diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-22 22:00:18 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-22 22:00:18 -0800 |
commit | 84621c9b18d0bb6cb267e3395c7f3131ecf4d39c (patch) | |
tree | 28566fe0211798143136b5cd154e2239d38a7b68 /drivers/xen | |
parent | 7ebd3faa9b5b42caf2d5aa1352a93dcfa0098011 (diff) | |
parent | c9f6e9977e38de15da96b732a8dec0ef56cbf977 (diff) |
Merge tag 'stable/for-linus-3.14-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull Xen updates from Konrad Rzeszutek Wilk:
"Two major features that Xen community is excited about:
The first is event channel scalability by David Vrabel - we switch
over from an two-level per-cpu bitmap of events (IRQs) - to an FIFO
queue with priorities. This lets us be able to handle more events,
have lower latency, and better scalability. Good stuff.
The other is PVH by Mukesh Rathor. In short, PV is a mode where the
kernel lets the hypervisor program page-tables, segments, etc. With
EPT/NPT capabilities in current processors, the overhead of doing this
in an HVM (Hardware Virtual Machine) container is much lower than the
hypervisor doing it for us.
In short we let a PV guest run without doing page-table, segment,
syscall, etc updates through the hypervisor - instead it is all done
within the guest container. It is a "hybrid" PV - hence the 'PVH'
name - a PV guest within an HVM container.
The major benefits are less code to deal with - for example we only
use one function from the the pv_mmu_ops (which has 39 function
calls); faster performance for syscall (no context switches into the
hypervisor); less traps on various operations; etc.
It is still being baked - the ABI is not yet set in stone. But it is
pretty awesome and we are excited about it.
Lastly, there are some changes to ARM code - you should get a simple
conflict which has been resolved in #linux-next.
In short, this pull has awesome features.
Features:
- FIFO event channels. Key advantages: support for over 100,000
events (2^17), 16 different event priorities, improved fairness in
event latency through the use of FIFOs.
- Xen PVH support. "It’s a fully PV kernel mode, running with
paravirtualized disk and network, paravirtualized interrupts and
timers, no emulated devices of any kind (and thus no qemu), no BIOS
or legacy boot — but instead of requiring PV MMU, it uses the HVM
hardware extensions to virtualize the pagetables, as well as system
calls and other privileged operations." (from "The
Paravirtualization Spectrum, Part 2: From poles to a spectrum")
Bug-fixes:
- Fixes in balloon driver (refactor and make it work under ARM)
- Allow xenfb to be used in HVM guests.
- Allow xen_platform_pci=0 to work properly.
- Refactors in event channels"
* tag 'stable/for-linus-3.14-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (52 commits)
xen/pvh: Set X86_CR0_WP and others in CR0 (v2)
MAINTAINERS: add git repository for Xen
xen/pvh: Use 'depend' instead of 'select'.
xen: delete new instances of __cpuinit usage
xen/fb: allow xenfb initialization for hvm guests
xen/evtchn_fifo: fix error return code in evtchn_fifo_setup()
xen-platform: fix error return code in platform_pci_init()
xen/pvh: remove duplicated include from enlighten.c
xen/pvh: Fix compile issues with xen_pvh_domain()
xen: Use dev_is_pci() to check whether it is pci device
xen/grant-table: Force to use v1 of grants.
xen/pvh: Support ParaVirtualized Hardware extensions (v3).
xen/pvh: Piggyback on PVHVM XenBus.
xen/pvh: Piggyback on PVHVM for grant driver (v4)
xen/grant: Implement an grant frame array struct (v3).
xen/grant-table: Refactor gnttab_init
xen/grants: Remove gnttab_max_grant_frames dependency on gnttab_init.
xen/pvh: Piggyback on PVHVM for event channels (v2)
xen/pvh: Update E820 to work with PVH (v2)
xen/pvh: Secondary VCPU bringup (non-bootup CPUs)
...
Diffstat (limited to 'drivers/xen')
-rw-r--r-- | drivers/xen/Kconfig | 1 | ||||
-rw-r--r-- | drivers/xen/Makefile | 3 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 9 | ||||
-rw-r--r-- | drivers/xen/dbgp.c | 2 | ||||
-rw-r--r-- | drivers/xen/events/Makefile | 5 | ||||
-rw-r--r-- | drivers/xen/events/events_2l.c | 372 | ||||
-rw-r--r-- | drivers/xen/events/events_base.c (renamed from drivers/xen/events.c) | 797 | ||||
-rw-r--r-- | drivers/xen/events/events_fifo.c | 428 | ||||
-rw-r--r-- | drivers/xen/events/events_internal.h | 150 | ||||
-rw-r--r-- | drivers/xen/evtchn.c | 2 | ||||
-rw-r--r-- | drivers/xen/gntdev.c | 2 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 90 | ||||
-rw-r--r-- | drivers/xen/pci.c | 2 | ||||
-rw-r--r-- | drivers/xen/platform-pci.c | 11 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_client.c | 3 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_probe_frontend.c | 2 |
16 files changed, 1328 insertions, 551 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 12ba6db6514..38fb36e1c59 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -3,7 +3,6 @@ menu "Xen driver support" config XEN_BALLOON bool "Xen memory balloon driver" - depends on !ARM default y help The balloon driver allows the Xen domain to request more memory from diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 14fe79d8634..d75c811bfa5 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -2,7 +2,8 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o endif obj-$(CONFIG_X86) += fallback.o -obj-y += grant-table.o features.o events.o balloon.o manage.o +obj-y += grant-table.o features.o balloon.o manage.o +obj-y += events/ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 4c02e2b9410..37d06ea624a 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -157,13 +157,6 @@ static struct page *balloon_retrieve(bool prefer_highmem) return page; } -static struct page *balloon_first_page(void) -{ - if (list_empty(&ballooned_pages)) - return NULL; - return list_entry(ballooned_pages.next, struct page, lru); -} - static struct page *balloon_next_page(struct page *page) { struct list_head *next = page->lru.next; @@ -328,7 +321,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - page = balloon_first_page(); + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); for (i = 0; i < nr_pages; i++) { if (!page) { nr_pages = i; diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c index f3ccc80a455..8145a59fd9f 100644 --- a/drivers/xen/dbgp.c +++ b/drivers/xen/dbgp.c @@ -19,7 +19,7 @@ static int xen_dbgp_op(struct usb_hcd *hcd, int op) dbgp.op = op; #ifdef CONFIG_PCI - if (ctrlr->bus == &pci_bus_type) { + if (dev_is_pci(ctrlr)) { const struct pci_dev *pdev = to_pci_dev(ctrlr); dbgp.u.pci.seg = pci_domain_nr(pdev->bus); diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 00000000000..62be55cd981 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 00000000000..d7ff9175730 --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,372 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], + cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); + set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct irq_desc *desc; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* Timer interrupt has highest priority. */ + irq = irq_from_virq(cpu, VIRQ_TIMER); + if (irq != -1) { + unsigned int evtchn = evtchn_from_irq(irq); + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) { + desc = irq_to_desc(irq); + if (desc) + generic_handle_irq_desc(irq, desc); + } + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + int port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + irq = get_evtchn_to_irq(port); + + if (irq != -1) { + desc = irq_to_desc(irq); + if (desc) + generic_handle_irq_desc(irq, desc); + } + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, + get_evtchn_to_irq(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .test_and_set_mask = evtchn_2l_test_and_set_mask, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events.c b/drivers/xen/events/events_base.c index 4035e833ea2..4672e003c0a 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events/events_base.c @@ -59,6 +59,10 @@ #include <xen/interface/vcpu.h> #include <asm/hw_irq.h> +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; + /* * This lock protects updates to the following mapping and reference-count * arrays. The lock does not need to be acquired to read the mapping tables. @@ -73,71 +77,15 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; -/* Interrupt types. */ -enum xen_irq_type { - IRQT_UNBOUND = 0, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* - * Packed IRQ information: - * type - enum xen_irq_type - * event channel - irq->event channel mapping - * cpu - cpu this event channel is bound to - * index - type-specific information: - * PIRQ - physical IRQ, GSI, flags, and owner domain - * VIRQ - virq number - * IPI - IPI vector - * EVTCHN - - */ -struct irq_info { - struct list_head list; - int refcnt; - enum xen_irq_type type; /* type */ - unsigned irq; - unsigned short evtchn; /* event channel */ - unsigned short cpu; /* cpu bound */ - - union { - unsigned short virq; - enum ipi_vector ipi; - struct { - unsigned short pirq; - unsigned short gsi; - unsigned char flags; - uint16_t domid; - } pirq; - } u; -}; -#define PIRQ_NEEDS_EOI (1 << 0) -#define PIRQ_SHAREABLE (1 << 1) - -static int *evtchn_to_irq; +int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; #endif static bool (*pirq_needs_eoi)(unsigned irq); -/* - * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be - * careful to only use bitops which allow for this (e.g - * test_bit/find_first_bit and friends but not __ffs) and to pass - * BITS_PER_EVTCHN_WORD as the bitmask length. - */ -#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) -/* - * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t - * array. Primarily to avoid long lines (hence the terse name). - */ -#define BM(x) (unsigned long *)(x) -/* Find the first set bit in a evtchn mask */ -#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) - -static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD], - cpu_evtchn_mask); +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -148,19 +96,75 @@ static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); static void disable_dynirq(struct irq_data *data); +static void clear_evtchn_to_irq_row(unsigned row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(row); + } +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ + unsigned row; + unsigned col; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); + if (evtchn_to_irq[row] == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(row); + } + + evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; + return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ + if (evtchn >= xen_evtchn_max_channels()) + return -1; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return -1; + return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} + /* Get info for IRQ */ -static struct irq_info *info_for_irq(unsigned irq) +struct irq_info *info_for_irq(unsigned irq) { return irq_get_handler_data(irq); } /* Constructors for packed IRQ information. */ -static void xen_irq_info_common_init(struct irq_info *info, +static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, enum xen_irq_type type, - unsigned short evtchn, + unsigned evtchn, unsigned short cpu) { + int ret; BUG_ON(info->type != IRQT_UNBOUND && info->type != type); @@ -169,68 +173,78 @@ static void xen_irq_info_common_init(struct irq_info *info, info->evtchn = evtchn; info->cpu = cpu; - evtchn_to_irq[evtchn] = irq; + ret = set_evtchn_to_irq(evtchn, irq); + if (ret < 0) + return ret; irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(info); } -static void xen_irq_info_evtchn_init(unsigned irq, - unsigned short evtchn) +static int xen_irq_info_evtchn_setup(unsigned irq, + unsigned evtchn) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); + return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); } -static void xen_irq_info_ipi_init(unsigned cpu, +static int xen_irq_info_ipi_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, + unsigned evtchn, enum ipi_vector ipi) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); - info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } -static void xen_irq_info_virq_init(unsigned cpu, +static int xen_irq_info_virq_setup(unsigned cpu, unsigned irq, - unsigned short evtchn, - unsigned short virq) + unsigned evtchn, + unsigned virq) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); - info->u.virq = virq; per_cpu(virq_to_irq, cpu)[virq] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); } -static void xen_irq_info_pirq_init(unsigned irq, - unsigned short evtchn, - unsigned short pirq, - unsigned short gsi, +static int xen_irq_info_pirq_setup(unsigned irq, + unsigned evtchn, + unsigned pirq, + unsigned gsi, uint16_t domid, unsigned char flags) { struct irq_info *info = info_for_irq(irq); - xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; info->u.pirq.domid = domid; info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + info->evtchn = 0; } /* * Accessors for packed IRQ information. */ -static unsigned int evtchn_from_irq(unsigned irq) +unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) return 0; @@ -240,10 +254,15 @@ static unsigned int evtchn_from_irq(unsigned irq) unsigned irq_from_evtchn(unsigned int evtchn) { - return evtchn_to_irq[evtchn]; + return get_evtchn_to_irq(evtchn); } EXPORT_SYMBOL_GPL(irq_from_evtchn); +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ + return per_cpu(virq_to_irq, cpu)[virq]; +} + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -279,14 +298,14 @@ static enum xen_irq_type type_from_irq(unsigned irq) return info_for_irq(irq)->type; } -static unsigned cpu_from_irq(unsigned irq) +unsigned cpu_from_irq(unsigned irq) { return info_for_irq(irq)->cpu; } -static unsigned int cpu_from_evtchn(unsigned int evtchn) +unsigned int cpu_from_evtchn(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); unsigned ret = 0; if (irq != -1) @@ -310,67 +329,29 @@ static bool pirq_needs_eoi_flag(unsigned irq) return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static inline xen_ulong_t active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return sh->evtchn_pending[idx] & - per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]; -} - static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) { - int irq = evtchn_to_irq[chn]; + int irq = get_evtchn_to_irq(chn); + struct irq_info *info = info_for_irq(irq); BUG_ON(irq == -1); #ifdef CONFIG_SMP cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); #endif - clear_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu_from_irq(irq)))); - set_bit(chn, BM(per_cpu(cpu_evtchn_mask, cpu))); - - info_for_irq(irq)->cpu = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ - int i; -#ifdef CONFIG_SMP - struct irq_info *info; - - /* By default all event channels notify CPU#0. */ - list_for_each_entry(info, &xen_irq_list_head, list) { - struct irq_desc *desc = irq_to_desc(info->irq); - cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); - } -#endif - - for_each_possible_cpu(i) - memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, NR_EVENT_CHANNELS/8); -} + xen_evtchn_port_bind_to_cpu(info, cpu); -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, BM(&s->evtchn_pending[0])); + info->cpu = cpu; } -static inline void set_evtchn(int port) +static void xen_evtchn_mask_all(void) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_pending[0])); -} + unsigned int evtchn; -static inline int test_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, BM(&s->evtchn_pending[0])); + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); } - /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -388,63 +369,6 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq); -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_mask[0])); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - int do_hypercall = 0, evtchn_pending = 0; - - BUG_ON(!irqs_disabled()); - - if (unlikely((cpu != cpu_from_evtchn(port)))) - do_hypercall = 1; - else { - /* - * Need to clear the mask before checking pending to - * avoid a race with an event becoming pending. - * - * EVTCHNOP_unmask will only trigger an upcall if the - * mask bit was set, so if a hypercall is needed - * remask the event. - */ - sync_clear_bit(port, BM(&s->evtchn_mask[0])); - evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); - - if (unlikely(evtchn_pending && xen_hvm_domain())) { - sync_set_bit(port, BM(&s->evtchn_mask[0])); - do_hypercall = 1; - } - } - - /* Slow path (hypercall) if this is a non-local port or if this is - * an hvm domain and an event is pending (hvm domains don't have - * their own implementation of irq_enable). */ - if (do_hypercall) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (evtchn_pending && - !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, - BM(&vcpu_info->evtchn_pending_sel))) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - static void xen_irq_init(unsigned irq) { struct irq_info *info; @@ -538,6 +462,18 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } +static void xen_evtchn_close(unsigned int port) +{ + struct evtchn_close close; + + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(port, 0); +} + static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -610,7 +546,13 @@ static unsigned int __startup_pirq(unsigned int irq) pirq_query_unmask(irq); - evtchn_to_irq[evtchn] = irq; + rc = set_evtchn_to_irq(evtchn, irq); + if (rc != 0) { + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", + irq, rc); + xen_evtchn_close(evtchn); + return 0; + } bind_evtchn_to_cpu(evtchn, 0); info->evtchn = evtchn; @@ -628,10 +570,9 @@ static unsigned int startup_pirq(struct irq_data *data) static void shutdown_pirq(struct irq_data *data) { - struct evtchn_close close; unsigned int irq = data->irq; struct irq_info *info = info_for_irq(irq); - int evtchn = evtchn_from_irq(irq); + unsigned evtchn = evtchn_from_irq(irq); BUG_ON(info->type != IRQT_PIRQ); @@ -639,14 +580,8 @@ static void shutdown_pirq(struct irq_data *data) return; mask_evtchn(evtchn); - - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - bind_evtchn_to_cpu(evtchn, 0); - evtchn_to_irq[evtchn] = -1; - info->evtchn = 0; + xen_evtchn_close(evtchn); + xen_irq_info_cleanup(info); } static void enable_pirq(struct irq_data *data) @@ -675,6 +610,41 @@ int xen_irq_from_gsi(unsigned gsi) } EXPORT_SYMBOL_GPL(xen_irq_from_gsi); +static void __unbind_from_irq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_info *info = irq_get_handler_data(irq); + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = cpu_from_irq(irq); + + xen_evtchn_close(evtchn); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + break; + default: + break; + } + + xen_irq_info_cleanup(info); + } + + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + + xen_free_irq(irq); +} + /* * Do not make any assumptions regarding the relationship between the * IRQ number returned here and the Xen pirq argument. @@ -690,6 +660,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, { int irq = -1; struct physdev_irq irq_op; + int ret; mutex_lock(&irq_mapping_update_lock); @@ -717,8 +688,13 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - xen_irq_info_pirq_init(irq, 0, pirq, gsi, DOMID_SELF, + ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } pirq_query_unmask(irq); /* We try to use the handler with the appropriate semantic for the @@ -778,7 +754,9 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, name); - xen_irq_info_pirq_init(irq, 0, pirq, 0, domid, 0); + ret = xen_irq_info_pirq_setup(irq, 0, pirq, 0, domid, 0); + if (ret < 0) + goto error_irq; ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; @@ -786,8 +764,8 @@ out: mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: + __unbind_from_irq(irq); mutex_unlock(&irq_mapping_update_lock); - xen_free_irq(irq); return ret; } #endif @@ -857,13 +835,18 @@ int xen_pirq_from_irq(unsigned irq) return pirq_from_irq(irq); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + int bind_evtchn_to_irq(unsigned int evtchn) { int irq; + int ret; + + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) { irq = xen_allocate_irq_dynamic(); @@ -873,7 +856,12 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, handle_edge_irq, "event"); - xen_irq_info_evtchn_init(irq, evtchn); + ret = xen_irq_info_evtchn_setup(irq, evtchn); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_EVTCHN); @@ -890,6 +878,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; int evtchn, irq; + int ret; mutex_lock(&irq_mapping_update_lock); @@ -909,8 +898,12 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); - + ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); } else { struct irq_info *info = info_for_irq(irq); @@ -943,7 +936,7 @@ static int find_virq(unsigned int virq, unsigned int cpu) int port, rc = -ENOENT; memset(&status, 0, sizeof(status)); - for (port = 0; port <= NR_EVENT_CHANNELS; port++) { + for (port = 0; port < xen_evtchn_max_channels(); port++) { status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -959,6 +952,19 @@ static int find_virq(unsigned int virq, unsigned int cpu) return rc; } +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -989,7 +995,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) evtchn = ret; } - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } bind_evtchn_to_cpu(evtchn, cpu); } else { @@ -1005,50 +1016,8 @@ out: static void unbind_from_irq(unsigned int irq) { - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_handler_data(irq); - - if (WARN_ON(!info)) - return; - mutex_lock(&irq_mapping_update_lock); - - if (info->refcnt > 0) { - info->refcnt--; - if (info->refcnt != 0) - goto done; - } - - if (VALID_EVTCHN(evtchn)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [virq_from_irq(irq)] = -1; - break; - case IRQT_IPI: - per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) - [ipi_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - } - - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - - xen_free_irq(irq); - - done: + __unbind_from_irq(irq); mutex_unlock(&irq_mapping_update_lock); } @@ -1148,9 +1117,26 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id) } EXPORT_SYMBOL_GPL(unbind_from_irqhandler); +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + int evtchn_make_refcounted(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); struct irq_info *info; if (irq == -1) @@ -1175,12 +1161,12 @@ int evtchn_get(unsigned int evtchn) struct irq_info *info; int err = -ENOENT; - if (evtchn >= NR_EVENT_CHANNELS) + if (evtchn >= xen_evtchn_max_channels()) return -EINVAL; mutex_lock(&irq_mapping_update_lock); - irq = evtchn_to_irq[evtchn]; + irq = get_evtchn_to_irq(evtchn); if (irq == -1) goto done; @@ -1204,7 +1190,7 @@ EXPORT_SYMBOL_GPL(evtchn_get); void evtchn_put(unsigned int evtchn) { - int irq = evtchn_to_irq[evtchn]; + int irq = get_evtchn_to_irq(evtchn); if (WARN_ON(irq == -1)) return; unbind_from_irq(irq); @@ -1228,222 +1214,21 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) notify_remote_via_irq(irq); } -irqreturn_t xen_debug_interrupt(int irq, void *dev_id) -{ - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; - static DEFINE_SPINLOCK(debug_lock); - struct vcpu_info *v; - - spin_lock_irqsave(&debug_lock, flags); - - printk("\nvcpu %d\n ", cpu); - - for_each_online_cpu(i) { - int pending; - v = per_cpu(xen_vcpu, i); - pending = (get_irq_regs() && i == cpu) - ? xen_irqs_disabled(get_irq_regs()) - : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, - pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); - } - v = per_cpu(xen_vcpu, cpu); - - printk("\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), - cpu_evtchn[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - xen_ulong_t pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] - & cpu_evtchn[i]; - printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - pending, i % 8 == 0 ? "\n " : " "); - } - - printk("\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, BM(sh->evtchn_pending))) { - int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", - cpu_from_evtchn(i), i, - evtchn_to_irq[i], - sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) - ? "" : " l2-clear", - !sync_test_bit(i, BM(sh->evtchn_mask)) - ? "" : " globally-masked", - sync_test_bit(i, BM(cpu_evtchn)) - ? "" : " locally-masked"); - } - } - - spin_unlock_irqrestore(&debug_lock, flags); - - return IRQ_HANDLED; -} - static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); - -/* - * Mask out the i least significant bits of w - */ -#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ static void __xen_evtchn_do_upcall(void) { - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; - int i, irq; - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = get_cpu(); unsigned count; do { - xen_ulong_t pending_words; - xen_ulong_t pending_bits; - struct irq_desc *desc; - vcpu_info->evtchn_upcall_pending = 0; if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out; - /* - * Master flag must be cleared /before/ clearing - * selector flag. xchg_xen_ulong must contain an - * appropriate barrier. - */ - if ((irq = per_cpu(virq_to_irq, cpu)[VIRQ_TIMER]) != -1) { - int evtchn = evtchn_from_irq(irq); - word_idx = evtchn / BITS_PER_LONG; - pending_bits = evtchn % BITS_PER_LONG; - if (active_evtchns(cpu, s, word_idx) & (1ULL << pending_bits)) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - } - - pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; - - for (i = 0; pending_words != 0; i++) { - xen_ulong_t words; - - words = MASK_LSBS(pending_words, word_idx); - - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = EVTCHN_FIRST_BIT(words); - - pending_bits = active_evtchns(cpu, s, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - /* - * We scan the starting word in two parts. - * - * 1st time: start in the middle, scanning the - * upper bits. - * - * 2nd time: scan the whole word (not just the - * parts skipped in the first pass) -- if an - * event in the previously scanned bits is - * pending again it would just be scanned on - * the next loop anyway. - */ - if (word_idx == start_word_idx) { - if (i == 0) - bit_idx = start_bit_idx; - } - - do { - xen_ulong_t bits; - int port; - - bits = MASK_LSBS(pending_bits, bit_idx); - - /* If we masked out all events, move on. */ - if (bits == 0) - break; - - bit_idx = EVTCHN_FIRST_BIT(bits); - - /* Process port. */ - port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = evtchn_to_irq[port]; - - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } - - bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; - - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_EVTCHN_WORD); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); - - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); - - word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; - } + xen_evtchn_handle_events(cpu); BUG_ON(!irqs_disabled()); @@ -1492,12 +1277,12 @@ void rebind_evtchn_irq(int evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(evtchn_to_irq[evtchn] != -1); + BUG_ON(get_evtchn_to_irq(evtchn) != -1); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - xen_irq_info_evtchn_init(irq, evtchn); + (void)xen_irq_info_evtchn_setup(irq, evtchn); mutex_unlock(&irq_mapping_update_lock); @@ -1511,7 +1296,6 @@ void rebind_evtchn_irq(int evtchn, int irq) /* Rebind an evtchn so that it gets delivered to a specific cpu */ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { - struct shared_info *s = HYPERVISOR_shared_info; struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); int masked; @@ -1534,7 +1318,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) * Mask the event while changing the VCPU binding to prevent * it being delivered on an unexpected VCPU. */ - masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); + masked = test_and_set_mask(evtchn); /* * If this fails, it usually just indicates that we're dealing with a @@ -1558,22 +1342,26 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, return rebind_irq_to_cpu(data->irq, tcpu); } -int resend_irq_on_evtchn(unsigned int irq) +static int retrigger_evtchn(int evtchn) { - int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; + int masked; if (!VALID_EVTCHN(evtchn)) - return 1; + return 0; - masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); - sync_set_bit(evtchn, BM(s->evtchn_pending)); + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); if (!masked) unmask_evtchn(evtchn); return 1; } +int resend_irq_on_evtchn(unsigned int irq) +{ + return retrigger_evtchn(evtchn_from_irq(irq)); +} + static void enable_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -1608,21 +1396,7 @@ static void mask_ack_dynirq(struct irq_data *data) static int retrigger_dynirq(struct irq_data *data) { - int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - int masked; - - masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask)); - sync_set_bit(evtchn, BM(sh->evtchn_pending)); - if (!masked) - unmask_evtchn(evtchn); - ret = 1; - } - - return ret; + return retrigger_evtchn(evtchn_from_irq(data->irq)); } static void restore_pirqs(void) @@ -1683,7 +1457,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - xen_irq_info_virq_init(cpu, irq, evtchn, virq); + (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1707,7 +1481,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); + (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1784,21 +1558,18 @@ EXPORT_SYMBOL_GPL(xen_test_irq_shared); void xen_irq_resume(void) { - unsigned int cpu, evtchn; + unsigned int cpu; struct irq_info *info; - init_evtchn_cpu_bindings(); - /* New event-channel space is not 'live' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - mask_evtchn(evtchn); + xen_evtchn_mask_all(); + xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ list_for_each_entry(info, &xen_irq_list_head, list) info->evtchn = 0; /* zap event-channel binding */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) - evtchn_to_irq[evtchn] = -1; + clear_evtchn_to_irq_all(); for_each_possible_cpu(cpu) { restore_cpu_virqs(cpu); @@ -1889,27 +1660,40 @@ void xen_callback_vector(void) void xen_callback_vector(void) {} #endif +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + void __init xen_init_IRQ(void) { - int i; + int ret = -EINVAL; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), - GFP_KERNEL); - BUG_ON(!evtchn_to_irq); - for (i = 0; i < NR_EVENT_CHANNELS; i++) - evtchn_to_irq[i] = -1; + if (fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) + xen_evtchn_2l_init(); - init_evtchn_cpu_bindings(); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); + xen_evtchn_mask_all(); pirq_needs_eoi = pirq_needs_eoi_flag; #ifdef CONFIG_X86 - if (xen_hvm_domain()) { + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_callback_vector(); + + if (xen_hvm_domain()) { native_init_IRQ(); /* pci_xen_hvm_init must be called after native_init_IRQ so that * __acpi_register_gsi can point at the right function */ @@ -1918,13 +1702,10 @@ void __init xen_init_IRQ(void) int rc; struct physdev_pirq_eoi_gmfn eoi_gmfn; - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - pci_xen_initial_domain(); - pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ if (rc != 0) { free_page((unsigned long) pirq_eoi_map); pirq_eoi_map = NULL; diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 00000000000..1de2a191b39 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,428 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +#define BM(w) ((unsigned long *)(w)) + +static inline event_word_t *event_word_from_port(unsigned port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ + unsigned port = info->evtchn; + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_mfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_PENDING, BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_PENDING, BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_PENDING, BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_and_set_bit(EVTCHN_FIFO_MASKED, BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_MASKED, BM(word)); +} + +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + w = sync_cmpxchg(word, old, new); + } while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + clear_masked(word); + if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word))) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w; + new = (w & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while ((w = sync_cmpxchg(word, old, new)) != old); + + return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ + int irq; + struct irq_desc *desc; + + irq = get_evtchn_to_irq(port); + if (irq != -1) { + desc = irq_to_desc(irq); + if (desc) + generic_handle_irq_desc(irq, desc); + } +} + +static void consume_one_event(unsigned cpu, + struct evtchn_fifo_control_block *control_block, + unsigned priority, uint32_t *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + unsigned port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, BM(ready)); + + if (sync_test_bit(EVTCHN_FIFO_PENDING, BM(word)) + && !sync_test_bit(EVTCHN_FIFO_MASKED, BM(word))) + handle_irq_for_port(port); + + q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ + struct evtchn_fifo_control_block *control_block; + uint32_t ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(BM(&ready), EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + struct evtchn_init_control init_control; + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + init_control.control_gfn = virt_to_mfn(control_block); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, + &init_control); + if (ret < 0) + BUG(); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, +}; + +static int evtchn_fifo_init_control_block(unsigned cpu) +{ + struct page *control_block = NULL; + struct evtchn_init_control init_control; + int ret = -ENOMEM; + + control_block = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (control_block == NULL) + goto error; + + init_control.control_gfn = virt_to_mfn(page_address(control_block)); + init_control.offset = 0; + init_control.vcpu = cpu; + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = page_address(control_block); + + return 0; + + error: + __free_page(control_block); + return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_control_block, cpu)) + ret = evtchn_fifo_init_control_block(cpu); + break; + default: + break; + } + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { + .notifier_call = evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = get_cpu(); + int ret; + + ret = evtchn_fifo_init_control_block(cpu); + if (ret < 0) + goto out; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: + put_cpu(); + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 00000000000..677f41a0fff --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,150 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later. See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + int refcnt; + enum xen_irq_type type; /* type */ + unsigned irq; + unsigned int evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(struct irq_info *info); + void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + + void (*clear_pending)(unsigned port); + void (*set_pending)(unsigned port); + bool (*is_pending)(unsigned port); + bool (*test_and_set_mask)(unsigned port); + void (*mask)(unsigned port); + void (*unmask)(unsigned port); + + void (*handle_events)(unsigned cpu); + void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(info); + return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, + unsigned cpu) +{ + evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ + return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ + return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ + return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 5de2063e16d..00f40f051d9 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -417,7 +417,7 @@ static long evtchn_ioctl(struct file *file, break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_evtchn_nr_channels()) break; rc = -ENOTCONN; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index e41c79c986e..073b4a19a8b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -846,7 +846,7 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); err = misc_register(&gntdev_miscdev); if (err != 0) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index aa846a48f40..1ce1c40331f 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -62,12 +62,10 @@ static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; -static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; static union { struct grant_entry_v1 *v1; @@ -827,6 +825,11 @@ static unsigned int __max_nr_grant_frames(void) unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); if (xen_max > boot_max_nr_grant_frames) return boot_max_nr_grant_frames; @@ -834,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(unsigned long addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n", + addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + /* Handling of paged out grant targets (GNTST_eagain) */ #define MAX_DELAY 256 static inline void @@ -1060,10 +1108,11 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -1072,7 +1121,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { pr_warn("grant table add_to_physmap failed, err=%d\n", @@ -1135,10 +1184,8 @@ static void gnttab_request_version(void) int rc; struct gnttab_set_version gsv; - if (xen_hvm_domain()) - gsv.version = 1; - else - gsv.version = 2; + gsv.version = 1; + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); if (rc == 0 && gsv.version == 2) { grant_table_version = 2; @@ -1169,22 +1216,15 @@ static int gnttab_setup(void) if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - if (xen_pv_domain()) - return gnttab_map(0, nr_grant_frames - 1); - - if (gnttab_shared.addr == NULL) { - gnttab_shared.addr = xen_remap(xen_hvm_resume_frames, - PAGE_SIZE * max_nr_gframes); + if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n", - xen_hvm_resume_frames); + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); return -ENOMEM; } } - - gnttab_map(0, nr_grant_frames - 1); - - return 0; + return gnttab_map(0, nr_grant_frames - 1); } int gnttab_resume(void) @@ -1227,13 +1267,12 @@ int gnttab_init(void) gnttab_request_version(); nr_grant_frames = 1; - boot_max_nr_grant_frames = __max_nr_grant_frames(); /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ BUG_ON(grefs_per_grant_frame == 0); - max_nr_glist_frames = (boot_max_nr_grant_frames * + max_nr_glist_frames = (gnttab_max_grant_frames() * grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), @@ -1286,5 +1325,6 @@ static int __gnttab_init(void) return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 188825122aa..dd9c249ea31 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -26,7 +26,9 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG #include <asm/pci_x86.h> +#endif static bool __read_mostly pci_seg_supported = true; diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 2f3528e93cb..a1361c312c0 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -108,6 +108,7 @@ static int platform_pci_init(struct pci_dev *pdev, long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; if (!xen_domain()) return -ENODEV; @@ -154,13 +155,17 @@ static int platform_pci_init(struct pci_dev *pdev, } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); - ret = gnttab_init(); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); if (ret) goto out; + ret = gnttab_init(); + if (ret) + goto grant_out; xenbus_probe(NULL); return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); out: pci_release_region(pdev, 0); mem_out: diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index ec097d6f964..01d59e66565 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -45,6 +45,7 @@ #include <xen/grant_table.h> #include <xen/xenbus.h> #include <xen/xen.h> +#include <xen/features.h> #include "xenbus_probe.h" @@ -743,7 +744,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { - if (xen_pv_domain()) + if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else ring_ops = &ring_ops_hvm; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 129bf84c19e..cb385c10d2b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_init); #ifndef MODULE static int __init boot_wait_for_devices(void) { - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_devices()) return -ENODEV; ready_to_wait_for_devices = 1; |