From a6c372de6e4b9a8188b66badcee3e3792eccdd26 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:01 -0600 Subject: lguest: fix lguest wake on guest clock tick, or fd activity The Launcher could be inside the Guest on another CPU; wake_up_process will do nothing because it is "running". kick_process will knock it back into our kernel in this case, otherwise we'll miss it until the next guest exit. Signed-off-by: Rusty Russell --- drivers/lguest/interrupts_and_traps.c | 6 +++--- drivers/lguest/lguest_user.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 6e99adbe194..9ea26ad88c9 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -511,9 +511,9 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) /* Remember the first interrupt is the timer interrupt. */ set_bit(0, cpu->irqs_pending); - /* If the Guest is actually stopped, we need to wake it up. */ - if (cpu->halted) - wake_up_process(cpu->tsk); + /* Guest may be stopped or running on another CPU. */ + if (!wake_up_process(cpu->tsk)) + kick_process(cpu->tsk); return HRTIMER_NORESTART; } diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b8ee103eed5..bcdcf3453e7 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -24,8 +24,8 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) if (on) { cpu->break_out = 1; - /* Pop it out of the Guest (may be running on different CPU) */ - wake_up_process(cpu->tsk); + if (!wake_up_process(cpu->tsk)) + kick_process(cpu->tsk); /* Wait for them to reset it */ return wait_event_interruptible(cpu->break_wq, !cpu->break_out); } else { -- cgit v1.2.3-70-g09d2 From abd41f037e1a64543000ed73b42f616d04d92700 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:02 -0600 Subject: lguest: fix race in halt code When the Guest does the LHCALL_HALT hypercall, we go to sleep, expecting that a timer or the Waker will wake_up_process() us. But we do it in a stupid way, leaving a classic missing wakeup race. So split maybe_do_interrupt() into interrupt_pending() and try_deliver_interrupt(), and check maybe_do_interrupt() and the "break_out" flag before calling schedule. Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 14 ++++++++++++-- drivers/lguest/interrupts_and_traps.c | 26 +++++++++++++++++--------- drivers/lguest/lg.h | 3 ++- 3 files changed, 31 insertions(+), 12 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4845fb3cf74..8ca1def5b14 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -188,6 +188,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { + unsigned int irq; + /* First we run any hypercalls the Guest wants done. */ if (cpu->hcall) do_hypercalls(cpu); @@ -211,7 +213,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ - maybe_do_interrupt(cpu); + irq = interrupt_pending(cpu); + if (irq < LGUEST_IRQS) + try_deliver_interrupt(cpu, irq); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -227,7 +231,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) * clock timer or LHREQ_BREAK from the Waker will wake us. */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + /* Just before we sleep, make sure nothing snuck in + * which we should be doing. */ + if (interrupt_pending(cpu) < LGUEST_IRQS + || cpu->break_out) + set_current_state(TASK_RUNNING); + else + schedule(); continue; } diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 9ea26ad88c9..a8c966fee1e 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -128,30 +128,38 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, /*H:205 * Virtual Interrupts. * - * maybe_do_interrupt() gets called before every entry to the Guest, to see if - * we should divert the Guest to running an interrupt handler. */ -void maybe_do_interrupt(struct lg_cpu *cpu) + * interrupt_pending() returns the first pending interrupt which isn't blocked + * by the Guest. It is called before every entry to the Guest, and just before + * we go to sleep when the Guest has halted itself. */ +unsigned int interrupt_pending(struct lg_cpu *cpu) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); - struct desc_struct *idt; /* If the Guest hasn't even initialized yet, we can do nothing. */ if (!cpu->lg->lguest_data) - return; + return LGUEST_IRQS; /* Take our "irqs_pending" array and remove any interrupts the Guest * wants blocked: the result ends up in "blk". */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) - return; + return LGUEST_IRQS; bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); - /* None? Nothing to do */ - if (irq >= LGUEST_IRQS) - return; + + return irq; +} + +/* This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). */ +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) +{ + struct desc_struct *idt; + + BUG_ON(irq >= LGUEST_IRQS); /* They may be in the middle of an iret, where they asked us never to * deliver interrupts. */ diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index af92a176697..6743cf147d9 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -139,7 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -void maybe_do_interrupt(struct lg_cpu *cpu); +unsigned int interrupt_pending(struct lg_cpu *cpu); +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq); bool deliver_trap(struct lg_cpu *cpu, unsigned int num); void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, u32 low, u32 hi); -- cgit v1.2.3-70-g09d2 From a32a8813d0173163ba44d8f9556e0d89fdc4fb46 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:02 -0600 Subject: lguest: improve interrupt handling, speed up stream networking lguest never checked for pending interrupts when enabling interrupts, and things still worked. However, it makes a significant difference to TCP performance, so it's time we fixed it by introducing a pending_irq flag and checking it on irq_restore and irq_enable. These two routines are now too big to patch into the 8/10 bytes patch space, so we drop that code. Note: The high latency on interrupt delivery had a very curious effect: once everything else was optimized, networking without GSO was faster than networking with GSO, since more interrupts were sent and hence a greater chance of one getting through to the Guest! Note2: (Almost) Closing the same loophole for iret doesn't have any measurable effect, so I'm leaving that patch for the moment. Before: 1GB tcpblast Guest->Host: 30.7 seconds 1GB tcpblast Guest->Host (no GSO): 76.0 seconds After: 1GB tcpblast Guest->Host: 6.8 seconds 1GB tcpblast Guest->Host (no GSO): 27.8 seconds Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 + arch/x86/lguest/boot.c | 21 +++++++++++++++------ arch/x86/lguest/i386_head.S | 2 -- drivers/lguest/core.c | 7 ++++--- drivers/lguest/hypercalls.c | 4 ++++ drivers/lguest/interrupts_and_traps.c | 16 +++++++++++++--- drivers/lguest/lg.h | 4 ++-- include/linux/lguest.h | 4 ++++ 8 files changed, 43 insertions(+), 16 deletions(-) (limited to 'drivers/lguest') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487..f9a9f781124 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -17,6 +17,7 @@ #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 +#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 2392a7a171c..37b8c1d3e02 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -205,6 +205,12 @@ PV_CALLEE_SAVE_REGS_THUNK(save_fl); static void restore_fl(unsigned long flags) { lguest_data.irq_enabled = flags; + mb(); + /* Null hcall forces interrupt delivery now, if irq_pending is + * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags + * enables interrupts. */ + if (flags & lguest_data.irq_pending) + kvm_hypercall0(LHCALL_SEND_INTERRUPTS); } PV_CALLEE_SAVE_REGS_THUNK(restore_fl); @@ -219,6 +225,11 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); static void irq_enable(void) { lguest_data.irq_enabled = X86_EFLAGS_IF; + mb(); + /* Null hcall forces interrupt delivery now. */ + if (lguest_data.irq_pending) + kvm_hypercall0(LHCALL_SEND_INTERRUPTS); + } PV_CALLEE_SAVE_REGS_THUNK(irq_enable); @@ -972,10 +983,10 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch the four most commonly called functions: disable interrupts, enable - * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 - * bytes to patch into: the Guest versions of these operations are small enough - * that we can fit comfortably. + * patch two of the simplest of the most commonly called functions: disable + * interrupts and save interrupts. We usually have 6 or 10 bytes to patch + * into: the Guest versions of these operations are small enough that we can + * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in i386_head.S. */ @@ -986,8 +997,6 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f7954198947..3e0c5545d59 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -46,8 +46,6 @@ ENTRY(lguest_entry) .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) /*:*/ diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 8ca1def5b14..03fbc88c002 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -189,6 +189,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { unsigned int irq; + bool more; /* First we run any hypercalls the Guest wants done. */ if (cpu->hcall) @@ -213,9 +214,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ - irq = interrupt_pending(cpu); + irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) - try_deliver_interrupt(cpu, irq); + try_deliver_interrupt(cpu, irq, more); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -233,7 +234,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) set_current_state(TASK_INTERRUPTIBLE); /* Just before we sleep, make sure nothing snuck in * which we should be doing. */ - if (interrupt_pending(cpu) < LGUEST_IRQS + if (interrupt_pending(cpu, &more) < LGUEST_IRQS || cpu->break_out) set_current_state(TASK_RUNNING); else diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 54d66f05fef..f252b71ae79 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* This call does nothing, except by breaking out of the Guest * it makes us process all the asynchronous hypercalls. */ break; + case LHCALL_SEND_INTERRUPTS: + /* This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. */ + break; case LHCALL_LGUEST_INIT: /* You can't get here unless you're already initialized. Don't * do that. */ diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index a8c966fee1e..5a10754b479 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -131,7 +131,7 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before * we go to sleep when the Guest has halted itself. */ -unsigned int interrupt_pending(struct lg_cpu *cpu) +unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); @@ -149,13 +149,14 @@ unsigned int interrupt_pending(struct lg_cpu *cpu) /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); + *more = find_next_bit(blk, LGUEST_IRQS, irq+1); return irq; } /* This actually diverts the Guest to running an interrupt handler, once an * interrupt has been identified by interrupt_pending(). */ -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; @@ -178,8 +179,12 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) u32 irq_enabled; if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) irq_enabled = 0; - if (!irq_enabled) + if (!irq_enabled) { + /* Make sure they know an IRQ is pending. */ + put_user(X86_EFLAGS_IF, + &cpu->lg->lguest_data->irq_pending); return; + } } /* Look at the IDT entry the Guest gave us for this interrupt. The @@ -202,6 +207,11 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq) * here is a compromise which means at least it gets updated every * timer interrupt. */ write_timestamp(cpu); + + /* If there are no other interrupts we want to deliver, clear + * the pending flag. */ + if (!more) + put_user(0, &cpu->lg->lguest_data->irq_pending); } /*:*/ diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 6743cf147d9..573896533ac 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -139,8 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -unsigned int interrupt_pending(struct lg_cpu *cpu); -void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq); +unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); bool deliver_trap(struct lg_cpu *cpu, unsigned int num); void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, u32 low, u32 hi); diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 175e63f4a8c..7bc1440fc47 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -30,6 +30,10 @@ struct lguest_data /* Wallclock time set by the Host. */ struct timespec time; + /* Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + int irq_pending; + /* Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. * This batching can be quite efficient. */ -- cgit v1.2.3-70-g09d2 From 81b79b01d057f8c5a378c38d2f738775b972934a Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 20 May 2009 01:45:45 +0200 Subject: lguest: beyond ARRAY_SIZE of cpu->arch.gdt Do not go beyond ARRAY_SIZE of cpu->arch.gdt Signed-off-by: Roel Kluin Signed-off-by: Rusty Russell --- drivers/lguest/segments.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 7ede64ffeef..482ed5a1875 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { /* We assume the Guest has the same number of GDT entries as the * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ - if (num > ARRAY_SIZE(cpu->arch.gdt)) + if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); /* Set it up, then fix it. */ -- cgit v1.2.3-70-g09d2 From f086122bb6e885f926f935b1418fca3b293375f0 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:04 -0600 Subject: lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition. If GDT_ENTRIES were every > 256, this could become a problem. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- drivers/lguest/lg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 573896533ac..74af503ad63 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -49,7 +49,7 @@ struct lg_cpu { u32 cr2; int ts; u32 esp1; - u8 ss1; + u16 ss1; /* Bitmap of what has changed: see CHANGED_* above. */ int changed; -- cgit v1.2.3-70-g09d2 From ed1dc77810159a733240ba6751c1b31023bf8dd7 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Sat, 30 May 2009 15:35:49 -0300 Subject: lguest: map switcher with executable page table entries Map switcher with executable page table entries. (This bug didn't matter before PAE and hence NX support -- RR) Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 2 +- drivers/lguest/page_tables.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 03fbc88c002..d0298dc45d9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -95,7 +95,7 @@ static __init int map_switcher(void) * array of struct pages. It increments that pointer, but we don't * care. */ pagep = switcher_page; - err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); + err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a059cf9980f..496995370fb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -714,7 +714,7 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); + switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; -- cgit v1.2.3-70-g09d2 From 90603d15fa95605d1d08235b73e220d766f04bb0 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:06 -0600 Subject: lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated Some cleanups and replace direct assignment with native_set_* macros which properly handle 64-bit entries when PAE is activated Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 8 ++++---- drivers/lguest/page_tables.c | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'drivers/lguest') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 514f4d0d2bf..4f311e40d0a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -525,7 +525,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); lguest_pte_update(mm, addr, ptep); } @@ -534,9 +534,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, * changed. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - *pmdp = pmdval; + native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } /* There are a couple of legacy places where the kernel sets a PTE, but we @@ -550,7 +550,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * which brings boot back to 0.25 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 496995370fb..ffba723cd98 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -90,7 +90,7 @@ static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); - return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; + return &page[pte_index(vaddr)]; } /* These two functions just like the above two, except they access the Guest @@ -105,7 +105,7 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); + return gpage + pte_index(vaddr) * sizeof(pte_t); } /*:*/ @@ -171,7 +171,7 @@ static void release_pte(pte_t pte) /* Remember that get_user_pages_fast() took a reference to the page, in * get_pfn()? We have to put it back now. */ if (pte_flags(pte) & _PAGE_PRESENT) - put_page(pfn_to_page(pte_pfn(pte))); + put_page(pte_page(pte)); } /*:*/ @@ -273,7 +273,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); + native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ @@ -323,7 +323,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) } /*H:450 If we chase down the release_pgd() code, it looks like this: */ -static void release_pgd(struct lguest *lg, pgd_t *spgd) +static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { @@ -350,7 +350,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) unsigned int i; /* Release every pgd entry up to the kernel's address. */ for (i = 0; i < pgd_index(lg->kernel_address); i++) - release_pgd(lg, lg->pgdirs[idx].pgdir + i); + release_pgd(lg->pgdirs[idx].pgdir + i); } /*H:440 (v) Flushing (throwing away) page tables, @@ -431,7 +431,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /*H:430 (iv) Switching page tables * - * Now we've seen all the page table setting and manipulation, let's see what + * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) @@ -463,7 +463,7 @@ static void release_all_pagetables(struct lguest *lg) if (lg->pgdirs[i].pgdir) /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg, lg->pgdirs[i].pgdir + j); + release_pgd(lg->pgdirs[i].pgdir + j); } /* We also throw away everything when a Guest tells us it's changed a kernel @@ -581,7 +581,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) pgdir = find_pgdir(lg, gpgdir); if (pgdir < ARRAY_SIZE(lg->pgdirs)) /* ... throw it away. */ - release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); + release_pgd(lg->pgdirs[pgdir].pgdir + idx); } /* Once we know how much memory we have we can construct simple identity @@ -726,8 +726,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * page is already mapped there, we don't have to copy them out * again. */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; - regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); - switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; + native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); + native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], + regs_pte); } /*:*/ @@ -752,21 +753,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - pte[i] = mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); + native_set_pte(&pte[i], mk_pte(switcher_page[i], + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } /* The only other thing we map is this CPU's pair of pages. */ i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); + native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); /* The second page contains the "struct lguest_ro_state", and is * read-only. */ - pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); + native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } /* We've made it through the page table code. Perhaps our tired brains are -- cgit v1.2.3-70-g09d2 From ebe0ba84f55950a89cb7af94c7ffc35ee3992f9e Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Sat, 30 May 2009 15:48:08 -0300 Subject: lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD replace LHCALL_SET_PMD with LHCALL_SET_PGD hypercall name (That's really what it is, and the confusion gets worse with PAE support) Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell Reported-by: Jeremy Fitzhardinge --- arch/x86/include/asm/lguest_hcall.h | 2 +- arch/x86/lguest/boot.c | 2 +- drivers/lguest/hypercalls.c | 4 ++-- drivers/lguest/lg.h | 2 +- drivers/lguest/page_tables.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/lguest') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index f9a9f781124..05b9c198e4b 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -13,7 +13,7 @@ #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 +#define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4f311e40d0a..943a75ef70b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -535,7 +535,7 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, + lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index f252b71ae79..51149ca1461 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -79,8 +79,8 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_SET_PTE: guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); break; - case LHCALL_SET_PMD: - guest_set_pmd(cpu->lg, args->arg1, args->arg2); + case LHCALL_SET_PGD: + guest_set_pgd(cpu->lg, args->arg1, args->arg2); break; case LHCALL_SET_CLOCKEVENT: guest_set_clockevent(cpu, args->arg1); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 74af503ad63..cacc2da2058 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -169,7 +169,7 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); -void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); +void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index ffba723cd98..6a54d76b623 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -568,7 +568,7 @@ void guest_set_pte(struct lg_cpu *cpu, * * So with that in mind here's our code to to update a (top-level) PGD entry: */ -void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) +void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; -- cgit v1.2.3-70-g09d2 From acdd0b6292b282c4511897ac2691a47befbf1c6a Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:07 -0600 Subject: lguest: PAE support This version requires that host and guest have the same PAE status. NX cap is not offered to the guest, yet. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.txt | 1 - arch/x86/include/asm/lguest.h | 7 +- arch/x86/include/asm/lguest_hcall.h | 3 +- arch/x86/lguest/Kconfig | 1 - arch/x86/lguest/boot.c | 71 +++++++- drivers/lguest/Kconfig | 2 +- drivers/lguest/hypercalls.c | 10 + drivers/lguest/lg.h | 5 + drivers/lguest/page_tables.c | 351 ++++++++++++++++++++++++++++++++---- 9 files changed, 403 insertions(+), 48 deletions(-) (limited to 'drivers/lguest') diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f9..efb3a6a045a 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -37,7 +37,6 @@ Running Lguest: "Paravirtualized guest support" = Y "Lguest guest support" = Y "High Memory Support" = off/4GB - "PAE (Physical Address Extension) Support" = N "Alignment value to which kernel should be aligned" = 0x100000 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and CONFIG_PHYSICAL_ALIGN=0x100000) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9..313389cd50d 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,13 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M for ease of mapping into the guest (one PTE page). */ +/* We map at -4M (-2M when PAE is activated) for ease of mapping + * into the guest (one PTE page). */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_ADDR 0xFFE00000 +#else #define SWITCHER_ADDR 0xFFC00000 +#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b14b3552a4d..d31c4a68407 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -12,6 +12,7 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 +#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 @@ -33,7 +34,7 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Eighteen hypercalls are + * We use the KVM hypercall mechanism. Seventeen hypercalls are * available: the hypercall number is put in the %eax register, and the * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. * If a return value makes sense, it's returned in %eax. diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d..38718041efc 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,7 +2,6 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 - depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d12f554e5f6..7bc65f0f62c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call, async_hcall(call, arg1, arg2, arg3, 0); } +#ifdef CONFIG_X86_PAE static void lazy_hcall4(unsigned long call, unsigned long arg1, unsigned long arg2, @@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call, else async_hcall(call, arg1, arg2, arg3, arg4); } +#endif /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ @@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ - *dx &= 0x07808111; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ + *dx &= 0x07808151; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, if (*ax > 0x80000008) *ax = 0x80000008; break; + case 0x80000001: + /* Here we should fix nx cap depending on host. */ + /* For this version of PAE, we just clear NX bit. */ + *dx &= ~(1 << 20); + break; } } @@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val) static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_X86_PAE + lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, + ptep->pte_low, ptep->pte_high); +#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); +#endif } static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, @@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls this to set a top-level entry. Again, we set the entry then - * tell the Host which top-level page we changed, and the index of the entry we - * changed. */ +/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd + * to set a middle-level entry when PAE is activated. + * Again, we set the entry then tell the Host which page we changed, + * and the index of the entry we changed. */ +#ifdef CONFIG_X86_PAE +static void lguest_set_pud(pud_t *pudp, pud_t pudval) +{ + native_set_pud(pudp, pudval); + + /* 32 bytes aligned pdpt address and the index. */ + lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, + (__pa(pudp) & 0x1F) / sizeof(pud_t)); +} + +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + native_set_pmd(pmdp, pmdval); + lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); +} +#else + +/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not + * activated. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } +#endif /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't @@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } +#ifdef CONFIG_X86_PAE +static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte_atomic(ptep, pte); + if (cr3_changed) + lazy_hcall1(LHCALL_FLUSH_TLB, 1); +} + +void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + native_pte_clear(mm, addr, ptep); + lguest_pte_update(mm, addr, ptep); +} + +void lguest_pmd_clear(pmd_t *pmdp) +{ + lguest_set_pmd(pmdp, __pmd(0)); +} +#endif + /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do @@ -1035,6 +1089,7 @@ __init void lguest_init(void) pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; + pv_info.shared_kernel_pmd = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ @@ -1080,6 +1135,12 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; + pv_mmu_ops.pte_clear = lguest_pte_clear; + pv_mmu_ops.pmd_clear = lguest_pmd_clear; + pv_mmu_ops.set_pud = lguest_set_pud; +#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359..8f63845db83 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX + depends on X86_32 && EXPERIMENTAL && FUTEX select HVC_DRIVER ---help--- This is a very simple module which allows you to run diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 51149ca1461..c29ffa19cb7 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: +#ifdef CONFIG_X86_PAE + guest_set_pte(cpu, args->arg1, args->arg2, + __pte(args->arg3 | (u64)args->arg4 << 32)); +#else guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); +#endif break; case LHCALL_SET_PGD: guest_set_pgd(cpu->lg, args->arg1, args->arg2); break; +#ifdef CONFIG_X86_PAE + case LHCALL_SET_PMD: + guest_set_pmd(cpu->lg, args->arg1, args->arg2); + break; +#endif case LHCALL_SET_CLOCKEVENT: guest_set_clockevent(cpu, args->arg1); break; diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index cacc2da2058..6201ce59e88 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); * in the kernel. */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) +#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) +#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); @@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); +#ifdef CONFIG_X86_PAE +void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); +#endif void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 6a54d76b623..5e2c26adcf0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -53,6 +53,17 @@ * page. */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) +/* For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) +#define RESERVE_MEM 2U +#define CHECK_GPGD_MASK _PAGE_PRESENT +#else +#define RESERVE_MEM 4U +#define CHECK_GPGD_MASK _PAGE_TABLE +#endif + /* We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this * CPU's guest to see the pages of any other CPU. */ @@ -73,23 +84,58 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); +#ifndef CONFIG_X86_PAE /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { kill_guest(cpu, "attempt to access switcher pages"); index = 0; } +#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } +#ifdef CONFIG_X86_PAE +/* This routine then takes the PGD entry given above, which contains the + * address of the PMD page. It then returns a pointer to the PMD entry for the + * given address. */ +static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) +{ + unsigned int index = pmd_index(vaddr); + pmd_t *page; + + /* We kill any Guest trying to touch the Switcher addresses. */ + if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && + index >= SWITCHER_PMD_INDEX) { + kill_guest(cpu, "attempt to access switcher pages"); + index = 0; + } + + /* You should never call this if the PGD entry wasn't valid */ + BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); + page = __va(pgd_pfn(spgd) << PAGE_SHIFT); + + return &page[index]; +} +#endif + /* This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a * pointer to the PTE entry for the given address. */ -static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { +#ifdef CONFIG_X86_PAE + pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); + pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); + + /* You should never call this if the PMD entry wasn't valid */ + BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); +#else pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); +#endif + return &page[pte_index(vaddr)]; } @@ -101,10 +147,31 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); } -static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) +#ifdef CONFIG_X86_PAE +static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); + return gpage + pmd_index(vaddr) * sizeof(pmd_t); +} +#endif + +static unsigned long gpte_addr(struct lg_cpu *cpu, + pgd_t gpgd, unsigned long vaddr) +{ +#ifdef CONFIG_X86_PAE + pmd_t gpmd; +#endif + unsigned long gpage; + + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + gpage = pmd_pfn(gpmd) << PAGE_SHIFT; + BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); +#else + gpage = pgd_pfn(gpgd) << PAGE_SHIFT; +#endif return gpage + pte_index(vaddr) * sizeof(pte_t); } /*:*/ @@ -184,11 +251,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { - if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || + if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) kill_guest(cpu, "bad page directory entry"); } +#ifdef CONFIG_X86_PAE +static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +{ + if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + kill_guest(cpu, "bad page middle directory entry"); +} +#endif + /*H:330 * (i) Looking up a page table entry when the Guest faults. * @@ -207,6 +283,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; +#ifdef CONFIG_X86_PAE + pmd_t *spmd; + pmd_t gpmd; +#endif + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -228,12 +309,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) check_gpgd(cpu, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* Now look at the matching shadow entry. */ + spmd = spmd_addr(cpu, *spgd, vaddr); + + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + + /* This is not really the Guest's fault, but killing it is + * simple for this corner case. */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pte page"); + return false; + } + + /* We check that the Guest pmd is OK. */ + check_gpmd(cpu, gpmd); + + /* And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. */ + native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + } +#endif /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ - gpte_ptr = gpte_addr(gpgd, vaddr); + gpte_ptr = gpte_addr(cpu, gpgd, vaddr); gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -259,7 +368,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(*spgd, vaddr); + spte = spte_addr(cpu, *spgd, vaddr); /* If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); @@ -301,14 +410,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) pgd_t *spgd; unsigned long flags; +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif /* Look at the current top level entry: is it present? */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return false; +#ifdef CONFIG_X86_PAE + spmd = spmd_addr(cpu, *spgd, vaddr); + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) + return false; +#endif + /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(*spgd, vaddr))); + flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -322,6 +440,41 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) kill_guest(cpu, "bad stack page %#lx", vaddr); } +#ifdef CONFIG_X86_PAE +static void release_pmd(pmd_t *spmd) +{ + /* If the entry's not present, there's nothing to release. */ + if (pmd_flags(*spmd) & _PAGE_PRESENT) { + unsigned int i; + pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); + /* For each entry in the page, we might need to release it. */ + for (i = 0; i < PTRS_PER_PTE; i++) + release_pte(ptepage[i]); + /* Now we can free the page of PTEs */ + free_page((long)ptepage); + /* And zero out the PMD entry so we never release it twice. */ + native_set_pmd(spmd, __pmd(0)); + } +} + +static void release_pgd(pgd_t *spgd) +{ + /* If the entry's not present, there's nothing to release. */ + if (pgd_flags(*spgd) & _PAGE_PRESENT) { + unsigned int i; + pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); + + for (i = 0; i < PTRS_PER_PMD; i++) + release_pmd(&pmdpage[i]); + + /* Now we can free the page of PMDs */ + free_page((long)pmdpage); + /* And zero out the PGD entry so we never release it twice. */ + set_pgd(spgd, __pgd(0)); + } +} + +#else /* !CONFIG_X86_PAE */ /*H:450 If we chase down the release_pgd() code, it looks like this: */ static void release_pgd(pgd_t *spgd) { @@ -341,7 +494,7 @@ static void release_pgd(pgd_t *spgd) *spgd = __pgd(0); } } - +#endif /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. * It simply releases every PTE page from 0 up to the Guest's kernel address. */ @@ -370,6 +523,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) pgd_t gpgd; pte_t gpte; +#ifdef CONFIG_X86_PAE + pmd_t gpmd; +#endif /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -378,7 +534,13 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return -1UL; } - gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); + gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + kill_guest(cpu, "Bad address %#lx", vaddr); +#endif + gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); @@ -405,6 +567,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; +#ifdef CONFIG_X86_PAE + pmd_t *pmd_table; +#endif /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ @@ -416,10 +581,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* If the allocation fails, just keep using the one we have */ if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; - else - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + else { +#ifdef CONFIG_X86_PAE + /* In PAE mode, allocate a pmd page and populate the + * last pgd entry. */ + pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd_table) { + free_page((long)cpu->lg->pgdirs[next].pgdir); + set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); + next = cpu->cpu_pgd; + } else { + set_pgd(cpu->lg->pgdirs[next].pgdir + + SWITCHER_PGD_INDEX, + __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + /* This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ + *blank_pgdir = 1; + } +#else *blank_pgdir = 1; +#endif + } } /* Record which Guest toplevel this shadows. */ cpu->lg->pgdirs[next].gpgdir = gpgdir; @@ -460,10 +642,25 @@ static void release_all_pagetables(struct lguest *lg) /* Every shadow pagetable this Guest has */ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) + if (lg->pgdirs[i].pgdir) { +#ifdef CONFIG_X86_PAE + pgd_t *spgd; + pmd_t *pmdpage; + unsigned int k; + + /* Get the last pmd page. */ + spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; + pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); + + /* And release the pmd entries of that pmd page, + * except for the switcher pmd. */ + for (k = 0; k < SWITCHER_PMD_INDEX; k++) + release_pmd(&pmdpage[k]); +#endif /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) release_pgd(lg->pgdirs[i].pgdir + j); + } } /* We also throw away everything when a Guest tells us it's changed a kernel @@ -504,24 +701,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, { /* Look up the matching shadow page directory entry. */ pgd_t *spgd = spgd_addr(cpu, idx, vaddr); +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { - /* Otherwise, we start by releasing the existing entry. */ - pte_t *spte = spte_addr(*spgd, vaddr); - release_pte(*spte); - - /* If they're setting this entry as dirty or accessed, we might - * as well put that entry they've given us in now. This shaves - * 10% off a copy-on-write micro-benchmark. */ - if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); - *spte = gpte_to_spte(cpu, gpte, - pte_flags(gpte) & _PAGE_DIRTY); - } else - /* Otherwise kill it and we can demand_page() it in - * later. */ - *spte = __pte(0); +#ifdef CONFIG_X86_PAE + spmd = spmd_addr(cpu, *spgd, vaddr); + if (pmd_flags(*spmd) & _PAGE_PRESENT) { +#endif + /* Otherwise, we start by releasing + * the existing entry. */ + pte_t *spte = spte_addr(cpu, *spgd, vaddr); + release_pte(*spte); + + /* If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us + * in now. This shaves 10% off a + * copy-on-write micro-benchmark. */ + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + check_gpte(cpu, gpte); + native_set_pte(spte, + gpte_to_spte(cpu, gpte, + pte_flags(gpte) & _PAGE_DIRTY)); + } else + /* Otherwise kill it and we can demand_page() + * it in later. */ + native_set_pte(spte, __pte(0)); +#ifdef CONFIG_X86_PAE + } +#endif } } @@ -572,8 +782,6 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - /* The kernel seems to try to initialize this early on: we ignore its - * attempts to map over the Switcher. */ if (idx >= SWITCHER_PGD_INDEX) return; @@ -583,6 +791,12 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } +#ifdef CONFIG_X86_PAE +void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) +{ + guest_pagetable_clear_all(&lg->cpus[0]); +} +#endif /* Once we know how much memory we have we can construct simple identity * (which set virtual == physical) and linear mappings @@ -596,8 +810,16 @@ static unsigned long setup_pagetables(struct lguest *lg, { pgd_t __user *pgdir; pte_t __user *linear; - unsigned int mapped_pages, i, linear_pages, phys_linear; unsigned long mem_base = (unsigned long)lg->mem_base; + unsigned int mapped_pages, i, linear_pages; +#ifdef CONFIG_X86_PAE + pmd_t __user *pmds; + unsigned int j; + pgd_t pgd; + pmd_t pmd; +#else + unsigned int phys_linear; +#endif /* We have mapped_pages frames to map, so we need * linear_pages page tables to map them. */ @@ -610,6 +832,9 @@ static unsigned long setup_pagetables(struct lguest *lg, /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages * PAGE_SIZE; +#ifdef CONFIG_X86_PAE + pmds = (void *)linear - PAGE_SIZE; +#endif /* Linear mapping is easy: put every page's address into the * mapping in order. */ for (i = 0; i < mapped_pages; i++) { @@ -621,6 +846,22 @@ static unsigned long setup_pagetables(struct lguest *lg, /* The top level points to the linear page table pages above. * We setup the identity and linear mappings here. */ +#ifdef CONFIG_X86_PAE + for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD; + i += PTRS_PER_PTE, j++) { + native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) + - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + + if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) + return -EFAULT; + } + + set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) + return -EFAULT; + if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) + return -EFAULT; +#else phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; @@ -633,6 +874,7 @@ static unsigned long setup_pagetables(struct lguest *lg, &pgd, sizeof(pgd))) return -EFAULT; } +#endif /* We return the top level (guest-physical) address: remember where * this is. */ @@ -648,7 +890,10 @@ int init_guest_pagetable(struct lguest *lg) u64 mem; u32 initrd_size; struct boot_params __user *boot = (struct boot_params *)lg->mem_base; - +#ifdef CONFIG_X86_PAE + pgd_t *pgd; + pmd_t *pmd_table; +#endif /* Get the Guest memory size and the ramdisk size from the boot header * located at lg->mem_base (Guest address 0). */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) @@ -663,6 +908,15 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; +#ifdef CONFIG_X86_PAE + pgd = lg->pgdirs[0].pgdir; + pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); + if (!pmd_table) + return -ENOMEM; + + set_pgd(pgd + SWITCHER_PGD_INDEX, + __pgd(__pa(pmd_table) | _PAGE_PRESENT)); +#endif lg->cpus[0].cpu_pgd = 0; return 0; } @@ -672,17 +926,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 4MB of virtual - * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) + &cpu->lg->lguest_data->kernel_address) + /* We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. */ + || put_user(RESERVE_MEM * 1024 * 1024, + &cpu->lg->lguest_data->reserve_mem) + || put_user(cpu->lg->pgdirs[0].gpgdir, + &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ +#ifdef CONFIG_X86_PAE + if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && + pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) +#else if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) +#endif kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -708,16 +969,30 @@ void free_guest_pagetable(struct lguest *lg) void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); - pgd_t switcher_pgd; pte_t regs_pte; unsigned long pfn; +#ifdef CONFIG_X86_PAE + pmd_t switcher_pmd; + pmd_t *pmd_table; + + native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> + PAGE_SHIFT, PAGE_KERNEL_EXEC)); + + pmd_table = __va(pgd_pfn(cpu->lg-> + pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) + << PAGE_SHIFT); + native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); +#else + pgd_t switcher_pgd; + /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; +#endif /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher -- cgit v1.2.3-70-g09d2 From 92b4d8df8436cdd74d22a2a5b6b23b9abc737a3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:08 -0600 Subject: lguest: PAE fixes 1) j wasn't initialized in setup_pagetables, so they weren't set up for me causing immediate guest crashes. 2) gpte_addr should not re-read the pmd from the Guest. Especially not BUG_ON() based on the value. If we ever supported SMP guests, they could trigger that. And the Launcher could also trigger it (tho currently root-only). Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 5e2c26adcf0..a6fe1abda24 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -154,26 +154,25 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); return gpage + pmd_index(vaddr) * sizeof(pmd_t); } -#endif static unsigned long gpte_addr(struct lg_cpu *cpu, - pgd_t gpgd, unsigned long vaddr) + pmd_t gpmd, unsigned long vaddr) { -#ifdef CONFIG_X86_PAE - pmd_t gpmd; -#endif - unsigned long gpage; + unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; - BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); -#ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - gpage = pmd_pfn(gpmd) << PAGE_SHIFT; BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); + return gpage + pte_index(vaddr) * sizeof(pte_t); +} #else - gpage = pgd_pfn(gpgd) << PAGE_SHIFT; -#endif +static unsigned long gpte_addr(struct lg_cpu *cpu, + pgd_t gpgd, unsigned long vaddr) +{ + unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; + + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); return gpage + pte_index(vaddr) * sizeof(pte_t); } +#endif /*:*/ /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as @@ -339,10 +338,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * number in the shadow PMD is the page we just allocated. */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } -#endif + + /* OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. */ + gpte_ptr = gpte_addr(cpu, gpmd, vaddr); +#else /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); +#endif gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -522,7 +526,6 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t gpgd; pte_t gpte; - #ifdef CONFIG_X86_PAE pmd_t gpmd; #endif @@ -534,13 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return -1UL; } - gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); -#endif + gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); +#else gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); +#endif if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); @@ -847,7 +851,7 @@ static unsigned long setup_pagetables(struct lguest *lg, /* The top level points to the linear page table pages above. * We setup the identity and linear mappings here. */ #ifdef CONFIG_X86_PAE - for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD; + for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); -- cgit v1.2.3-70-g09d2 From 9f155a9b3d5a5444bcc5e049ec2547bb5107150e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:08 -0600 Subject: lguest: allow any process to send interrupts We currently only allow the Launcher process to send interrupts, but it as we already send interrupts from the hrtimer, it's a simple matter of extracting that code into a common set_interrupt routine. As we switch to a thread per virtqueue, this avoids a bottleneck through the main Launcher process. Signed-off-by: Rusty Russell --- drivers/lguest/interrupts_and_traps.c | 19 +++++++++++++++---- drivers/lguest/lg.h | 1 + drivers/lguest/lguest_user.c | 10 ++-------- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 5a10754b479..0e9067b0d50 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -213,6 +213,20 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } + +/* And this is the routine when we want to set an interrupt for the Guest. */ +void set_interrupt(struct lg_cpu *cpu, unsigned int irq) +{ + /* Next time the Guest runs, the core code will see if it can deliver + * this interrupt. */ + set_bit(irq, cpu->irqs_pending); + + /* Make sure it sees it; it might be asleep (eg. halted), or + * running the Guest right now, in which case kick_process() + * will knock it out. */ + if (!wake_up_process(cpu->tsk)) + kick_process(cpu->tsk); +} /*:*/ /* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent @@ -528,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); /* Remember the first interrupt is the timer interrupt. */ - set_bit(0, cpu->irqs_pending); - /* Guest may be stopped or running on another CPU. */ - if (!wake_up_process(cpu->tsk)) - kick_process(cpu->tsk); + set_interrupt(cpu, 0); return HRTIMER_NORESTART; } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 6201ce59e88..040cb70780e 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -143,6 +143,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); /* interrupts_and_traps.c: */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); +void set_interrupt(struct lg_cpu *cpu, unsigned int irq); bool deliver_trap(struct lg_cpu *cpu, unsigned int num); void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, u32 low, u32 hi); diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index bcdcf3453e7..1982b45bd93 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -45,9 +45,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return -EFAULT; if (irq >= LGUEST_IRQS) return -EINVAL; - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ - set_bit(irq, cpu->irqs_pending); + + set_interrupt(cpu, irq); return 0; } @@ -252,11 +251,6 @@ static ssize_t write(struct file *file, const char __user *in, /* Once the Guest is dead, you can only read() why it died. */ if (lg->dead) return -ENOENT; - - /* If you're not the task which owns the Guest, all you can do - * is break the Launcher out of running the Guest. */ - if (current != cpu->tsk && req != LHREQ_BREAK) - return -EPERM; } switch (req) { -- cgit v1.2.3-70-g09d2 From df60aeef4f4fe0645d9a195a7689005520422de5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:09 -0600 Subject: lguest: use eventfds for device notification Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with an address: the main Launcher process returns with this address, and figures out what device to run. A far nicer model is to let processes bind an eventfd to an address: if we find one, we simply signal the eventfd. Signed-off-by: Rusty Russell Cc: Davide Libenzi --- drivers/lguest/Kconfig | 2 +- drivers/lguest/core.c | 8 ++-- drivers/lguest/lg.h | 13 ++++++ drivers/lguest/lguest_user.c | 98 ++++++++++++++++++++++++++++++++++++++++- include/linux/lguest_launcher.h | 1 + 5 files changed, 116 insertions(+), 6 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 8f63845db83..0aaa0597a62 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && FUTEX + depends on X86_32 && EXPERIMENTAL && EVENTFD select HVC_DRIVER ---help--- This is a very simple module which allows you to run diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index d0298dc45d9..508569c9571 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ if (cpu->pending_notify) { - if (put_user(cpu->pending_notify, user)) - return -EFAULT; - return sizeof(cpu->pending_notify); + if (!send_notify_to_eventfd(cpu)) { + if (put_user(cpu->pending_notify, user)) + return -EFAULT; + return sizeof(cpu->pending_notify); + } } /* Check for signals */ diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 040cb70780e..32fefdc6ad3 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -82,6 +82,16 @@ struct lg_cpu { struct lg_cpu_arch arch; }; +struct lg_eventfd { + unsigned long addr; + struct file *event; +}; + +struct lg_eventfd_map { + unsigned int num; + struct lg_eventfd map[]; +}; + /* The private info the thread maintains about the guest. */ struct lguest { @@ -102,6 +112,8 @@ struct lguest unsigned int stack_pages; u32 tsc_khz; + struct lg_eventfd_map *eventfds; + /* Dead? */ const char *dead; }; @@ -154,6 +166,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state, void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def); void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); +bool send_notify_to_eventfd(struct lg_cpu *cpu); void init_clockdev(struct lg_cpu *cpu); bool check_syscall_vector(struct lguest *lg); int init_interrupts(void); diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 1982b45bd93..f6bf255f183 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "lg.h" /*L:055 When something happens, the Waker process needs a way to stop the @@ -35,6 +37,81 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) } } +bool send_notify_to_eventfd(struct lg_cpu *cpu) +{ + unsigned int i; + struct lg_eventfd_map *map; + + /* lg->eventfds is RCU-protected */ + rcu_read_lock(); + map = rcu_dereference(cpu->lg->eventfds); + for (i = 0; i < map->num; i++) { + if (map->map[i].addr == cpu->pending_notify) { + eventfd_signal(map->map[i].event, 1); + cpu->pending_notify = 0; + break; + } + } + rcu_read_unlock(); + return cpu->pending_notify == 0; +} + +static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) +{ + struct lg_eventfd_map *new, *old = lg->eventfds; + + if (!addr) + return -EINVAL; + + /* Replace the old array with the new one, carefully: others can + * be accessing it at the same time */ + new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), + GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* First make identical copy. */ + memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); + new->num = old->num; + + /* Now append new entry. */ + new->map[new->num].addr = addr; + new->map[new->num].event = eventfd_fget(fd); + if (IS_ERR(new->map[new->num].event)) { + kfree(new); + return PTR_ERR(new->map[new->num].event); + } + new->num++; + + /* Now put new one in place. */ + rcu_assign_pointer(lg->eventfds, new); + + /* We're not in a big hurry. Wait until noone's looking at old + * version, then delete it. */ + synchronize_rcu(); + kfree(old); + + return 0; +} + +static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) +{ + unsigned long addr, fd; + int err; + + if (get_user(addr, input) != 0) + return -EFAULT; + input++; + if (get_user(fd, input) != 0) + return -EFAULT; + + mutex_lock(&lguest_lock); + err = add_eventfd(lg, addr, fd); + mutex_unlock(&lguest_lock); + + return 0; +} + /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt * number to /dev/lguest. */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) @@ -184,6 +261,13 @@ static int initialize(struct file *file, const unsigned long __user *input) goto unlock; } + lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); + if (!lg->eventfds) { + err = -ENOMEM; + goto free_lg; + } + lg->eventfds->num = 0; + /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; @@ -191,7 +275,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) - goto release_guest; + goto free_eventfds; /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can fail. */ @@ -210,7 +294,9 @@ static int initialize(struct file *file, const unsigned long __user *input) free_regs: /* FIXME: This should be in free_vcpu */ free_page(lg->cpus[0].regs_page); -release_guest: +free_eventfds: + kfree(lg->eventfds); +free_lg: kfree(lg); unlock: mutex_unlock(&lguest_lock); @@ -260,6 +346,8 @@ static ssize_t write(struct file *file, const char __user *in, return user_send_irq(cpu, input); case LHREQ_BREAK: return break_guest_out(cpu, input); + case LHREQ_EVENTFD: + return attach_eventfd(lg, input); default: return -EINVAL; } @@ -297,6 +385,12 @@ static int close(struct inode *inode, struct file *file) * the Launcher's memory management structure. */ mmput(lg->cpus[i].mm); } + + /* Release any eventfds they registered. */ + for (i = 0; i < lg->eventfds->num; i++) + fput(lg->eventfds->map[i].event); + kfree(lg->eventfds); + /* If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). */ if (!IS_ERR(lg->dead)) diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index a53407a4165..9de964b9058 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -58,6 +58,7 @@ enum lguest_req LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ + LHREQ_EVENTFD, /* + address, fd. */ }; /* The alignment to use between consumer and producer parts of vring. -- cgit v1.2.3-70-g09d2 From 5dac051bc6030963181b69faddd9e0ad04f85fa8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:10 -0600 Subject: lguest: remove obsolete LHREQ_BREAK call We no longer need an efficient mechanism to force the Guest back into host userspace, as each device is serviced without bothering the main Guest process (aka. the Launcher). Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 11 +++-------- drivers/lguest/lg.h | 4 +--- drivers/lguest/lguest_user.c | 31 ------------------------------- include/linux/lguest_launcher.h | 2 +- 4 files changed, 5 insertions(+), 43 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 508569c9571..a6974e9b8eb 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -209,10 +209,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* If Waker set break_out, return to Launcher. */ - if (cpu->break_out) - return -EAGAIN; - /* Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ @@ -231,13 +227,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) break; /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer or LHREQ_BREAK from the Waker will wake us. */ + * clock timer will wake us. */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure nothing snuck in + /* Just before we sleep, make sure no interrupt snuck in * which we should be doing. */ - if (interrupt_pending(cpu, &more) < LGUEST_IRQS - || cpu->break_out) + if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else schedule(); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 32fefdc6ad3..d4e8979735c 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -71,9 +71,7 @@ struct lg_cpu { /* Virtual clock device */ struct hrtimer hrt; - /* Do we need to stop what we're doing and return to userspace? */ - int break_out; - wait_queue_head_t break_wq; + /* Did the Guest tell us to halt? */ int halted; /* Pending virtual interrupts */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index f6bf255f183..32e29712105 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -11,32 +11,6 @@ #include #include "lg.h" -/*L:055 When something happens, the Waker process needs a way to stop the - * kernel running the Guest and return to the Launcher. So the Waker writes - * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher - * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release - * the Waker. */ -static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) -{ - unsigned long on; - - /* Fetch whether they're turning break on or off. */ - if (get_user(on, input) != 0) - return -EFAULT; - - if (on) { - cpu->break_out = 1; - if (!wake_up_process(cpu->tsk)) - kick_process(cpu->tsk); - /* Wait for them to reset it */ - return wait_event_interruptible(cpu->break_wq, !cpu->break_out); - } else { - cpu->break_out = 0; - wake_up(&cpu->break_wq); - return 0; - } -} - bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; @@ -202,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) * address. */ lguest_arch_setup_regs(cpu, start_ip); - /* Initialize the queue for the Waker to wait on */ - init_waitqueue_head(&cpu->break_wq); - /* We keep a pointer to the Launcher task (ie. current task) for when * other Guests want to wake this one (eg. console input). */ cpu->tsk = current; @@ -344,8 +315,6 @@ static ssize_t write(struct file *file, const char __user *in, return initialize(file, input); case LHREQ_IRQ: return user_send_irq(cpu, input); - case LHREQ_BREAK: - return break_guest_out(cpu, input); case LHREQ_EVENTFD: return attach_eventfd(lg, input); default: diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 9de964b9058..bfefbdf7498 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -57,7 +57,7 @@ enum lguest_req LHREQ_INITIALIZE, /* + base, pfnlimit, start */ LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ - LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ + LHREQ_BREAK, /* No longer used */ LHREQ_EVENTFD, /* + address, fd. */ }; -- cgit v1.2.3-70-g09d2