From de56a948b9182fbcf92cb8212f114de096c2d574 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:21:34 +0000
Subject: KVM: PPC: Add support for Book3S processors in hypervisor mode

This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode.  Using hypervisor mode means
that the guest can use the processor's supervisor mode.  That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host.  This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.

This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses.  That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification.  In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.

Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.

This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.

With the guest running in supervisor mode, most exceptions go straight
to the guest.  We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.

We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.

In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount.  Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.

The POWER7 processor has a restriction that all threads in a core have
to be in the same partition.  MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest.  At present we require the host and guest to run
in single-thread mode because of this hardware restriction.

This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management.  This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.

This also adds a few new exports needed by the book3s_hv code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s_hv.c | 445 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 445 insertions(+)
 create mode 100644 arch/powerpc/kvm/book3s_hv.c

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 00000000000..60b7300568c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	u64 now;
+	unsigned long dec_nsec;
+
+	now = get_tb();
+	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_queue_dec(vcpu);
+	if (vcpu->arch.pending_exceptions)
+		return;
+	if (vcpu->arch.dec_expires != ~(u64)0) {
+		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+			tb_ticks_per_sec;
+		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	kvm_vcpu_block(vcpu);
+	vcpu->stat.halt_wakeup++;
+
+	if (vcpu->arch.dec_expires != ~(u64)0)
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->arch.shregs.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.ctr, vcpu->arch.lr);
+	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			      struct task_struct *tsk)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		/* hcall - punt to userspace */
+		int i;
+
+		if (vcpu->arch.shregs.msr & MSR_PR) {
+			/* sc 1 from userspace - reflect to guest syscall */
+			kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
+			r = RESUME_GUEST;
+			break;
+		}
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	/*
+	 * We get these next two if the guest does a bad real-mode access,
+	 * as we have enabled VRMA (virtualized real mode area) mode in the
+	 * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
+		vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+					0x08000000);
+		r = RESUME_GUEST;
+		break;
+	/*
+	 * This occurs if the guest executes an illegal instruction.
+	 * We just generate a program interrupt to the guest, since
+	 * we don't emulate any guest instructions at this stage.
+	 */
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		kvmppc_core_queue_program(vcpu, 0x80000);
+		r = RESUME_GUEST;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(tsk)) {
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+		return 0;
+	return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long lpcr;
+
+	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vcpu->arch.shared = &vcpu->arch.shregs;
+	vcpu->arch.last_cpu = -1;
+	vcpu->arch.mmcr[0] = MMCR0_FC;
+	vcpu->arch.ctrl = CTRL_RUNLATCH;
+	/* default to host PVR, since we can't spoof it */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
+	vcpu->arch.lpcr = lpcr;
+
+	kvmppc_mmu_book3s_hv_init(vcpu);
+
+	return vcpu;
+
+free_vcpu:
+	kfree(vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u64 now;
+
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	preempt_disable();
+
+	/*
+	 * Make sure we are running on thread 0, and that
+	 * secondary threads are offline.
+	 * XXX we should also block attempts to bring any
+	 * secondary threads online.
+	 */
+	if (threads_per_core > 1) {
+		int cpu = smp_processor_id();
+		int thr = cpu_thread_in_core(cpu);
+
+		if (thr)
+			goto out;
+		while (++thr < threads_per_core)
+			if (cpu_online(cpu + thr))
+				goto out;
+	}
+
+	kvm_guest_enter();
+
+	__kvmppc_vcore_entry(NULL, vcpu);
+
+	kvm_guest_exit();
+
+	preempt_enable();
+	kvm_resched(vcpu);
+
+	now = get_tb();
+	/* cancel pending dec exception if dec is positive */
+	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_dequeue_dec(vcpu);
+
+	return kvmppc_handle_exit(run, vcpu, current);
+
+ out:
+	preempt_enable();
+	return -EBUSY;
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		return kvmppc_prepare_vrma(kvm, mem);
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	long r;
+
+	/* Allocate hashed page table */
+	r = kvmppc_alloc_hpt(kvm);
+
+	return r;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_hpt(kvm);
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hv_init();
+
+	return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
-- 
cgit v1.2.3-70-g09d2


From a8606e20e41a8149456bafdf76ad29d47672027c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:22:05 +0000
Subject: KVM: PPC: Handle some PAPR hcalls in the kernel

This adds the infrastructure for handling PAPR hcalls in the kernel,
either early in the guest exit path while we are still in real mode,
or later once the MMU has been turned back on and we are in the full
kernel context.  The advantage of handling hcalls in real mode if
possible is that we avoid two partition switches -- and this will
become more important when we support SMT4 guests, since a partition
switch means we have to pull all of the threads in the core out of
the guest.  The disadvantage is that we can only access the kernel
linear mapping, not anything vmalloced or ioremapped, since the MMU
is off.

This also adds code to handle the following hcalls in real mode:

H_ENTER       Add an HPTE to the hashed page table
H_REMOVE      Remove an HPTE from the hashed page table
H_READ        Read HPTEs from the hashed page table
H_PROTECT     Change the protection bits in an HPTE
H_BULK_REMOVE Remove up to 4 HPTEs from the hashed page table
H_SET_DABR    Set the data address breakpoint register

Plus code to handle the following hcalls in the kernel:

H_CEDE        Idle the vcpu until an interrupt or H_PROD hcall arrives
H_PROD        Wake up a ceded vcpu
H_REGISTER_VPA Register a virtual processor area (VPA)

The code that runs in real mode has to be in the base kernel, not in
the module, if KVM is compiled as a module.  The real-mode code can
only access the kernel linear mapping, not vmalloc or ioremap space.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/hvcall.h       |   5 +
 arch/powerpc/include/asm/kvm_host.h     |  11 +
 arch/powerpc/include/asm/kvm_ppc.h      |   1 +
 arch/powerpc/kernel/asm-offsets.c       |   2 +
 arch/powerpc/kvm/Makefile               |   8 +-
 arch/powerpc/kvm/book3s_hv.c            | 170 ++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c     | 368 ++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 158 +++++++++++++-
 arch/powerpc/kvm/powerpc.c              |   2 +-
 9 files changed, 718 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_mmu.c

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index fd8201dddd4..1c324ff55ea 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -29,6 +29,10 @@
 #define H_LONG_BUSY_ORDER_100_SEC	9905  /* Long busy, hint that 100sec \
 						 is a good time to retry */
 #define H_LONG_BUSY_END_RANGE		9905  /* End of long busy range */
+
+/* Internal value used in book3s_hv kvm support; not returned to guests */
+#define H_TOO_HARD	9999
+
 #define H_HARDWARE	-1	/* Hardware error */
 #define H_FUNCTION	-2	/* Function not supported */
 #define H_PRIVILEGE	-3	/* Caller not privileged */
@@ -100,6 +104,7 @@
 #define H_PAGE_SET_ACTIVE	H_PAGE_STATE_CHANGE
 #define H_AVPN			(1UL<<(63-32))	/* An avpn is provided as a sanity test */
 #define H_ANDCOND		(1UL<<(63-33))
+#define H_LOCAL			(1UL<<(63-35))
 #define H_ICACHE_INVALIDATE	(1UL<<(63-40))	/* icbi, etc.  (ignored for IO pages) */
 #define H_ICACHE_SYNCHRONIZE	(1UL<<(63-41))	/* dcbst, icbi, etc (ignored for IO pages */
 #define H_COALESCE_CAND	(1UL<<(63-42))	/* page is a good candidate for coalescing */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 4a3f790d5fc..6ebf1721680 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -59,6 +59,10 @@ struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
 
+struct lppaca;
+struct slb_shadow;
+struct dtl;
+
 struct kvm_vm_stat {
 	u32 remote_tlb_flush;
 };
@@ -344,7 +348,14 @@ struct kvm_vcpu_arch {
 	u64 dec_expires;
 	unsigned long pending_exceptions;
 	u16 last_cpu;
+	u8 ceded;
+	u8 prodded;
 	u32 last_inst;
+
+	struct lppaca *vpa;
+	struct slb_shadow *slb_shadow;
+	struct dtl *dtl;
+	struct dtl *dtl_end;
 	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0dafd53c30e..2afe92e6f62 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -118,6 +118,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
+extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 936267462ca..c70d106bf1a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -189,6 +189,7 @@ int main(void)
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
 	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
+	DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
@@ -459,6 +460,7 @@ int main(void)
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
 	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 8a435a6da66..2ecffc0dc1b 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -54,14 +54,17 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+	book3s_hv_rm_mmu.o
 
-kvm-book3s_64-objs := \
+kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
 	powerpc.o \
 	emulate.o \
 	book3s.o \
 	$(kvm-book3s_64-objs-y)
-kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
 	$(common-objs-y) \
@@ -83,3 +86,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
+obj-y += $(kvm-book3s_64-builtin-objs-y)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 60b7300568c..af862c30b70 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -124,6 +124,158 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 	       vcpu->arch.last_inst);
 }
 
+struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+	int r;
+	struct kvm_vcpu *v, *ret = NULL;
+
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(r, v, kvm) {
+		if (v->vcpu_id == id) {
+			ret = v;
+			break;
+		}
+	}
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+	vpa->shared_proc = 1;
+	vpa->yield_count = 1;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+				       unsigned long flags,
+				       unsigned long vcpuid, unsigned long vpa)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long pg_index, ra, len;
+	unsigned long pg_offset;
+	void *va;
+	struct kvm_vcpu *tvcpu;
+
+	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+	if (!tvcpu)
+		return H_PARAMETER;
+
+	flags >>= 63 - 18;
+	flags &= 7;
+	if (flags == 0 || flags == 4)
+		return H_PARAMETER;
+	if (flags < 4) {
+		if (vpa & 0x7f)
+			return H_PARAMETER;
+		/* registering new area; convert logical addr to real */
+		pg_index = vpa >> kvm->arch.ram_porder;
+		pg_offset = vpa & (kvm->arch.ram_psize - 1);
+		if (pg_index >= kvm->arch.ram_npages)
+			return H_PARAMETER;
+		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+			return H_PARAMETER;
+		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
+		ra |= pg_offset;
+		va = __va(ra);
+		if (flags <= 1)
+			len = *(unsigned short *)(va + 4);
+		else
+			len = *(unsigned int *)(va + 4);
+		if (pg_offset + len > kvm->arch.ram_psize)
+			return H_PARAMETER;
+		switch (flags) {
+		case 1:		/* register VPA */
+			if (len < 640)
+				return H_PARAMETER;
+			tvcpu->arch.vpa = va;
+			init_vpa(vcpu, va);
+			break;
+		case 2:		/* register DTL */
+			if (len < 48)
+				return H_PARAMETER;
+			if (!tvcpu->arch.vpa)
+				return H_RESOURCE;
+			len -= len % 48;
+			tvcpu->arch.dtl = va;
+			tvcpu->arch.dtl_end = va + len;
+			break;
+		case 3:		/* register SLB shadow buffer */
+			if (len < 8)
+				return H_PARAMETER;
+			if (!tvcpu->arch.vpa)
+				return H_RESOURCE;
+			tvcpu->arch.slb_shadow = va;
+			len = (len - 16) / 16;
+			tvcpu->arch.slb_shadow = va;
+			break;
+		}
+	} else {
+		switch (flags) {
+		case 5:		/* unregister VPA */
+			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
+				return H_RESOURCE;
+			tvcpu->arch.vpa = NULL;
+			break;
+		case 6:		/* unregister DTL */
+			tvcpu->arch.dtl = NULL;
+			break;
+		case 7:		/* unregister SLB shadow buffer */
+			tvcpu->arch.slb_shadow = NULL;
+			break;
+		}
+	}
+	return H_SUCCESS;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long req = kvmppc_get_gpr(vcpu, 3);
+	unsigned long target, ret = H_SUCCESS;
+	struct kvm_vcpu *tvcpu;
+
+	switch (req) {
+	case H_CEDE:
+		vcpu->arch.shregs.msr |= MSR_EE;
+		vcpu->arch.ceded = 1;
+		smp_mb();
+		if (!vcpu->arch.prodded)
+			kvmppc_vcpu_block(vcpu);
+		else
+			vcpu->arch.prodded = 0;
+		smp_mb();
+		vcpu->arch.ceded = 0;
+		break;
+	case H_PROD:
+		target = kvmppc_get_gpr(vcpu, 4);
+		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		tvcpu->arch.prodded = 1;
+		smp_mb();
+		if (vcpu->arch.ceded) {
+			if (waitqueue_active(&vcpu->wq)) {
+				wake_up_interruptible(&vcpu->wq);
+				vcpu->stat.halt_wakeup++;
+			}
+		}
+		break;
+	case H_CONFER:
+		break;
+	case H_REGISTER_VPA:
+		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		break;
+	default:
+		return RESUME_HOST;
+	}
+	kvmppc_set_gpr(vcpu, 3, ret);
+	vcpu->arch.hcall_needed = 0;
+	return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			      struct task_struct *tsk)
 {
@@ -318,7 +470,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 
-int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	u64 now;
 
@@ -370,6 +522,22 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return -EBUSY;
 }
 
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	do {
+		r = kvmppc_run_vcpu(run, vcpu);
+
+		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
+		    !(vcpu->arch.shregs.msr & MSR_PR)) {
+			r = kvmppc_pseries_do_hcall(vcpu);
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	} while (r == RESUME_GUEST);
+	return r;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644
index 00000000000..edb0aae901a
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -0,0 +1,368 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+
+#define HPTE_V_HVLOCK	0x40UL
+
+static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+	unsigned long tmp, old;
+
+	asm volatile("	ldarx	%0,0,%2\n"
+		     "	and.	%1,%0,%3\n"
+		     "	bne	2f\n"
+		     "	ori	%0,%0,%4\n"
+		     "  stdcx.	%0,0,%2\n"
+		     "	beq+	2f\n"
+		     "	li	%1,%3\n"
+		     "2:	isync"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+		    long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	unsigned long porder;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long i, lpn, pa;
+	unsigned long *hpte;
+
+	/* only handle 4k, 64k and 16M pages for now */
+	porder = 12;
+	if (pteh & HPTE_V_LARGE) {
+		if ((ptel & 0xf000) == 0x1000) {
+			/* 64k page */
+			porder = 16;
+		} else if ((ptel & 0xff000) == 0) {
+			/* 16M page */
+			porder = 24;
+			/* lowest AVA bit must be 0 for 16M pages */
+			if (pteh & 0x80)
+				return H_PARAMETER;
+		} else
+			return H_PARAMETER;
+	}
+	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
+		return H_PARAMETER;
+	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+	if (!pa)
+		return H_PARAMETER;
+	/* Check WIMG */
+	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+		return H_PARAMETER;
+	pteh &= ~0x60UL;
+	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+	ptel |= pa;
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	if (likely((flags & H_EXACT) == 0)) {
+		pte_index &= ~7UL;
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		for (i = 0; ; ++i) {
+			if (i == 8)
+				return H_PTEG_FULL;
+			if ((*hpte & HPTE_V_VALID) == 0 &&
+			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+				break;
+			hpte += 2;
+		}
+	} else {
+		i = 0;
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			return H_PTEG_FULL;
+	}
+	hpte[1] = ptel;
+	eieio();
+	hpte[0] = pteh;
+	asm volatile("ptesync" : : : "memory");
+	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+	vcpu->arch.gpr[4] = pte_index + i;
+	return H_SUCCESS;
+}
+
+static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+				      unsigned long pte_index)
+{
+	unsigned long rb, va_low;
+
+	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	va_low = pte_index >> 3;
+	if (v & HPTE_V_SECONDARY)
+		va_low = ~va_low;
+	/* xor vsid from AVA */
+	if (!(v & HPTE_V_1TB_SEG))
+		va_low ^= v >> 12;
+	else
+		va_low ^= v >> 24;
+	va_low &= 0x7ff;
+	if (v & HPTE_V_LARGE) {
+		rb |= 1;			/* L field */
+		if (r & 0xff000) {
+			/* non-16MB large page, must be 64k */
+			/* (masks depend on page size) */
+			rb |= 0x1000;		/* page encoding in LP field */
+			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
+		}
+	} else {
+		/* 4kB page */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+	}
+	rb |= (v >> 54) & 0x300;		/* B field */
+	return rb;
+}
+
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
+
+static inline int try_lock_tlbie(unsigned int *lock)
+{
+	unsigned int tmp, old;
+	unsigned int token = LOCK_TOKEN;
+
+	asm volatile("1:lwarx	%1,0,%2\n"
+		     "	cmpwi	cr0,%1,0\n"
+		     "	bne	2f\n"
+		     "  stwcx.	%3,0,%2\n"
+		     "	bne-	1b\n"
+		     "  isync\n"
+		     "2:"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (lock), "r" (token)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+		     unsigned long pte_index, unsigned long avpn,
+		     unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte;
+	unsigned long v, r, rb;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
+		hpte[0] &= ~HPTE_V_HVLOCK;
+		return H_NOT_FOUND;
+	}
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		flags |= H_LOCAL;
+	vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
+	vcpu->arch.gpr[5] = r = hpte[1];
+	rb = compute_tlbie_rb(v, r, pte_index);
+	hpte[0] = 0;
+	if (!(flags & H_LOCAL)) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		asm volatile("tlbiel %0" : : "r" (rb));
+		asm volatile("ptesync" : : : "memory");
+	}
+	return H_SUCCESS;
+}
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *args = &vcpu->arch.gpr[4];
+	unsigned long *hp, tlbrb[4];
+	long int i, found;
+	long int n_inval = 0;
+	unsigned long flags, req, pte_index;
+	long int local = 0;
+	long int ret = H_SUCCESS;
+
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		local = 1;
+	for (i = 0; i < 4; ++i) {
+		pte_index = args[i * 2];
+		flags = pte_index >> 56;
+		pte_index &= ((1ul << 56) - 1);
+		req = flags >> 6;
+		flags &= 3;
+		if (req == 3)
+			break;
+		if (req != 1 || flags == 3 ||
+		    pte_index >= (HPT_NPTEG << 3)) {
+			/* parameter error */
+			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
+			ret = H_PARAMETER;
+			break;
+		}
+		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		while (!lock_hpte(hp, HPTE_V_HVLOCK))
+			cpu_relax();
+		found = 0;
+		if (hp[0] & HPTE_V_VALID) {
+			switch (flags & 3) {
+			case 0:		/* absolute */
+				found = 1;
+				break;
+			case 1:		/* andcond */
+				if (!(hp[0] & args[i * 2 + 1]))
+					found = 1;
+				break;
+			case 2:		/* AVPN */
+				if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
+					found = 1;
+				break;
+			}
+		}
+		if (!found) {
+			hp[0] &= ~HPTE_V_HVLOCK;
+			args[i * 2] = ((0x90 | flags) << 56) + pte_index;
+			continue;
+		}
+		/* insert R and C bits from PTE */
+		flags |= (hp[1] >> 5) & 0x0c;
+		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
+		tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+		hp[0] = 0;
+	}
+	if (n_inval == 0)
+		return ret;
+
+	if (!local) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < n_inval; ++i)
+			asm volatile(PPC_TLBIE(%1,%0)
+				     : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < n_inval; ++i)
+			asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
+		asm volatile("ptesync" : : : "memory");
+	}
+	return ret;
+}
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+		      unsigned long pte_index, unsigned long avpn,
+		      unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte;
+	unsigned long v, r, rb;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
+		hpte[0] &= ~HPTE_V_HVLOCK;
+		return H_NOT_FOUND;
+	}
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		flags |= H_LOCAL;
+	v = hpte[0];
+	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+	r |= (flags << 55) & HPTE_R_PP0;
+	r |= (flags << 48) & HPTE_R_KEY_HI;
+	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+	rb = compute_tlbie_rb(v, r, pte_index);
+	hpte[0] = v & ~HPTE_V_VALID;
+	if (!(flags & H_LOCAL)) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		asm volatile("tlbiel %0" : : "r" (rb));
+		asm volatile("ptesync" : : : "memory");
+	}
+	hpte[1] = r;
+	eieio();
+	hpte[0] = v & ~HPTE_V_HVLOCK;
+	asm volatile("ptesync" : : : "memory");
+	return H_SUCCESS;
+}
+
+static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
+{
+	long int i;
+	unsigned long offset, rpn;
+
+	offset = realaddr & (kvm->arch.ram_psize - 1);
+	rpn = (realaddr - offset) >> PAGE_SHIFT;
+	for (i = 0; i < kvm->arch.ram_npages; ++i)
+		if (rpn == kvm->arch.ram_pginfo[i].pfn)
+			return (i << PAGE_SHIFT) + offset;
+	return HPTE_R_RPN;	/* all 1s in the RPN field */
+}
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+		   unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte, r;
+	int i, n = 1;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	if (flags & H_READ_4) {
+		pte_index &= ~3;
+		n = 4;
+	}
+	for (i = 0; i < n; ++i, ++pte_index) {
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		r = hpte[1];
+		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
+			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
+				(r & ~HPTE_R_RPN);
+		vcpu->arch.gpr[4 + i * 2] = hpte[0];
+		vcpu->arch.gpr[5 + i * 2] = r;
+	}
+	return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9af264840b9..319ff63b1f3 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -166,6 +166,14 @@ kvmppc_hv_entry:
 	/* Save R1 in the PACA */
 	std	r1, HSTATE_HOST_R1(r13)
 
+	/* Increment yield count if they have a VPA */
+	ld	r3, VCPU_VPA(r4)
+	cmpdi	r3, 0
+	beq	25f
+	lwz	r5, LPPACA_YIELDCOUNT(r3)
+	addi	r5, r5, 1
+	stw	r5, LPPACA_YIELDCOUNT(r3)
+25:
 	/* Load up DAR and DSISR */
 	ld	r5, VCPU_DAR(r4)
 	lwz	r6, VCPU_DSISR(r4)
@@ -401,6 +409,10 @@ kvmppc_interrupt:
 	cmpwi	r3,0
 	bge	ignore_hdec
 2:
+	/* See if this is something we can handle in real mode */
+	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
+	beq	hcall_try_real_mode
+hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
@@ -579,13 +591,28 @@ hdec_soon:
 	std	r5, VCPU_SPRG2(r9)
 	std	r6, VCPU_SPRG3(r9)
 
-	/* Save PMU registers */
+	/* Increment yield count if they have a VPA */
+	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
+	cmpdi	r8, 0
+	beq	25f
+	lwz	r3, LPPACA_YIELDCOUNT(r8)
+	addi	r3, r3, 1
+	stw	r3, LPPACA_YIELDCOUNT(r8)
+25:
+	/* Save PMU registers if requested */
+	/* r8 and cr0.eq are live here */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
 	isync
-	mfspr	r5, SPRN_MMCR1
+	beq	21f			/* if no VPA, save PMU stuff anyway */
+	lbz	r7, LPPACA_PMCINUSE(r8)
+	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
+	bne	21f
+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
+	b	22f
+21:	mfspr	r5, SPRN_MMCR1
 	mfspr	r6, SPRN_MMCRA
 	std	r4, VCPU_MMCR(r9)
 	std	r5, VCPU_MMCR + 8(r9)
@@ -676,6 +703,125 @@ hdec_soon:
 	mfspr	r7,SPRN_HDSISR
 	b	7b
 
+/*
+ * Try to handle an hcall in real mode.
+ * Returns to the guest if we handle it, or continues on up to
+ * the kernel if we can't (i.e. if we don't have a handler for
+ * it, or if the handler returns H_TOO_HARD).
+ */
+	.globl	hcall_try_real_mode
+hcall_try_real_mode:
+	ld	r3,VCPU_GPR(r3)(r9)
+	andi.	r0,r11,MSR_PR
+	bne	hcall_real_cont
+	clrrdi	r3,r3,2
+	cmpldi	r3,hcall_real_table_end - hcall_real_table
+	bge	hcall_real_cont
+	LOAD_REG_ADDR(r4, hcall_real_table)
+	lwzx	r3,r3,r4
+	cmpwi	r3,0
+	beq	hcall_real_cont
+	add	r3,r3,r4
+	mtctr	r3
+	mr	r3,r9		/* get vcpu pointer */
+	ld	r4,VCPU_GPR(r4)(r9)
+	bctrl
+	cmpdi	r3,H_TOO_HARD
+	beq	hcall_real_fallback
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	std	r3,VCPU_GPR(r3)(r4)
+	ld	r10,VCPU_PC(r4)
+	ld	r11,VCPU_MSR(r4)
+	b	fast_guest_return
+
+	/* We've attempted a real mode hcall, but it's punted it back
+	 * to userspace.  We need to restore some clobbered volatiles
+	 * before resuming the pass-it-to-qemu path */
+hcall_real_fallback:
+	li	r12,BOOK3S_INTERRUPT_SYSCALL
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r11, VCPU_MSR(r9)
+
+	b	hcall_real_cont
+
+	.globl	hcall_real_table
+hcall_real_table:
+	.long	0		/* 0 - unused */
+	.long	.kvmppc_h_remove - hcall_real_table
+	.long	.kvmppc_h_enter - hcall_real_table
+	.long	.kvmppc_h_read - hcall_real_table
+	.long	0		/* 0x10 - H_CLEAR_MOD */
+	.long	0		/* 0x14 - H_CLEAR_REF */
+	.long	.kvmppc_h_protect - hcall_real_table
+	.long	0		/* 0x1c - H_GET_TCE */
+	.long	0		/* 0x20 - H_SET_TCE */
+	.long	0		/* 0x24 - H_SET_SPRG0 */
+	.long	.kvmppc_h_set_dabr - hcall_real_table
+	.long	0		/* 0x2c */
+	.long	0		/* 0x30 */
+	.long	0		/* 0x34 */
+	.long	0		/* 0x38 */
+	.long	0		/* 0x3c */
+	.long	0		/* 0x40 */
+	.long	0		/* 0x44 */
+	.long	0		/* 0x48 */
+	.long	0		/* 0x4c */
+	.long	0		/* 0x50 */
+	.long	0		/* 0x54 */
+	.long	0		/* 0x58 */
+	.long	0		/* 0x5c */
+	.long	0		/* 0x60 */
+	.long	0		/* 0x64 */
+	.long	0		/* 0x68 */
+	.long	0		/* 0x6c */
+	.long	0		/* 0x70 */
+	.long	0		/* 0x74 */
+	.long	0		/* 0x78 */
+	.long	0		/* 0x7c */
+	.long	0		/* 0x80 */
+	.long	0		/* 0x84 */
+	.long	0		/* 0x88 */
+	.long	0		/* 0x8c */
+	.long	0		/* 0x90 */
+	.long	0		/* 0x94 */
+	.long	0		/* 0x98 */
+	.long	0		/* 0x9c */
+	.long	0		/* 0xa0 */
+	.long	0		/* 0xa4 */
+	.long	0		/* 0xa8 */
+	.long	0		/* 0xac */
+	.long	0		/* 0xb0 */
+	.long	0		/* 0xb4 */
+	.long	0		/* 0xb8 */
+	.long	0		/* 0xbc */
+	.long	0		/* 0xc0 */
+	.long	0		/* 0xc4 */
+	.long	0		/* 0xc8 */
+	.long	0		/* 0xcc */
+	.long	0		/* 0xd0 */
+	.long	0		/* 0xd4 */
+	.long	0		/* 0xd8 */
+	.long	0		/* 0xdc */
+	.long	0		/* 0xe0 */
+	.long	0		/* 0xe4 */
+	.long	0		/* 0xe8 */
+	.long	0		/* 0xec */
+	.long	0		/* 0xf0 */
+	.long	0		/* 0xf4 */
+	.long	0		/* 0xf8 */
+	.long	0		/* 0xfc */
+	.long	0		/* 0x100 */
+	.long	0		/* 0x104 */
+	.long	0		/* 0x108 */
+	.long	0		/* 0x10c */
+	.long	0		/* 0x110 */
+	.long	0		/* 0x114 */
+	.long	0		/* 0x118 */
+	.long	0		/* 0x11c */
+	.long	0		/* 0x120 */
+	.long	.kvmppc_h_bulk_remove - hcall_real_table
+hcall_real_table_end:
+
 ignore_hdec:
 	mr	r4,r9
 	b	fast_guest_return
@@ -688,10 +834,16 @@ bounce_ext_interrupt:
 	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
 	b	fast_guest_return
 
+_GLOBAL(kvmppc_h_set_dabr)
+	std	r4,VCPU_DABR(r3)
+	mtspr	SPRN_DABR,r4
+	li	r3,0
+	blr
+
 /*
  * Save away FP, VMX and VSX registers.
  * r3 = vcpu pointer
-*/
+ */
 _GLOBAL(kvmppc_save_fp)
 	mfmsr	r9
 	ori	r8,r9,MSR_FP
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3a4f379ee70..6fc9ee499b6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -42,7 +42,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	return !(v->arch.shared->msr & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
 #else
-	return 1;
+	return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
 #endif
 }
 
-- 
cgit v1.2.3-70-g09d2


From 54738c097163c3f01e67ccc85462b78d4d4f495f Mon Sep 17 00:00:00 2001
From: David Gibson <dwg@au1.ibm.com>
Date: Wed, 29 Jun 2011 00:22:41 +0000
Subject: KVM: PPC: Accelerate H_PUT_TCE by implementing it in real mode

This improves I/O performance for guests using the PAPR
paravirtualization interface by making the H_PUT_TCE hcall faster, by
implementing it in real mode.  H_PUT_TCE is used for updating virtual
IOMMU tables, and is used both for virtual I/O and for real I/O in the
PAPR interface.

Since this moves the IOMMU tables into the kernel, we define a new
KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables.  The
ioctl returns a file descriptor which can be used to mmap the newly
created table.  The qemu driver models use them in the same way as
userspace managed tables, but they can be updated directly by the
guest with a real-mode H_PUT_TCE implementation, reducing the number
of host/guest context switches during guest IO.

There are certain circumstances where it is useful for userland qemu
to write to the TCE table even if the kernel H_PUT_TCE path is used
most of the time.  Specifically, allowing this will avoid awkwardness
when we need to reset the table.  More importantly, we will in the
future need to write the table in order to restore its state after a
checkpoint resume or migration.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt        |  35 ++++++++++
 arch/powerpc/include/asm/kvm.h           |   9 +++
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 +
 arch/powerpc/include/asm/kvm_host.h      |   9 +++
 arch/powerpc/include/asm/kvm_ppc.h       |   2 +
 arch/powerpc/kvm/Makefile                |   3 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c      |  73 +++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c             | 116 ++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |   2 +-
 arch/powerpc/kvm/powerpc.c               |  18 +++++
 include/linux/kvm.h                      |   2 +
 11 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e8875fef3eb..a1d344d5ff4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1350,6 +1350,41 @@ The following flags are defined:
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
+4.62 KVM_CREATE_SPAPR_TCE
+
+Capability: KVM_CAP_SPAPR_TCE
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table.  The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every 4kiB of the DMA window.
+
+When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
+table has been created using this ioctl(), the kernel will handle it
+in real mode, updating the TCE table.  H_PUT_TCE calls for other
+liobns will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index d2ca5ed3877..c3ec990daf4 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -22,6 +22,9 @@
 
 #include <linux/types.h>
 
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
@@ -272,4 +275,10 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET	-2U
 #define KVM_INTERRUPT_SET_LEVEL	-3U
 
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 5f73388ea0a..e43fe42b987 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -27,4 +27,6 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 }
 #endif
 
+#define SPAPR_TCE_SHIFT		12
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6ebf1721680..5616e39a7fa 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -144,6 +144,14 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_table {
+	struct list_head list;
+	struct kvm *kvm;
+	u64 liobn;
+	u32 window_size;
+	struct page *pages[0];
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
@@ -157,6 +165,7 @@ struct kvm_arch {
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
+	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2afe92e6f62..99f6fcf4cf8 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -119,6 +119,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				struct kvm_create_spapr_tce *args);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2ecffc0dc1b..1de3d54901d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -55,7 +55,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_interrupts.o \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
-	book3s_hv_rm_mmu.o
+	book3s_hv_rm_mmu.o \
+	book3s_64_vio_hv.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644
index 00000000000..ea0f8c537c2
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -0,0 +1,73 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_spapr_tce_table *stt;
+
+	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+	/* 	    liobn, ioba, tce); */
+
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == liobn) {
+			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+			struct page *page;
+			u64 *tbl;
+
+			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+			/* 	    liobn, stt, stt->window_size); */
+			if (ioba >= stt->window_size)
+				return H_PARAMETER;
+
+			page = stt->pages[idx / TCES_PER_PAGE];
+			tbl = (u64 *)page_address(page);
+
+			/* FIXME: Need to validate the TCE itself */
+			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+			tbl[idx % TCES_PER_PAGE] = tce;
+			return H_SUCCESS;
+		}
+	}
+
+	/* Didn't find the liobn, punt it to userspace */
+	return H_TOO_HARD;
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index af862c30b70..6fe469eabce 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -538,6 +538,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+	struct kvm *kvm = stt->kvm;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	list_del(&stt->list);
+	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+		__free_page(stt->pages[i]);
+	kfree(stt);
+	mutex_unlock(&kvm->lock);
+
+	kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+		return VM_FAULT_SIGBUS;
+
+	page = stt->pages[vmf->pgoff];
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+	release_spapr_tce_table(stt);
+	return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				   struct kvm_create_spapr_tce *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	long npages;
+	int ret = -ENOMEM;
+	int i;
+
+	/* Check this LIOBN hasn't been previously allocated */
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == args->liobn)
+			return -EBUSY;
+	}
+
+	npages = kvmppc_stt_npages(args->window_size);
+
+	stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!stt)
+		goto fail;
+
+	stt->liobn = args->liobn;
+	stt->window_size = args->window_size;
+	stt->kvm = kvm;
+
+	for (i = 0; i < npages; i++) {
+		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!stt->pages[i])
+			goto fail;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	mutex_unlock(&kvm->lock);
+
+	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				stt, O_RDWR);
+
+fail:
+	if (stt) {
+		for (i = 0; i < npages; i++)
+			if (stt->pages[i])
+				__free_page(stt->pages[i]);
+
+		kfree(stt);
+	}
+	return ret;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
@@ -559,13 +669,17 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
+	if (r)
+		return r;
 
-	return r;
+	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
 	kvmppc_free_hpt(kvm);
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
 
 /* These are stubs for now */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 319ff63b1f3..e6adaadcdff 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -754,7 +754,7 @@ hcall_real_table:
 	.long	0		/* 0x14 - H_CLEAR_REF */
 	.long	.kvmppc_h_protect - hcall_real_table
 	.long	0		/* 0x1c - H_GET_TCE */
-	.long	0		/* 0x20 - H_SET_TCE */
+	.long	.kvmppc_h_put_tce - hcall_real_table
 	.long	0		/* 0x24 - H_SET_SPRG0 */
 	.long	.kvmppc_h_set_dabr - hcall_real_table
 	.long	0		/* 0x2c */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6fc9ee499b6..c78ceb9d560 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -202,6 +202,11 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_SPAPR_TCE:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -653,6 +658,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CREATE_SPAPR_TCE: {
+		struct kvm_create_spapr_tce create_tce;
+		struct kvm *kvm = filp->private_data;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+			goto out;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+		goto out;
+	}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index a156294fc22..61f56502732 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -550,6 +550,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_TSC_CONTROL 60
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SPAPR_TCE 63
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -752,6 +753,7 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3-70-g09d2


From 371fefd6f2dc46668e00871930dde613b88d4bde Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:23:08 +0000
Subject: KVM: PPC: Allow book3s_hv guests to use SMT processor modes

This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7.  The host still has to run single-threaded.

This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability.  The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.

To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode.  KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline).  To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c.  In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it.  Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.

When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host.  This number is exported
to userspace via the KVM_CAP_PPC_SMT capability.  If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.

We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host.  We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked.  This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.

When a vcore starts to run, it executes in the context of one of the
vcpu threads.  The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).

It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running.  In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest.  It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.

Note that there is no fixed relationship between the hardware thread
number and the vcpu number.  Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt         |  13 ++
 arch/powerpc/include/asm/kvm.h            |   1 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |   2 +
 arch/powerpc/include/asm/kvm_host.h       |  46 ++++-
 arch/powerpc/include/asm/kvm_ppc.h        |  13 ++
 arch/powerpc/kernel/asm-offsets.c         |   6 +
 arch/powerpc/kernel/exceptions-64s.S      |  31 ++-
 arch/powerpc/kernel/idle_power7.S         |   2 -
 arch/powerpc/kvm/book3s_hv.c              | 316 +++++++++++++++++++++++++++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 168 +++++++++++++++-
 arch/powerpc/kvm/powerpc.c                |   4 +
 arch/powerpc/sysdev/xics/icp-native.c     |   9 +
 include/linux/kvm.h                       |   1 +
 13 files changed, 567 insertions(+), 45 deletions(-)

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a1d344d5ff4..681871311d3 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
 
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index c3ec990daf4..471bb3d85e0 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -24,6 +24,7 @@
 
 /* Select powerpc specific features in <linux/kvm.h> */
 #define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
 
 struct kvm_regs {
 	__u64 pc;
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index b7b039532fb..9cfd5436782 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -78,6 +78,8 @@ struct kvmppc_host_state {
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu *kvm_vcpu;
+	struct kvmppc_vcore *kvm_vcore;
+	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
 	u32 host_pmc[6];
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5616e39a7fa..0d6d569e19c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,10 +25,14 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <linux/kvm_para.h>
 #include <asm/kvm_asm.h>
+#include <asm/processor.h>
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS		NR_CPUS
+#define KVM_MAX_VCORES		NR_CPUS
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -167,9 +171,34 @@ struct kvm_arch {
 	int tlbie_lock;
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
+	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_count combines an entry count in the bottom 8 bits
+ * and an exit count in the next 8 bits.  This is so that we can
+ * atomically increment the entry count iff the exit count is 0
+ * without taking the lock.
+ */
+struct kvmppc_vcore {
+	int n_runnable;
+	int n_blocked;
+	int num_threads;
+	int entry_exit_count;
+	int n_woken;
+	int nap_count;
+	u16 pcpu;
+	u8 vcore_running;
+	u8 in_guest;
+	struct list_head runnable_threads;
+	spinlock_t lock;
+};
+
+#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
+#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
+
 struct kvmppc_pte {
 	ulong eaddr;
 	u64 vpage;
@@ -365,14 +394,29 @@ struct kvm_vcpu_arch {
 	struct slb_shadow *slb_shadow;
 	struct dtl *dtl;
 	struct dtl *dtl_end;
+
+	struct kvmppc_vcore *vcore;
+	int ret;
 	int trap;
+	int state;
+	int ptid;
+	wait_queue_head_t cpu_run;
+
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu_arch_shared shregs;
+
+	struct list_head run_list;
+	struct task_struct *run_task;
+	struct kvm_run *kvm_run;
 #endif
 };
 
+#define KVMPPC_VCPU_BUSY_IN_HOST	0
+#define KVMPPC_VCPU_BLOCKED		1
+#define KVMPPC_VCPU_RUNNABLE		2
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 99f6fcf4cf8..6ef73442863 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -33,6 +33,9 @@
 #else
 #include <asm/kvm_booke.h>
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/paca.h>
+#endif
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
@@ -169,4 +172,14 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{
+	paca[cpu].kvm_hstate.xics_phys = addr;
+}
+#else
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{}
+#endif
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c70d106bf1a..d0f2387fd79 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -471,6 +471,10 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -530,6 +534,8 @@ int main(void)
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 163c041cec2..5bc06fdfa6c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -49,19 +49,32 @@ BEGIN_FTR_SECTION
 	 * state loss at this time.
 	 */
 	mfspr	r13,SPRN_SRR1
-	rlwinm	r13,r13,47-31,30,31
-	cmpwi	cr0,r13,1
-	bne	1f
-	b	.power7_wakeup_noloss
-1:	cmpwi	cr0,r13,2
-	bne	1f
-	b	.power7_wakeup_loss
+	rlwinm.	r13,r13,47-31,30,31
+	beq	9f
+
+	/* waking up from powersave (nap) state */
+	cmpwi	cr1,r13,2
 	/* Total loss of HV state is fatal, we could try to use the
 	 * PIR to locate a PACA, then use an emergency stack etc...
 	 * but for now, let's just stay stuck here
 	 */
-1:	cmpwi	cr0,r13,3
-	beq	.
+	bgt	cr1,.
+	GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	lbz	r0,PACAPROCSTART(r13)
+	cmpwi	r0,0x80
+	bne	1f
+	li	r0,0
+	stb	r0,PACAPROCSTART(r13)
+	b	kvm_start_guest
+1:
+#endif
+
+	beq	cr1,2f
+	b	.power7_wakeup_noloss
+2:	b	.power7_wakeup_loss
+9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index f8f0bc7f1d4..3a70845a51c 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
 	b	.
 
 _GLOBAL(power7_wakeup_loss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	REST_NVGPRS(r1)
 	REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
 	rfid
 
 _GLOBAL(power7_wakeup_noloss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	ld	r4,_MSR(r1)
 	ld	r5,_NIP(r1)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6fe469eabce..36b6d98f119 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -39,6 +39,7 @@
 #include <asm/mmu_context.h>
 #include <asm/lppaca.h>
 #include <asm/processor.h>
+#include <asm/cputhreads.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
@@ -51,12 +52,16 @@
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	local_paca->kvm_hstate.kvm_vcpu = vcpu;
+	local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
+
 void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	u64 now;
@@ -74,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 			      HRTIMER_MODE_REL);
 	}
 
+	kvmppc_vcpu_blocked(vcpu);
+
 	kvm_vcpu_block(vcpu);
 	vcpu->stat.halt_wakeup++;
 
 	if (vcpu->arch.dec_expires != ~(u64)0)
 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+	kvmppc_vcpu_unblocked(vcpu);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -429,9 +438,16 @@ int kvmppc_core_check_processor_compat(void)
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
-	int err = -ENOMEM;
+	int err = -EINVAL;
+	int core;
+	struct kvmppc_vcore *vcore;
 	unsigned long lpcr;
 
+	core = id / threads_per_core;
+	if (core >= KVM_MAX_VCORES)
+		goto out;
+
+	err = -ENOMEM;
 	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
 	if (!vcpu)
 		goto out;
@@ -454,6 +470,38 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
+	/*
+	 * Some vcpus may start out in stopped state.  If we initialize
+	 * them to busy-in-host state they will stop other vcpus in the
+	 * vcore from running.  Instead we initialize them to blocked
+	 * state, effectively considering them to be stopped until we
+	 * see the first run ioctl for them.
+	 */
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+
+	init_waitqueue_head(&vcpu->arch.cpu_run);
+
+	mutex_lock(&kvm->lock);
+	vcore = kvm->arch.vcores[core];
+	if (!vcore) {
+		vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+		if (vcore) {
+			INIT_LIST_HEAD(&vcore->runnable_threads);
+			spin_lock_init(&vcore->lock);
+		}
+		kvm->arch.vcores[core] = vcore;
+	}
+	mutex_unlock(&kvm->lock);
+
+	if (!vcore)
+		goto free_vcpu;
+
+	spin_lock(&vcore->lock);
+	++vcore->num_threads;
+	++vcore->n_blocked;
+	spin_unlock(&vcore->lock);
+	vcpu->arch.vcore = vcore;
+
 	return vcpu;
 
 free_vcpu:
@@ -468,21 +516,121 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kfree(vcpu);
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+	++vc->n_blocked;
+	if (vc->n_runnable > 0 &&
+	    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+		vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+					arch.run_list);
+		wake_up(&vcpu->arch.cpu_run);
+	}
+	spin_unlock(&vc->lock);
+}
+
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_blocked;
+	spin_unlock(&vc->lock);
+}
+
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void xics_wake_cpu(int cpu);
 
-static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+				   struct kvm_vcpu *vcpu)
 {
-	u64 now;
+	struct kvm_vcpu *v;
 
-	if (signal_pending(current)) {
-		run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
+	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_runnable;
+	/* decrement the physical thread id of each following vcpu */
+	v = vcpu;
+	list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
+		--v->arch.ptid;
+	list_del(&vcpu->arch.run_list);
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+	struct paca_struct *tpaca;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	cpu = vc->pcpu + vcpu->arch.ptid;
+	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	tpaca->kvm_hstate.kvm_vcore = vc;
+	smp_wmb();
+#ifdef CONFIG_PPC_ICP_NATIVE
+	if (vcpu->arch.ptid) {
+		tpaca->cpu_start = 0x80;
+		tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
+		wmb();
+		xics_wake_cpu(cpu);
+		++vc->n_woken;
 	}
+#endif
+}
 
-	flush_fp_to_thread(current);
-	flush_altivec_to_thread(current);
-	flush_vsx_to_thread(current);
-	preempt_disable();
+static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+{
+	int i;
+
+	HMT_low();
+	i = 0;
+	while (vc->nap_count < vc->n_woken) {
+		if (++i >= 1000000) {
+			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
+			       vc->nap_count, vc->n_woken);
+			break;
+		}
+		cpu_relax();
+	}
+	HMT_medium();
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.
+ */
+static int on_primary_thread(void)
+{
+	int cpu = smp_processor_id();
+	int thr = cpu_thread_in_core(cpu);
+
+	if (thr)
+		return 0;
+	while (++thr < threads_per_core)
+		if (cpu_online(cpu + thr))
+			return 0;
+	return 1;
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static int kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+	long ret;
+	u64 now;
+
+	/* don't start if any threads have a signal pending */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		if (signal_pending(vcpu->arch.run_task))
+			return 0;
 
 	/*
 	 * Make sure we are running on thread 0, and that
@@ -490,36 +638,150 @@ static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	 * XXX we should also block attempts to bring any
 	 * secondary threads online.
 	 */
-	if (threads_per_core > 1) {
-		int cpu = smp_processor_id();
-		int thr = cpu_thread_in_core(cpu);
-
-		if (thr)
-			goto out;
-		while (++thr < threads_per_core)
-			if (cpu_online(cpu + thr))
-				goto out;
+	if (threads_per_core > 1 && !on_primary_thread()) {
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+			vcpu->arch.ret = -EBUSY;
+		goto out;
 	}
 
-	kvm_guest_enter();
+	vc->n_woken = 0;
+	vc->nap_count = 0;
+	vc->entry_exit_count = 0;
+	vc->vcore_running = 1;
+	vc->in_guest = 0;
+	vc->pcpu = smp_processor_id();
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		kvmppc_start_thread(vcpu);
+	vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+				arch.run_list);
+
+	spin_unlock(&vc->lock);
 
+	preempt_disable();
+	kvm_guest_enter();
 	__kvmppc_vcore_entry(NULL, vcpu);
 
+	/* wait for secondary threads to finish writing their state to memory */
+	spin_lock(&vc->lock);
+	if (vc->nap_count < vc->n_woken)
+		kvmppc_wait_for_nap(vc);
+	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
+	vc->vcore_running = 2;
+	spin_unlock(&vc->lock);
+
+	/* make sure updates to secondary vcpu structs are visible now */
+	smp_mb();
 	kvm_guest_exit();
 
 	preempt_enable();
 	kvm_resched(vcpu);
 
 	now = get_tb();
-	/* cancel pending dec exception if dec is positive */
-	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
-		kvmppc_core_dequeue_dec(vcpu);
-
-	return kvmppc_handle_exit(run, vcpu, current);
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+		if (!vcpu->arch.trap) {
+			if (signal_pending(vcpu->arch.run_task)) {
+				vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+			}
+			continue;		/* didn't get to run */
+		}
+		ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+					 vcpu->arch.run_task);
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+	}
 
+	spin_lock(&vc->lock);
  out:
-	preempt_enable();
-	return -EBUSY;
+	vc->vcore_running = 0;
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		if (vcpu->arch.ret != RESUME_GUEST) {
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+
+	return 1;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ptid;
+	int wait_state;
+	struct kvmppc_vcore *vc;
+	DEFINE_WAIT(wait);
+
+	/* No need to go into the guest when all we do is going out */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	kvm_run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+
+	/*
+	 * Synchronize with other threads in this virtual core
+	 */
+	vc = vcpu->arch.vcore;
+	spin_lock(&vc->lock);
+	/* This happens the first time this is called for a vcpu */
+	if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
+		--vc->n_blocked;
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	ptid = vc->n_runnable;
+	vcpu->arch.run_task = current;
+	vcpu->arch.kvm_run = kvm_run;
+	vcpu->arch.ptid = ptid;
+	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+	++vc->n_runnable;
+
+	wait_state = TASK_INTERRUPTIBLE;
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		if (signal_pending(current)) {
+			if (!vc->vcore_running) {
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+				break;
+			}
+			/* have to wait for vcore to stop executing guest */
+			wait_state = TASK_UNINTERRUPTIBLE;
+			smp_send_reschedule(vc->pcpu);
+		}
+
+		if (!vc->vcore_running &&
+		    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+			/* we can run now */
+			if (kvmppc_run_core(vc))
+				continue;
+		}
+
+		if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
+			kvmppc_start_thread(vcpu);
+
+		/* wait for other threads to come in, or wait for vcore */
+		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+		spin_unlock(&vc->lock);
+		schedule();
+		finish_wait(&vcpu->arch.cpu_run, &wait);
+		spin_lock(&vc->lock);
+	}
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+		kvmppc_remove_runnable(vc, vcpu);
+	spin_unlock(&vc->lock);
+
+	return vcpu->arch.ret;
 }
 
 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e6adaadcdff..c9bf177b7cf 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -30,8 +30,6 @@
  *                                                                           *
  ****************************************************************************/
 
-#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
-
 	.globl	kvmppc_skip_interrupt
 kvmppc_skip_interrupt:
 	mfspr	r13,SPRN_SRR0
@@ -79,6 +77,32 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
  *                                                                            *
  *****************************************************************************/
 
+#define XICS_XIRR		4
+#define XICS_QIRR		0xc
+
+/*
+ * We come in here when wakened from nap mode on a secondary hw thread.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+	.globl	kvm_start_guest
+kvm_start_guest:
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	/* get vcpu pointer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
+	/* We got here with an IPI; clear it */
+	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	li	r7, XICS_XIRR
+	lwzcix	r8, r5, r7		/* ack the interrupt */
+	sync
+	stbcix	r0, r5, r6		/* clear it */
+	stwcix	r8, r5, r7		/* EOI it */
+
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
 
@@ -200,7 +224,20 @@ kvmppc_hv_entry:
 	slbia
 	ptesync
 
-	/* Switch to guest partition. */
+	/* Increment entry count iff exit count is zero. */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r9,r5,VCORE_ENTRY_EXIT
+21:	lwarx	r3,0,r9
+	cmpwi	r3,0x100		/* any threads starting to exit? */
+	bge	secondary_too_late	/* if so we're too late to the party */
+	addi	r3,r3,1
+	stwcx.	r3,0,r9
+	bne	21b
+
+	/* Primary thread switches to guest partition. */
+	lwz	r6,VCPU_PTID(r4)
+	cmpwi	r6,0
+	bne	20f
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
@@ -210,7 +247,15 @@ kvmppc_hv_entry:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
-	ld	r8,VCPU_LPCR(r4)
+	li	r0,1
+	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
+	b	10f
+
+	/* Secondary threads wait for primary to have done partition switch */
+20:	lbz	r0,VCORE_IN_GUEST(r5)
+	cmpwi	r0,0
+	beq	20b
+10:	ld	r8,VCPU_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -225,10 +270,12 @@ kvmppc_hv_entry:
 	 * Invalidate the TLB if we could possibly have stale TLB
 	 * entries for this partition on this core due to the use
 	 * of tlbiel.
+	 * XXX maybe only need this on primary thread?
 	 */
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r5,VCPU_VCPUID(r4)
 	lhz	r6,PACAPACAINDEX(r13)
+	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
 	lhz	r8,VCPU_LAST_CPU(r4)
 	sldi	r7,r6,1			/* see if this is the same vcpu */
 	add	r7,r7,r9		/* as last ran on this pcpu */
@@ -512,8 +559,60 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
-	/* Switch back to host partition */
+	/* Increment the threads-exiting-guest count in the 0xff00
+	   bits of vcore->entry_exit_count */
+	lwsync
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	addi	r6,r5,VCORE_ENTRY_EXIT
+41:	lwarx	r3,0,r6
+	addi	r0,r3,0x100
+	stwcx.	r0,0,r6
+	bne	41b
+
+	/*
+	 * At this point we have an interrupt that we have to pass
+	 * up to the kernel or qemu; we can't handle it in real mode.
+	 * Thus we have to do a partition switch, so we have to
+	 * collect the other threads, if we are the first thread
+	 * to take an interrupt.  To do this, we set the HDEC to 0,
+	 * which causes an HDEC interrupt in all threads within 2ns
+	 * because the HDEC register is shared between all 4 threads.
+	 * However, we don't need to bother if this is an HDEC
+	 * interrupt, since the other threads will already be on their
+	 * way here in that case.
+	 */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	beq	40f
+	cmpwi	r3,0x100	/* Are we the first here? */
+	bge	40f
+	cmpwi	r3,1
+	ble	40f
+	li	r0,0
+	mtspr	SPRN_HDEC,r0
+40:
+
+	/* Secondary threads wait for primary to do partition switch */
 	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	lwz	r3,VCPU_PTID(r9)
+	cmpwi	r3,0
+	beq	15f
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	b	16f
+
+	/* Primary thread waits for all the secondaries to exit guest */
+15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
+	srwi	r0,r3,8
+	clrldi	r3,r3,56
+	cmpw	r3,r0
+	bne	15b
+	isync
+
+	/* Primary thread switches back to host partition */
 	ld	r6,KVM_HOST_SDR1(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
 	li	r8,LPID_RSVD		/* switch to reserved LPID */
@@ -522,10 +621,12 @@ hdec_soon:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
+	li	r0,0
+	stb	r0,VCORE_IN_GUEST(r5)
 	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 
-	ld	r8,KVM_HOST_LPCR(r4)
+16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -634,6 +735,11 @@ hdec_soon:
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
+	/* Secondary threads go off to take a nap */
+	lwz	r0,VCPU_PTID(r3)
+	cmpwi	r0,0
+	bne	secondary_nap
+
 	/*
 	 * Reload DEC.  HDEC interrupts were disabled when
 	 * we reloaded the host's LPCR value.
@@ -840,6 +946,56 @@ _GLOBAL(kvmppc_h_set_dabr)
 	li	r3,0
 	blr
 
+secondary_too_late:
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	ld	r11,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r11)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r11,r11,16
+	.endr
+	b	50f
+
+secondary_nap:
+	/* Clear any pending IPI */
+50:	ld	r5, HSTATE_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	stbcix	r0, r5, r6
+
+	/* increment the nap count and then go to nap mode */
+	ld	r4, HSTATE_KVM_VCORE(r13)
+	addi	r4, r4, VCORE_NAP_COUNT
+	lwsync				/* make previous updates visible */
+51:	lwarx	r3, 0, r4
+	addi	r3, r3, 1
+	stwcx.	r3, 0, r4
+	bne	51b
+	isync
+
+	mfspr	r4, SPRN_LPCR
+	li	r0, LPCR_PECE
+	andc	r4, r4, r0
+	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
+	mtspr	SPRN_LPCR, r4
+	li	r0, 0
+	std	r0, HSTATE_SCRATCH0(r13)
+	ptesync
+	ld	r0, HSTATE_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
+
 /*
  * Save away FP, VMX and VSX registers.
  * r3 = vcpu pointer
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c78ceb9d560..4c549664c98 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include <asm/cputhreads.h>
 #include "timing.h"
 #include "../mm/mmu_decl.h"
 
@@ -207,6 +208,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SPAPR_TCE:
 		r = 1;
 		break;
+	case KVM_CAP_PPC_SMT:
+		r = threads_per_core;
+		break;
 #endif
 	default:
 		r = 0;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 1f15ad43614..ba382b59b92 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/prom.h>
 #include <asm/io.h>
@@ -24,6 +25,7 @@
 #include <asm/irq.h>
 #include <asm/errno.h>
 #include <asm/xics.h>
+#include <asm/kvm_ppc.h>
 
 struct icp_ipl {
 	union {
@@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+void xics_wake_cpu(int cpu)
+{
+	icp_native_set_qirr(cpu, IPI_PRIORITY);
+}
+EXPORT_SYMBOL_GPL(xics_wake_cpu);
+
 static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
 	int cpu = smp_processor_id();
@@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
 	}
 
 	icp_native_regs[cpu] = ioremap(addr, size);
+	kvmppc_set_xics_phys(cpu, addr);
 	if (!icp_native_regs[cpu]) {
 		pr_warning("icp_native: Failed ioremap for CPU %d, "
 			   "interrupt server #0x%x, addr %#lx\n",
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 61f56502732..e2a378d9716 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -551,6 +551,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_GET_TSC_KHZ 61
 #define KVM_CAP_PPC_BOOKE_SREGS 62
 #define KVM_CAP_SPAPR_TCE 63
+#define KVM_CAP_PPC_SMT 64
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-70-g09d2


From aa04b4cc5be64b4fb9ef4e0fdf2418e2f4737fb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:25:44 +0000
Subject: KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests

This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility.  These processors require a physically
contiguous, aligned area of memory for each guest.  When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access.  The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.

Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator.  The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.

KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs.  The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.

This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA.  It
also returns the size of the RMA in the argument structure.

Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace.  To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory.  Subsequently we will get rid of this
array and use memory associated with each memslot instead.

This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region.  Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB.  However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.

Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest.  This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt       |  32 ++++
 arch/powerpc/include/asm/kvm.h          |   5 +
 arch/powerpc/include/asm/kvm_book3s.h   |   8 -
 arch/powerpc/include/asm/kvm_host.h     |  15 +-
 arch/powerpc/include/asm/kvm_ppc.h      |  10 ++
 arch/powerpc/include/asm/reg.h          |   1 +
 arch/powerpc/kernel/asm-offsets.c       |   4 +-
 arch/powerpc/kernel/setup_64.c          |   3 +
 arch/powerpc/kvm/Makefile               |   3 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c     |  97 +-----------
 arch/powerpc/kvm/book3s_hv.c            | 259 ++++++++++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_hv_builtin.c    | 152 +++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  19 ++-
 arch/powerpc/kvm/powerpc.c              |  13 ++
 include/linux/kvm.h                     |   3 +
 15 files changed, 505 insertions(+), 119 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_builtin.c

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 681871311d3..b0e4b9cd6a6 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1398,6 +1398,38 @@ the entries written by kernel-handled H_PUT_TCE calls, and also lets
 userspace update the TCE table directly which is useful in some
 circumstances.
 
+4.63 KVM_ALLOCATE_RMA
+
+Capability: KVM_CAP_PPC_RMA
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_allocate_rma (out)
+Returns: file descriptor for mapping the allocated RMA
+
+This allocates a Real Mode Area (RMA) from the pool allocated at boot
+time by the kernel.  An RMA is a physically-contiguous, aligned region
+of memory used on older POWER processors to provide the memory which
+will be accessed by real-mode (MMU off) accesses in a KVM guest.
+POWER processors support a set of sizes for the RMA that usually
+includes 64MB, 128MB, 256MB and some larger powers of two.
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the allocated RMA into userspace.  The mapped area can then be
+passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
+RMA for a virtual machine.  The size of the RMA in bytes (which is
+fixed at host kernel boot time) is returned in the rma_size field of
+the argument structure.
+
+The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
+is supported; 2 if the processor requires all virtual machines to have
+an RMA, or 1 if the processor can use an RMA but doesn't require it,
+because it supports the Virtual RMA (VRMA) facility.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 471bb3d85e0..a4f6c85431f 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -282,4 +282,9 @@ struct kvm_create_spapr_tce {
 	__u32 window_size;
 };
 
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+	__u64 rma_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5537c45d626..3f91ebd4ae4 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -298,14 +298,6 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 			unsigned long pending_now, unsigned long old_pending)
 {
-	/* Recalculate LPCR:MER based on the presence of
-	 * a pending external interrupt
-	 */
-	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) ||
-	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now))
-		vcpu->arch.lpcr |= LPCR_MER;
-	else
-		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
 }
 
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0d6d569e19c..f572d9cc31b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -28,6 +28,8 @@
 #include <linux/threads.h>
 #include <linux/spinlock.h>
 #include <linux/kvm_para.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
 
@@ -156,6 +158,14 @@ struct kvmppc_spapr_tce_table {
 	struct page *pages[0];
 };
 
+struct kvmppc_rma_info {
+	void		*base_virt;
+	unsigned long	 base_pfn;
+	unsigned long	 npages;
+	struct list_head list;
+	atomic_t 	 use_count;
+};
+
 struct kvm_arch {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
@@ -169,6 +179,10 @@ struct kvm_arch {
 	unsigned long sdr1;
 	unsigned long host_sdr1;
 	int tlbie_lock;
+	int n_rma_pages;
+	unsigned long lpcr;
+	unsigned long rmor;
+	struct kvmppc_rma_info *rma;
 	struct list_head spapr_tce_tables;
 	unsigned short last_vcpu[NR_CPUS];
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
@@ -295,7 +309,6 @@ struct kvm_vcpu_arch {
 	ulong guest_owned_ext;
 	ulong purr;
 	ulong spurr;
-	ulong lpcr;
 	ulong dscr;
 	ulong amr;
 	ulong uamor;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 6ef73442863..d121f49d62b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -124,6 +124,10 @@ extern void kvmppc_map_vrma(struct kvm *kvm,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+				struct kvm_allocate_rma *rma);
+extern struct kvmppc_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvmppc_rma_info *ri);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
@@ -177,9 +181,15 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
 	paca[cpu].kvm_hstate.xics_phys = addr;
 }
+
+extern void kvm_rma_init(void);
+
 #else
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
+
+static inline void kvm_rma_init(void)
+{}
 #endif
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 36a611b398c..20a053c1427 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -242,6 +242,7 @@
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
 #define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
+#define	  LPCR_RMLS_SH	(63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
 #define     LPCR_PECE0	0x00004000	/* ext. exceptions can cause exit */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index d0f2387fd79..f4aba938166 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -437,6 +437,8 @@ int main(void)
 	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
 	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
 	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
+	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 #endif
@@ -459,7 +461,7 @@ int main(void)
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
-	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a88bf2713d4..532054f24ec 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -63,6 +63,7 @@
 #include <asm/kexec.h>
 #include <asm/mmu_context.h>
 #include <asm/code-patching.h>
+#include <asm/kvm_ppc.h>
 
 #include "setup.h"
 
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize the MMU context management stuff */
 	mmu_context_init();
 
+	kvm_rma_init();
+
 	ppc64_boot_msg(0x15, "Setup Done");
 }
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1de3d54901d..08428e2c188 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -56,7 +56,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_64_mmu_hv.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
 	book3s_hv_rm_mmu.o \
-	book3s_64_vio_hv.o
+	book3s_64_vio_hv.o \
+	book3s_hv_builtin.o
 
 kvm-book3s_64-module-objs := \
 	../../../virt/kvm/kvm_main.o \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 4a4fbec61a1..96ba96a16ab 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -79,103 +79,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	unsigned long i;
-	struct kvmppc_pginfo *pginfo;
-
 	clear_bit(kvm->arch.lpid, lpid_inuse);
 	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
-
-	if (kvm->arch.ram_pginfo) {
-		pginfo = kvm->arch.ram_pginfo;
-		kvm->arch.ram_pginfo = NULL;
-		for (i = 0; i < kvm->arch.ram_npages; ++i)
-			put_page(pfn_to_page(pginfo[i].pfn));
-		kfree(pginfo);
-	}
-}
-
-static unsigned long user_page_size(unsigned long addr)
-{
-	struct vm_area_struct *vma;
-	unsigned long size = PAGE_SIZE;
-
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, addr);
-	if (vma)
-		size = vma_kernel_pagesize(vma);
-	up_read(&current->mm->mmap_sem);
-	return size;
-}
-
-static pfn_t hva_to_pfn(unsigned long addr)
-{
-	struct page *page[1];
-	int npages;
-
-	might_sleep();
-
-	npages = get_user_pages_fast(addr, 1, 1, page);
-
-	if (unlikely(npages != 1))
-		return 0;
-
-	return page_to_pfn(page[0]);
-}
-
-long kvmppc_prepare_vrma(struct kvm *kvm,
-			 struct kvm_userspace_memory_region *mem)
-{
-	unsigned long psize, porder;
-	unsigned long i, npages;
-	struct kvmppc_pginfo *pginfo;
-	pfn_t pfn;
-	unsigned long hva;
-
-	/* First see what page size we have */
-	psize = user_page_size(mem->userspace_addr);
-	/* For now, only allow 16MB pages */
-	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
-		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
-		       psize, mem->memory_size, mem->userspace_addr);
-		return -EINVAL;
-	}
-	porder = __ilog2(psize);
-
-	npages = mem->memory_size >> porder;
-	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
-	if (!pginfo) {
-		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
-		       npages * sizeof(struct kvmppc_pginfo));
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < npages; ++i) {
-		hva = mem->userspace_addr + (i << porder);
-		if (user_page_size(hva) != psize)
-			goto err;
-		pfn = hva_to_pfn(hva);
-		if (pfn == 0) {
-			pr_err("oops, no pfn for hva %lx\n", hva);
-			goto err;
-		}
-		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
-			pr_err("oops, unaligned pfn %llx\n", pfn);
-			put_page(pfn_to_page(pfn));
-			goto err;
-		}
-		pginfo[i].pfn = pfn;
-	}
-
-	kvm->arch.ram_npages = npages;
-	kvm->arch.ram_psize = psize;
-	kvm->arch.ram_porder = porder;
-	kvm->arch.ram_pginfo = pginfo;
-
-	return 0;
-
- err:
-	kfree(pginfo);
-	return -EINVAL;
 }
 
 void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
@@ -199,6 +104,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 	for (i = 0; i < npages; ++i) {
 		pfn = pginfo[i].pfn;
+		if (!pfn)
+			break;
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 36b6d98f119..04da135cae6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -27,6 +27,8 @@
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
 #include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -40,11 +42,22 @@
 #include <asm/lppaca.h>
 #include <asm/processor.h>
 #include <asm/cputhreads.h>
+#include <asm/page.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+/*
+ * For now, limit memory to 64GB and require it to be large pages.
+ * This value is chosen because it makes the ram_pginfo array be
+ * 64kB in size, which is about as large as we want to be trying
+ * to allocate with kmalloc.
+ */
+#define MAX_MEM_ORDER		36
+
+#define LARGE_PAGE_ORDER	24	/* 16MB pages */
+
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
@@ -129,7 +142,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 		pr_err("  ESID = %.16llx VSID = %.16llx\n",
 		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
 	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
-	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
 	       vcpu->arch.last_inst);
 }
 
@@ -441,7 +454,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	int err = -EINVAL;
 	int core;
 	struct kvmppc_vcore *vcore;
-	unsigned long lpcr;
 
 	core = id / threads_per_core;
 	if (core >= KVM_MAX_VCORES)
@@ -464,10 +476,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
 
-	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
-	vcpu->arch.lpcr = lpcr;
-
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
 	/*
@@ -910,24 +918,216 @@ fail:
 	return ret;
 }
 
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+	switch (rma_size) {
+	case 32ul << 20:	/* 32 MB */
+		return 8;
+	case 64ul << 20:	/* 64 MB */
+		return 3;
+	case 128ul << 20:	/* 128 MB */
+		return 7;
+	case 256ul << 20:	/* 256 MB */
+		return 4;
+	case 1ul << 30:		/* 1 GB */
+		return 2;
+	case 16ul << 30:	/* 16 GB */
+		return 1;
+	case 256ul << 30:	/* 256 GB */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= ri->npages)
+		return VM_FAULT_SIGBUS;
+
+	page = pfn_to_page(ri->base_pfn + vmf->pgoff);
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_rma_vm_ops = {
+	.fault = kvm_rma_fault,
+};
+
+static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &kvm_rma_vm_ops;
+	return 0;
+}
+
+static int kvm_rma_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_rma_info *ri = filp->private_data;
+
+	kvm_release_rma(ri);
+	return 0;
+}
+
+static struct file_operations kvm_rma_fops = {
+	.mmap           = kvm_rma_mmap,
+	.release	= kvm_rma_release,
+};
+
+long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+{
+	struct kvmppc_rma_info *ri;
+	long fd;
+
+	ri = kvm_alloc_rma();
+	if (!ri)
+		return -ENOMEM;
+
+	fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+	if (fd < 0)
+		kvm_release_rma(ri);
+
+	ret->rma_size = ri->npages << PAGE_SHIFT;
+	return fd;
+}
+
+static struct page *hva_to_page(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page[0];
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
-		return kvmppc_prepare_vrma(kvm, mem);
+	unsigned long psize, porder;
+	unsigned long i, npages, totalpages;
+	unsigned long pg_ix;
+	struct kvmppc_pginfo *pginfo;
+	unsigned long hva;
+	struct kvmppc_rma_info *ri = NULL;
+	struct page *page;
+
+	/* For now, only allow 16MB pages */
+	porder = LARGE_PAGE_ORDER;
+	psize = 1ul << porder;
+	if ((mem->memory_size & (psize - 1)) ||
+	    (mem->guest_phys_addr & (psize - 1))) {
+		pr_err("bad memory_size=%llx @ %llx\n",
+		       mem->memory_size, mem->guest_phys_addr);
+		return -EINVAL;
+	}
+
+	npages = mem->memory_size >> porder;
+	totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+
+	/* More memory than we have space to track? */
+	if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
+		return -EINVAL;
+
+	/* Do we already have an RMA registered? */
+	if (mem->guest_phys_addr == 0 && kvm->arch.rma)
+		return -EINVAL;
+
+	if (totalpages > kvm->arch.ram_npages)
+		kvm->arch.ram_npages = totalpages;
+
+	/* Is this one of our preallocated RMAs? */
+	if (mem->guest_phys_addr == 0) {
+		struct vm_area_struct *vma;
+
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, mem->userspace_addr);
+		if (vma && vma->vm_file &&
+		    vma->vm_file->f_op == &kvm_rma_fops &&
+		    mem->userspace_addr == vma->vm_start)
+			ri = vma->vm_file->private_data;
+		up_read(&current->mm->mmap_sem);
+	}
+
+	if (ri) {
+		unsigned long rma_size;
+		unsigned long lpcr;
+		long rmls;
+
+		rma_size = ri->npages << PAGE_SHIFT;
+		if (rma_size > mem->memory_size)
+			rma_size = mem->memory_size;
+		rmls = lpcr_rmls(rma_size);
+		if (rmls < 0) {
+			pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
+			return -EINVAL;
+		}
+		atomic_inc(&ri->use_count);
+		kvm->arch.rma = ri;
+		kvm->arch.n_rma_pages = rma_size >> porder;
+		lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
+		lpcr |= rmls << LPCR_RMLS_SH;
+		kvm->arch.lpcr = lpcr;
+		kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
+	}
+
+	pg_ix = mem->guest_phys_addr >> porder;
+	pginfo = kvm->arch.ram_pginfo + pg_ix;
+	for (i = 0; i < npages; ++i, ++pg_ix) {
+		if (ri && pg_ix < kvm->arch.n_rma_pages) {
+			pginfo[i].pfn = ri->base_pfn +
+				(pg_ix << (porder - PAGE_SHIFT));
+			continue;
+		}
+		hva = mem->userspace_addr + (i << porder);
+		page = hva_to_page(hva);
+		if (!page) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		/* Check it's a 16MB page */
+		if (!PageHead(page) ||
+		    compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
+			pr_err("page at %lx isn't 16MB (o=%d)\n",
+			       hva, compound_order(page));
+			goto err;
+		}
+		pginfo[i].pfn = page_to_pfn(page);
+	}
+
 	return 0;
+
+ err:
+	return -EINVAL;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
-	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
+	    !kvm->arch.rma)
 		kvmppc_map_vrma(kvm, mem);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 	long r;
+	unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
+	long err = -ENOMEM;
+	unsigned long lpcr;
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
@@ -935,11 +1135,52 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 		return r;
 
 	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+
+	kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
+				       GFP_KERNEL);
+	if (!kvm->arch.ram_pginfo) {
+		pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		goto out_free;
+	}
+
+	kvm->arch.ram_npages = 0;
+	kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+	kvm->arch.ram_porder = LARGE_PAGE_ORDER;
+	kvm->arch.rma = NULL;
+	kvm->arch.n_rma_pages = 0;
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+		LPCR_VPM0 | LPCR_VRMA_L;
+	kvm->arch.lpcr = lpcr;
+
+
 	return 0;
+
+ out_free:
+	kvmppc_free_hpt(kvm);
+	return err;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
+	struct kvmppc_pginfo *pginfo;
+	unsigned long i;
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
+			if (pginfo[i].pfn)
+				put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+	if (kvm->arch.rma) {
+		kvm_release_rma(kvm->arch.rma);
+		kvm->arch.rma = NULL;
+	}
+
 	kvmppc_free_hpt(kvm);
 	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644
index 00000000000..736df3cbbc5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/*
+ * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * Each RMA has to be physically contiguous and of a size that the
+ * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
+ * and other larger sizes.  Since we are unlikely to be allocate that
+ * much physically contiguous memory after the system is up and running,
+ * we preallocate a set of RMAs in early boot for KVM to use.
+ */
+static unsigned long kvm_rma_size = 64 << 20;	/* 64MB */
+static unsigned long kvm_rma_count;
+
+static int __init early_parse_rma_size(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_size = memparse(p, &p);
+
+	return 0;
+}
+early_param("kvm_rma_size", early_parse_rma_size);
+
+static int __init early_parse_rma_count(char *p)
+{
+	if (!p)
+		return 1;
+
+	kvm_rma_count = simple_strtoul(p, NULL, 0);
+
+	return 0;
+}
+early_param("kvm_rma_count", early_parse_rma_count);
+
+static struct kvmppc_rma_info *rma_info;
+static LIST_HEAD(free_rmas);
+static DEFINE_SPINLOCK(rma_lock);
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+	switch (rma_size) {
+	case 32ul << 20:	/* 32 MB */
+		return 8;
+	case 64ul << 20:	/* 64 MB */
+		return 3;
+	case 128ul << 20:	/* 128 MB */
+		return 7;
+	case 256ul << 20:	/* 256 MB */
+		return 4;
+	case 1ul << 30:		/* 1 GB */
+		return 2;
+	case 16ul << 30:	/* 16 GB */
+		return 1;
+	case 256ul << 30:	/* 256 GB */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+/*
+ * Called at boot time while the bootmem allocator is active,
+ * to allocate contiguous physical memory for the real memory
+ * areas for guests.
+ */
+void kvm_rma_init(void)
+{
+	unsigned long i;
+	unsigned long j, npages;
+	void *rma;
+	struct page *pg;
+
+	/* Only do this on POWER7 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return;
+
+	if (!kvm_rma_size || !kvm_rma_count)
+		return;
+
+	/* Check that the requested size is one supported in hardware */
+	if (lpcr_rmls(kvm_rma_size) < 0) {
+		pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+		return;
+	}
+
+	npages = kvm_rma_size >> PAGE_SHIFT;
+	rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
+	for (i = 0; i < kvm_rma_count; ++i) {
+		rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
+		pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
+			kvm_rma_size >> 20);
+		rma_info[i].base_virt = rma;
+		rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
+		rma_info[i].npages = npages;
+		list_add_tail(&rma_info[i].list, &free_rmas);
+		atomic_set(&rma_info[i].use_count, 0);
+
+		pg = pfn_to_page(rma_info[i].base_pfn);
+		for (j = 0; j < npages; ++j) {
+			atomic_inc(&pg->_count);
+			++pg;
+		}
+	}
+}
+
+struct kvmppc_rma_info *kvm_alloc_rma(void)
+{
+	struct kvmppc_rma_info *ri;
+
+	ri = NULL;
+	spin_lock(&rma_lock);
+	if (!list_empty(&free_rmas)) {
+		ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
+		list_del(&ri->list);
+		atomic_inc(&ri->use_count);
+	}
+	spin_unlock(&rma_lock);
+	return ri;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_rma);
+
+void kvm_release_rma(struct kvmppc_rma_info *ri)
+{
+	if (atomic_dec_and_test(&ri->use_count)) {
+		spin_lock(&rma_lock);
+		list_add_tail(&ri->list, &free_rmas);
+		spin_unlock(&rma_lock);
+
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_release_rma);
+
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c9bf177b7cf..9ee223c3528 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -235,10 +235,10 @@ kvmppc_hv_entry:
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r6,VCPU_PTID(r4)
 	cmpwi	r6,0
 	bne	20f
-	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -255,8 +255,18 @@ kvmppc_hv_entry:
 20:	lbz	r0,VCORE_IN_GUEST(r5)
 	cmpwi	r0,0
 	beq	20b
-10:	ld	r8,VCPU_LPCR(r4)
-	mtspr	SPRN_LPCR,r8
+
+	/* Set LPCR.  Set the MER bit if there is a pending external irq. */
+10:	ld	r8,KVM_LPCR(r9)
+	ld	r0,VCPU_PENDING_EXC(r4)
+	li	r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and.	r0,r0,r7
+	beq	11f
+	ori	r8,r8,LPCR_MER
+11:	mtspr	SPRN_LPCR,r8
+	ld	r8,KVM_RMOR(r9)
+	mtspr	SPRN_RMOR,r8
 	isync
 
 	/* Check if HDEC expires soon */
@@ -464,7 +474,8 @@ hcall_real_cont:
 	/* Check for mediated interrupts (could be done earlier really ...) */
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
-	ld	r5,VCPU_LPCR(r9)
+	ld	r5,VCPU_KVM(r9)
+	ld	r5,KVM_LPCR(r5)
 	andi.	r0,r11,MSR_EE
 	beq	1f
 	andi.	r0,r5,LPCR_MER
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4c549664c98..72c506505fa 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -211,6 +211,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_SMT:
 		r = threads_per_core;
 		break;
+	case KVM_CAP_PPC_RMA:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -673,6 +676,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
 		goto out;
 	}
+
+	case KVM_ALLOCATE_RMA: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_allocate_rma rma;
+
+		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
+		if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
+			r = -EFAULT;
+		break;
+	}
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 	default:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e2a378d9716..2c366b52f50 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -552,6 +552,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_BOOKE_SREGS 62
 #define KVM_CAP_SPAPR_TCE 63
 #define KVM_CAP_PPC_SMT 64
+#define KVM_CAP_PPC_RMA	65
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -755,6 +756,8 @@ struct kvm_clock_data {
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3-70-g09d2


From 969391c58a4efb8411d6881179945f425ad9cbb5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:26:11 +0000
Subject: powerpc, KVM: Split HVMODE_206 cpu feature bit into separate HV and
 architecture bits

This replaces the single CPU_FTR_HVMODE_206 bit with two bits, one to
indicate that we have a usable hypervisor mode, and another to indicate
that the processor conforms to PowerISA version 2.06.  We also add
another bit to indicate that the processor conforms to ISA version 2.01
and set that for PPC970 and derivatives.

Some PPC970 chips (specifically those in Apple machines) have a
hypervisor mode in that MSR[HV] is always 1, but the hypervisor mode
is not useful in the sense that there is no way to run any code in
supervisor mode (HV=0 PR=0).  On these processors, the LPES0 and LPES1
bits in HID4 are always 0, and we use that as a way of detecting that
hypervisor mode is not useful.

Where we have a feature section in assembly code around code that
only applies on POWER7 in hypervisor mode, we use a construct like

END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)

The definition of END_FTR_SECTION_IFSET is such that the code will
be enabled (not overwritten with nops) only if all bits in the
provided mask are set.

Note that the CPU feature check in __tlbie() only needs to check the
ARCH_206 bit, not the HVMODE bit, because __tlbie() can only get called
if we are running bare-metal, i.e. in hypervisor mode.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/cputable.h    | 14 ++++++++------
 arch/powerpc/include/asm/reg.h         | 16 ++++++++++++----
 arch/powerpc/kernel/cpu_setup_power7.S |  4 ++--
 arch/powerpc/kernel/cpu_setup_ppc970.S | 26 ++++++++++++++++++++++----
 arch/powerpc/kernel/exceptions-64s.S   |  4 ++--
 arch/powerpc/kernel/paca.c             |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  3 ++-
 arch/powerpc/kvm/book3s_hv.c           |  3 ++-
 arch/powerpc/kvm/book3s_hv_builtin.c   |  4 ++--
 arch/powerpc/kvm/book3s_segment.S      |  2 +-
 arch/powerpc/mm/hash_native_64.c       |  4 ++--
 11 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index c0d842cfd01..e30442c539c 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -179,8 +179,9 @@ extern const char *powerpc_base_platform;
 #define LONG_ASM_CONST(x)		0
 #endif
 
-
-#define CPU_FTR_HVMODE_206		LONG_ASM_CONST(0x0000000800000000)
+#define CPU_FTR_HVMODE			LONG_ASM_CONST(0x0000000200000000)
+#define CPU_FTR_ARCH_201		LONG_ASM_CONST(0x0000000400000000)
+#define CPU_FTR_ARCH_206		LONG_ASM_CONST(0x0000000800000000)
 #define CPU_FTR_CFAR			LONG_ASM_CONST(0x0000001000000000)
 #define CPU_FTR_IABR			LONG_ASM_CONST(0x0000002000000000)
 #define CPU_FTR_MMCRA			LONG_ASM_CONST(0x0000004000000000)
@@ -401,9 +402,10 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS)
 #define CPU_FTRS_PPC970	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
-	    CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS)
+	    CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \
+	    CPU_FTR_HVMODE)
 #define CPU_FTRS_POWER5	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -417,13 +419,13 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
 	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
 	    CPU_FTR_COHERENT_ICACHE | \
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR)
+	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 20a053c1427..ddbe57ae858 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -307,6 +307,7 @@
 #define SPRN_HASH1	0x3D2		/* Primary Hash Address Register */
 #define SPRN_HASH2	0x3D3		/* Secondary Hash Address Resgister */
 #define SPRN_HID0	0x3F0		/* Hardware Implementation Register 0 */
+#define HID0_HDICE_SH	(63 - 23)	/* 970 HDEC interrupt enable */
 #define HID0_EMCP	(1<<31)		/* Enable Machine Check pin */
 #define HID0_EBA	(1<<29)		/* Enable Bus Address Parity */
 #define HID0_EBD	(1<<28)		/* Enable Bus Data Parity */
@@ -362,6 +363,13 @@
 #define SPRN_IABR2	0x3FA		/* 83xx */
 #define SPRN_IBCR	0x135		/* 83xx Insn Breakpoint Control Reg */
 #define SPRN_HID4	0x3F4		/* 970 HID4 */
+#define  HID4_LPES0	 (1ul << (63-0)) /* LPAR env. sel. bit 0 */
+#define	 HID4_RMLS2_SH	 (63 - 2)	/* Real mode limit bottom 2 bits */
+#define	 HID4_LPID5_SH	 (63 - 6)	/* partition ID bottom 4 bits */
+#define	 HID4_RMOR_SH	 (63 - 22)	/* real mode offset (16 bits) */
+#define  HID4_LPES1	 (1 << (63-57))	/* LPAR env. sel. bit 1 */
+#define  HID4_RMLS0_SH	 (63 - 58)	/* Real mode limit top bit */
+#define	 HID4_LPID1_SH	 0		/* partition ID top 2 bits */
 #define SPRN_HID4_GEKKO	0x3F3		/* Gekko HID4 */
 #define SPRN_HID5	0x3F6		/* 970 HID5 */
 #define SPRN_HID6	0x3F9	/* BE HID 6 */
@@ -811,28 +819,28 @@
 	mfspr	rX,SPRN_SPRG_PACA;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mfspr	rX,SPRN_SPRG_HPACA;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_PACA(rX)					\
 	BEGIN_FTR_SECTION_NESTED(66);			\
 	mtspr	SPRN_SPRG_PACA,rX;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mtspr	SPRN_SPRG_HPACA,rX;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define GET_SCRATCH0(rX)				\
 	BEGIN_FTR_SECTION_NESTED(66);			\
 	mfspr	rX,SPRN_SPRG_SCRATCH0;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mfspr	rX,SPRN_SPRG_HSCRATCH0;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_SCRATCH0(rX)				\
 	BEGIN_FTR_SECTION_NESTED(66);			\
 	mtspr	SPRN_SPRG_SCRATCH0,rX;			\
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mtspr	SPRN_SPRG_HSCRATCH0,rX;			\
-	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #else /* CONFIG_PPC_BOOK3S_64 */
 #define GET_SCRATCH0(rX)	mfspr	rX,SPRN_SPRG_SCRATCH0
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S
index 2ef6749688e..76797c5105d 100644
--- a/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/arch/powerpc/kernel/cpu_setup_power7.S
@@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7)
 	blr
 
 __init_hvmode_206:
-	/* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */
+	/* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
 	mfmsr	r3
 	rldicl.	r0,r3,4,63
 	bnelr
 	ld	r5,CPU_SPEC_FEATURES(r4)
-	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
 	xor	r5,r5,r6
 	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S
index 27f2507279d..12fac8df01c 100644
--- a/arch/powerpc/kernel/cpu_setup_ppc970.S
+++ b/arch/powerpc/kernel/cpu_setup_ppc970.S
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970)
 	/* Do nothing if not running in HV mode */
 	mfmsr	r0
 	rldicl.	r0,r0,4,63
-	beqlr
+	beq	no_hv_mode
 
 	mfspr	r0,SPRN_HID0
 	li	r11,5			/* clear DOZE and SLEEP */
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP)
 	/* Do nothing if not running in HV mode */
 	mfmsr	r0
 	rldicl.	r0,r0,4,63
-	beqlr
+	beq	no_hv_mode
 
 	mfspr	r0,SPRN_HID0
 	li	r11,0x15		/* clear DOZE and SLEEP */
@@ -109,6 +109,14 @@ load_hids:
 	sync
 	isync
 
+	/* Try to set LPES = 01 in HID4 */
+	mfspr	r0,SPRN_HID4
+	clrldi	r0,r0,1			/* clear LPES0 */
+	ori	r0,r0,HID4_LPES1	/* set LPES1 */
+	sync
+	mtspr	SPRN_HID4,r0
+	isync
+
 	/* Save away cpu state */
 	LOAD_REG_ADDR(r5,cpu_state_storage)
 
@@ -117,11 +125,21 @@ load_hids:
 	std	r3,CS_HID0(r5)
 	mfspr	r3,SPRN_HID1
 	std	r3,CS_HID1(r5)
-	mfspr	r3,SPRN_HID4
-	std	r3,CS_HID4(r5)
+	mfspr	r4,SPRN_HID4
+	std	r4,CS_HID4(r5)
 	mfspr	r3,SPRN_HID5
 	std	r3,CS_HID5(r5)
 
+	/* See if we successfully set LPES1 to 1; if not we are in Apple mode */
+	andi.	r4,r4,HID4_LPES1
+	bnelr
+
+no_hv_mode:
+	/* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
+	ld	r5,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+	andc	r5,r5,r6
+	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
 
 /* Called with no MMU context (typically MSR:IR/DR off) to
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 5bc06fdfa6c..a5345380bef 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -75,7 +75,7 @@ BEGIN_FTR_SECTION
 	b	.power7_wakeup_noloss
 2:	b	.power7_wakeup_loss
 9:
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
 				 NOTEST, 0x100)
@@ -173,7 +173,7 @@ hardware_interrupt_hv:
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
 					    EXC_STD, SOFTEN_TEST_PR)
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
 	KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index efeb8818418..0a5a899846b 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca)
 	 * if we do a GET_PACA() before the feature fixups have been
 	 * applied
 	 */
-	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE))
 		mtspr(SPRN_SPRG_HPACA, local_paca);
 #endif
 	mtspr(SPRN_SPRG_PACA, local_paca);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 96ba96a16ab..212dcd8fc50 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -128,7 +128,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 int kvmppc_mmu_hv_init(void)
 {
-	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_206))
 		return -EINVAL;
 	memset(lpid_inuse, 0, sizeof(lpid_inuse));
 	set_bit(mfspr(SPRN_LPID), lpid_inuse);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 04da135cae6..dc70e7745ab 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -443,7 +443,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvmppc_core_check_processor_compat(void)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_206))
 		return 0;
 	return -EIO;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 736df3cbbc5..7315ec6e817 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -90,8 +90,8 @@ void kvm_rma_init(void)
 	void *rma;
 	struct page *pg;
 
-	/* Only do this on POWER7 in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+	/* Only do this in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return;
 
 	if (!kvm_rma_size || !kvm_rma_count)
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 134501691ad..aed32e51721 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -170,7 +170,7 @@ BEGIN_FTR_SECTION
 	mfspr	r4,SPRN_HSRR1
 	andi.	r12,r12,0x3ffd
 	b	2f
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 #endif
 1:	mfsrr0	r3
 	mfsrr1	r4
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index dfd764896db..b44f5f80305 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -51,7 +51,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
 		va &= ~0xffful;
 		va |= ssize << 8;
 		asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
-			     : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
 			     : "memory");
 		break;
 	default:
@@ -61,7 +61,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize)
 		va |= ssize << 8;
 		va |= 1; /* L */
 		asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2)
-			     : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206)
+			     : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
 			     : "memory");
 		break;
 	}
-- 
cgit v1.2.3-70-g09d2


From 9e368f2915601cd5bc7f5fd638b58435b018bbd7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Jun 2011 00:40:08 +0000
Subject: KVM: PPC: book3s_hv: Add support for PPC970-family processors

This adds support for running KVM guests in supervisor mode on those
PPC970 processors that have a usable hypervisor mode.  Unfortunately,
Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to
1), but the YDL PowerStation does have a usable hypervisor mode.

There are several differences between the PPC970 and POWER7 in how
guests are managed.  These differences are accommodated using the
CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature
bits.  Notably, on PPC970:

* The LPCR, LPID or RMOR registers don't exist, and the functions of
  those registers are provided by bits in HID4 and one bit in HID0.

* External interrupts can be directed to the hypervisor, but unlike
  POWER7 they are masked by MSR[EE] in non-hypervisor modes and use
  SRR0/1 not HSRR0/1.

* There is no virtual RMA (VRMA) mode; the guest must use an RMO
  (real mode offset) area.

* The TLB entries are not tagged with the LPID, so it is necessary to
  flush the whole TLB on partition switch.  Furthermore, when switching
  partitions we have to ensure that no other CPU is executing the tlbie
  or tlbsync instructions in either the old or the new partition,
  otherwise undefined behaviour can occur.

* The PMU has 8 counters (PMC registers) rather than 6.

* The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist.

* The SLB has 64 entries rather than 32.

* There is no mediated external interrupt facility, so if we switch to
  a guest that has a virtual external interrupt pending but the guest
  has MSR[EE] = 0, we have to arrange to have an interrupt pending for
  it so that we can get control back once it re-enables interrupts.  We
  do that by sending ourselves an IPI with smp_send_reschedule after
  hard-disabling interrupts.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/exception-64s.h  |   4 +
 arch/powerpc/include/asm/kvm_book3s_asm.h |   2 +-
 arch/powerpc/include/asm/kvm_host.h       |   2 +-
 arch/powerpc/kernel/asm-offsets.c         |   1 +
 arch/powerpc/kernel/exceptions-64s.S      |   2 +-
 arch/powerpc/kvm/Kconfig                  |  13 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       |  30 ++--
 arch/powerpc/kvm/book3s_hv.c              |  60 ++++++--
 arch/powerpc/kvm/book3s_hv_builtin.c      |  11 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S   |  30 ++++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c       |   6 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 230 +++++++++++++++++++++++++++++-
 arch/powerpc/kvm/powerpc.c                |   3 +
 arch/powerpc/mm/hash_native_64.c          |   2 +-
 14 files changed, 354 insertions(+), 42 deletions(-)

(limited to 'arch/powerpc/kvm/book3s_hv.c')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 69435da8f2b..8057f4f6980 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -246,6 +246,10 @@ label##_hv:						\
 	KVMTEST(vec);							\
 	_SOFTEN_TEST(EXC_HV)
 
+#define SOFTEN_TEST_HV_201(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_STD)
+
 #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
 	HMT_MEDIUM;							\
 	SET_SCRATCH0(r13);    /* save r13 */				\
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 9cfd5436782..ef7b3688c3b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,7 +82,7 @@ struct kvmppc_host_state {
 	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
-	u32 host_pmc[6];
+	u32 host_pmc[8];
 	u64 host_purr;
 	u64 host_spurr;
 	u64 host_dscr;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f572d9cc31b..cc22b282d75 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -353,7 +353,7 @@ struct kvm_vcpu_arch {
 	u32 dbsr;
 
 	u64 mmcr[3];
-	u32 pmc[6];
+	u32 pmc[8];
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index f4aba938166..54b935f2f5d 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -128,6 +128,7 @@ int main(void)
 	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
 	/* paca */
 	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+	DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
 	DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
 	DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
 	DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a5345380bef..41b02c792aa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -171,7 +171,7 @@ hardware_interrupt_hv:
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST_PR)
+					    EXC_STD, SOFTEN_TEST_HV_201)
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 5d9b78ebbaa..eeb42e06f2d 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -67,23 +67,20 @@ config KVM_BOOK3S_64
 	  If unsure, say N.
 
 config KVM_BOOK3S_64_HV
-	bool "KVM support for POWER7 using hypervisor mode in host"
+	bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
 	depends on KVM_BOOK3S_64
 	---help---
 	  Support running unmodified book3s_64 guest kernels in
-	  virtual machines on POWER7 processors that have hypervisor
-	  mode available to the host.
+	  virtual machines on POWER7 and PPC970 processors that have
+	  hypervisor mode available to the host.
 
 	  If you say Y here, KVM will use the hardware virtualization
 	  facilities of POWER7 (and later) processors, meaning that
 	  guest operating systems will run at full hardware speed
 	  using supervisor and user modes.  However, this also means
 	  that KVM is not usable under PowerVM (pHyp), is only usable
-	  on POWER7 (or later) processors, and can only emulate
-	  POWER5+, POWER6 and POWER7 processors.
-
-	  This module provides access to the hardware capabilities through
-	  a character device node named /dev/kvm.
+	  on POWER7 (or later) processors and PPC970-family processors,
+	  and cannot emulate a different processor from the host processor.
 
 	  If unsure, say N.
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 212dcd8fc50..bc3a2ea9421 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -42,6 +42,8 @@
 #define VRMA_PAGE_ORDER	24
 #define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
 
+/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
+#define MAX_LPID_970	63
 #define NR_LPIDS	(LPID_RSVD + 1)
 unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
 
@@ -69,9 +71,6 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
 	kvm->arch.lpid = lpid;
-	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
-	kvm->arch.host_lpid = mfspr(SPRN_LPID);
-	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
 
 	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
 	return 0;
@@ -128,12 +127,24 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 
 int kvmppc_mmu_hv_init(void)
 {
-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-	    !cpu_has_feature(CPU_FTR_ARCH_206))
+	unsigned long host_lpid, rsvd_lpid;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return -EINVAL;
+
 	memset(lpid_inuse, 0, sizeof(lpid_inuse));
-	set_bit(mfspr(SPRN_LPID), lpid_inuse);
-	set_bit(LPID_RSVD, lpid_inuse);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		host_lpid = mfspr(SPRN_LPID);	/* POWER7 */
+		rsvd_lpid = LPID_RSVD;
+	} else {
+		host_lpid = 0;			/* PPC970 */
+		rsvd_lpid = MAX_LPID_970;
+	}
+
+	set_bit(host_lpid, lpid_inuse);
+	/* rsvd_lpid is reserved for use in partition switching */
+	set_bit(rsvd_lpid, lpid_inuse);
 
 	return 0;
 }
@@ -157,7 +168,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
 
-	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+	if (cpu_has_feature(CPU_FTR_ARCH_206))
+		vcpu->arch.slb_nr = 32;		/* POWER7 */
+	else
+		vcpu->arch.slb_nr = 64;
 
 	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index dc70e7745ab..cc0d7f1b19a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -443,8 +443,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvmppc_core_check_processor_compat(void)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE) &&
-	    cpu_has_feature(CPU_FTR_ARCH_206))
+	if (cpu_has_feature(CPU_FTR_HVMODE))
 		return 0;
 	return -EIO;
 }
@@ -731,6 +730,10 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINTR;
 	}
 
+	/* On PPC970, check that we have an RMA region */
+	if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+		return -EPERM;
+
 	kvm_run->exit_reason = 0;
 	vcpu->arch.ret = RESUME_GUEST;
 	vcpu->arch.trap = 0;
@@ -920,12 +923,14 @@ fail:
 }
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
-   Assumes POWER7. */
+   Assumes POWER7 or PPC970. */
 static inline int lpcr_rmls(unsigned long rma_size)
 {
 	switch (rma_size) {
 	case 32ul << 20:	/* 32 MB */
-		return 8;
+		if (cpu_has_feature(CPU_FTR_ARCH_206))
+			return 8;	/* only supported on POWER7 */
+		return -1;
 	case 64ul << 20:	/* 64 MB */
 		return 3;
 	case 128ul << 20:	/* 128 MB */
@@ -1059,6 +1064,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		    mem->userspace_addr == vma->vm_start)
 			ri = vma->vm_file->private_data;
 		up_read(&current->mm->mmap_sem);
+		if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
+			pr_err("CPU requires an RMO\n");
+			return -EINVAL;
+		}
 	}
 
 	if (ri) {
@@ -1077,10 +1086,25 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 		atomic_inc(&ri->use_count);
 		kvm->arch.rma = ri;
 		kvm->arch.n_rma_pages = rma_size >> porder;
-		lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
-		lpcr |= rmls << LPCR_RMLS_SH;
+
+		/* Update LPCR and RMOR */
+		lpcr = kvm->arch.lpcr;
+		if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+			/* PPC970; insert RMLS value (split field) in HID4 */
+			lpcr &= ~((1ul << HID4_RMLS0_SH) |
+				  (3ul << HID4_RMLS2_SH));
+			lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+				((rmls & 3) << HID4_RMLS2_SH);
+			/* RMOR is also in HID4 */
+			lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
+				<< HID4_RMOR_SH;
+		} else {
+			/* POWER7 */
+			lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
+			lpcr |= rmls << LPCR_RMLS_SH;
+			kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+		}
 		kvm->arch.lpcr = lpcr;
-		kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
 		pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
 			ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
 	}
@@ -1151,11 +1175,25 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 	kvm->arch.rma = NULL;
 	kvm->arch.n_rma_pages = 0;
 
-	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
-		LPCR_VPM0 | LPCR_VRMA_L;
-	kvm->arch.lpcr = lpcr;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
+	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+		/* PPC970; HID4 is effectively the LPCR */
+		unsigned long lpid = kvm->arch.lpid;
+		kvm->arch.host_lpid = 0;
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
+		lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+		lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
+			((lpid & 0xf) << HID4_LPID5_SH);
+	} else {
+		/* POWER7; init LPCR for virtual RMA mode */
+		kvm->arch.host_lpid = mfspr(SPRN_LPID);
+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+		lpcr &= LPCR_PECE | LPCR_LPES;
+		lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+			LPCR_VPM0 | LPCR_VRMA_L;
+	}
+	kvm->arch.lpcr = lpcr;
 
 	return 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7315ec6e817..d43120355ee 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -55,12 +55,14 @@ static LIST_HEAD(free_rmas);
 static DEFINE_SPINLOCK(rma_lock);
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
-   Assumes POWER7. */
+   Assumes POWER7 or PPC970. */
 static inline int lpcr_rmls(unsigned long rma_size)
 {
 	switch (rma_size) {
 	case 32ul << 20:	/* 32 MB */
-		return 8;
+		if (cpu_has_feature(CPU_FTR_ARCH_206))
+			return 8;	/* only supported on POWER7 */
+		return -1;
 	case 64ul << 20:	/* 64 MB */
 		return 3;
 	case 128ul << 20:	/* 128 MB */
@@ -90,8 +92,9 @@ void kvm_rma_init(void)
 	void *rma;
 	struct page *pg;
 
-	/* Only do this in HV mode */
-	if (!cpu_has_feature(CPU_FTR_HVMODE))
+	/* Only do this on PPC970 in HV mode */
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_201))
 		return;
 
 	if (!kvm_rma_size || !kvm_rma_count)
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 532afaf1984..3f7b674dd4b 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -50,8 +50,10 @@ _GLOBAL(__kvmppc_vcore_entry)
 	SAVE_NVGPRS(r1)
 
 	/* Save host DSCR */
+BEGIN_FTR_SECTION
 	mfspr	r3, SPRN_DSCR
 	std	r3, HSTATE_DSCR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save host DABR */
 	mfspr	r3, SPRN_DABR
@@ -86,12 +88,20 @@ _GLOBAL(__kvmppc_vcore_entry)
 	mfspr	r7, SPRN_PMC4
 	mfspr	r8, SPRN_PMC5
 	mfspr	r9, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	stw	r3, HSTATE_PMC(r13)
 	stw	r5, HSTATE_PMC + 4(r13)
 	stw	r6, HSTATE_PMC + 8(r13)
 	stw	r7, HSTATE_PMC + 12(r13)
 	stw	r8, HSTATE_PMC + 16(r13)
 	stw	r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+	stw	r10, HSTATE_PMC + 24(r13)
+	stw	r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 31:
 
 	/*
@@ -105,6 +115,26 @@ _GLOBAL(__kvmppc_vcore_entry)
 	add	r8,r8,r7
 	std	r8,HSTATE_DECEXP(r13)
 
+	/*
+	 * On PPC970, if the guest vcpu has an external interrupt pending,
+	 * send ourselves an IPI so as to interrupt the guest once it
+	 * enables interrupts.  (It must have interrupts disabled,
+	 * otherwise we would already have delivered the interrupt.)
+	 */
+BEGIN_FTR_SECTION
+	ld	r0, VCPU_PENDING_EXC(r4)
+	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
+	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+	and.	r0, r0, r7
+	beq	32f
+	mr	r31, r4
+	lhz	r3, PACAPACAINDEX(r13)
+	bl	smp_send_reschedule
+	nop
+	mr	r4, r31
+32:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+
 	/* Jump to partition switch code */
 	bl	.kvmppc_hv_entry_trampoline
 	nop
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index edb0aae901a..fcfe6b05555 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -56,7 +56,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	/* only handle 4k, 64k and 16M pages for now */
 	porder = 12;
 	if (pteh & HPTE_V_LARGE) {
-		if ((ptel & 0xf000) == 0x1000) {
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (ptel & 0xf000) == 0x1000) {
 			/* 64k page */
 			porder = 16;
 		} else if ((ptel & 0xff000) == 0) {
@@ -126,7 +127,8 @@ static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 	va_low &= 0x7ff;
 	if (v & HPTE_V_LARGE) {
 		rb |= 1;			/* L field */
-		if (r & 0xff000) {
+		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		    (r & 0xff000)) {
 			/* non-16MB large page, must be 64k */
 			/* (masks depend on page size) */
 			rb |= 0x1000;		/* page encoding in LP field */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9ee223c3528..6dd33581a22 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,12 +148,20 @@ kvmppc_hv_entry:
 	lwz	r7, VCPU_PMC + 12(r4)
 	lwz	r8, VCPU_PMC + 16(r4)
 	lwz	r9, VCPU_PMC + 20(r4)
+BEGIN_FTR_SECTION
+	lwz	r10, VCPU_PMC + 24(r4)
+	lwz	r11, VCPU_PMC + 28(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_PMC1, r3
 	mtspr	SPRN_PMC2, r5
 	mtspr	SPRN_PMC3, r6
 	mtspr	SPRN_PMC4, r7
 	mtspr	SPRN_PMC5, r8
 	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	ld	r3, VCPU_MMCR(r4)
 	ld	r5, VCPU_MMCR + 8(r4)
 	ld	r6, VCPU_MMCR + 16(r4)
@@ -165,9 +173,11 @@ kvmppc_hv_entry:
 	/* Load up FP, VMX and VSX registers */
 	bl	kvmppc_load_fp
 
+BEGIN_FTR_SECTION
 	/* Switch DSCR to guest value */
 	ld	r5, VCPU_DSCR(r4)
 	mtspr	SPRN_DSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/*
 	 * Set the decrementer to the guest decrementer.
@@ -210,6 +220,7 @@ kvmppc_hv_entry:
 	mtspr	SPRN_DABRX,r5
 	mtspr	SPRN_DABR,r6
 
+BEGIN_FTR_SECTION
 	/* Restore AMR and UAMOR, set AMOR to all 1s */
 	ld	r5,VCPU_AMR(r4)
 	ld	r6,VCPU_UAMOR(r4)
@@ -217,6 +228,7 @@ kvmppc_hv_entry:
 	mtspr	SPRN_AMR,r5
 	mtspr	SPRN_UAMOR,r6
 	mtspr	SPRN_AMOR,r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Clear out SLB */
 	li	r6,0
@@ -224,6 +236,14 @@ kvmppc_hv_entry:
 	slbia
 	ptesync
 
+BEGIN_FTR_SECTION
+	b	30f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	/*
+	 * POWER7 host -> guest partition switch code.
+	 * We don't have to lock against concurrent tlbies,
+	 * but we do have to coordinate across hardware threads.
+	 */
 	/* Increment entry count iff exit count is zero. */
 	ld	r5,HSTATE_KVM_VCORE(r13)
 	addi	r9,r5,VCORE_ENTRY_EXIT
@@ -315,9 +335,94 @@ kvmppc_hv_entry:
 	ld	r8,VCPU_SPURR(r4)
 	mtspr	SPRN_PURR,r7
 	mtspr	SPRN_SPURR,r8
+	b	31f
+
+	/*
+	 * PPC970 host -> guest partition switch code.
+	 * We have to lock against concurrent tlbies,
+	 * using native_tlbie_lock to lock against host tlbies
+	 * and kvm->arch.tlbie_lock to lock against guest tlbies.
+	 * We also have to invalidate the TLB since its
+	 * entries aren't tagged with the LPID.
+	 */
+30:	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+
+	/* first take native_tlbie_lock */
+	.section ".toc","aw"
+toc_tlbie_lock:
+	.tc	native_tlbie_lock[TC],native_tlbie_lock
+	.previous
+	ld	r3,toc_tlbie_lock@toc(2)
+	lwz	r8,PACA_LOCK_TOKEN(r13)
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r7,KVM_LPCR(r9)		/* use kvm->arch.lpcr to store HID4 */
+	li	r0,0x18f
+	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */
+	or	r0,r7,r0
+	ptesync
+	sync
+	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop native_tlbie_lock */
+
+	/* invalidate the whole TLB */
+	li	r0,256
+	mtctr	r0
+	li	r6,0
+25:	tlbiel	r6
+	addi	r6,r6,0x1000
+	bdnz	25b
+	ptesync
+
+	/* Take the guest's tlbie_lock */
+	addi	r3,r9,KVM_TLBIE_LOCK
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+	ld	r6,KVM_SDR1(r9)
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+
+	/* Set up HID4 with the guest's LPID etc. */
+	sync
+	mtspr	SPRN_HID4,r7
+	isync
+
+	/* drop the guest's tlbie_lock */
+	li	r0,0
+	stw	r0,0(r3)
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	mr	r9,r4
+	blt	hdec_soon
+
+	/* Enable HDEC interrupts */
+	mfspr	r0,SPRN_HID0
+	li	r3,1
+	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+	sync
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
 
 	/* Load up guest SLB entries */
-	lwz	r5,VCPU_SLB_MAX(r4)
+31:	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
 	mtctr	r5
@@ -472,6 +577,7 @@ kvmppc_interrupt:
 hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_EXTERNAL
 	bne+	1f
 	ld	r5,VCPU_KVM(r9)
@@ -481,6 +587,7 @@ hcall_real_cont:
 	andi.	r0,r5,LPCR_MER
 	bne	bounce_ext_interrupt
 1:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save DEC */
 	mfspr	r5,SPRN_DEC
@@ -492,9 +599,11 @@ hcall_real_cont:
 	/* Save HEIR (HV emulation assist reg) in last_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
 	li	r3,-1
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
 	bne	11f
 	mfspr	r3,SPRN_HEIR
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 11:	stw	r3,VCPU_LAST_INST(r9)
 
 	/* Save more register state  */
@@ -508,8 +617,10 @@ hcall_real_cont:
 	stw	r7, VCPU_DSISR(r9)
 	std	r8, VCPU_CTR(r9)
 	/* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+BEGIN_FTR_SECTION
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
 	beq	6f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 7:	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
@@ -543,6 +654,7 @@ hcall_real_cont:
 	/*
 	 * Save the guest PURR/SPURR
 	 */
+BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_PURR
 	mfspr	r6,SPRN_SPURR
 	ld	r7,VCPU_PURR(r9)
@@ -562,6 +674,7 @@ hcall_real_cont:
 	add	r4,r4,r6
 	mtspr	SPRN_PURR,r3
 	mtspr	SPRN_SPURR,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 
 	/* Clear out SLB */
 	li	r5,0
@@ -570,6 +683,14 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
+BEGIN_FTR_SECTION
+	b	32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	/*
+	 * POWER7 guest -> host partition switch code.
+	 * We don't have to lock against tlbies but we do
+	 * have to coordinate the hardware threads.
+	 */
 	/* Increment the threads-exiting-guest count in the 0xff00
 	   bits of vcore->entry_exit_count */
 	lwsync
@@ -640,9 +761,82 @@ hdec_soon:
 16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
+	b	33f
+
+	/*
+	 * PPC970 guest -> host partition switch code.
+	 * We have to lock against concurrent tlbies, and
+	 * we have to flush the whole TLB.
+	 */
+32:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+
+	/* Take the guest's tlbie_lock */
+	lwz	r8,PACA_LOCK_TOKEN(r13)
+	addi	r3,r4,KVM_TLBIE_LOCK
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r7,KVM_HOST_LPCR(r4)	/* use kvm->arch.host_lpcr for HID4 */
+	li	r0,0x18f
+	rotldi	r0,r0,HID4_LPID5_SH	/* all lpid bits in HID4 = 1 */
+	or	r0,r7,r0
+	ptesync
+	sync
+	mtspr	SPRN_HID4,r0		/* switch to reserved LPID */
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop guest tlbie_lock */
+
+	/* invalidate the whole TLB */
+	li	r0,256
+	mtctr	r0
+	li	r6,0
+25:	tlbiel	r6
+	addi	r6,r6,0x1000
+	bdnz	25b
+	ptesync
+
+	/* take native_tlbie_lock */
+	ld	r3,toc_tlbie_lock@toc(2)
+24:	lwarx	r0,0,r3
+	cmpwi	r0,0
+	bne	24b
+	stwcx.	r8,0,r3
+	bne	24b
+	isync
+
+	ld	r6,KVM_HOST_SDR1(r4)
+	mtspr	SPRN_SDR1,r6		/* switch to host page table */
+
+	/* Set up host HID4 value */
+	sync
+	mtspr	SPRN_HID4,r7
+	isync
+	li	r0,0
+	stw	r0,0(r3)		/* drop native_tlbie_lock */
+
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+	/* Disable HDEC interrupts */
+	mfspr	r0,SPRN_HID0
+	li	r3,0
+	rldimi	r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+	sync
+	mtspr	SPRN_HID0,r0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
+	mfspr	r0,SPRN_HID0
 
 	/* load host SLB entries */
-	ld	r8,PACA_SLBSHADOWPTR(r13)
+33:	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
 	ld	r5,SLBSHADOW_SAVEAREA(r8)
@@ -654,12 +848,14 @@ hdec_soon:
 	.endr
 
 	/* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_AMR
 	mfspr	r6,SPRN_UAMOR
 	std	r5,VCPU_AMR(r9)
 	std	r6,VCPU_UAMOR(r9)
 	li	r6,0
 	mtspr	SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Restore host DABR and DABRX */
 	ld	r5,HSTATE_DABR(r13)
@@ -668,10 +864,12 @@ hdec_soon:
 	mtspr	SPRN_DABRX,r6
 
 	/* Switch DSCR back to host value */
+BEGIN_FTR_SECTION
 	mfspr	r8, SPRN_DSCR
 	ld	r7, HSTATE_DSCR(r13)
 	std	r8, VCPU_DSCR(r7)
 	mtspr	SPRN_DSCR, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Save non-volatile GPRs */
 	std	r14, VCPU_GPR(r14)(r9)
@@ -735,21 +933,31 @@ hdec_soon:
 	mfspr	r6, SPRN_PMC4
 	mfspr	r7, SPRN_PMC5
 	mfspr	r8, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	stw	r3, VCPU_PMC(r9)
 	stw	r4, VCPU_PMC + 4(r9)
 	stw	r5, VCPU_PMC + 8(r9)
 	stw	r6, VCPU_PMC + 12(r9)
 	stw	r7, VCPU_PMC + 16(r9)
 	stw	r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+	stw	r10, VCPU_PMC + 24(r9)
+	stw	r11, VCPU_PMC + 28(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 22:
 	/* save FP state */
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
-	/* Secondary threads go off to take a nap */
+	/* Secondary threads go off to take a nap on POWER7 */
+BEGIN_FTR_SECTION
 	lwz	r0,VCPU_PTID(r3)
 	cmpwi	r0,0
 	bne	secondary_nap
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/*
 	 * Reload DEC.  HDEC interrupts were disabled when
@@ -771,12 +979,20 @@ hdec_soon:
 	lwz	r6, HSTATE_PMC + 12(r13)
 	lwz	r8, HSTATE_PMC + 16(r13)
 	lwz	r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+	lwz	r10, HSTATE_PMC + 24(r13)
+	lwz	r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_PMC1, r3
 	mtspr	SPRN_PMC2, r4
 	mtspr	SPRN_PMC3, r5
 	mtspr	SPRN_PMC4, r6
 	mtspr	SPRN_PMC5, r8
 	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	ld	r3, HSTATE_MMCR(r13)
 	ld	r4, HSTATE_MMCR + 8(r13)
 	ld	r5, HSTATE_MMCR + 16(r13)
@@ -802,7 +1018,7 @@ hdec_soon:
 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 
 	/* RFI into the highmem handler, or branch to interrupt handler */
-	mfmsr	r6
+12:	mfmsr	r6
 	mtctr	r12
 	li	r0, MSR_RI
 	andc	r6, r6, r0
@@ -812,7 +1028,11 @@ hdec_soon:
 	beqctr
 	RFI
 
-11:	mtspr	SPRN_HSRR0, r8
+11:
+BEGIN_FTR_SECTION
+	b	12b
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	mtspr	SPRN_HSRR0, r8
 	mtspr	SPRN_HSRR1, r7
 	ba	0x500
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 72c506505fa..a107c9be0fb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -213,6 +213,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 		break;
 	case KVM_CAP_PPC_RMA:
 		r = 1;
+		/* PPC970 requires an RMA */
+		if (cpu_has_feature(CPU_FTR_ARCH_201))
+			r = 2;
 		break;
 #endif
 	default:
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index b44f5f80305..90039bc6411 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -37,7 +37,7 @@
 
 #define HPTE_LOCK_BIT 3
 
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 static inline void __tlbie(unsigned long va, int psize, int ssize)
 {
-- 
cgit v1.2.3-70-g09d2