Merge tag 'efi-urgent' into x86/urgent

* Avoid WARN_ON() when mapping BGRT on Baytrail (EFI 32-bit). Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: H. Peter Anvin <hpa@linux.intel.com> 2014-02-07 11:27:30 -0800
committer: H. Peter Anvin <hpa@linux.intel.com> 2014-02-07 11:27:30 -0800
commit: a3b072cd180c12e8fe0ece9487b9065808327640 (patch)
tree: 62b982041be84748852d77cdf6ca5639ef40858f /arch/x86
parent: 75a1ba5b2c529db60ca49626bcaf0bddf4548438 (diff)
parent: 081cd62a010f97b5bc1d2b0cd123c5abc692b68a (diff)
86 files changed, 4228 insertions, 576 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3b6922ebf17..0af5250d914 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	def_bool y
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_MIGHT_HAVE_PC_PARPORT
+	select ARCH_MIGHT_HAVE_PC_SERIO
 	select HAVE_AOUT if X86_32
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select ARCH_SUPPORTS_NUMA_BALANCING
@@ -279,13 +280,13 @@ config SMP
 	bool "Symmetric multi-processing support"
 	---help---
 	  This enables support for systems with more than one CPU. If you have
-	  a system with only one CPU, like most personal computers, say N. If
-	  you have a system with more than one CPU, say Y.
+	  a system with only one CPU, say N. If you have a system with more
+	  than one CPU, say Y.
 
-	  If you say N here, the kernel will run on single and multiprocessor
+	  If you say N here, the kernel will run on uni- and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all,
-	  singleprocessor machines. On a singleprocessor machine, the kernel
+	  uniprocessor machines. On a uniprocessor machine, the kernel
 	  will run faster if you say N here.
 
 	  Note that if you say Y here and choose architecture "586" or
@@ -732,6 +733,7 @@ config APB_TIMER
 # The code disables itself when not needed.
 config DMI
 	default y
+	select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
 	bool "Enable DMI scanning" if EXPERT
 	---help---
 	  Enabled scanning of DMI to identify machine quirks. Say Y
@@ -939,7 +941,7 @@ config X86_ANCIENT_MCE
 	depends on X86_32 && X86_MCE
 	---help---
 	  Include support for machine check handling on old Pentium 5 or WinChip
-	  systems. These typically need to be enabled explicitely on the command
+	  systems. These typically need to be enabled explicitly on the command
 	  line.
 
 config X86_MCE_THRESHOLD
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 13b22e0f681..eeda43abed6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -11,6 +11,28 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
+# How to compile the 16-bit code.  Note we always compile for -march=i386;
+# that way we can complain to the user if the CPU is insufficient.
+#
+# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
+# older versions of GCC, we need to play evil and unreliable tricks to
+# attempt to ensure that our asm(".code16gcc") is first in the asm
+# output.
+CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \
+		    $(call cc-option, -fno-toplevel-reorder,\
+		      $(call cc-option, -fno-unit-at-a-time))
+M16_CFLAGS	 := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
+
+REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
+		   -DDISABLE_BRANCH_PROFILING \
+		   -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
+		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
+		   -mno-mmx -mno-sse \
+		   $(call cc-option, -ffreestanding) \
+		   $(call cc-option, -fno-stack-protector) \
+		   $(call cc-option, -mpreferred-stack-boundary=2)
+export REALMODE_CFLAGS
+
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index de706691800..878df7e88cd 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -51,20 +51,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE
 
 # ---------------------------------------------------------------------------
 
-# How to compile the 16-bit code.  Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-KBUILD_CFLAGS	:= $(USERINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \
-		   -DDISABLE_BRANCH_PROFILING \
-		   -Wall -Wstrict-prototypes \
-		   -march=i386 -mregparm=3 \
-		   -include $(srctree)/$(src)/code16gcc.h \
-		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-		   -mno-mmx -mno-sse \
-		   $(call cc-option, -ffreestanding) \
-		   $(call cc-option, -fno-toplevel-reorder,\
-		   $(call cc-option, -fno-unit-at-a-time)) \
-		   $(call cc-option, -fno-stack-protector) \
-		   $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_CFLAGS	:= $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
 
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index a9fcb7cfb24..431fa5f8453 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -28,20 +28,35 @@ static int has_fpu(void)
 	return fsw == 0 && (fcw & 0x103f) == 0x003f;
 }
 
+/*
+ * For building the 16-bit code we want to explicitly specify 32-bit
+ * push/pop operations, rather than just saying 'pushf' or 'popf' and
+ * letting the compiler choose. But this is also included from the
+ * compressed/ directory where it may be 64-bit code, and thus needs
+ * to be 'pushfq' or 'popfq' in that case.
+ */
+#ifdef __x86_64__
+#define PUSHF "pushfq"
+#define POPF "popfq"
+#else
+#define PUSHF "pushfl"
+#define POPF "popfl"
+#endif
+
 int has_eflag(unsigned long mask)
 {
 	unsigned long f0, f1;
 
-	asm volatile("pushf	\n\t"
-		     "pushf	\n\t"
+	asm volatile(PUSHF "	\n\t"
+		     PUSHF "	\n\t"
 		     "pop %0	\n\t"
 		     "mov %0,%1	\n\t"
 		     "xor %2,%1	\n\t"
 		     "push %1	\n\t"
-		     "popf	\n\t"
-		     "pushf	\n\t"
+		     POPF "	\n\t"
+		     PUSHF "	\n\t"
 		     "pop %1	\n\t"
-		     "popf"
+		     POPF
 		     : "=&r" (f0), "=&r" (f1)
 		     : "ri" (mask));
 
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ff339c5db31..0bb25491262 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -80,7 +80,7 @@ struct card_info {
 	u16 xmode_n;		/* Size of unprobed mode range */
 };
 
-#define __videocard struct card_info __attribute__((section(".videocards")))
+#define __videocard struct card_info __attribute__((used,section(".videocards")))
 extern struct card_info video_cards[], video_cards_end[];
 
 int mode_defined(u16 mode);	/* video.c */
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e0fc24db234..6ba54d64038 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -76,6 +76,7 @@ ifeq ($(avx2_supported),yes)
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 00000000000..522ab68d1c8
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2811 @@
+########################################################################
+# Copyright (c) 2013, Intel Corporation
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the
+#   distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
+# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+##
+## Authors:
+##	Erdinc Ozturk <erdinc.ozturk@intel.com>
+##	Vinodh Gopal <vinodh.gopal@intel.com>
+##	James Guilford <james.guilford@intel.com>
+##	Tim Chen <tim.c.chen@linux.intel.com>
+##
+## References:
+##       This code was derived and highly optimized from the code described in paper:
+##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
+##			on Intel Architecture Processors. August, 2010
+##       The details of the implementation is explained in:
+##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
+##			on Intel Architecture Processors. October, 2012.
+##
+## Assumptions:
+##
+##
+##
+## iv:
+##       0                   1                   2                   3
+##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                             Salt  (From the SA)               |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                     Initialization Vector                     |
+##       |         (This is the sequence number from IPSec header)       |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                              0x1                              |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##
+##
+## AAD:
+##       AAD padded to 128 bits with 0
+##       for example, assume AAD is a u32 vector
+##
+##       if AAD is 8 bytes:
+##       AAD[3] = {A0, A1}#
+##       padded AAD in xmm register = {A1 A0 0 0}
+##
+##       0                   1                   2                   3
+##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                               SPI (A1)                        |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                     32-bit Sequence Number (A0)               |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                              0x0                              |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##                                       AAD Format with 32-bit Sequence Number
+##
+##       if AAD is 12 bytes:
+##       AAD[3] = {A0, A1, A2}#
+##       padded AAD in xmm register = {A2 A1 A0 0}
+##
+##       0                   1                   2                   3
+##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                               SPI (A2)                        |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                 64-bit Extended Sequence Number {A1,A0}       |
+##       |                                                               |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##       |                              0x0                              |
+##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##        AAD Format with 64-bit Extended Sequence Number
+##
+##
+## aadLen:
+##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+##	 The code additionally supports aadLen of length 16 bytes.
+##
+## TLen:
+##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+##
+## poly = x^128 + x^127 + x^126 + x^121 + 1
+## throughout the code, one tab and two tab indentations are used. one tab is
+## for GHASH part, two tabs is for AES part.
+##
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.data
+.align 16
+
+POLY:            .octa     0xC2000000000000000000000000000001
+POLY2:           .octa     0xC20000000000000000000001C2000000
+TWOONE:          .octa     0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
+SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
+ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
+ZERO:            .octa     0x00000000000000000000000000000000
+ONE:             .octa     0x00000000000000000000000000000001
+ONEf:            .octa     0x01000000000000000000000000000000
+
+.text
+
+
+##define the fields of the gcm aes context
+#{
+#        u8 expanded_keys[16*11] store expanded keys
+#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
+#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
+#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
+#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
+#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
+#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
+#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
+#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
+#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+#} gcm_ctx#
+
+HashKey        = 16*11   # store HashKey <<1 mod poly here
+HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
+HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
+HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
+HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
+HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
+HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
+HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
+HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+#define arg4 %rcx
+#define arg5 %r8
+#define arg6 %r9
+#define arg7 STACK_OFFSET+8*1(%r14)
+#define arg8 STACK_OFFSET+8*2(%r14)
+#define arg9 STACK_OFFSET+8*3(%r14)
+
+i = 0
+j = 0
+
+out_order = 0
+in_order = 1
+DEC = 0
+ENC = 1
+
+.macro define_reg r n
+reg_\r = %xmm\n
+.endm
+
+.macro setreg
+.altmacro
+define_reg i %i
+define_reg j %j
+.noaltmacro
+.endm
+
+# need to push 4 registers into stack to maintain
+STACK_OFFSET = 8*4
+
+TMP1 =   16*0    # Temporary storage for AAD
+TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+TMP3 =   16*2    # Temporary storage for AES State 3
+TMP4 =   16*3    # Temporary storage for AES State 4
+TMP5 =   16*4    # Temporary storage for AES State 5
+TMP6 =   16*5    # Temporary storage for AES State 6
+TMP7 =   16*6    # Temporary storage for AES State 7
+TMP8 =   16*7    # Temporary storage for AES State 8
+
+VARIABLE_OFFSET = 16*8
+
+################################
+# Utility Macros
+################################
+
+# Encryption of a single block
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+                vpxor    (arg1), \XMM0, \XMM0
+		i = 1
+		setreg
+.rep 9
+                vaesenc  16*i(arg1), \XMM0, \XMM0
+		i = (i+1)
+		setreg
+.endr
+                vaesenclast 16*10(arg1), \XMM0, \XMM0
+.endm
+
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+
+        vpshufd         $0b01001110, \GH, \T2
+        vpshufd         $0b01001110, \HK, \T3
+        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
+        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
+
+        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
+        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
+        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
+        vpxor           \GH, \T2,\T2
+        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
+
+        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
+        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
+        vpxor           \T3, \GH, \GH
+        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
+
+        #first phase of the reduction
+        vpslld  $31, \GH, \T2                   # packed right shifting << 31
+        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
+        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
+
+        vpxor   \T3, \T2, \T2                   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
+
+        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
+        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
+
+        #second phase of the reduction
+
+        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
+        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
+        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2                   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpxor   \T5, \T2, \T2
+        vpxor   \T2, \GH, \GH
+        vpxor   \T1, \GH, \GH                   # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+
+        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        vmovdqa  \HK, \T5
+
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
+        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_2_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
+        vmovdqa  \T5, HashKey_3(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_3_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
+        vmovdqa  \T5, HashKey_4(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_4_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
+        vmovdqa  \T5, HashKey_5(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_5_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
+        vmovdqa  \T5, HashKey_6(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_6_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
+        vmovdqa  \T5, HashKey_7(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_7_k(arg1)
+
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
+        vmovdqa  \T5, HashKey_8(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_8_k(arg1)
+
+.endm
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+	i = (8-\num_initial_blocks)
+	setreg
+
+        mov     arg6, %r10                      # r10 = AAD
+        mov     arg7, %r12                      # r12 = aadLen
+
+
+        mov     %r12, %r11
+
+        vpxor   reg_i, reg_i, reg_i
+_get_AAD_loop\@:
+        vmovd   (%r10), \T1
+        vpslldq $12, \T1, \T1
+        vpsrldq $4, reg_i, reg_i
+        vpxor   \T1, reg_i, reg_i
+
+        add     $4, %r10
+        sub     $4, %r12
+        jg      _get_AAD_loop\@
+
+
+        cmp     $16, %r11
+        je      _get_AAD_loop2_done\@
+        mov     $16, %r12
+
+_get_AAD_loop2\@:
+        vpsrldq $4, reg_i, reg_i
+        sub     $4, %r12
+        cmp     %r11, %r12
+        jg      _get_AAD_loop2\@
+
+_get_AAD_loop2_done\@:
+
+        #byte-reflect the AAD data
+        vpshufb SHUF_MASK(%rip), reg_i, reg_i
+
+	# initialize the data pointer offset as zero
+	xor     %r11, %r11
+
+	# start AES for num_initial_blocks blocks
+	mov     arg5, %rax                     # rax = *Y0
+	vmovdqu (%rax), \CTR                   # CTR = Y0
+	vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
+                vmovdqa \CTR, reg_i
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
+	i = (i+1)
+	setreg
+.endr
+
+	vmovdqa  (arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpxor   \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
+
+	j = 1
+	setreg
+.rep 9
+	vmovdqa  16*j(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenc \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
+
+	j = (j+1)
+	setreg
+.endr
+
+
+	vmovdqa  16*10(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenclast      \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
+
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vmovdqu (arg3, %r11), \T1
+                vpxor   \T1, reg_i, reg_i
+                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
+                add     $16, %r11
+.if  \ENC_DEC == DEC
+                vmovdqa \T1, reg_i
+.endif
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
+	i = (i+1)
+	setreg
+.endr
+
+
+	i = (8-\num_initial_blocks)
+	j = (9-\num_initial_blocks)
+	setreg
+        GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+.rep \num_initial_blocks
+        vpxor    reg_i, reg_j, reg_j
+        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+        # XMM8 has the combined result here
+
+        vmovdqa  \XMM8, TMP1(%rsp)
+        vmovdqa  \XMM8, \T3
+
+        cmp     $128, %r13
+        jl      _initial_blocks_done\@                  # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM1
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM2
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM3
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM4
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM5
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM6
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM7
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM8
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
+
+                vmovdqa  (arg1), \T_key
+                vpxor    \T_key, \XMM1, \XMM1
+                vpxor    \T_key, \XMM2, \XMM2
+                vpxor    \T_key, \XMM3, \XMM3
+                vpxor    \T_key, \XMM4, \XMM4
+                vpxor    \T_key, \XMM5, \XMM5
+                vpxor    \T_key, \XMM6, \XMM6
+                vpxor    \T_key, \XMM7, \XMM7
+                vpxor    \T_key, \XMM8, \XMM8
+
+		i = 1
+		setreg
+.rep    9       # do 9 rounds
+                vmovdqa  16*i(arg1), \T_key
+                vaesenc  \T_key, \XMM1, \XMM1
+                vaesenc  \T_key, \XMM2, \XMM2
+                vaesenc  \T_key, \XMM3, \XMM3
+                vaesenc  \T_key, \XMM4, \XMM4
+                vaesenc  \T_key, \XMM5, \XMM5
+                vaesenc  \T_key, \XMM6, \XMM6
+                vaesenc  \T_key, \XMM7, \XMM7
+                vaesenc  \T_key, \XMM8, \XMM8
+		i = (i+1)
+		setreg
+.endr
+
+
+                vmovdqa  16*i(arg1), \T_key
+                vaesenclast  \T_key, \XMM1, \XMM1
+                vaesenclast  \T_key, \XMM2, \XMM2
+                vaesenclast  \T_key, \XMM3, \XMM3
+                vaesenclast  \T_key, \XMM4, \XMM4
+                vaesenclast  \T_key, \XMM5, \XMM5
+                vaesenclast  \T_key, \XMM6, \XMM6
+                vaesenclast  \T_key, \XMM7, \XMM7
+                vaesenclast  \T_key, \XMM8, \XMM8
+
+                vmovdqu  (arg3, %r11), \T1
+                vpxor    \T1, \XMM1, \XMM1
+                vmovdqu  \XMM1, (arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM1
+                .endif
+
+                vmovdqu  16*1(arg3, %r11), \T1
+                vpxor    \T1, \XMM2, \XMM2
+                vmovdqu  \XMM2, 16*1(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM2
+                .endif
+
+                vmovdqu  16*2(arg3, %r11), \T1
+                vpxor    \T1, \XMM3, \XMM3
+                vmovdqu  \XMM3, 16*2(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM3
+                .endif
+
+                vmovdqu  16*3(arg3, %r11), \T1
+                vpxor    \T1, \XMM4, \XMM4
+                vmovdqu  \XMM4, 16*3(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM4
+                .endif
+
+                vmovdqu  16*4(arg3, %r11), \T1
+                vpxor    \T1, \XMM5, \XMM5
+                vmovdqu  \XMM5, 16*4(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM5
+                .endif
+
+                vmovdqu  16*5(arg3, %r11), \T1
+                vpxor    \T1, \XMM6, \XMM6
+                vmovdqu  \XMM6, 16*5(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM6
+                .endif
+
+                vmovdqu  16*6(arg3, %r11), \T1
+                vpxor    \T1, \XMM7, \XMM7
+                vmovdqu  \XMM7, 16*6(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM7
+                .endif
+
+                vmovdqu  16*7(arg3, %r11), \T1
+                vpxor    \T1, \XMM8, \XMM8
+                vmovdqu  \XMM8, 16*7(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM8
+                .endif
+
+                add     $128, %r11
+
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
+                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+.endm
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+        vmovdqa \XMM1, \T2
+        vmovdqa \XMM2, TMP2(%rsp)
+        vmovdqa \XMM3, TMP3(%rsp)
+        vmovdqa \XMM4, TMP4(%rsp)
+        vmovdqa \XMM5, TMP5(%rsp)
+        vmovdqa \XMM6, TMP6(%rsp)
+        vmovdqa \XMM7, TMP7(%rsp)
+        vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
+                vpaddd  ONE(%rip), \XMM1, \XMM2
+                vpaddd  ONE(%rip), \XMM2, \XMM3
+                vpaddd  ONE(%rip), \XMM3, \XMM4
+                vpaddd  ONE(%rip), \XMM4, \XMM5
+                vpaddd  ONE(%rip), \XMM5, \XMM6
+                vpaddd  ONE(%rip), \XMM6, \XMM7
+                vpaddd  ONE(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
+
+                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
+.else
+                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
+                vpaddd  ONEf(%rip), \XMM1, \XMM2
+                vpaddd  ONEf(%rip), \XMM2, \XMM3
+                vpaddd  ONEf(%rip), \XMM3, \XMM4
+                vpaddd  ONEf(%rip), \XMM4, \XMM5
+                vpaddd  ONEf(%rip), \XMM5, \XMM6
+                vpaddd  ONEf(%rip), \XMM6, \XMM7
+                vpaddd  ONEf(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
+.endif
+
+
+        #######################################################################
+
+                vmovdqu (arg1), \T1
+                vpxor   \T1, \XMM1, \XMM1
+                vpxor   \T1, \XMM2, \XMM2
+                vpxor   \T1, \XMM3, \XMM3
+                vpxor   \T1, \XMM4, \XMM4
+                vpxor   \T1, \XMM5, \XMM5
+                vpxor   \T1, \XMM6, \XMM6
+                vpxor   \T1, \XMM7, \XMM7
+                vpxor   \T1, \XMM8, \XMM8
+
+        #######################################################################
+
+
+
+
+
+                vmovdqu 16*1(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+                vmovdqu 16*2(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+
+        #######################################################################
+
+        vmovdqa         HashKey_8(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
+        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
+
+        vpshufd         $0b01001110, \T2, \T6
+        vpxor           \T2, \T6, \T6
+
+        vmovdqa         HashKey_8_k(arg1), \T5
+        vpclmulqdq      $0x00, \T5, \T6, \T6
+
+                vmovdqu 16*3(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP2(%rsp), \T1
+        vmovdqa         HashKey_7(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_7_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*4(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        #######################################################################
+
+        vmovdqa         TMP3(%rsp), \T1
+        vmovdqa         HashKey_6(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_6_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*5(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP4(%rsp), \T1
+        vmovdqa         HashKey_5(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_5_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*6(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+
+        vmovdqa         TMP5(%rsp), \T1
+        vmovdqa         HashKey_4(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_4_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*7(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP6(%rsp), \T1
+        vmovdqa         HashKey_3(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_3_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+
+                vmovdqu 16*8(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP7(%rsp), \T1
+        vmovdqa         HashKey_2(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_2_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+        #######################################################################
+
+                vmovdqu 16*9(arg1), \T5
+                vaesenc \T5, \XMM1, \XMM1
+                vaesenc \T5, \XMM2, \XMM2
+                vaesenc \T5, \XMM3, \XMM3
+                vaesenc \T5, \XMM4, \XMM4
+                vaesenc \T5, \XMM5, \XMM5
+                vaesenc \T5, \XMM6, \XMM6
+                vaesenc \T5, \XMM7, \XMM7
+                vaesenc \T5, \XMM8, \XMM8
+
+        vmovdqa         TMP8(%rsp), \T1
+        vmovdqa         HashKey(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpxor           \T4, \T6, \T6
+        vpxor           \T7, \T6, \T6
+
+                vmovdqu 16*10(arg1), \T5
+
+	i = 0
+	j = 1
+	setreg
+.rep 8
+		vpxor	16*i(arg3, %r11), \T5, \T2
+                .if \ENC_DEC == ENC
+                vaesenclast     \T2, reg_j, reg_j
+                .else
+                vaesenclast     \T2, reg_j, \T3
+                vmovdqu 16*i(arg3, %r11), reg_j
+                vmovdqu \T3, 16*i(arg2, %r11)
+                .endif
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+	#######################################################################
+
+
+	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
+	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
+	vpxor	\T3, \T7, \T7
+	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
+
+
+
+	#######################################################################
+	#first phase of the reduction
+	#######################################################################
+        vpslld  $31, \T7, \T2                           # packed right shifting << 31
+        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
+        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
+
+        vpxor   \T3, \T2, \T2                           # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
+
+        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
+        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
+	#######################################################################
+                .if \ENC_DEC == ENC
+		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
+                .endif
+
+	#######################################################################
+	#second phase of the reduction
+        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
+        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
+        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2                           # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpxor   \T1, \T2, \T2
+        vpxor   \T2, \T7, \T7
+        vpxor   \T7, \T6, \T6                           # the result is in T6
+	#######################################################################
+
+		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
+
+
+	vpxor	\T6, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+        ## Karatsuba Method
+
+
+        vpshufd         $0b01001110, \XMM1, \T2
+        vpxor           \XMM1, \T2, \T2
+        vmovdqa         HashKey_8(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM1, \T6
+        vpclmulqdq      $0x00, \T5, \XMM1, \T7
+
+        vmovdqa         HashKey_8_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM2, \T2
+        vpxor           \XMM2, \T2, \T2
+        vmovdqa         HashKey_7(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM2, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM2, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_7_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM3, \T2
+        vpxor           \XMM3, \T2, \T2
+        vmovdqa         HashKey_6(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM3, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM3, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_6_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM4, \T2
+        vpxor           \XMM4, \T2, \T2
+        vmovdqa         HashKey_5(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM4, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM4, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_5_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM5, \T2
+        vpxor           \XMM5, \T2, \T2
+        vmovdqa         HashKey_4(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM5, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM5, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_4_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM6, \T2
+        vpxor           \XMM6, \T2, \T2
+        vmovdqa         HashKey_3(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM6, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM6, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_3_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM7, \T2
+        vpxor           \XMM7, \T2, \T2
+        vmovdqa         HashKey_2(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM7, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM7, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_2_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vpshufd         $0b01001110, \XMM8, \T2
+        vpxor           \XMM8, \T2, \T2
+        vmovdqa         HashKey(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM8, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM8, \T4
+        vpxor           \T4, \T7, \T7
+
+        vmovdqa         HashKey_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+        vpxor           \T6, \XMM1, \XMM1
+        vpxor           \T7, \XMM1, \T2
+
+
+
+
+        vpslldq $8, \T2, \T4
+        vpsrldq $8, \T2, \T2
+
+        vpxor   \T4, \T7, \T7
+        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
+				# the accumulated carry-less multiplications
+
+        #######################################################################
+        #first phase of the reduction
+        vpslld  $31, \T7, \T2   # packed right shifting << 31
+        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
+        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
+
+        vpxor   \T3, \T2, \T2   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
+
+        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
+        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
+        #######################################################################
+
+
+        #second phase of the reduction
+        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
+        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
+        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpxor   \T1, \T2, \T2
+        vpxor   \T2, \T7, \T7
+        vpxor   \T7, \T6, \T6   # the result is in T6
+
+.endm
+
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro  GCM_ENC_DEC_AVX     ENC_DEC
+
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+
+        mov     %rsp, %r14
+
+
+
+
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                  # align rsp to 64 bytes
+
+
+        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
+
+        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
+        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
+
+        mov     %r13, %r12
+        shr     $4, %r12
+        and     $7, %r12
+        jz      _initial_num_blocks_is_0\@
+
+        cmp     $7, %r12
+        je      _initial_num_blocks_is_7\@
+        cmp     $6, %r12
+        je      _initial_num_blocks_is_6\@
+        cmp     $5, %r12
+        je      _initial_num_blocks_is_5\@
+        cmp     $4, %r12
+        je      _initial_num_blocks_is_4\@
+        cmp     $3, %r12
+        je      _initial_num_blocks_is_3\@
+        cmp     $2, %r12
+        je      _initial_num_blocks_is_2\@
+
+        jmp     _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*7, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*6, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*5, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*4, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*3, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*2, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*1, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+        cmp     $0, %r13
+        je      _zero_cipher_left\@
+
+        sub     $128, %r13
+        je      _eight_cipher_left\@
+
+
+
+
+        vmovd   %xmm9, %r15d
+        and     $255, %r15d
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+        cmp     $(255-8), %r15d
+        jg      _encrypt_by_8\@
+
+
+
+        add     $8, %r15b
+        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
+
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        jmp     _eight_cipher_left\@
+
+_encrypt_by_8\@:
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $8, %r15b
+        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
+
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+        cmp     $16, arg4
+        jl      _only_less_than_16\@
+
+        mov     arg4, %r13
+        and     $15, %r13                            # r13 = (arg4 mod 16)
+
+        je      _multiple_of_16_bytes\@
+
+        # handle the last <16 Byte block seperately
+
+
+        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+
+        sub     $16, %r11
+        add     %r13, %r11
+        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
+						     # able to shift 16-r13 bytes (r13 is the
+						     # number of bytes in plaintext mod 16)
+        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
+        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
+        jmp     _final_ghash_mul\@
+
+_only_less_than_16\@:
+        # check for 0 length
+        mov     arg4, %r13
+        and     $15, %r13                            # r13 = (arg4 mod 16)
+
+        je      _multiple_of_16_bytes\@
+
+        # handle the last <16 Byte block seperately
+
+
+        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+
+
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
+						     # able to shift 16-r13 bytes (r13 is the
+						     # number of bytes in plaintext mod 16)
+
+_get_last_16_byte_loop\@:
+        movb    (arg3, %r11),  %al
+        movb    %al,  TMP1 (%rsp , %r11)
+        add     $1, %r11
+        cmp     %r13,  %r11
+        jne     _get_last_16_byte_loop\@
+
+        vmovdqu  TMP1(%rsp), %xmm1
+
+        sub     $16, %r11
+
+_final_ghash_mul\@:
+        .if  \ENC_DEC ==  DEC
+        vmovdqa %xmm1, %xmm2
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
+						     # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm2, %xmm2
+        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+        vpxor   %xmm2, %xmm14, %xmm14
+	#GHASH computation for the last <16 Byte block
+        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        sub     %r13, %r11
+        add     $16, %r11
+        .else
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
+						     # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpxor   %xmm9, %xmm14, %xmm14
+	#GHASH computation for the last <16 Byte block
+        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        sub     %r13, %r11
+        add     $16, %r11
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
+        .endif
+
+
+        #############################
+        # output r13 Bytes
+        vmovq   %xmm9, %rax
+        cmp     $8, %r13
+        jle     _less_than_8_bytes_left\@
+
+        mov     %rax, (arg2 , %r11)
+        add     $8, %r11
+        vpsrldq $8, %xmm9, %xmm9
+        vmovq   %xmm9, %rax
+        sub     $8, %r13
+
+_less_than_8_bytes_left\@:
+        movb    %al, (arg2 , %r11)
+        add     $1, %r11
+        shr     $8, %rax
+        sub     $1, %r13
+        jne     _less_than_8_bytes_left\@
+        #############################
+
+_multiple_of_16_bytes\@:
+        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
+        shl     $3, %r12                             # convert into number of bits
+        vmovd   %r12d, %xmm15                        # len(A) in xmm15
+
+        shl     $3, arg4                             # len(C) in bits  (*128)
+        vmovq   arg4, %xmm1
+        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
+        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
+
+        vpxor   %xmm15, %xmm14, %xmm14
+        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
+        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
+
+        mov     arg5, %rax                           # rax = *Y0
+        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
+
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
+
+        vpxor   %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+        mov     arg8, %r10              # r10 = authTag
+        mov     arg9, %r11              # r11 = auth_tag_len
+
+        cmp     $16, %r11
+        je      _T_16\@
+
+        cmp     $12, %r11
+        je      _T_12\@
+
+_T_8\@:
+        vmovq   %xmm9, %rax
+        mov     %rax, (%r10)
+        jmp     _return_T_done\@
+_T_12\@:
+        vmovq   %xmm9, %rax
+        mov     %rax, (%r10)
+        vpsrldq $8, %xmm9, %xmm9
+        vmovd   %xmm9, %eax
+        mov     %eax, 8(%r10)
+        jmp     _return_T_done\@
+
+_T_16\@:
+        vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+        mov     %r14, %rsp
+
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+.endm
+
+
+#############################################################
+#void   aesni_gcm_precomp_avx_gen2
+#        (gcm_data     *my_ctx_data,
+#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#############################################################
+ENTRY(aesni_gcm_precomp_avx_gen2)
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+
+        mov     %rsp, %r14
+
+
+
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                  # align rsp to 64 bytes
+
+        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
+
+        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
+        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+        vmovdqa  %xmm6, %xmm2
+        vpsllq   $1, %xmm6, %xmm6
+        vpsrlq   $63, %xmm2, %xmm2
+        vmovdqa  %xmm2, %xmm1
+        vpslldq  $8, %xmm2, %xmm2
+        vpsrldq  $8, %xmm1, %xmm1
+        vpor     %xmm2, %xmm6, %xmm6
+        #reduction
+        vpshufd  $0b00100100, %xmm1, %xmm2
+        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+        vpand    POLY(%rip), %xmm2, %xmm2
+        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
+        #######################################################################
+        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
+
+
+        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+        mov     %r14, %rsp
+
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        ret
+ENDPROC(aesni_gcm_precomp_avx_gen2)
+
+###############################################################################
+#void   aesni_gcm_enc_avx_gen2(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
+#        const   u8 *in, /* Plaintext input */
+#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			Initialisation Vector (from IPSec ESP Payload)
+#			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#        u8      *auth_tag, /* Authenticated Tag output. */
+#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
+#				Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_enc_avx_gen2)
+        GCM_ENC_DEC_AVX     ENC
+	ret
+ENDPROC(aesni_gcm_enc_avx_gen2)
+
+###############################################################################
+#void   aesni_gcm_dec_avx_gen2(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
+#        const   u8 *in, /* Ciphertext input */
+#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			Initialisation Vector (from IPSec ESP Payload)
+#			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#        u8      *auth_tag, /* Authenticated Tag output. */
+#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
+#				Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_dec_avx_gen2)
+        GCM_ENC_DEC_AVX     DEC
+	ret
+ENDPROC(aesni_gcm_dec_avx_gen2)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
+
+        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
+        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
+        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
+        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
+        vpxor           \T3, \GH, \GH
+
+
+        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
+        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
+
+        vpxor           \T3, \T1, \T1
+        vpxor           \T2, \GH, \GH
+
+        #######################################################################
+        #first phase of the reduction
+        vmovdqa         POLY2(%rip), \T3
+
+        vpclmulqdq      $0x01, \GH, \T3, \T2
+        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
+
+        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
+        #######################################################################
+        #second phase of the reduction
+        vpclmulqdq      $0x00, \GH, \T3, \T2
+        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        vpclmulqdq      $0x10, \GH, \T3, \GH
+        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
+        #######################################################################
+        vpxor           \T1, \GH, \GH          # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
+
+        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        vmovdqa  \HK, \T5
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
+        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
+
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
+        vmovdqa  \T5, HashKey_3(arg1)
+
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
+        vmovdqa  \T5, HashKey_4(arg1)
+
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
+        vmovdqa  \T5, HashKey_5(arg1)
+
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
+        vmovdqa  \T5, HashKey_6(arg1)
+
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
+        vmovdqa  \T5, HashKey_7(arg1)
+
+        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
+        vmovdqa  \T5, HashKey_8(arg1)
+
+.endm
+
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+	i = (8-\num_initial_blocks)
+	setreg
+
+        mov     arg6, %r10                       # r10 = AAD
+        mov     arg7, %r12                       # r12 = aadLen
+
+
+        mov     %r12, %r11
+
+        vpxor   reg_i, reg_i, reg_i
+_get_AAD_loop\@:
+        vmovd   (%r10), \T1
+        vpslldq $12, \T1, \T1
+        vpsrldq $4, reg_i, reg_i
+        vpxor   \T1, reg_i, reg_i
+
+        add     $4, %r10
+        sub     $4, %r12
+        jg      _get_AAD_loop\@
+
+
+        cmp     $16, %r11
+        je      _get_AAD_loop2_done\@
+        mov     $16, %r12
+
+_get_AAD_loop2\@:
+        vpsrldq $4, reg_i, reg_i
+        sub     $4, %r12
+        cmp     %r11, %r12
+        jg      _get_AAD_loop2\@
+
+_get_AAD_loop2_done\@:
+
+        #byte-reflect the AAD data
+        vpshufb SHUF_MASK(%rip), reg_i, reg_i
+
+	# initialize the data pointer offset as zero
+	xor     %r11, %r11
+
+	# start AES for num_initial_blocks blocks
+	mov     arg5, %rax                     # rax = *Y0
+	vmovdqu (%rax), \CTR                   # CTR = Y0
+	vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
+                vmovdqa \CTR, reg_i
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
+	i = (i+1)
+	setreg
+.endr
+
+	vmovdqa  (arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpxor   \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
+
+	j = 1
+	setreg
+.rep 9
+	vmovdqa  16*j(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenc \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
+
+	j = (j+1)
+	setreg
+.endr
+
+
+	vmovdqa  16*10(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenclast      \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
+
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vmovdqu (arg3, %r11), \T1
+                vpxor   \T1, reg_i, reg_i
+                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
+						       # num_initial_blocks blocks
+                add     $16, %r11
+.if  \ENC_DEC == DEC
+                vmovdqa \T1, reg_i
+.endif
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
+	i = (i+1)
+	setreg
+.endr
+
+
+	i = (8-\num_initial_blocks)
+	j = (9-\num_initial_blocks)
+	setreg
+        GHASH_MUL_AVX2       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+.rep \num_initial_blocks
+        vpxor    reg_i, reg_j, reg_j
+        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+        # XMM8 has the combined result here
+
+        vmovdqa  \XMM8, TMP1(%rsp)
+        vmovdqa  \XMM8, \T3
+
+        cmp     $128, %r13
+        jl      _initial_blocks_done\@                  # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM1
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM2
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM3
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM4
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM5
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM6
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM7
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
+
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM8
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
+
+                vmovdqa  (arg1), \T_key
+                vpxor    \T_key, \XMM1, \XMM1
+                vpxor    \T_key, \XMM2, \XMM2
+                vpxor    \T_key, \XMM3, \XMM3
+                vpxor    \T_key, \XMM4, \XMM4
+                vpxor    \T_key, \XMM5, \XMM5
+                vpxor    \T_key, \XMM6, \XMM6
+                vpxor    \T_key, \XMM7, \XMM7
+                vpxor    \T_key, \XMM8, \XMM8
+
+		i = 1
+		setreg
+.rep    9       # do 9 rounds
+                vmovdqa  16*i(arg1), \T_key
+                vaesenc  \T_key, \XMM1, \XMM1
+                vaesenc  \T_key, \XMM2, \XMM2
+                vaesenc  \T_key, \XMM3, \XMM3
+                vaesenc  \T_key, \XMM4, \XMM4
+                vaesenc  \T_key, \XMM5, \XMM5
+                vaesenc  \T_key, \XMM6, \XMM6
+                vaesenc  \T_key, \XMM7, \XMM7
+                vaesenc  \T_key, \XMM8, \XMM8
+		i = (i+1)
+		setreg
+.endr
+
+
+                vmovdqa  16*i(arg1), \T_key
+                vaesenclast  \T_key, \XMM1, \XMM1
+                vaesenclast  \T_key, \XMM2, \XMM2
+                vaesenclast  \T_key, \XMM3, \XMM3
+                vaesenclast  \T_key, \XMM4, \XMM4
+                vaesenclast  \T_key, \XMM5, \XMM5
+                vaesenclast  \T_key, \XMM6, \XMM6
+                vaesenclast  \T_key, \XMM7, \XMM7
+                vaesenclast  \T_key, \XMM8, \XMM8
+
+                vmovdqu  (arg3, %r11), \T1
+                vpxor    \T1, \XMM1, \XMM1
+                vmovdqu  \XMM1, (arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM1
+                .endif
+
+                vmovdqu  16*1(arg3, %r11), \T1
+                vpxor    \T1, \XMM2, \XMM2
+                vmovdqu  \XMM2, 16*1(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM2
+                .endif
+
+                vmovdqu  16*2(arg3, %r11), \T1
+                vpxor    \T1, \XMM3, \XMM3
+                vmovdqu  \XMM3, 16*2(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM3
+                .endif
+
+                vmovdqu  16*3(arg3, %r11), \T1
+                vpxor    \T1, \XMM4, \XMM4
+                vmovdqu  \XMM4, 16*3(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM4
+                .endif
+
+                vmovdqu  16*4(arg3, %r11), \T1
+                vpxor    \T1, \XMM5, \XMM5
+                vmovdqu  \XMM5, 16*4(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM5
+                .endif
+
+                vmovdqu  16*5(arg3, %r11), \T1
+                vpxor    \T1, \XMM6, \XMM6
+                vmovdqu  \XMM6, 16*5(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM6
+                .endif
+
+                vmovdqu  16*6(arg3, %r11), \T1
+                vpxor    \T1, \XMM7, \XMM7
+                vmovdqu  \XMM7, 16*6(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM7
+                .endif
+
+                vmovdqu  16*7(arg3, %r11), \T1
+                vpxor    \T1, \XMM8, \XMM8
+                vmovdqu  \XMM8, 16*7(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM8
+                .endif
+
+                add     $128, %r11
+
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
+                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
+							   # the corresponding ciphertext
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+
+.endm
+
+
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+        vmovdqa \XMM1, \T2
+        vmovdqa \XMM2, TMP2(%rsp)
+        vmovdqa \XMM3, TMP3(%rsp)
+        vmovdqa \XMM4, TMP4(%rsp)
+        vmovdqa \XMM5, TMP5(%rsp)
+        vmovdqa \XMM6, TMP6(%rsp)
+        vmovdqa \XMM7, TMP7(%rsp)
+        vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
+                vpaddd  ONE(%rip), \XMM1, \XMM2
+                vpaddd  ONE(%rip), \XMM2, \XMM3
+                vpaddd  ONE(%rip), \XMM3, \XMM4
+                vpaddd  ONE(%rip), \XMM4, \XMM5
+                vpaddd  ONE(%rip), \XMM5, \XMM6
+                vpaddd  ONE(%rip), \XMM6, \XMM7
+                vpaddd  ONE(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
+
+                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
+.else
+                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
+                vpaddd  ONEf(%rip), \XMM1, \XMM2
+                vpaddd  ONEf(%rip), \XMM2, \XMM3
+                vpaddd  ONEf(%rip), \XMM3, \XMM4
+                vpaddd  ONEf(%rip), \XMM4, \XMM5
+                vpaddd  ONEf(%rip), \XMM5, \XMM6
+                vpaddd  ONEf(%rip), \XMM6, \XMM7
+                vpaddd  ONEf(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
+.endif
+
+
+        #######################################################################
+
+                vmovdqu (arg1), \T1
+                vpxor   \T1, \XMM1, \XMM1
+                vpxor   \T1, \XMM2, \XMM2
+                vpxor   \T1, \XMM3, \XMM3
+                vpxor   \T1, \XMM4, \XMM4
+                vpxor   \T1, \XMM5, \XMM5
+                vpxor   \T1, \XMM6, \XMM6
+                vpxor   \T1, \XMM7, \XMM7
+                vpxor   \T1, \XMM8, \XMM8
+
+        #######################################################################
+
+
+
+
+
+                vmovdqu 16*1(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+                vmovdqu 16*2(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+
+        #######################################################################
+
+        vmovdqa         HashKey_8(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
+        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
+        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
+        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
+        vpxor           \T5, \T6, \T6
+
+                vmovdqu 16*3(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP2(%rsp), \T1
+        vmovdqa         HashKey_7(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*4(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        #######################################################################
+
+        vmovdqa         TMP3(%rsp), \T1
+        vmovdqa         HashKey_6(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*5(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP4(%rsp), \T1
+        vmovdqa         HashKey_5(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*6(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+
+        vmovdqa         TMP5(%rsp), \T1
+        vmovdqa         HashKey_4(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*7(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP6(%rsp), \T1
+        vmovdqa         HashKey_3(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+                vmovdqu 16*8(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
+
+        vmovdqa         TMP7(%rsp), \T1
+        vmovdqa         HashKey_2(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+
+        #######################################################################
+
+                vmovdqu 16*9(arg1), \T5
+                vaesenc \T5, \XMM1, \XMM1
+                vaesenc \T5, \XMM2, \XMM2
+                vaesenc \T5, \XMM3, \XMM3
+                vaesenc \T5, \XMM4, \XMM4
+                vaesenc \T5, \XMM5, \XMM5
+                vaesenc \T5, \XMM6, \XMM6
+                vaesenc \T5, \XMM7, \XMM7
+                vaesenc \T5, \XMM8, \XMM8
+
+        vmovdqa         TMP8(%rsp), \T1
+        vmovdqa         HashKey(arg1), \T5
+
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
+
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
+
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T1
+
+
+                vmovdqu 16*10(arg1), \T5
+
+	i = 0
+	j = 1
+	setreg
+.rep 8
+		vpxor	16*i(arg3, %r11), \T5, \T2
+                .if \ENC_DEC == ENC
+                vaesenclast     \T2, reg_j, reg_j
+                .else
+                vaesenclast     \T2, reg_j, \T3
+                vmovdqu 16*i(arg3, %r11), reg_j
+                vmovdqu \T3, 16*i(arg2, %r11)
+                .endif
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+	#######################################################################
+
+
+	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
+	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
+	vpxor	\T3, \T7, \T7
+	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
+
+
+
+	#######################################################################
+	#first phase of the reduction
+	vmovdqa         POLY2(%rip), \T3
+
+	vpclmulqdq	$0x01, \T7, \T3, \T2
+	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
+
+	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
+	#######################################################################
+                .if \ENC_DEC == ENC
+		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
+                .endif
+
+	#######################################################################
+	#second phase of the reduction
+	vpclmulqdq	$0x00, \T7, \T3, \T2
+	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+	vpclmulqdq	$0x10, \T7, \T3, \T4
+	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
+	#######################################################################
+	vpxor		\T4, \T1, \T1			# the result is in T1
+
+		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
+
+
+	vpxor	\T1, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+        ## Karatsuba Method
+
+        vmovdqa         HashKey_8(arg1), \T5
+
+        vpshufd         $0b01001110, \XMM1, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM1, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM1, \T6
+        vpclmulqdq      $0x00, \T5, \XMM1, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey_7(arg1), \T5
+        vpshufd         $0b01001110, \XMM2, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM2, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM2, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM2, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey_6(arg1), \T5
+        vpshufd         $0b01001110, \XMM3, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM3, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM3, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM3, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey_5(arg1), \T5
+        vpshufd         $0b01001110, \XMM4, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM4, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM4, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM4, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey_4(arg1), \T5
+        vpshufd         $0b01001110, \XMM5, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM5, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM5, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM5, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey_3(arg1), \T5
+        vpshufd         $0b01001110, \XMM6, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM6, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM6, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM6, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey_2(arg1), \T5
+        vpshufd         $0b01001110, \XMM7, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM7, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM7, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM7, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+
+        ######################
+
+        vmovdqa         HashKey(arg1), \T5
+        vpshufd         $0b01001110, \XMM8, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM8, \T2, \T2
+        vpxor           \T5, \T3, \T3
+
+        vpclmulqdq      $0x11, \T5, \XMM8, \T4
+        vpxor           \T4, \T6, \T6
+
+        vpclmulqdq      $0x00, \T5, \XMM8, \T4
+        vpxor           \T4, \T7, \T7
+
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+
+        vpxor           \T2, \XMM1, \XMM1
+        vpxor           \T6, \XMM1, \XMM1
+        vpxor           \T7, \XMM1, \T2
+
+
+
+
+        vpslldq $8, \T2, \T4
+        vpsrldq $8, \T2, \T2
+
+        vpxor   \T4, \T7, \T7
+        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
+						   # accumulated carry-less multiplications
+
+        #######################################################################
+        #first phase of the reduction
+        vmovdqa         POLY2(%rip), \T3
+
+        vpclmulqdq      $0x01, \T7, \T3, \T2
+        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
+
+        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
+        #######################################################################
+
+
+        #second phase of the reduction
+        vpclmulqdq      $0x00, \T7, \T3, \T2
+        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+        vpclmulqdq      $0x10, \T7, \T3, \T4
+        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
+        #######################################################################
+        vpxor           \T4, \T6, \T6              # the result is in T6
+.endm
+
+
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro  GCM_ENC_DEC_AVX2     ENC_DEC
+
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+
+        mov     %rsp, %r14
+
+
+
+
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                         # align rsp to 64 bytes
+
+
+        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
+
+        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
+        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
+
+        mov     %r13, %r12
+        shr     $4, %r12
+        and     $7, %r12
+        jz      _initial_num_blocks_is_0\@
+
+        cmp     $7, %r12
+        je      _initial_num_blocks_is_7\@
+        cmp     $6, %r12
+        je      _initial_num_blocks_is_6\@
+        cmp     $5, %r12
+        je      _initial_num_blocks_is_5\@
+        cmp     $4, %r12
+        je      _initial_num_blocks_is_4\@
+        cmp     $3, %r12
+        je      _initial_num_blocks_is_3\@
+        cmp     $2, %r12
+        je      _initial_num_blocks_is_2\@
+
+        jmp     _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*7, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*6, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*5, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*4, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*3, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*2, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*1, %r13
+        jmp     _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+        cmp     $0, %r13
+        je      _zero_cipher_left\@
+
+        sub     $128, %r13
+        je      _eight_cipher_left\@
+
+
+
+
+        vmovd   %xmm9, %r15d
+        and     $255, %r15d
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+        cmp     $(255-8), %r15d
+        jg      _encrypt_by_8\@
+
+
+
+        add     $8, %r15b
+        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
+
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        jmp     _eight_cipher_left\@
+
+_encrypt_by_8\@:
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $8, %r15b
+        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
+
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+        cmp     $16, arg4
+        jl      _only_less_than_16\@
+
+        mov     arg4, %r13
+        and     $15, %r13                            # r13 = (arg4 mod 16)
+
+        je      _multiple_of_16_bytes\@
+
+        # handle the last <16 Byte block seperately
+
+
+        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+
+        sub     $16, %r11
+        add     %r13, %r11
+        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer
+						     # to be able to shift 16-r13 bytes
+						     # (r13 is the number of bytes in plaintext mod 16)
+        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
+        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
+        jmp     _final_ghash_mul\@
+
+_only_less_than_16\@:
+        # check for 0 length
+        mov     arg4, %r13
+        and     $15, %r13                            # r13 = (arg4 mod 16)
+
+        je      _multiple_of_16_bytes\@
+
+        # handle the last <16 Byte block seperately
+
+
+        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+
+
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
+						     # able to shift 16-r13 bytes (r13 is the
+						     # number of bytes in plaintext mod 16)
+
+_get_last_16_byte_loop\@:
+        movb    (arg3, %r11),  %al
+        movb    %al,  TMP1 (%rsp , %r11)
+        add     $1, %r11
+        cmp     %r13,  %r11
+        jne     _get_last_16_byte_loop\@
+
+        vmovdqu  TMP1(%rsp), %xmm1
+
+        sub     $16, %r11
+
+_final_ghash_mul\@:
+        .if  \ENC_DEC ==  DEC
+        vmovdqa %xmm1, %xmm2
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm2, %xmm2
+        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+        vpxor   %xmm2, %xmm14, %xmm14
+	#GHASH computation for the last <16 Byte block
+        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        sub     %r13, %r11
+        add     $16, %r11
+        .else
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpxor   %xmm9, %xmm14, %xmm14
+	#GHASH computation for the last <16 Byte block
+        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        sub     %r13, %r11
+        add     $16, %r11
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
+        .endif
+
+
+        #############################
+        # output r13 Bytes
+        vmovq   %xmm9, %rax
+        cmp     $8, %r13
+        jle     _less_than_8_bytes_left\@
+
+        mov     %rax, (arg2 , %r11)
+        add     $8, %r11
+        vpsrldq $8, %xmm9, %xmm9
+        vmovq   %xmm9, %rax
+        sub     $8, %r13
+
+_less_than_8_bytes_left\@:
+        movb    %al, (arg2 , %r11)
+        add     $1, %r11
+        shr     $8, %rax
+        sub     $1, %r13
+        jne     _less_than_8_bytes_left\@
+        #############################
+
+_multiple_of_16_bytes\@:
+        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
+        shl     $3, %r12                             # convert into number of bits
+        vmovd   %r12d, %xmm15                        # len(A) in xmm15
+
+        shl     $3, arg4                             # len(C) in bits  (*128)
+        vmovq   arg4, %xmm1
+        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
+        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
+
+        vpxor   %xmm15, %xmm14, %xmm14
+        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
+        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
+
+        mov     arg5, %rax                           # rax = *Y0
+        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
+
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
+
+        vpxor   %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+        mov     arg8, %r10              # r10 = authTag
+        mov     arg9, %r11              # r11 = auth_tag_len
+
+        cmp     $16, %r11
+        je      _T_16\@
+
+        cmp     $12, %r11
+        je      _T_12\@
+
+_T_8\@:
+        vmovq   %xmm9, %rax
+        mov     %rax, (%r10)
+        jmp     _return_T_done\@
+_T_12\@:
+        vmovq   %xmm9, %rax
+        mov     %rax, (%r10)
+        vpsrldq $8, %xmm9, %xmm9
+        vmovd   %xmm9, %eax
+        mov     %eax, 8(%r10)
+        jmp     _return_T_done\@
+
+_T_16\@:
+        vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+        mov     %r14, %rsp
+
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+.endm
+
+
+#############################################################
+#void   aesni_gcm_precomp_avx_gen4
+#        (gcm_data     *my_ctx_data,
+#        u8     *hash_subkey)# /* H, the Hash sub key input.
+#				Data starts on a 16-byte boundary. */
+#############################################################
+ENTRY(aesni_gcm_precomp_avx_gen4)
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+
+        mov     %rsp, %r14
+
+
+
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                    # align rsp to 64 bytes
+
+        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
+
+        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
+        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+        vmovdqa  %xmm6, %xmm2
+        vpsllq   $1, %xmm6, %xmm6
+        vpsrlq   $63, %xmm2, %xmm2
+        vmovdqa  %xmm2, %xmm1
+        vpslldq  $8, %xmm2, %xmm2
+        vpsrldq  $8, %xmm1, %xmm1
+        vpor     %xmm2, %xmm6, %xmm6
+        #reduction
+        vpshufd  $0b00100100, %xmm1, %xmm2
+        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+        vpand    POLY(%rip), %xmm2, %xmm2
+        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
+        #######################################################################
+        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
+
+
+        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+        mov     %r14, %rsp
+
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+        ret
+ENDPROC(aesni_gcm_precomp_avx_gen4)
+
+
+###############################################################################
+#void   aesni_gcm_enc_avx_gen4(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
+#        const   u8 *in, /* Plaintext input */
+#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			 Initialisation Vector (from IPSec ESP Payload)
+#			 concatenated with 0x00000001. 16-byte aligned pointer. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#        u8      *auth_tag, /* Authenticated Tag output. */
+#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
+#				Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_enc_avx_gen4)
+        GCM_ENC_DEC_AVX2     ENC
+	ret
+ENDPROC(aesni_gcm_enc_avx_gen4)
+
+###############################################################################
+#void   aesni_gcm_dec_avx_gen4(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
+#        const   u8 *in, /* Ciphertext input */
+#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			Initialisation Vector (from IPSec ESP Payload)
+#			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#        u8      *auth_tag, /* Authenticated Tag output. */
+#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
+#				Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_dec_avx_gen4)
+        GCM_ENC_DEC_AVX2     DEC
+	ret
+ENDPROC(aesni_gcm_dec_avx_gen4)
+
+#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 835488b745e..948ad0e7774 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 int crypto_fpu_init(void);
 void crypto_fpu_exit(void);
 
+#define AVX_GEN2_OPTSIZE 640
+#define AVX_GEN4_OPTSIZE 4096
+
 #ifdef CONFIG_X86_64
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
@@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
+
+#ifdef CONFIG_AS_AVX
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
+				aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+
+static void aesni_gcm_dec_avx(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
+				aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+#endif
+
+#ifdef CONFIG_AS_AVX2
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (plaintext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
+				aad_len, auth_tag, auth_tag_len);
+	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+		aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+
+static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len)
+{
+	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+		aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
+				aad, aad_len, auth_tag, auth_tag_len);
+	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
+		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	} else {
+		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+		aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
+					aad_len, auth_tag, auth_tag_len);
+	}
+}
+#endif
+
+static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
 {
@@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 		dst = src;
 	}
 
-	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+	aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
 		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
 		+ ((unsigned long)req->cryptlen), auth_tag_len);
 
@@ -996,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		dst = src;
 	}
 
-	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+	aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
 		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
 		authTag, auth_tag_len);
 
 	/* Compare generated tag with passed in tag. */
-	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+	retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
 		-EBADMSG : 0;
 
 	if (one_entry_in_sg) {
@@ -1353,6 +1473,27 @@ static int __init aesni_init(void)
 
 	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_AS_AVX2
+	if (boot_cpu_has(X86_FEATURE_AVX2)) {
+		pr_info("AVX2 version of gcm_enc/dec engaged.\n");
+		aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
+		aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+	} else
+#endif
+#ifdef CONFIG_AS_AVX
+	if (boot_cpu_has(X86_FEATURE_AVX)) {
+		pr_info("AVX version of gcm_enc/dec engaged.\n");
+		aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
+		aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+	} else
+#endif
+	{
+		pr_info("SSE version of gcm_enc/dec engaged.\n");
+		aesni_gcm_enc_tfm = aesni_gcm_enc;
+		aesni_gcm_dec_tfm = aesni_gcm_dec;
+	}
+#endif
 
 	err = crypto_fpu_init();
 	if (err)
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index fd8f9e2ca35..535192f6bfa 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,7 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap early_ioremap
-#define dmi_iounmap early_iounmap
+#define dmi_early_remap		early_ioremap
+#define dmi_early_unmap		early_iounmap
+#define dmi_remap		ioremap
+#define dmi_unmap		iounmap
 
 #endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e846225265e..7252cd33917 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -175,64 +175,7 @@ static inline void __set_fixmap(enum fixed_addresses idx,
 }
 #endif
 
-#define set_fixmap(idx, phys)				\
-	__set_fixmap(idx, phys, PAGE_KERNEL)
-
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys)			\
-	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
-#define clear_fixmap(idx)			\
-	__set_fixmap(idx, 0, __pgprot(0))
-
-#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without translation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
-	/*
-	 * this branch gets completely eliminated after inlining,
-	 * except when someone tries to use fixaddr indices in an
-	 * illegal way. (such as mixing up address types or using
-	 * out-of-range indices).
-	 *
-	 * If it doesn't get removed, the linker will complain
-	 * loudly with a reasonably clear error message..
-	 */
-	if (idx >= __end_of_fixed_addresses)
-		__this_fixmap_does_not_exist();
-
-	return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
-	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
-	return __virt_to_fix(vaddr);
-}
-
-/* Return an pointer with offset calculated */
-static __always_inline unsigned long
-__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
-{
-	__set_fixmap(idx, phys, flags);
-	return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
-}
-
-#define set_fixmap_offset(idx, phys)			\
-	__set_fixmap_offset(idx, phys, PAGE_KERNEL)
-
-#define set_fixmap_offset_nocache(idx, phys)			\
-	__set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+#include <asm-generic/fixmap.h>
 
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
new file mode 100644
index 00000000000..e8c58f88b1d
--- /dev/null
+++ b/arch/x86/include/asm/hash.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_HASH_H
+#define _ASM_X86_HASH_H
+
+struct fast_hash_ops;
+extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
+
+#endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ae5d7830855..fdf83afbb7d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -605,6 +605,7 @@ struct kvm_arch {
 	/* fields used by HYPER-V emulation */
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
+	u64 hv_tsc_page;
 
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
@@ -699,6 +700,8 @@ struct kvm_x86_ops {
 	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	u64 (*get_dr6)(struct kvm_vcpu *vcpu);
+	void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
 	void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 1df11590975..c7678e43465 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -85,28 +85,9 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 	return ret;
 }
 
-static inline uint32_t kvm_cpuid_base(void)
-{
-	if (boot_cpu_data.cpuid_level < 0)
-		return 0;	/* So we don't blow up on old processors */
-
-	if (cpu_has_hypervisor)
-		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
-
-	return 0;
-}
-
-static inline bool kvm_para_available(void)
-{
-	return kvm_cpuid_base() != 0;
-}
-
-static inline unsigned int kvm_arch_para_features(void)
-{
-	return cpuid_eax(KVM_CPUID_FEATURES);
-}
-
 #ifdef CONFIG_KVM_GUEST
+bool kvm_para_available(void);
+unsigned int kvm_arch_para_features(void);
 void __init kvm_guest_init(void);
 void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
@@ -126,6 +107,16 @@ static inline void kvm_spinlock_init(void)
 #define kvm_async_pf_task_wait(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 
+static inline bool kvm_para_available(void)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+	return 0;
+}
+
 static inline u32 kvm_read_and_reset_pf_reason(void)
 {
 	return 0;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 401f350ef71..cd6e1610e29 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -781,9 +781,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
  */
 #define PV_CALLEE_SAVE_REGS_THUNK(func)					\
 	extern typeof(func) __raw_callee_save_##func;			\
-	static void *__##func##__ __used = func;			\
 									\
 	asm(".pushsection .text;"					\
+	    ".globl __raw_callee_save_" #func " ; "			\
 	    "__raw_callee_save_" #func ": "				\
 	    PV_SAVE_ALL_CALLER_REGS					\
 	    "call " #func ";"						\
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index aab8f671b52..7549b8b369e 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -388,10 +388,11 @@ extern struct pv_lock_ops pv_lock_ops;
 	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
 
 /* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code) 					\
-	extern const char start_##ops##_##name[] __visible,		\
-			  end_##ops##_##name[] __visible;		\
-	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
+
+#define DEF_NATIVE(ops, name, code)					\
+	__visible extern const char start_##ops##_##name[], end_##ops##_##name[];	\
+	asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name))
 
 unsigned paravirt_patch_nop(void);
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 947b5c417e8..1ac6114c9ea 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -104,7 +104,7 @@ extern void pci_iommu_alloc(void);
 struct msi_desc;
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
-void native_restore_msi_irqs(struct pci_dev *dev, int irq);
+void native_restore_msi_irqs(struct pci_dev *dev);
 int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 		  unsigned int irq_base, unsigned int irq_offset);
 #else
@@ -125,7 +125,6 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 
 /* generic pci stuff */
 #include <asm-generic/pci.h>
-#define PCIBIOS_MAX_MEM_32 0xffffffff
 
 #ifdef CONFIG_NUMA
 /* Returns the node based on pci bus */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a83aa44bb1f..1aa9ccd4322 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -121,7 +121,8 @@
 
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
-			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
+			 _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 #define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ba3de457d0..e1940c06ed0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -163,9 +163,11 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-
-/* how to get the current stack pointer from C */
-register unsigned long current_stack_pointer asm("esp") __used;
+#define current_stack_pointer ({		\
+	unsigned long sp;			\
+	asm("mov %%esp,%0" : "=g" (sp));	\
+	sp;					\
+})
 
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6b964a0b86d..062921ef34e 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -12,7 +12,6 @@ extern enum uv_system_type get_uv_system_type(void);
 extern int is_uv_system(void);
 extern void uv_cpu_init(void);
 extern void uv_nmi_init(void);
-extern void uv_register_nmi_notifier(void);
 extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 						 struct mm_struct *mm,
@@ -26,7 +25,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
 static inline int is_uv_system(void)	{ return 0; }
 static inline void uv_cpu_init(void)	{ }
 static inline void uv_system_init(void)	{ }
-static inline void uv_register_nmi_notifier(void) { }
 static inline const struct cpumask *
 uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
 		    unsigned long start, unsigned long end, unsigned int cpu)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 966502d4682..2067264fb7f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -100,6 +100,7 @@
 
 #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK	0x0000001f
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
+#define VMX_MISC_ACTIVITY_HLT			0x00000040
 
 /* VMCS Encodings */
 enum vmcs_field {
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 0f1be11e43d..e45e4da96bf 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -181,7 +181,7 @@ struct x86_msi_ops {
 			       u8 hpet_id);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
-	void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
+	void (*restore_msi_irqs)(struct pci_dev *dev);
 	int  (*setup_hpet_msi)(unsigned int irq, unsigned int id);
 	u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag);
 	u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index b913915e8e6..787e1bb5aaf 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -52,7 +52,8 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s,
 extern int m2p_add_override(unsigned long mfn, struct page *page,
 			    struct gnttab_map_grant_ref *kmap_op);
 extern int m2p_remove_override(struct page *page,
-				struct gnttab_map_grant_ref *kmap_op);
+			       struct gnttab_map_grant_ref *kmap_op,
+			       unsigned long mfn);
 extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
@@ -121,7 +122,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 		pfn = m2p_find_override_pfn(mfn, ~0);
 	}
 
-	/* 
+	/*
 	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
 	 * entry doesn't map back to the mfn and m2p_override doesn't have a
 	 * valid entry for it.
@@ -167,7 +168,12 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  */
 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 {
-	unsigned long pfn = mfn_to_pfn(mfn);
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return mfn;
+
+	pfn = mfn_to_pfn(mfn);
 	if (get_phys_to_machine(pfn) != mfn)
 		return -1; /* force !pfn_valid() */
 	return pfn;
@@ -222,5 +228,6 @@ void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
 #define xen_remap(cookie, size) ioremap((cookie), (size));
+#define xen_unmap(cookie) iounmap((cookie))
 
 #endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index b8f1c0176cb..462efe746d7 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -28,6 +28,9 @@
 /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
 #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE	(1 << 1)
 
+/* A partition's reference time stamp counter (TSC) page */
+#define HV_X64_MSR_REFERENCE_TSC		0x40000021
+
 /*
  * There is a single feature flag that signifies the presence of the MSR
  * that can be used to retrieve both the local APIC Timer frequency as
@@ -198,6 +201,9 @@
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK	\
 		(~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
 
+#define HV_X64_MSR_TSC_REFERENCE_ENABLE		0x00000001
+#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT	12
+
 #define HV_PROCESSOR_POWER_STATE_C0		0
 #define HV_PROCESSOR_POWER_STATE_C1		1
 #define HV_PROCESSOR_POWER_STATE_C2		2
@@ -210,4 +216,11 @@
 #define HV_STATUS_INVALID_ALIGNMENT		4
 #define HV_STATUS_INSUFFICIENT_BUFFERS		19
 
+typedef struct _HV_REFERENCE_TSC_PAGE {
+	__u32 tsc_sequence;
+	__u32 res1;
+	__u64 tsc_scale;
+	__s64 tsc_offset;
+} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
+
 #endif
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 59cea185ad1..c19fc60ff06 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -528,6 +528,7 @@
 #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
 #define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
 #define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
+#define MSR_IA32_VMX_VMFUNC             0x00000491
 
 /* VMX_BASIC bits and bitmasks */
 #define VMX_BASIC_VMCS_SIZE_SHIFT	32
diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h
index ee50c801f7b..cc2d6a3aeae 100644
--- a/arch/x86/include/uapi/asm/sembuf.h
+++ b/arch/x86/include/uapi/asm/sembuf.h
@@ -13,12 +13,12 @@
 struct semid64_ds {
 	struct ipc64_perm sem_perm;	/* permissions .. see ipc.h */
 	__kernel_time_t	sem_otime;	/* last semop time */
-	unsigned long	__unused1;
+	__kernel_ulong_t __unused1;
 	__kernel_time_t	sem_ctime;	/* last change time */
-	unsigned long	__unused2;
-	unsigned long	sem_nsems;	/* no. of semaphores in array */
-	unsigned long	__unused3;
-	unsigned long	__unused4;
+	__kernel_ulong_t __unused2;
+	__kernel_ulong_t sem_nsems;	/* no. of semaphores in array */
+	__kernel_ulong_t __unused3;
+	__kernel_ulong_t __unused4;
 };
 
 #endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6c0b43bd024..1dac94265b5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -46,7 +46,6 @@
 
 #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
 static int __initdata acpi_force = 0;
-u32 acpi_rsdt_forced;
 int acpi_disabled;
 EXPORT_SYMBOL(acpi_disabled);
 
@@ -1034,9 +1033,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 
 	if (!acpi_ioapic)
 		return 0;
-	if (!dev)
-		return 0;
-	if (dev->bus != &pci_bus_type)
+	if (!dev || !dev_is_pci(dev))
 		return 0;
 
 	pdev = to_pci_dev(dev);
@@ -1564,7 +1561,7 @@ static int __init parse_acpi(char *arg)
 	}
 	/* acpi=rsdt use RSDT instead of XSDT */
 	else if (strcmp(arg, "rsdt") == 0) {
-		acpi_rsdt_forced = 1;
+		acpi_gbl_do_not_use_xsdt = TRUE;
 	}
 	/* "acpi=noirq" disables ACPI interrupt routing */
 	else if (strcmp(arg, "noirq") == 0) {
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5d5b9eb2b7a..2c621a6b901 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -20,9 +20,7 @@
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
+#include <linux/acpi.h>
 
 static struct apic apic_physflat;
 static struct apic apic_flat;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a43f068ebec..6ad4658de70 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -37,9 +37,6 @@
 #include <linux/kthread.h>
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
 #include <linux/bootmem.h>
 #include <linux/dmar.h>
 #include <linux/hpet.h>
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ad0dc0428ba..d263b1307de 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -980,7 +980,6 @@ void __init uv_system_init(void)
 	uv_nmi_setup();
 	uv_cpu_init();
 	uv_scir_register_cpu_notifier();
-	uv_register_nmi_notifier();
 	proc_mkdir("sgi_uv", NULL);
 
 	/* register Legacy VGA I/O redirection handler */
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabd..83a7995625a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
 
 	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
 
-	for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+	for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
 		start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
 				PAGE_SIZE, corruption_check_size);
 		end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 4a6ff747aaa..8fffd845e22 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -433,7 +433,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 	if (c->x86 >= 0x15)
 		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
 
-	if (request_firmware(&fw, (const char *)fw_name, device)) {
+	if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
 		pr_debug("failed to load file %s\n", fw_name);
 		goto out;
 	}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5fb2cebf556..a276fa75d9b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		c->x86, c->x86_model, c->x86_mask);
 
-	if (request_firmware(&firmware, name, device)) {
+	if (request_firmware_direct(&firmware, name, device)) {
 		pr_debug("data file %s load failed\n", name);
 		return UCODE_NFOUND;
 	}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7..988c00a1f60 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
 		nr_pages += end_pfn - start_pfn;
 	}
 
-	for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+	for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
 		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
 		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
 		if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6dd802c6d78..713f1b3bad5 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -500,6 +500,38 @@ void __init kvm_guest_init(void)
 #endif
 }
 
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+	if (boot_cpu_data.cpuid_level < 0)
+		return 0;	/* So we don't blow up on old processors */
+
+	if (cpu_has_hypervisor)
+		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+
+	return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+	static int kvm_cpuid_base = -1;
+
+	if (kvm_cpuid_base == -1)
+		kvm_cpuid_base = __kvm_cpuid_base();
+
+	return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+	return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
 static uint32_t __init kvm_detect(void)
 {
 	return kvm_cpuid_base();
@@ -673,7 +705,7 @@ static cpumask_t waiting_cpus;
 /* Track spinlock on which a cpu is waiting */
 static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
 
-static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
 	struct kvm_lock_waiting *w;
 	int cpu;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a3acbac2ee7..19e5adb49a2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -180,7 +180,7 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
 
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
-	data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_mul = 0;
 	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 992f890283e..f6584a90aba 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -33,7 +33,7 @@
  * and vice versa.
  */
 
-static unsigned long vsmp_save_fl(void)
+asmlinkage unsigned long vsmp_save_fl(void)
 {
 	unsigned long flags = native_save_fl();
 
@@ -43,7 +43,7 @@ static unsigned long vsmp_save_fl(void)
 }
 PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
 
-static void vsmp_restore_fl(unsigned long flags)
+__visible void vsmp_restore_fl(unsigned long flags)
 {
 	if (flags & X86_EFLAGS_IF)
 		flags &= ~X86_EFLAGS_AC;
@@ -53,7 +53,7 @@ static void vsmp_restore_fl(unsigned long flags)
 }
 PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
 
-static void vsmp_irq_disable(void)
+asmlinkage void vsmp_irq_disable(void)
 {
 	unsigned long flags = native_save_fl();
 
@@ -61,7 +61,7 @@ static void vsmp_irq_disable(void)
 }
 PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
 
-static void vsmp_irq_enable(void)
+asmlinkage void vsmp_irq_enable(void)
 {
 	unsigned long flags = native_save_fl();
 
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 021783b1f46..e48b674639c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -136,9 +136,9 @@ void arch_teardown_msi_irq(unsigned int irq)
 	x86_msi.teardown_msi_irq(irq);
 }
 
-void arch_restore_msi_irqs(struct pci_dev *dev, int irq)
+void arch_restore_msi_irqs(struct pci_dev *dev)
 {
-	x86_msi.restore_msi_irqs(dev, irq);
+	x86_msi.restore_msi_irqs(dev);
 }
 u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b89c5db2b83..287e4c85fff 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -80,7 +80,7 @@ config KVM_MMU_AUDIT
 	depends on KVM && TRACEPOINTS
 	---help---
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
-	 audit  KVM MMU at runtime.
+	 auditing of KVM MMU events at runtime.
 
 config KVM_DEVICE_ASSIGNMENT
 	bool "KVM legacy PCI device assignment support"
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f1e4895174b..a2a1bb7ed8c 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -72,4 +72,12 @@ static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
 	return best && (best->ecx & bit(X86_FEATURE_PCID));
 }
 
+static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_X2APIC));
+}
+
 #endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 412a5aa0ef9..518d86471b7 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -37,6 +37,7 @@
 
 #include "irq.h"
 #include "i8254.h"
+#include "x86.h"
 
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -349,6 +350,23 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	atomic_set(&ps->pending, 0);
 	ps->irq_ack = 1;
 
+	/*
+	 * Do not allow the guest to program periodic timers with small
+	 * interval, since the hrtimers are not throttled by the host
+	 * scheduler.
+	 */
+	if (ps->is_periodic) {
+		s64 min_period = min_timer_period_us * 1000LL;
+
+		if (ps->period < min_period) {
+			pr_info_ratelimited(
+			    "kvm: requested %lld ns "
+			    "i8254 timer period limited to %lld ns\n",
+			    ps->period, min_period);
+			ps->period = min_period;
+		}
+	}
+
 	hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
 		      HRTIMER_MODE_ABS);
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 775702f649c..9736529ade0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -71,9 +71,6 @@
 #define VEC_POS(v) ((v) & (32 - 1))
 #define REG_POS(v) (((v) >> 5) << 4)
 
-static unsigned int min_timer_period_us = 500;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
-
 static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 {
 	*((u32 *) (apic->regs + reg_off)) = val;
@@ -435,7 +432,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
 	u8 val;
 	if (pv_eoi_get_user(vcpu, &val) < 0)
 		apic_debug("Can't read EOI MSR value: 0x%llx\n",
-			   (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+			   (unsigned long long)vcpu->arch.pv_eoi.msr_val);
 	return val & 0x1;
 }
 
@@ -443,7 +440,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
 {
 	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
 		apic_debug("Can't set EOI MSR value: 0x%llx\n",
-			   (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+			   (unsigned long long)vcpu->arch.pv_eoi.msr_val);
 		return;
 	}
 	__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
@@ -453,7 +450,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 {
 	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
 		apic_debug("Can't clear EOI MSR value: 0x%llx\n",
-			   (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+			   (unsigned long long)vcpu->arch.pv_eoi.msr_val);
 		return;
 	}
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c8b0d0d2da5..6a11845fd8b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -65,7 +65,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		struct kvm_lapic_state *s);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 40772ef0f2b..e50425d0f5f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2659,6 +2659,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	int emulate = 0;
 	gfn_t pseudo_gfn;
 
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return 0;
+
 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
 		if (iterator.level == level) {
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
@@ -2829,6 +2832,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	bool ret = false;
 	u64 spte = 0ull;
 
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return false;
+
 	if (!page_fault_can_be_fast(error_code))
 		return false;
 
@@ -3224,6 +3230,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
 	struct kvm_shadow_walk_iterator iterator;
 	u64 spte = 0ull;
 
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return spte;
+
 	walk_shadow_page_lockless_begin(vcpu);
 	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
 		if (!is_shadow_present_pte(spte))
@@ -4510,6 +4519,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 	u64 spte;
 	int nr_sptes = 0;
 
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return nr_sptes;
+
 	walk_shadow_page_lockless_begin(vcpu);
 	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
 		sptes[iterator.level-1] = spte;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ad75d77999d..cba218a2f08 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -569,6 +569,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (FNAME(gpte_changed)(vcpu, gw, top_level))
 		goto out_gpte_changed;
 
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		goto out_gpte_changed;
+
 	for (shadow_walk_init(&it, vcpu, addr);
 	     shadow_walk_okay(&it) && it.level > gw->level;
 	     shadow_walk_next(&it)) {
@@ -820,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	 */
 	mmu_topup_memory_caches(vcpu);
 
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+		WARN_ON(1);
+		return;
+	}
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	for_each_shadow_entry(vcpu, gva, iterator) {
 		level = iterator.level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c7168a5cff1..e81df8fce02 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1671,6 +1671,19 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 	mark_dirty(svm->vmcb, VMCB_ASID);
 }
 
+static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
+{
+	return to_svm(vcpu)->vmcb->save.dr6;
+}
+
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->save.dr6 = value;
+	mark_dirty(svm->vmcb, VMCB_DR);
+}
+
 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4286,6 +4299,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_idt = svm_set_idt,
 	.get_gdt = svm_get_gdt,
 	.set_gdt = svm_set_gdt,
+	.get_dr6 = svm_get_dr6,
+	.set_dr6 = svm_set_dr6,
 	.set_dr7 = svm_set_dr7,
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da7837e1349..a06f101ef64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -418,6 +418,8 @@ struct vcpu_vmx {
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
 #endif
+	u32 vm_entry_controls_shadow;
+	u32 vm_exit_controls_shadow;
 	/*
 	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
 	 * non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -1056,7 +1058,9 @@ static inline bool is_exception(u32 intr_info)
 		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
 }
 
-static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+			      u32 exit_intr_info,
+			      unsigned long exit_qualification);
 static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
 			struct vmcs12 *vmcs12,
 			u32 reason, unsigned long qualification);
@@ -1326,6 +1330,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
 	vmcs_writel(field, vmcs_readl(field) | mask);
 }
 
+static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+	vmcs_write32(VM_ENTRY_CONTROLS, val);
+	vmx->vm_entry_controls_shadow = val;
+}
+
+static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+	if (vmx->vm_entry_controls_shadow != val)
+		vm_entry_controls_init(vmx, val);
+}
+
+static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
+{
+	return vmx->vm_entry_controls_shadow;
+}
+
+
+static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
+}
+
+static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
+}
+
+static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+	vmcs_write32(VM_EXIT_CONTROLS, val);
+	vmx->vm_exit_controls_shadow = val;
+}
+
+static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+	if (vmx->vm_exit_controls_shadow != val)
+		vm_exit_controls_init(vmx, val);
+}
+
+static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
+{
+	return vmx->vm_exit_controls_shadow;
+}
+
+
+static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
+}
+
+static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
+}
+
 static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 {
 	vmx->segment_cache.bitmask = 0;
@@ -1410,11 +1470,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
-static void clear_atomic_switch_msr_special(unsigned long entry,
-		unsigned long exit)
+static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+		unsigned long entry, unsigned long exit)
 {
-	vmcs_clear_bits(VM_ENTRY_CONTROLS, entry);
-	vmcs_clear_bits(VM_EXIT_CONTROLS, exit);
+	vm_entry_controls_clearbit(vmx, entry);
+	vm_exit_controls_clearbit(vmx, exit);
 }
 
 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
@@ -1425,14 +1485,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 	switch (msr) {
 	case MSR_EFER:
 		if (cpu_has_load_ia32_efer) {
-			clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+			clear_atomic_switch_msr_special(vmx,
+					VM_ENTRY_LOAD_IA32_EFER,
 					VM_EXIT_LOAD_IA32_EFER);
 			return;
 		}
 		break;
 	case MSR_CORE_PERF_GLOBAL_CTRL:
 		if (cpu_has_load_perf_global_ctrl) {
-			clear_atomic_switch_msr_special(
+			clear_atomic_switch_msr_special(vmx,
 					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 			return;
@@ -1453,14 +1514,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 }
 
-static void add_atomic_switch_msr_special(unsigned long entry,
-		unsigned long exit, unsigned long guest_val_vmcs,
-		unsigned long host_val_vmcs, u64 guest_val, u64 host_val)
+static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+		unsigned long entry, unsigned long exit,
+		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
+		u64 guest_val, u64 host_val)
 {
 	vmcs_write64(guest_val_vmcs, guest_val);
 	vmcs_write64(host_val_vmcs, host_val);
-	vmcs_set_bits(VM_ENTRY_CONTROLS, entry);
-	vmcs_set_bits(VM_EXIT_CONTROLS, exit);
+	vm_entry_controls_setbit(vmx, entry);
+	vm_exit_controls_setbit(vmx, exit);
 }
 
 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
@@ -1472,7 +1534,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 	switch (msr) {
 	case MSR_EFER:
 		if (cpu_has_load_ia32_efer) {
-			add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+			add_atomic_switch_msr_special(vmx,
+					VM_ENTRY_LOAD_IA32_EFER,
 					VM_EXIT_LOAD_IA32_EFER,
 					GUEST_IA32_EFER,
 					HOST_IA32_EFER,
@@ -1482,7 +1545,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 		break;
 	case MSR_CORE_PERF_GLOBAL_CTRL:
 		if (cpu_has_load_perf_global_ctrl) {
-			add_atomic_switch_msr_special(
+			add_atomic_switch_msr_special(vmx,
 					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
 					GUEST_IA32_PERF_GLOBAL_CTRL,
@@ -1906,7 +1969,9 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
 	if (!(vmcs12->exception_bitmap & (1u << nr)))
 		return 0;
 
-	nested_vmx_vmexit(vcpu);
+	nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+			  vmcs_read32(VM_EXIT_INTR_INFO),
+			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
 }
 
@@ -2279,6 +2344,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
 	nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
 		VMX_MISC_SAVE_EFER_LMA;
+	nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
 	nested_vmx_misc_high = 0;
 }
 
@@ -2295,32 +2361,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
 	return low | ((u64)high << 32);
 }
 
-/*
- * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
- * also let it use VMX-specific MSRs.
- * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
- * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
- * like all other MSRs).
- */
+/* Returns 0 on success, non-0 otherwise. */
 static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
-	if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
-		     msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
-		/*
-		 * According to the spec, processors which do not support VMX
-		 * should throw a #GP(0) when VMX capability MSRs are read.
-		 */
-		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-		return 1;
-	}
-
 	switch (msr_index) {
-	case MSR_IA32_FEATURE_CONTROL:
-		if (nested_vmx_allowed(vcpu)) {
-			*pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
-			break;
-		}
-		return 0;
 	case MSR_IA32_VMX_BASIC:
 		/*
 		 * This MSR reports some information about VMX support. We
@@ -2387,34 +2431,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		*pdata = nested_vmx_ept_caps;
 		break;
 	default:
-		return 0;
-	}
-
-	return 1;
-}
-
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
-	u32 msr_index = msr_info->index;
-	u64 data = msr_info->data;
-	bool host_initialized = msr_info->host_initiated;
-
-	if (!nested_vmx_allowed(vcpu))
-		return 0;
-
-	if (msr_index == MSR_IA32_FEATURE_CONTROL) {
-		if (!host_initialized &&
-				to_vmx(vcpu)->nested.msr_ia32_feature_control
-				& FEATURE_CONTROL_LOCKED)
-			return 0;
-		to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
 		return 1;
 	}
 
-	/*
-	 * No need to treat VMX capability MSRs specially: If we don't handle
-	 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
-	 */
 	return 0;
 }
 
@@ -2460,13 +2479,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	case MSR_IA32_SYSENTER_ESP:
 		data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
+	case MSR_IA32_FEATURE_CONTROL:
+		if (!nested_vmx_allowed(vcpu))
+			return 1;
+		data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+		break;
+	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+		if (!nested_vmx_allowed(vcpu))
+			return 1;
+		return vmx_get_vmx_msr(vcpu, msr_index, pdata);
 	case MSR_TSC_AUX:
 		if (!to_vmx(vcpu)->rdtscp_enabled)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
-			return 0;
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
 			data = msr->data;
@@ -2479,6 +2505,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	return 0;
 }
 
+static void vmx_leave_nested(struct kvm_vcpu *vcpu);
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -2533,6 +2561,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC_ADJUST:
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
+	case MSR_IA32_FEATURE_CONTROL:
+		if (!nested_vmx_allowed(vcpu) ||
+		    (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+			return 1;
+		vmx->nested.msr_ia32_feature_control = data;
+		if (msr_info->host_initiated && data == 0)
+			vmx_leave_nested(vcpu);
+		break;
+	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+		return 1; /* they are read-only */
 	case MSR_TSC_AUX:
 		if (!vmx->rdtscp_enabled)
 			return 1;
@@ -2541,8 +2580,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		if (vmx_set_vmx_msr(vcpu, msr_info))
-			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			msr->data = data;
@@ -3182,14 +3219,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vmx_load_host_state(to_vmx(vcpu));
 	vcpu->arch.efer = efer;
 	if (efer & EFER_LMA) {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-			     vmcs_read32(VM_ENTRY_CONTROLS) |
-			     VM_ENTRY_IA32E_MODE);
+		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 		msr->data = efer;
 	} else {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-			     vmcs_read32(VM_ENTRY_CONTROLS) &
-			     ~VM_ENTRY_IA32E_MODE);
+		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 
 		msr->data = efer & ~EFER_LME;
 	}
@@ -3217,9 +3250,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
 {
-	vmcs_write32(VM_ENTRY_CONTROLS,
-		     vmcs_read32(VM_ENTRY_CONTROLS)
-		     & ~VM_ENTRY_IA32E_MODE);
+	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
 }
 
@@ -4346,10 +4377,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		++vmx->nmsrs;
 	}
 
-	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 
 	/* 22.2.1, 20.8.1 */
-	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
+	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
 
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	set_cr4_guest_host_mask(vmx);
@@ -4360,7 +4392,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 msr;
+	struct msr_data apic_base_msr;
 
 	vmx->rmode.vm86_active = 0;
 
@@ -4368,10 +4400,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(&vmx->vcpu, 0);
-	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (kvm_vcpu_is_bsp(&vmx->vcpu))
-		msr |= MSR_IA32_APICBASE_BSP;
-	kvm_set_apic_base(&vmx->vcpu, msr);
+		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+	apic_base_msr.host_initiated = true;
+	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
 
 	vmx_segment_cache_clear(vmx);
 
@@ -4588,15 +4621,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
 		if (to_vmx(vcpu)->nested.nested_run_pending)
 			return 0;
 		if (nested_exit_on_nmi(vcpu)) {
-			nested_vmx_vmexit(vcpu);
-			vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
-			vmcs12->vm_exit_intr_info = NMI_VECTOR |
-				INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
+			nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+					  NMI_VECTOR | INTR_TYPE_NMI_INTR |
+					  INTR_INFO_VALID_MASK, 0);
 			/*
 			 * The NMI-triggered VM exit counts as injection:
 			 * clear this one and block further NMIs.
@@ -4618,15 +4648,11 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
 		if (to_vmx(vcpu)->nested.nested_run_pending)
 			return 0;
 		if (nested_exit_on_intr(vcpu)) {
-			nested_vmx_vmexit(vcpu);
-			vmcs12->vm_exit_reason =
-				EXIT_REASON_EXTERNAL_INTERRUPT;
-			vmcs12->vm_exit_intr_info = 0;
+			nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+					  0, 0);
 			/*
 			 * fall through to normal code, but now in L1, not L2
 			 */
@@ -4812,7 +4838,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		dr6 = vmcs_readl(EXIT_QUALIFICATION);
 		if (!(vcpu->guest_debug &
 		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
-			vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+			vcpu->arch.dr6 &= ~15;
+			vcpu->arch.dr6 |= dr6;
 			kvm_queue_exception(vcpu, DB_VECTOR);
 			return 1;
 		}
@@ -5080,14 +5107,27 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
 	if (exit_qualification & TYPE_MOV_FROM_DR) {
 		unsigned long val;
-		if (!kvm_get_dr(vcpu, dr, &val))
-			kvm_register_write(vcpu, reg, val);
+
+		if (kvm_get_dr(vcpu, dr, &val))
+			return 1;
+		kvm_register_write(vcpu, reg, val);
 	} else
-		kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
+		if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
+			return 1;
+
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
+static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.dr6;
+}
+
+static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+}
+
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
 {
 	vmcs_writel(GUEST_DR7, val);
@@ -6460,11 +6500,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 	int size;
 	u8 b;
 
-	if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
-		return 1;
-
 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
-		return 0;
+		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
@@ -6628,6 +6665,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	u32 exit_reason = vmx->exit_reason;
 
+	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+				vmcs_readl(EXIT_QUALIFICATION),
+				vmx->idt_vectoring_info,
+				intr_info,
+				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+				KVM_ISA_VMX);
+
 	if (vmx->nested.nested_run_pending)
 		return 0;
 
@@ -6777,7 +6821,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		return handle_invalid_guest_state(vcpu);
 
 	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-		nested_vmx_vmexit(vcpu);
+		nested_vmx_vmexit(vcpu, exit_reason,
+				  vmcs_read32(VM_EXIT_INTR_INFO),
+				  vmcs_readl(EXIT_QUALIFICATION));
 		return 1;
 	}
 
@@ -7332,8 +7378,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	free_vpid(vmx);
-	free_nested(vmx);
 	free_loaded_vmcs(vmx->loaded_vmcs);
+	free_nested(vmx);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7518,15 +7564,14 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 		struct x86_exception *fault)
 {
-	struct vmcs12 *vmcs12;
-	nested_vmx_vmexit(vcpu);
-	vmcs12 = get_vmcs12(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 exit_reason;
 
 	if (fault->error_code & PFERR_RSVD_MASK)
-		vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+		exit_reason = EXIT_REASON_EPT_MISCONFIG;
 	else
-		vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
-	vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+		exit_reason = EXIT_REASON_EPT_VIOLATION;
+	nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
 	vmcs12->guest_physical_address = fault->address;
 }
 
@@ -7564,7 +7609,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
 	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
 	if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
-		nested_vmx_vmexit(vcpu);
+		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+				  vmcs_read32(VM_EXIT_INTR_INFO),
+				  vmcs_readl(EXIT_QUALIFICATION));
 	else
 		kvm_inject_page_fault(vcpu, fault);
 }
@@ -7706,6 +7753,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			else
 				vmcs_write64(APIC_ACCESS_ADDR,
 				  page_to_phys(vmx->nested.apic_access_page));
+		} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+			exec_control |=
+				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+			vmcs_write64(APIC_ACCESS_ADDR,
+				page_to_phys(vcpu->kvm->arch.apic_access_page));
 		}
 
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
@@ -7759,12 +7811,12 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	exit_control = vmcs_config.vmexit_ctrl;
 	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
 		exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
-	vmcs_write32(VM_EXIT_CONTROLS, exit_control);
+	vm_exit_controls_init(vmx, exit_control);
 
 	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 	 * emulated by vmx_set_efer(), below.
 	 */
-	vmcs_write32(VM_ENTRY_CONTROLS,
+	vm_entry_controls_init(vmx, 
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
 			~VM_ENTRY_IA32E_MODE) |
 		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
@@ -7882,7 +7934,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
-	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) {
+	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
@@ -7994,8 +8047,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	enter_guest_mode(vcpu);
 
-	vmx->nested.nested_run_pending = 1;
-
 	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
 
 	cpu = get_cpu();
@@ -8011,6 +8062,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	prepare_vmcs02(vcpu, vmcs12);
 
+	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+		return kvm_emulate_halt(vcpu);
+
+	vmx->nested.nested_run_pending = 1;
+
 	/*
 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
 	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
@@ -8110,7 +8166,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
  * which already writes to vmcs12 directly.
  */
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+			   u32 exit_reason, u32 exit_intr_info,
+			   unsigned long exit_qualification)
 {
 	/* update guest state fields: */
 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -8162,6 +8220,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+	else
+		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
 
 	if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
 	    (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
@@ -8186,7 +8248,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	vmcs12->vm_entry_controls =
 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
-		(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
+		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
 
 	/* TODO: These cannot have changed unless we have MSR bitmaps and
 	 * the relevant bit asks not to trap the change */
@@ -8201,10 +8263,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	/* update exit information fields: */
 
-	vmcs12->vm_exit_reason  = to_vmx(vcpu)->exit_reason;
-	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	vmcs12->vm_exit_reason = exit_reason;
+	vmcs12->exit_qualification = exit_qualification;
 
-	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	vmcs12->vm_exit_intr_info = exit_intr_info;
 	if ((vmcs12->vm_exit_intr_info &
 	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
@@ -8370,7 +8432,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  * and modify vmcs12 to make it see what it would expect to see there if
  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
  */
-static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+			      u32 exit_intr_info,
+			      unsigned long exit_qualification)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int cpu;
@@ -8380,7 +8444,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
 	leave_guest_mode(vcpu);
-	prepare_vmcs12(vcpu, vmcs12);
+	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+		       exit_qualification);
+
+	trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+				       vmcs12->exit_qualification,
+				       vmcs12->idt_vectoring_info_field,
+				       vmcs12->vm_exit_intr_info,
+				       vmcs12->vm_exit_intr_error_code,
+				       KVM_ISA_VMX);
 
 	cpu = get_cpu();
 	vmx->loaded_vmcs = &vmx->vmcs01;
@@ -8389,6 +8461,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 	vcpu->cpu = cpu;
 	put_cpu();
 
+	vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
+	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
 	vmx_segment_cache_clear(vmx);
 
 	/* if no vmcs02 cache requested, remove the one we used */
@@ -8424,6 +8498,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 }
 
 /*
+ * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+ */
+static void vmx_leave_nested(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu))
+		nested_vmx_vmexit(vcpu, -1, 0, 0);
+	free_nested(to_vmx(vcpu));
+}
+
+/*
  * L1's failure to enter L2 is a subset of a normal exit, as explained in
  * 23.7 "VM-entry failures during or after loading guest state" (this also
  * lists the acceptable exit-reason and exit-qualification parameters).
@@ -8486,6 +8570,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
 	.set_gdt = vmx_set_gdt,
+	.get_dr6 = vmx_get_dr6,
+	.set_dr6 = vmx_set_dr6,
 	.set_dr7 = vmx_set_dr7,
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d004da1e35..39c28f09dfd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 static bool ignore_msrs = 0;
 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
@@ -254,10 +257,26 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
-	/* TODO: reserve bits check */
-	kvm_lapic_set_base(vcpu, data);
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	u64 old_state = vcpu->arch.apic_base &
+		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+	u64 new_state = msr_info->data &
+		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
+		0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
+
+	if (!msr_info->host_initiated &&
+	    ((msr_info->data & reserved_bits) != 0 ||
+	     new_state == X2APIC_ENABLE ||
+	     (new_state == MSR_IA32_APICBASE_ENABLE &&
+	      old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
+	     (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
+	      old_state == 0)))
+		return 1;
+
+	kvm_lapic_set_base(vcpu, msr_info->data);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
@@ -719,6 +738,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static void kvm_update_dr6(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+		kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
+}
+
 static void kvm_update_dr7(struct kvm_vcpu *vcpu)
 {
 	unsigned long dr7;
@@ -747,6 +772,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 		if (val & 0xffffffff00000000ULL)
 			return -1; /* #GP */
 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+		kvm_update_dr6(vcpu);
 		break;
 	case 5:
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -788,7 +814,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 			return 1;
 		/* fall through */
 	case 6:
-		*val = vcpu->arch.dr6;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			*val = vcpu->arch.dr6;
+		else
+			*val = kvm_x86_ops->get_dr6(vcpu);
 		break;
 	case 5:
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
@@ -836,11 +865,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	10
+#define KVM_SAVE_MSRS_BEGIN	12
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -1275,8 +1305,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	kvm->arch.last_tsc_write = data;
 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
 
-	/* Reset of TSC must disable overshoot protection below */
-	vcpu->arch.hv_clock.tsc_timestamp = 0;
 	vcpu->arch.last_guest_tsc = data;
 
 	/* Keep track of which generation this VCPU has synchronized to */
@@ -1484,7 +1512,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	unsigned long flags, this_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
-	s64 kernel_ns, max_kernel_ns;
+	s64 kernel_ns;
 	u64 tsc_timestamp, host_tsc;
 	struct pvclock_vcpu_time_info guest_hv_clock;
 	u8 pvclock_flags;
@@ -1543,37 +1571,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (!vcpu->pv_time_enabled)
 		return 0;
 
-	/*
-	 * Time as measured by the TSC may go backwards when resetting the base
-	 * tsc_timestamp.  The reason for this is that the TSC resolution is
-	 * higher than the resolution of the other clock scales.  Thus, many
-	 * possible measurments of the TSC correspond to one measurement of any
-	 * other clock, and so a spread of values is possible.  This is not a
-	 * problem for the computation of the nanosecond clock; with TSC rates
-	 * around 1GHZ, there can only be a few cycles which correspond to one
-	 * nanosecond value, and any path through this code will inevitably
-	 * take longer than that.  However, with the kernel_ns value itself,
-	 * the precision may be much lower, down to HZ granularity.  If the
-	 * first sampling of TSC against kernel_ns ends in the low part of the
-	 * range, and the second in the high end of the range, we can get:
-	 *
-	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
-	 *
-	 * As the sampling errors potentially range in the thousands of cycles,
-	 * it is possible such a time value has already been observed by the
-	 * guest.  To protect against this, we must compute the system time as
-	 * observed by the guest and ensure the new system time is greater.
-	 */
-	max_kernel_ns = 0;
-	if (vcpu->hv_clock.tsc_timestamp) {
-		max_kernel_ns = vcpu->last_guest_tsc -
-				vcpu->hv_clock.tsc_timestamp;
-		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
-				    vcpu->hv_clock.tsc_to_system_mul,
-				    vcpu->hv_clock.tsc_shift);
-		max_kernel_ns += vcpu->last_kernel_ns;
-	}
-
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
 				   &vcpu->hv_clock.tsc_shift,
@@ -1581,14 +1578,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
-	/* with a master <monotonic time, tsc value> tuple,
-	 * pvclock clock reads always increase at the (scaled) rate
-	 * of guest TSC - no need to deal with sampling errors.
-	 */
-	if (!use_master_clock) {
-		if (max_kernel_ns > kernel_ns)
-			kernel_ns = max_kernel_ns;
-	}
 	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
@@ -1826,6 +1815,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	switch (msr) {
 	case HV_X64_MSR_GUEST_OS_ID:
 	case HV_X64_MSR_HYPERCALL:
+	case HV_X64_MSR_REFERENCE_TSC:
+	case HV_X64_MSR_TIME_REF_COUNT:
 		r = true;
 		break;
 	}
@@ -1865,6 +1856,21 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (__copy_to_user((void __user *)addr, instructions, 4))
 			return 1;
 		kvm->arch.hv_hypercall = data;
+		mark_page_dirty(kvm, gfn);
+		break;
+	}
+	case HV_X64_MSR_REFERENCE_TSC: {
+		u64 gfn;
+		HV_REFERENCE_TSC_PAGE tsc_ref;
+		memset(&tsc_ref, 0, sizeof(tsc_ref));
+		kvm->arch.hv_tsc_page = data;
+		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+			break;
+		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+		if (kvm_write_guest(kvm, data,
+			&tsc_ref, sizeof(tsc_ref)))
+			return 1;
+		mark_page_dirty(kvm, gfn);
 		break;
 	}
 	default:
@@ -1879,19 +1885,21 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
 	case HV_X64_MSR_APIC_ASSIST_PAGE: {
+		u64 gfn;
 		unsigned long addr;
 
 		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
 			vcpu->arch.hv_vapic = data;
 			break;
 		}
-		addr = gfn_to_hva(vcpu->kvm, data >>
-				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+		addr = gfn_to_hva(vcpu->kvm, gfn);
 		if (kvm_is_error_hva(addr))
 			return 1;
 		if (__clear_user((void __user *)addr, PAGE_SIZE))
 			return 1;
 		vcpu->arch.hv_vapic = data;
+		mark_page_dirty(vcpu->kvm, gfn);
 		break;
 	}
 	case HV_X64_MSR_EOI:
@@ -2017,8 +2025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case 0x200 ... 0x2ff:
 		return set_msr_mtrr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
-		kvm_set_apic_base(vcpu, data);
-		break;
+		return kvm_set_apic_base(vcpu, msr_info);
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
 	case MSR_IA32_TSCDEADLINE:
@@ -2291,6 +2298,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_HYPERCALL:
 		data = kvm->arch.hv_hypercall;
 		break;
+	case HV_X64_MSR_TIME_REF_COUNT: {
+		data =
+		     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+		break;
+	}
+	case HV_X64_MSR_REFERENCE_TSC:
+		data = kvm->arch.hv_tsc_page;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -2601,6 +2616,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
+	case KVM_CAP_HYPERV_TIME:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -2972,8 +2988,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 					     struct kvm_debugregs *dbgregs)
 {
+	unsigned long val;
+
 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
-	dbgregs->dr6 = vcpu->arch.dr6;
+	_kvm_get_dr(vcpu, 6, &val);
+	dbgregs->dr6 = val;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->flags = 0;
 	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
@@ -2987,7 +3006,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = dbgregs->dr6;
+	kvm_update_dr6(vcpu);
 	vcpu->arch.dr7 = dbgregs->dr7;
+	kvm_update_dr7(vcpu);
 
 	return 0;
 }
@@ -5834,6 +5855,11 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 	kvm_apic_update_tmr(vcpu, tmr);
 }
 
+/*
+ * Returns 1 to let __vcpu_run() continue the guest execution loop without
+ * exiting to the userspace.  Otherwise, the value will be returned to the
+ * userspace.
+ */
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -6089,7 +6115,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		}
 		if (need_resched()) {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-			kvm_resched(vcpu);
+			cond_resched();
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		}
 	}
@@ -6401,6 +6427,7 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
+	struct msr_data apic_base_msr;
 	int mmu_reset_needed = 0;
 	int pending_vec, max_bits, idx;
 	struct desc_ptr dt;
@@ -6424,7 +6451,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
 	kvm_x86_ops->set_efer(vcpu, sregs->efer);
-	kvm_set_apic_base(vcpu, sregs->apic_base);
+	apic_base_msr.data = sregs->apic_base;
+	apic_base_msr.host_initiated = true;
+	kvm_set_apic_base(vcpu, &apic_base_msr);
 
 	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
 	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
@@ -6717,6 +6746,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = DR6_FIXED_1;
+	kvm_update_dr6(vcpu);
 	vcpu->arch.dr7 = DR7_FIXED_1;
 	kvm_update_dr7(vcpu);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 587fb9ede43..8da5823bcde 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -125,5 +125,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 #define KVM_SUPPORTED_XCR0	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
 extern u64 host_xcr0;
 
+extern unsigned int min_timer_period_us;
+
 extern struct static_key kvm_no_apic_vcpu;
 #endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index bdf8532494f..ad1fb5f5392 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -233,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next)
  * flags word contains all kind of stuff, but in practice Linux only cares
  * about the interrupt flag.  Our "save_flags()" just returns that.
  */
-static unsigned long save_fl(void)
+asmlinkage unsigned long lguest_save_fl(void)
 {
 	return lguest_data.irq_enabled;
 }
 
 /* Interrupts go off... */
-static void irq_disable(void)
+asmlinkage void lguest_irq_disable(void)
 {
 	lguest_data.irq_enabled = 0;
 }
@@ -253,8 +253,8 @@ static void irq_disable(void)
  * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
  * C function, then restores it.
  */
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
+PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
 /*:*/
 
 /* These are in i386_head.S */
@@ -1291,9 +1291,9 @@ __init void lguest_init(void)
 	 */
 
 	/* Interrupt-related operations */
-	pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+	pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
 	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
 	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
 	pv_irq_ops.safe_halt = lguest_safe_halt;
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 992d63bb154..eabcb6e6a90 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -24,7 +24,7 @@ lib-$(CONFIG_SMP) += rwlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
new file mode 100644
index 00000000000..3056702e81f
--- /dev/null
+++ b/arch/x86/lib/hash.c
@@ -0,0 +1,88 @@
+/*
+ * Some portions derived from code covered by the following notice:
+ *
+ * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/hash.h>
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/hash.h>
+
+static inline u32 crc32_u32(u32 crc, u32 val)
+{
+	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
+	return crc;
+}
+
+static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
+{
+	const u32 *p32 = (const u32 *) data;
+	u32 i, tmp = 0;
+
+	for (i = 0; i < len / 4; i++)
+		seed = crc32_u32(*p32++, seed);
+
+	switch (3 - (len & 0x03)) {
+	case 0:
+		tmp |= *((const u8 *) p32 + 2) << 16;
+		/* fallthrough */
+	case 1:
+		tmp |= *((const u8 *) p32 + 1) << 8;
+		/* fallthrough */
+	case 2:
+		tmp |= *((const u8 *) p32);
+		seed = crc32_u32(tmp, seed);
+	default:
+		break;
+	}
+
+	return seed;
+}
+
+static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
+{
+	const u32 *p32 = (const u32 *) data;
+	u32 i;
+
+	for (i = 0; i < len; i++)
+		seed = crc32_u32(*p32++, seed);
+
+	return seed;
+}
+
+void setup_arch_fast_hash(struct fast_hash_ops *ops)
+{
+	if (cpu_has_xmm4_2) {
+		ops->hash  = intel_crc4_2_hash;
+		ops->hash2 = intel_crc4_2_hash2;
+	}
+}
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
index 59d353d2c59..a5449089cd9 100644
--- a/arch/x86/math-emu/errors.c
+++ b/arch/x86/math-emu/errors.c
@@ -330,11 +330,6 @@ asmlinkage void FPU_exception(int n)
 
 	RE_ENTRANT_CHECK_OFF;
 	if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) {
-#ifdef PRINT_MESSAGES
-		/* My message from the sponsor */
-		printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n");
-#endif /* PRINT_MESSAGES */
-
 		/* Get a name string for error reporting */
 		for (i = 0; exception_names[i].type; i++)
 			if ((exception_names[i].type & n) ==
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 0596e8e0cc1..207d9aef662 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 
 static inline void get_head_page_multiple(struct page *page, int nr)
 {
-	VM_BUG_ON(page != compound_head(page));
-	VM_BUG_ON(page_count(page) == 0);
+	VM_BUG_ON_PAGE(page != compound_head(page), page);
+	VM_BUG_ON_PAGE(page_count(page) == 0, page);
 	atomic_add(nr, &page->_count);
 	SetPageReferenced(page);
 }
@@ -135,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	head = pte_page(pte);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON(compound_head(page) != head);
+		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		if (PageTail(page))
 			get_huge_page_tail(page);
@@ -212,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 	head = pte_page(pte);
 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON(compound_head(page) != head);
+		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		if (PageTail(page))
 			get_huge_page_tail(page);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5bdc5430597..e39504878ae 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -665,7 +665,7 @@ void __init initmem_init(void)
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
 
-	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
 	sparse_memory_present_with_active_regions(0);
 
 #ifdef CONFIG_FLATMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a9245..f35c66c5959 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,
 #ifndef CONFIG_NUMA
 void __init initmem_init(void)
 {
-	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
 }
 #endif
 
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed409e..1e9da795767 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
 	u64 i;
 	phys_addr_t this_start, this_end;
 
-	for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
 		this_start = clamp_t(phys_addr_t, this_start, start, end);
 		this_end = clamp_t(phys_addr_t, this_end, start, end);
 		if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c85da7bb6b6..81b2750f366 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -491,7 +491,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	for (i = 0; i < mi->nr_blks; i++) {
 		struct numa_memblk *mb = &mi->blk[i];
-		memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
+		memblock_set_node(mb->start, mb->end - mb->start,
+				  &memblock.memory, mb->nid);
+
+		/*
+		 * At this time, all memory regions reserved by memblock are
+		 * used by the kernel. Set the nid in memblock.reserved will
+		 * mark out all the nodes the kernel resides in.
+		 */
+		memblock_set_node(mb->start, mb->end - mb->start,
+				  &memblock.reserved, mb->nid);
 	}
 
 	/*
@@ -553,6 +562,30 @@ static void __init numa_init_array(void)
 	}
 }
 
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+	int i, nid;
+	nodemask_t numa_kernel_nodes;
+	unsigned long start, end;
+	struct memblock_type *type = &memblock.reserved;
+
+	/* Mark all kernel nodes. */
+	for (i = 0; i < type->cnt; i++)
+		node_set(type->regions[i].nid, numa_kernel_nodes);
+
+	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+	for (i = 0; i < numa_meminfo.nr_blks; i++) {
+		nid = numa_meminfo.blk[i].nid;
+		if (!node_isset(nid, numa_kernel_nodes))
+			continue;
+
+		start = numa_meminfo.blk[i].start;
+		end = numa_meminfo.blk[i].end;
+
+		memblock_clear_hotplug(start, end - start);
+	}
+}
+
 static int __init numa_init(int (*init_func)(void))
 {
 	int i;
@@ -565,7 +598,12 @@ static int __init numa_init(int (*init_func)(void))
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
-	WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
+	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+				  MAX_NUMNODES));
+	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+				  MAX_NUMNODES));
+	/* In case that parsing SRAT failed. */
+	WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
 	numa_reset_distance();
 
 	ret = init_func();
@@ -601,6 +639,16 @@ static int __init numa_init(int (*init_func)(void))
 			numa_clear_node(i);
 	}
 	numa_init_array();
+
+	/*
+	 * At very early time, the kernel have to use some memory such as
+	 * loading the kernel image. We cannot prevent this anyway. So any
+	 * node the kernel resides in should be un-hotpluggable.
+	 *
+	 * And when we come here, numa_init() won't fail.
+	 */
+	numa_clear_kernel_node_hotplug();
+
 	return 0;
 }
 
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 5ecf65117e6..1953e9c9391 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -191,6 +191,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		(unsigned long long) start, (unsigned long long) end - 1,
 		hotpluggable ? " hotplug" : "");
 
+	/* Mark hotplug range in memblock. */
+	if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+		pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+			(unsigned long long)start, (unsigned long long)end - 1);
+
 	return 0;
 out_err_bad_srat:
 	bad_srat();
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 082e8812971..248642f4bab 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -12,7 +12,6 @@
 
 #include <linux/pci.h>
 #include <linux/init.h>
-#include <linux/acpi.h>
 #include <linux/sfi_acpi.h>
 #include <linux/bitmap.h>
 #include <linux/dmi.h>
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 5c90975cdf0..43984bc1665 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -14,7 +14,6 @@
 #include <linux/rcupdate.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
-#include <acpi/acpi.h>
 
 /* Assume systems with more busses have correct MCFG */
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 5eee4959785..103e702ec5a 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -337,7 +337,7 @@ out:
 	return ret;
 }
 
-static void xen_initdom_restore_msi_irqs(struct pci_dev *dev, int irq)
+static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
 {
 	int ret = 0;
 
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 7145ec63c52..4df9591eada 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -49,7 +49,8 @@ void __init efi_bgrt_init(void)
 
 	image = efi_lookup_mapped_addr(bgrt_tab->image_address);
 	if (!image) {
-		image = ioremap(bgrt_tab->image_address, sizeof(bmp_header));
+		image = early_memremap(bgrt_tab->image_address,
+				       sizeof(bmp_header));
 		ioremapped = true;
 		if (!image)
 			return;
@@ -57,7 +58,7 @@ void __init efi_bgrt_init(void)
 
 	memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
 	if (ioremapped)
-		iounmap(image);
+		early_iounmap(image, sizeof(bmp_header));
 	bgrt_image_size = bmp_header.size;
 
 	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL);
@@ -65,7 +66,8 @@ void __init efi_bgrt_init(void)
 		return;
 
 	if (ioremapped) {
-		image = ioremap(bgrt_tab->image_address, bmp_header.size);
+		image = early_memremap(bgrt_tab->image_address,
+				       bmp_header.size);
 		if (!image) {
 			kfree(bgrt_image);
 			bgrt_image = NULL;
@@ -75,5 +77,5 @@ void __init efi_bgrt_init(void)
 
 	memcpy_fromio(bgrt_image, image, bgrt_image_size);
 	if (ioremapped)
-		iounmap(image);
+		early_iounmap(image, bmp_header.size);
 }
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
index 8f568dd7960..79bb09d4f71 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
+++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
@@ -12,6 +12,7 @@
 #ifndef _PLATFORM_IPC_H_
 #define _PLATFORM_IPC_H_
 
-extern void __init ipc_device_handler(struct sfi_device_table_entry *pentry,
-			struct devs_id *dev) __attribute__((weak));
+void __init
+ipc_device_handler(struct sfi_device_table_entry *pentry, struct devs_id *dev);
+
 #endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
index 917eb56d77d..b7be1d041da 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.h
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
@@ -14,6 +14,6 @@
 
 extern struct intel_msic_platform_data msic_pdata;
 
-extern void *msic_generic_platform_data(void *info,
-			enum intel_msic_block block) __attribute__((weak));
+void *msic_generic_platform_data(void *info, enum intel_msic_block block);
+
 #endif
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
index a537ffc1629..46aa25c8ce0 100644
--- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -14,6 +14,6 @@
 /* For every CPU addition a new get_<cpuname>_ops interface needs
  * to be added.
  */
-extern void * __cpuinit get_penwell_ops(void) __attribute__((weak));
-extern void * __cpuinit get_cloverview_ops(void) __attribute__((weak));
-extern void * __init get_tangier_ops(void) __attribute__((weak));
+extern void *get_penwell_ops(void) __attribute__((weak));
+extern void *get_cloverview_ops(void) __attribute__((weak));
+extern void *get_tangier_ops(void) __attribute__((weak));
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index 4f7884eebc1..23381d2174a 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -58,18 +58,18 @@ static unsigned long __init mfld_calibrate_tsc(void)
 	return 0;
 }
 
-static void __init penwell_arch_setup()
+static void __init penwell_arch_setup(void)
 {
 	x86_platform.calibrate_tsc = mfld_calibrate_tsc;
 	pm_power_off = mfld_power_off;
 }
 
-void * __cpuinit get_penwell_ops()
+void *get_penwell_ops(void)
 {
 	return &penwell_ops;
 }
 
-void * __cpuinit get_cloverview_ops()
+void *get_cloverview_ops(void)
 {
 	return &penwell_ops;
 }
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
index 09d10159e7b..aaca91753d3 100644
--- a/arch/x86/platform/intel-mid/mrfl.c
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -97,7 +97,7 @@ static struct intel_mid_ops tangier_ops = {
 	.arch_setup = tangier_arch_setup,
 };
 
-void * __cpuinit get_tangier_ops()
+void *get_tangier_ops(void)
 {
 	return &tangier_ops;
 }
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 649a12befba..08e350e757d 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -15,8 +15,7 @@
 #include <linux/power_supply.h>
 #include <linux/olpc-ec.h>
 
-#include <acpi/acpi_bus.h>
-#include <acpi/acpi_drivers.h>
+#include <linux/acpi.h>
 #include <asm/olpc.h>
 
 #define DRV_NAME			"olpc-xo15-sci"
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 8eeccba7313..be27da60dc8 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -74,7 +74,6 @@ static atomic_t	uv_in_nmi;
 static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1);
 static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1);
 static atomic_t uv_nmi_slave_continue;
-static atomic_t uv_nmi_kexec_failed;
 static cpumask_var_t uv_nmi_cpu_mask;
 
 /* Values for uv_nmi_slave_continue */
@@ -149,7 +148,8 @@ module_param_named(retry_count, uv_nmi_retry_count, int, 0644);
  *  "dump"	- dump process stack for each cpu
  *  "ips"	- dump IP info for each cpu
  *  "kdump"	- do crash dump
- *  "kdb"	- enter KDB/KGDB (default)
+ *  "kdb"	- enter KDB (default)
+ *  "kgdb"	- enter KGDB
  */
 static char uv_nmi_action[8] = "kdb";
 module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644);
@@ -504,6 +504,7 @@ static void uv_nmi_touch_watchdogs(void)
 }
 
 #if defined(CONFIG_KEXEC)
+static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
 	/* Call crash to dump system state */
@@ -537,18 +538,45 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 }
 #endif /* !CONFIG_KEXEC */
 
+#ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
-/* Call KDB from NMI handler */
-static void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
+static inline int uv_nmi_kdb_reason(void)
 {
-	int ret;
+	return KDB_REASON_SYSTEM_NMI;
+}
+#else /* !CONFIG_KGDB_KDB */
+static inline int uv_nmi_kdb_reason(void)
+{
+	/* Insure user is expecting to attach gdb remote */
+	if (uv_nmi_action_is("kgdb"))
+		return 0;
+
+	pr_err("UV: NMI error: KDB is not enabled in this kernel\n");
+	return -1;
+}
+#endif /* CONFIG_KGDB_KDB */
 
+/*
+ * Call KGDB/KDB from NMI handler
+ *
+ * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or
+ * 'kdb' has no affect on which is used.  See the KGDB documention for further
+ * information.
+ */
+static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
+{
 	if (master) {
+		int reason = uv_nmi_kdb_reason();
+		int ret;
+
+		if (reason < 0)
+			return;
+
 		/* call KGDB NMI handler as MASTER */
-		ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs,
-					&uv_nmi_slave_continue);
+		ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason,
+				&uv_nmi_slave_continue);
 		if (ret) {
-			pr_alert("KDB returned error, is kgdboc set?\n");
+			pr_alert("KGDB returned error, is kgdboc set?\n");
 			atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
 		}
 	} else {
@@ -567,12 +595,12 @@ static void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
 	uv_nmi_sync_exit(master);
 }
 
-#else /* !CONFIG_KGDB_KDB */
-static inline void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
+#else /* !CONFIG_KGDB */
+static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
 {
-	pr_err("UV: NMI error: KGDB/KDB is not enabled in this kernel\n");
+	pr_err("UV: NMI error: KGDB is not enabled in this kernel\n");
 }
-#endif /* !CONFIG_KGDB_KDB */
+#endif /* !CONFIG_KGDB */
 
 /*
  * UV NMI handler
@@ -606,9 +634,9 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 	if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump"))
 		uv_nmi_dump_state(cpu, regs, master);
 
-	/* Call KDB if enabled */
-	else if (uv_nmi_action_is("kdb"))
-		uv_call_kdb(cpu, regs, master);
+	/* Call KGDB/KDB if enabled */
+	else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb"))
+		uv_call_kgdb_kdb(cpu, regs, master);
 
 	/* Clear per_cpu "in nmi" flag */
 	atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT);
@@ -634,7 +662,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 /*
  * NMI handler for pulling in CPUs when perf events are grabbing our NMI
  */
-int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
+static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
 {
 	int ret;
 
@@ -651,7 +679,7 @@ int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
 	return ret;
 }
 
-void uv_register_nmi_notifier(void)
+static void uv_register_nmi_notifier(void)
 {
 	if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
 		pr_warn("UV: NMI handler failed to register\n");
@@ -695,6 +723,5 @@ void uv_nmi_setup(void)
 		uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid];
 	}
 	BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL));
+	uv_register_nmi_notifier();
 }
-
-
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 9cac82588cb..3497f14e4de 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -64,20 +64,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
 
 # ---------------------------------------------------------------------------
 
-# How to compile the 16-bit code.  Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-KBUILD_CFLAGS	:= $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
-		   -I$(srctree)/arch/x86/boot \
-		   -DDISABLE_BRANCH_PROFILING \
-		   -Wall -Wstrict-prototypes \
-		   -march=i386 -mregparm=3 \
-		   -include $(srctree)/$(src)/../../boot/code16gcc.h \
-		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-		   -mno-mmx -mno-sse \
-		   $(call cc-option, -ffreestanding) \
-		   $(call cc-option, -fno-toplevel-reorder,\
-		   $(call cc-option, -fno-unit-at-a-time)) \
-		   $(call cc-option, -fno-stack-protector) \
-		   $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_CFLAGS	:= $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
+		   -I$(srctree)/arch/x86/boot
 KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 11f9285a2ff..cfbdbdb4e17 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -1025,6 +1025,29 @@ static void emit_relocs(int as_text, int use_real_mode)
 	}
 }
 
+/*
+ * As an aid to debugging problems with different linkers
+ * print summary information about the relocs.
+ * Since different linkers tend to emit the sections in
+ * different orders we use the section names in the output.
+ */
+static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
+				const char *symname)
+{
+	printf("%s\t%s\t%s\t%s\n",
+		sec_name(sec->shdr.sh_info),
+		rel_type(ELF_R_TYPE(rel->r_info)),
+		symname,
+		sec_name(sym->st_shndx));
+	return 0;
+}
+
+static void print_reloc_info(void)
+{
+	printf("reloc section\treloc type\tsymbol\tsymbol section\n");
+	walk_relocs(do_reloc_info);
+}
+
 #if ELF_BITS == 64
 # define process process_64
 #else
@@ -1032,7 +1055,8 @@ static void emit_relocs(int as_text, int use_real_mode)
 #endif
 
 void process(FILE *fp, int use_real_mode, int as_text,
-	     int show_absolute_syms, int show_absolute_relocs)
+	     int show_absolute_syms, int show_absolute_relocs,
+	     int show_reloc_info)
 {
 	regex_init(use_real_mode);
 	read_ehdr(fp);
@@ -1050,5 +1074,9 @@ void process(FILE *fp, int use_real_mode, int as_text,
 		print_absolute_relocs();
 		return;
 	}
+	if (show_reloc_info) {
+		print_reloc_info();
+		return;
+	}
 	emit_relocs(as_text, use_real_mode);
 }
diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h
index 07cdb1eca4f..f59590645b6 100644
--- a/arch/x86/tools/relocs.h
+++ b/arch/x86/tools/relocs.h
@@ -29,8 +29,9 @@ enum symtype {
 };
 
 void process_32(FILE *fp, int use_real_mode, int as_text,
-		int show_absolute_syms, int show_absolute_relocs);
+		int show_absolute_syms, int show_absolute_relocs,
+		int show_reloc_info);
 void process_64(FILE *fp, int use_real_mode, int as_text,
-		int show_absolute_syms, int show_absolute_relocs);
-
+		int show_absolute_syms, int show_absolute_relocs,
+		int show_reloc_info);
 #endif /* RELOCS_H */
diff --git a/arch/x86/tools/relocs_common.c b/arch/x86/tools/relocs_common.c
index 44d396823a5..acab636bcb3 100644
--- a/arch/x86/tools/relocs_common.c
+++ b/arch/x86/tools/relocs_common.c
@@ -11,12 +11,13 @@ void die(char *fmt, ...)
 
 static void usage(void)
 {
-	die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n");
+	die("relocs [--abs-syms|--abs-relocs|--reloc-info|--text|--realmode]" \
+	    " vmlinux\n");
 }
 
 int main(int argc, char **argv)
 {
-	int show_absolute_syms, show_absolute_relocs;
+	int show_absolute_syms, show_absolute_relocs, show_reloc_info;
 	int as_text, use_real_mode;
 	const char *fname;
 	FILE *fp;
@@ -25,6 +26,7 @@ int main(int argc, char **argv)
 
 	show_absolute_syms = 0;
 	show_absolute_relocs = 0;
+	show_reloc_info = 0;
 	as_text = 0;
 	use_real_mode = 0;
 	fname = NULL;
@@ -39,6 +41,10 @@ int main(int argc, char **argv)
 				show_absolute_relocs = 1;
 				continue;
 			}
+			if (strcmp(arg, "--reloc-info") == 0) {
+				show_reloc_info = 1;
+				continue;
+			}
 			if (strcmp(arg, "--text") == 0) {
 				as_text = 1;
 				continue;
@@ -67,10 +73,12 @@ int main(int argc, char **argv)
 	rewind(fp);
 	if (e_ident[EI_CLASS] == ELFCLASS64)
 		process_64(fp, use_real_mode, as_text,
-			   show_absolute_syms, show_absolute_relocs);
+			   show_absolute_syms, show_absolute_relocs,
+			   show_reloc_info);
 	else
 		process_32(fp, use_real_mode, as_text,
-			   show_absolute_syms, show_absolute_relocs);
+			   show_absolute_syms, show_absolute_relocs,
+			   show_reloc_info);
 	fclose(fp);
 	return 0;
 }
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1a3c7650564..01b90261fa3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -51,3 +51,7 @@ config XEN_DEBUG_FS
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
 
+config XEN_PVH
+	bool "Support for running as a PVH guest"
+	depends on X86_64 && XEN && XEN_PVHVM
+	def_bool n
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fa6ade76ef3..a4d7b647867 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -262,8 +262,9 @@ static void __init xen_banner(void)
 	struct xen_extraversion extra;
 	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
 
-	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       pv_info.name);
+	pr_info("Booting paravirtualized kernel %son %s\n",
+		xen_feature(XENFEAT_auto_translated_physmap) ?
+			"with PVH extensions " : "", pv_info.name);
 	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
 	       version >> 16, version & 0xffff, extra.extraversion,
 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
@@ -433,7 +434,7 @@ static void __init xen_init_cpuid_mask(void)
 
 	ax = 1;
 	cx = 0;
-	xen_cpuid(&ax, &bx, &cx, &dx);
+	cpuid(1, &ax, &bx, &cx, &dx);
 
 	xsave_mask =
 		(1 << (X86_FEATURE_XSAVE % 32)) |
@@ -1142,8 +1143,9 @@ void xen_setup_vcpu_info_placement(void)
 		xen_vcpu_setup(cpu);
 
 	/* xen_vcpu_setup managed to place the vcpu_info within the
-	   percpu area for all cpus, so make use of it */
-	if (have_vcpu_info_placement) {
+	 * percpu area for all cpus, so make use of it. Note that for
+	 * PVH we want to use native IRQ mechanism. */
+	if (have_vcpu_info_placement && !xen_pvh_domain()) {
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1407,9 +1409,49 @@ static void __init xen_boot_params_init_edd(void)
  * Set up the GDT and segment registers for -fstack-protector.  Until
  * we do this, we have to be careful not to call any stack-protected
  * function, which is most of the kernel.
+ *
+ * Note, that it is __ref because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
  */
-static void __init xen_setup_stackprotector(void)
+static void __ref xen_setup_gdt(int cpu)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+#ifdef CONFIG_X86_64
+		unsigned long dummy;
+
+		load_percpu_segment(cpu); /* We need to access per-cpu area */
+		switch_to_new_gdt(cpu); /* GDT and GS set */
+
+		/* We are switching of the Xen provided GDT to our HVM mode
+		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1
+		 * and we are jumping to reload it.
+		 */
+		asm volatile ("pushq %0\n"
+			      "leaq 1f(%%rip),%0\n"
+			      "pushq %0\n"
+			      "lretq\n"
+			      "1:\n"
+			      : "=&r" (dummy) : "0" (__KERNEL_CS));
+
+		/*
+		 * While not needed, we also set the %es, %ds, and %fs
+		 * to zero. We don't care about %ss as it is NULL.
+		 * Strictly speaking this is not needed as Xen zeros those
+		 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
+		 *
+		 * Linux zeros them in cpu_init() and in secondary_startup_64
+		 * (for BSP).
+		 */
+		loadsegment(es, 0);
+		loadsegment(ds, 0);
+		loadsegment(fs, 0);
+#else
+		/* PVH: TODO Implement. */
+		BUG();
+#endif
+		return; /* PVH does not need any PV GDT ops. */
+	}
 	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
 	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
 
@@ -1420,6 +1462,46 @@ static void __init xen_setup_stackprotector(void)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+/*
+ * A PV guest starts with default flags that are not set for PVH, set them
+ * here asap.
+ */
+static void xen_pvh_set_cr_flags(int cpu)
+{
+
+	/* Some of these are setup in 'secondary_startup_64'. The others:
+	 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
+	 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
+	write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
+}
+
+/*
+ * Note, that it is ref - because the only caller of this after init
+ * is PVH which is not going to use xen_load_gdt_boot or other
+ * __init functions.
+ */
+void __ref xen_pvh_secondary_vcpu_init(int cpu)
+{
+	xen_setup_gdt(cpu);
+	xen_pvh_set_cr_flags(cpu);
+}
+
+static void __init xen_pvh_early_guest_init(void)
+{
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	if (!xen_feature(XENFEAT_hvm_callback_vector))
+		return;
+
+	xen_have_vector_callback = 1;
+	xen_pvh_set_cr_flags(0);
+
+#ifdef CONFIG_X86_32
+	BUG(); /* PVH: Implement proper support. */
+#endif
+}
+
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1431,13 +1513,16 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_domain_type = XEN_PV_DOMAIN;
 
+	xen_setup_features();
+	xen_pvh_early_guest_init();
 	xen_setup_machphys_mapping();
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	pv_cpu_ops = xen_cpu_ops;
 	pv_apic_ops = xen_apic_ops;
+	if (!xen_pvh_domain())
+		pv_cpu_ops = xen_cpu_ops;
 
 	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
@@ -1469,17 +1554,14 @@ asmlinkage void __init xen_start_kernel(void)
 	/* Work out if we support NX */
 	x86_configure_nx();
 
-	xen_setup_features();
-
 	/* Get mfn list */
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		xen_build_dynamic_phys_to_machine();
+	xen_build_dynamic_phys_to_machine();
 
 	/*
 	 * Set up kernel GDT and segment registers, mainly so that
 	 * -fstack-protector code can be executed.
 	 */
-	xen_setup_stackprotector();
+	xen_setup_gdt(0);
 
 	xen_init_irq_ops();
 	xen_init_cpuid_mask();
@@ -1548,14 +1630,18 @@ asmlinkage void __init xen_start_kernel(void)
 	/* set the limit of our address space */
 	xen_reserve_top();
 
-	/* We used to do this in xen_arch_setup, but that is too late on AMD
-	 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
-	 * which pokes 0xcf8 port.
-	 */
-	set_iopl.iopl = 1;
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-	if (rc != 0)
-		xen_raw_printk("physdev_op failed %d\n", rc);
+	/* PVH: runs at default kernel iopl of 0 */
+	if (!xen_pvh_domain()) {
+		/*
+		 * We used to do this in xen_arch_setup, but that is too late
+		 * on AMD were early_cpu_init (run before ->arch_setup()) calls
+		 * early_amd_init which pokes 0xcf8 port.
+		 */
+		set_iopl.iopl = 1;
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+		if (rc != 0)
+			xen_raw_printk("physdev_op failed %d\n", rc);
+	}
 
 #ifdef CONFIG_X86_32
 	/* set up basic CPUID stuff */
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 3a5f55d5190..c9858358858 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -125,3 +125,67 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 	apply_to_page_range(&init_mm, (unsigned long)shared,
 			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
 }
+#ifdef CONFIG_XEN_PVH
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/xen.h>
+#include <linux/slab.h>
+static int __init xlated_setup_gnttab_pages(void)
+{
+	struct page **pages;
+	xen_pfn_t *pfns;
+	int rc;
+	unsigned int i;
+	unsigned long nr_grant_frames = gnttab_max_grant_frames();
+
+	BUG_ON(nr_grant_frames == 0);
+	pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL);
+	if (!pfns) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+	rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+	if (rc) {
+		pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
+			nr_grant_frames, rc);
+		kfree(pages);
+		kfree(pfns);
+		return rc;
+	}
+	for (i = 0; i < nr_grant_frames; i++)
+		pfns[i] = page_to_pfn(pages[i]);
+
+	rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
+				    &xen_auto_xlat_grant_frames.vaddr);
+
+	if (rc) {
+		pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
+			nr_grant_frames, rc);
+		free_xenballooned_pages(nr_grant_frames, pages);
+		kfree(pages);
+		kfree(pfns);
+		return rc;
+	}
+	kfree(pages);
+
+	xen_auto_xlat_grant_frames.pfn = pfns;
+	xen_auto_xlat_grant_frames.count = nr_grant_frames;
+
+	return 0;
+}
+
+static int __init xen_pvh_gnttab_setup(void)
+{
+	if (!xen_pvh_domain())
+		return -ENODEV;
+
+	return xlated_setup_gnttab_pages();
+}
+/* Call it _before_ __gnttab_init as we need to initialize the
+ * xen_auto_xlat_grant_frames first. */
+core_initcall(xen_pvh_gnttab_setup);
+#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 0da7f863056..08f763de26f 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -5,6 +5,7 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/vcpu.h>
+#include <xen/features.h>
 #include <xen/events.h>
 
 #include <asm/xen/hypercall.h>
@@ -22,7 +23,7 @@ void xen_force_evtchn_callback(void)
 	(void)HYPERVISOR_xen_version(0, NULL);
 }
 
-static unsigned long xen_save_fl(void)
+asmlinkage unsigned long xen_save_fl(void)
 {
 	struct vcpu_info *vcpu;
 	unsigned long flags;
@@ -40,7 +41,7 @@ static unsigned long xen_save_fl(void)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
 
-static void xen_restore_fl(unsigned long flags)
+__visible void xen_restore_fl(unsigned long flags)
 {
 	struct vcpu_info *vcpu;
 
@@ -62,7 +63,7 @@ static void xen_restore_fl(unsigned long flags)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
 
-static void xen_irq_disable(void)
+asmlinkage void xen_irq_disable(void)
 {
 	/* There's a one instruction preempt window here.  We need to
 	   make sure we're don't switch CPUs between getting the vcpu
@@ -73,7 +74,7 @@ static void xen_irq_disable(void)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
 
-static void xen_irq_enable(void)
+asmlinkage void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
 
@@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {
 
 void __init xen_init_irq_ops(void)
 {
-	pv_irq_ops = xen_irq_ops;
+	/* For PVH we use default pv_irq_ops settings. */
+	if (!xen_feature(XENFEAT_hvm_callback_vector))
+		pv_irq_ops = xen_irq_ops;
 	x86_init.irqs.intr_init = xen_init_IRQ;
 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ce563be09cc..2423ef04ffe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -431,7 +431,7 @@ static pteval_t iomap_pte(pteval_t val)
 	return val;
 }
 
-static pteval_t xen_pte_val(pte_t pte)
+__visible pteval_t xen_pte_val(pte_t pte)
 {
 	pteval_t pteval = pte.pte;
 #if 0
@@ -448,7 +448,7 @@ static pteval_t xen_pte_val(pte_t pte)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 
-static pgdval_t xen_pgd_val(pgd_t pgd)
+__visible pgdval_t xen_pgd_val(pgd_t pgd)
 {
 	return pte_mfn_to_pfn(pgd.pgd);
 }
@@ -479,7 +479,7 @@ void xen_set_pat(u64 pat)
 	WARN_ON(pat != 0x0007010600070106ull);
 }
 
-static pte_t xen_make_pte(pteval_t pte)
+__visible pte_t xen_make_pte(pteval_t pte)
 {
 	phys_addr_t addr = (pte & PTE_PFN_MASK);
 #if 0
@@ -514,14 +514,14 @@ static pte_t xen_make_pte(pteval_t pte)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 
-static pgd_t xen_make_pgd(pgdval_t pgd)
+__visible pgd_t xen_make_pgd(pgdval_t pgd)
 {
 	pgd = pte_pfn_to_mfn(pgd);
 	return native_make_pgd(pgd);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 
-static pmdval_t xen_pmd_val(pmd_t pmd)
+__visible pmdval_t xen_pmd_val(pmd_t pmd)
 {
 	return pte_mfn_to_pfn(pmd.pmd);
 }
@@ -580,7 +580,7 @@ static void xen_pmd_clear(pmd_t *pmdp)
 }
 #endif	/* CONFIG_X86_PAE */
 
-static pmd_t xen_make_pmd(pmdval_t pmd)
+__visible pmd_t xen_make_pmd(pmdval_t pmd)
 {
 	pmd = pte_pfn_to_mfn(pmd);
 	return native_make_pmd(pmd);
@@ -588,13 +588,13 @@ static pmd_t xen_make_pmd(pmdval_t pmd)
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 
 #if PAGETABLE_LEVELS == 4
-static pudval_t xen_pud_val(pud_t pud)
+__visible pudval_t xen_pud_val(pud_t pud)
 {
 	return pte_mfn_to_pfn(pud.pud);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 
-static pud_t xen_make_pud(pudval_t pud)
+__visible pud_t xen_make_pud(pudval_t pud)
 {
 	pud = pte_pfn_to_mfn(pud);
 
@@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
 	 * instead of somewhere later and be confusing. */
 	xen_mc_flush();
 }
-#endif
-static void __init xen_pagetable_init(void)
+static void __init xen_pagetable_p2m_copy(void)
 {
-#ifdef CONFIG_X86_64
 	unsigned long size;
 	unsigned long addr;
-#endif
-	paging_init();
-	xen_setup_shared_info();
-#ifdef CONFIG_X86_64
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-		unsigned long new_mfn_list;
-
-		size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-
-		/* On 32-bit, we get zero so this never gets executed. */
-		new_mfn_list = xen_revector_p2m_tree();
-		if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
-			/* using __ka address and sticking INVALID_P2M_ENTRY! */
-			memset((void *)xen_start_info->mfn_list, 0xff, size);
-
-			/* We should be in __ka space. */
-			BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
-			addr = xen_start_info->mfn_list;
-			/* We roundup to the PMD, which means that if anybody at this stage is
-			 * using the __ka address of xen_start_info or xen_start_info->shared_info
-			 * they are in going to crash. Fortunatly we have already revectored
-			 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
-			size = roundup(size, PMD_SIZE);
-			xen_cleanhighmap(addr, addr + size);
-
-			size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-			memblock_free(__pa(xen_start_info->mfn_list), size);
-			/* And revector! Bye bye old array */
-			xen_start_info->mfn_list = new_mfn_list;
-		} else
-			goto skip;
-	}
+	unsigned long new_mfn_list;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+	new_mfn_list = xen_revector_p2m_tree();
+	/* No memory or already called. */
+	if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
+		return;
+
+	/* using __ka address and sticking INVALID_P2M_ENTRY! */
+	memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+	/* We should be in __ka space. */
+	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
+	addr = xen_start_info->mfn_list;
+	/* We roundup to the PMD, which means that if anybody at this stage is
+	 * using the __ka address of xen_start_info or xen_start_info->shared_info
+	 * they are in going to crash. Fortunatly we have already revectored
+	 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+	size = roundup(size, PMD_SIZE);
+	xen_cleanhighmap(addr, addr + size);
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+	memblock_free(__pa(xen_start_info->mfn_list), size);
+	/* And revector! Bye bye old array */
+	xen_start_info->mfn_list = new_mfn_list;
+
 	/* At this stage, cleanup_highmap has already cleaned __ka space
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
 	 * the ramdisk). We continue on, erasing PMD entries that point to page
@@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void)
 	 * anything at this stage. */
 	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
 #endif
-skip:
+}
+#endif
+
+static void __init xen_pagetable_init(void)
+{
+	paging_init();
+	xen_setup_shared_info();
+#ifdef CONFIG_X86_64
+	xen_pagetable_p2m_copy();
 #endif
 	xen_post_allocator_init();
 }
@@ -1753,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
 	pte_t pte = pfn_pte(pfn, prot);
 
+	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
 		BUG();
 }
@@ -1863,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
  * but that's enough to get __va working.  We need to fill in the rest
  * of the physical mapping once some sort of allocator has been set
  * up.
+ * NOTE: for PVH, the page tables are native.
  */
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
@@ -1884,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Zap identity mapping */
 	init_level4_pgt[0] = __pgd(0);
 
-	/* Pre-constructed entries are in pfn, so convert to mfn */
-	/* L4[272] -> level3_ident_pgt
-	 * L4[511] -> level3_kernel_pgt */
-	convert_pfn_mfn(init_level4_pgt);
-
-	/* L3_i[0] -> level2_ident_pgt */
-	convert_pfn_mfn(level3_ident_pgt);
-	/* L3_k[510] -> level2_kernel_pgt
-	 * L3_i[511] -> level2_fixmap_pgt */
-	convert_pfn_mfn(level3_kernel_pgt);
-
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/* Pre-constructed entries are in pfn, so convert to mfn */
+		/* L4[272] -> level3_ident_pgt
+		 * L4[511] -> level3_kernel_pgt */
+		convert_pfn_mfn(init_level4_pgt);
+
+		/* L3_i[0] -> level2_ident_pgt */
+		convert_pfn_mfn(level3_ident_pgt);
+		/* L3_k[510] -> level2_kernel_pgt
+		 * L3_i[511] -> level2_fixmap_pgt */
+		convert_pfn_mfn(level3_kernel_pgt);
+	}
 	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
 	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
 	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
@@ -1918,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	copy_page(level2_fixmap_pgt, l2);
 	/* Note that we don't do anything with level1_fixmap_pgt which
 	 * we don't need. */
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/* Make pagetable pieces RO */
+		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+		set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+		set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+		/* Pin down new L4 */
+		pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+				  PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+		/* Unpin Xen-provided one */
+		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
-	/* Make pagetable pieces RO */
-	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
-	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-
-	/* Pin down new L4 */
-	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
-	/* Unpin Xen-provided one */
-	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-	/*
-	 * At this stage there can be no user pgd, and no page
-	 * structure to attach it to, so make sure we just set kernel
-	 * pgd.
-	 */
-	xen_mc_batch();
-	__xen_write_cr3(true, __pa(init_level4_pgt));
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
+		/*
+		 * At this stage there can be no user pgd, and no page
+		 * structure to attach it to, so make sure we just set kernel
+		 * pgd.
+		 */
+		xen_mc_batch();
+		__xen_write_cr3(true, __pa(init_level4_pgt));
+		xen_mc_issue(PARAVIRT_LAZY_CPU);
+	} else
+		native_write_cr3(__pa(init_level4_pgt));
 
 	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
 	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
@@ -2103,6 +2115,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
 static void __init xen_post_allocator_init(void)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;
@@ -2207,6 +2222,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 void __init xen_init_mmu_ops(void)
 {
 	x86_init.paging.pagetable_init = xen_pagetable_init;
+
+	/* Optimization - we can use the HVM one but it has no idea which
+	 * VCPUs are descheduled - which means that it will needlessly IPI
+	 * them. Xen knows so let it do the job.
+	 */
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+		return;
+	}
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2ae8699e876..8009acbe41e 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void)
 {
 	unsigned long pfn;
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	/* Pre-initialize p2m_top_mfn to be completely missing */
 	if (p2m_top_mfn == NULL) {
 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
@@ -336,6 +339,9 @@ void __ref xen_build_mfn_list_list(void)
 
 void xen_setup_mfn_list_list(void)
 {
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -346,10 +352,15 @@ void xen_setup_mfn_list_list(void)
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
 void __init xen_build_dynamic_phys_to_machine(void)
 {
-	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
-	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+	unsigned long *mfn_list;
+	unsigned long max_pfn;
 	unsigned long pfn;
 
+	 if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	mfn_list = (unsigned long *)xen_start_info->mfn_list;
+	max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	xen_max_p2m_pfn = max_pfn;
 
 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
@@ -888,13 +899,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
 					"m2p_add_override: pfn %lx not mapped", pfn))
 			return -EINVAL;
 	}
-	WARN_ON(PagePrivate(page));
-	SetPagePrivate(page);
-	set_page_private(page, mfn);
-	page->index = pfn_to_mfn(pfn);
-
-	if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
-		return -ENOMEM;
 
 	if (kmap_op != NULL) {
 		if (!PageHighMem(page)) {
@@ -933,19 +937,16 @@ int m2p_add_override(unsigned long mfn, struct page *page,
 }
 EXPORT_SYMBOL_GPL(m2p_add_override);
 int m2p_remove_override(struct page *page,
-		struct gnttab_map_grant_ref *kmap_op)
+			struct gnttab_map_grant_ref *kmap_op,
+			unsigned long mfn)
 {
 	unsigned long flags;
-	unsigned long mfn;
 	unsigned long pfn;
 	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;
 
 	pfn = page_to_pfn(page);
-	mfn = get_phys_to_machine(pfn);
-	if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
-		return -EINVAL;
 
 	if (!PageHighMem(page)) {
 		address = (unsigned long)__va(pfn << PAGE_SHIFT);
@@ -959,10 +960,7 @@ int m2p_remove_override(struct page *page,
 	spin_lock_irqsave(&m2p_override_lock, flags);
 	list_del(&page->lru);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
-	WARN_ON(!PagePrivate(page));
-	ClearPagePrivate(page);
 
-	set_phys_to_machine(pfn, page->index);
 	if (kmap_op != NULL) {
 		if (!PageHighMem(page)) {
 			struct multicall_space mcs;
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0a7852483ff..a8261716d58 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -30,10 +30,9 @@
 #define XEN_PLATFORM_ERR_PROTOCOL -2
 #define XEN_PLATFORM_ERR_BLACKLIST -3
 
-/* store the value of xen_emul_unplug after the unplug is done */
-int xen_platform_pci_unplug;
-EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
 #ifdef CONFIG_XEN_PVHVM
+/* store the value of xen_emul_unplug after the unplug is done */
+static int xen_platform_pci_unplug;
 static int xen_emul_unplug;
 
 static int check_platform_magic(void)
@@ -69,6 +68,80 @@ static int check_platform_magic(void)
 	return 0;
 }
 
+bool xen_has_pv_devices()
+{
+	if (!xen_domain())
+		return false;
+
+	/* PV domains always have them. */
+	if (xen_pv_domain())
+		return true;
+
+	/* And user has xen_platform_pci=0 set in guest config as
+	 * driver did not modify the value. */
+	if (xen_platform_pci_unplug == 0)
+		return false;
+
+	if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
+		return false;
+
+	if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
+		return true;
+
+	/* This is an odd one - we are going to run legacy
+	 * and PV drivers at the same time. */
+	if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+		return true;
+
+	/* And the caller has to follow with xen_pv_{disk,nic}_devices
+	 * to be certain which driver can load. */
+	return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_devices);
+
+static bool __xen_has_pv_device(int state)
+{
+	/* HVM domains might or might not */
+	if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
+		return true;
+
+	return xen_has_pv_devices();
+}
+
+bool xen_has_pv_nic_devices(void)
+{
+	return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
+
+bool xen_has_pv_disk_devices(void)
+{
+	return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
+				   XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
+
+/*
+ * This one is odd - it determines whether you want to run PV _and_
+ * legacy (IDE) drivers together. This combination is only possible
+ * under HVM.
+ */
+bool xen_has_pv_and_legacy_disk_devices(void)
+{
+	if (!xen_domain())
+		return false;
+
+	/* N.B. This is only ever used in HVM mode */
+	if (xen_pv_domain())
+		return false;
+
+	if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
+
 void xen_unplug_emulated_devices(void)
 {
 	int r;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f59de..0982233b9b8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,6 +27,7 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
+#include "mmu.h"
 #include "xen-ops.h"
 #include "vdso.h"
 
@@ -34,7 +35,7 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 #ifdef CONFIG_X86_64
-extern const char nmi[];
+extern asmlinkage void nmi(void);
 #endif
 extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
@@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
 
 	memblock_reserve(start, size);
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
 	xen_max_p2m_pfn = PFN_DOWN(start + size);
 	for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
 		unsigned long mfn = pfn_to_mfn(pfn);
@@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 		.domid        = DOMID_SELF
 	};
 	unsigned long len = 0;
+	int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap);
 	unsigned long pfn;
 	int ret;
 
@@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 				continue;
 			frame = mfn;
 		} else {
-			if (mfn != INVALID_P2M_ENTRY)
+			if (!xlated_phys && mfn != INVALID_P2M_ENTRY)
 				continue;
 			frame = pfn;
 		}
@@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 static unsigned long __init xen_release_chunk(unsigned long start,
 					      unsigned long end)
 {
+	/*
+	 * Xen already ballooned out the E820 non RAM regions for us
+	 * and set them up properly in EPT.
+	 */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return end - start;
+
 	return xen_do_chunk(start, end, true);
 }
 
@@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk(
 	 * (except for the ISA region which must be 1:1 mapped) to
 	 * release the refcounts (in Xen) on the original frames.
 	 */
-	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
+
+	/*
+	 * PVH E820 matches the hypervisor's P2M which means we need to
+	 * account for the proper values of *release and *identity.
+	 */
+	for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) &&
+	     pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
 		pte_t pte = __pte_ma(0);
 
 		if (pfn < PFN_UP(ISA_END_ADDRESS))
@@ -559,20 +577,17 @@ void xen_enable_syscall(void)
 void xen_enable_nmi(void)
 {
 #ifdef CONFIG_X86_64
-	if (register_callback(CALLBACKTYPE_nmi, nmi))
+	if (register_callback(CALLBACKTYPE_nmi, (char *)nmi))
 		BUG();
 #endif
 }
-void __init xen_arch_setup(void)
+void __init xen_pvmmu_arch_setup(void)
 {
-	xen_panic_handler_init();
-
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
 
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		HYPERVISOR_vm_assist(VMASST_CMD_enable,
-				     VMASST_TYPE_pae_extended_cr3);
+	HYPERVISOR_vm_assist(VMASST_CMD_enable,
+			     VMASST_TYPE_pae_extended_cr3);
 
 	if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
 	    register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
@@ -581,6 +596,15 @@ void __init xen_arch_setup(void)
 	xen_enable_sysenter();
 	xen_enable_syscall();
 	xen_enable_nmi();
+}
+
+/* This function is not called for HVM domains */
+void __init xen_arch_setup(void)
+{
+	xen_panic_handler_init();
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		xen_pvmmu_arch_setup();
+
 #ifdef CONFIG_ACPI
 	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
 		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c36b325abd8..a18eadd8bb4 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -73,9 +73,11 @@ static void cpu_bringup(void)
 	touch_softlockup_watchdog();
 	preempt_disable();
 
-	xen_enable_sysenter();
-	xen_enable_syscall();
-
+	/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+	if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+		xen_enable_sysenter();
+		xen_enable_syscall();
+	}
 	cpu = smp_processor_id();
 	smp_store_cpu_info(cpu);
 	cpu_data(cpu).x86_max_cores = 1;
@@ -97,8 +99,14 @@ static void cpu_bringup(void)
 	wmb();			/* make sure everything is out */
 }
 
-static void cpu_bringup_and_idle(void)
+/* Note: cpu parameter is only relevant for PVH */
+static void cpu_bringup_and_idle(int cpu)
 {
+#ifdef CONFIG_X86_64
+	if (xen_feature(XENFEAT_auto_translated_physmap) &&
+	    xen_feature(XENFEAT_supervisor_mode_kernel))
+		xen_pvh_secondary_vcpu_init(cpu);
+#endif
 	cpu_bringup();
 	cpu_startup_entry(CPUHP_ONLINE);
 }
@@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void)
 	native_smp_prepare_boot_cpu();
 
 	if (xen_pv_domain()) {
-		/* We've switched to the "real" per-cpu gdt, so make sure the
-		   old memory can be recycled */
-		make_lowmem_page_readwrite(xen_initial_gdt);
+		if (!xen_feature(XENFEAT_writable_page_tables))
+			/* We've switched to the "real" per-cpu gdt, so make
+			 * sure the old memory can be recycled. */
+			make_lowmem_page_readwrite(xen_initial_gdt);
 
 #ifdef CONFIG_X86_32
 		/*
@@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 
 	gdt = get_cpu_gdt_table(cpu);
 
-	ctxt->flags = VGCF_IN_KERNEL;
-	ctxt->user_regs.ss = __KERNEL_DS;
 #ifdef CONFIG_X86_32
+	/* Note: PVH is not yet supported on x86_32. */
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
 	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
-#else
-	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
-	{
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		ctxt->flags = VGCF_IN_KERNEL;
 		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 		ctxt->user_regs.ds = __USER_DS;
 		ctxt->user_regs.es = __USER_DS;
+		ctxt->user_regs.ss = __KERNEL_DS;
 
 		xen_copy_trap_info(ctxt->trap_ctxt);
 
@@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 		ctxt->event_callback_cs     = __KERNEL_CS;
 		ctxt->failsafe_callback_cs  = __KERNEL_CS;
+#else
+		ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
 		ctxt->event_callback_eip    =
 					(unsigned long)xen_hypervisor_callback;
 		ctxt->failsafe_callback_eip =
 					(unsigned long)xen_failsafe_callback;
+		ctxt->user_regs.cs = __KERNEL_CS;
+		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+#ifdef CONFIG_X86_32
 	}
-	ctxt->user_regs.cs = __KERNEL_CS;
+#else
+	} else
+		/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
+		 * %rdi having the cpu number - which means are passing in
+		 * as the first parameter the cpu. Subtle!
+		 */
+		ctxt->user_regs.rdi = cpu;
+#endif
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-
-	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
 		BUG();
 
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 0e36cde12f7..581521c843a 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -106,7 +106,7 @@ static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
 
 static bool xen_pvspin = true;
-static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
 	int irq = __this_cpu_read(lock_kicker_irq);
 	struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 12a1ca707b9..7b78f88c170 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -446,6 +446,7 @@ void xen_setup_timer(int cpu)
 				      IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
 				      IRQF_FORCE_RESUME,
 				      name, NULL);
+	(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
 
 	memcpy(evt, xen_clockevent, sizeof(*evt));
 
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7faed5869e5..485b6958554 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -11,8 +11,28 @@
 #include <asm/page_types.h>
 
 #include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
 #include <asm/xen/interface.h>
 
+#ifdef CONFIG_XEN_PVH
+#define PVH_FEATURES_STR  "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
+/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
+ * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
+ * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
+ */
+#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
+		      (1 << XENFEAT_auto_translated_physmap) | \
+		      (1 << XENFEAT_supervisor_mode_kernel) | \
+		      (1 << XENFEAT_hvm_callback_vector))
+/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+ * up regardless whether this CONFIG option is enabled or not, but it
+ * clarifies what the right flags need to be.
+ */
+#else
+#define PVH_FEATURES_STR  ""
+#define PVH_FEATURES (0)
+#endif
+
 	__INIT
 ENTRY(startup_xen)
 	cld
@@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6)
 #endif
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
-	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
+	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
+						(1 << XENFEAT_writable_page_tables) |
+						(1 << XENFEAT_dom0))
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 95f8c614232..1cb6f4c3730 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void);
 
 extern int xen_panic_handler_init(void);
 
+void xen_pvh_secondary_vcpu_init(int cpu);
 #endif /* XEN_OPS_H */
author	H. Peter Anvin <hpa@linux.intel.com>	2014-02-07 11:27:30 -0800
committer	H. Peter Anvin <hpa@linux.intel.com>	2014-02-07 11:27:30 -0800
commit	a3b072cd180c12e8fe0ece9487b9065808327640 (patch)
tree	62b982041be84748852d77cdf6ca5639ef40858f /arch/x86
parent	75a1ba5b2c529db60ca49626bcaf0bddf4548438 (diff)
parent	081cd62a010f97b5bc1d2b0cd123c5abc692b68a (diff)