13 files changed, 5548 insertions, 0 deletions
diff --git a/drivers/misc/sgi-gru/Makefile b/drivers/misc/sgi-gru/Makefile
new file mode 100644
index 00000000000..d03597a521b
--- /dev/null
+++ b/drivers/misc/sgi-gru/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_SGI_GRU) := gru.o
+gru-y := grufile.o grumain.o grufault.o grutlbpurge.o gruprocfs.o grukservices.o
+
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
new file mode 100644
index 00000000000..40df7cb3f0a
--- /dev/null
+++ b/drivers/misc/sgi-gru/gru.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation; either version 2.1 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __GRU_H__
+#define __GRU_H__
+
+/*
+ * GRU architectural definitions
+ */
+#define GRU_CACHE_LINE_BYTES		64
+#define GRU_HANDLE_STRIDE		256
+#define GRU_CB_BASE			0
+#define GRU_DS_BASE			0x20000
+
+/*
+ * Size used to map GRU GSeg
+ */
+#if defined CONFIG_IA64
+#define GRU_GSEG_PAGESIZE	(256 * 1024UL)
+#elif defined CONFIG_X86_64
+#define GRU_GSEG_PAGESIZE	(256 * 1024UL)		/* ZZZ 2MB ??? */
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * Structure for obtaining GRU resource information
+ */
+struct gru_chiplet_info {
+	int	node;
+	int	chiplet;
+	int	blade;
+	int	total_dsr_bytes;
+	int	total_cbr;
+	int	total_user_dsr_bytes;
+	int	total_user_cbr;
+	int	free_user_dsr_bytes;
+	int	free_user_cbr;
+};
+
+/* Flags for GRU options on the gru_create_context() call */
+/* Select one of the follow 4 options to specify how TLB misses are handled */
+#define GRU_OPT_MISS_DEFAULT	0x0000	/* Use default mode */
+#define GRU_OPT_MISS_USER_POLL	0x0001	/* User will poll CB for faults */
+#define GRU_OPT_MISS_FMM_INTR	0x0002	/* Send interrupt to cpu to
+					   handle fault */
+#define GRU_OPT_MISS_FMM_POLL	0x0003	/* Use system polling thread */
+#define GRU_OPT_MISS_MASK	0x0003	/* Mask for TLB MISS option */
+
+
+
+#endif		/* __GRU_H__ */
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
new file mode 100644
index 00000000000..0dc36225c7c
--- /dev/null
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -0,0 +1,669 @@
+/*
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation; either version 2.1 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __GRU_INSTRUCTIONS_H__
+#define __GRU_INSTRUCTIONS_H__
+
+#define gru_flush_cache_hook(p)
+#define gru_emulator_wait_hook(p, w)
+
+/*
+ * Architecture dependent functions
+ */
+
+#if defined CONFIG_IA64
+#include <linux/compiler.h>
+#include <asm/intrinsics.h>
+#define __flush_cache(p)		ia64_fc(p)
+/* Use volatile on IA64 to ensure ordering via st4.rel */
+#define gru_ordered_store_int(p,v)					\
+		do {							\
+			barrier();					\
+			*((volatile int *)(p)) = v; /* force st.rel */	\
+		} while (0)
+#elif defined CONFIG_X86_64
+#define __flush_cache(p)		clflush(p)
+#define gru_ordered_store_int(p,v)					\
+		do {							\
+			barrier();					\
+			*(int *)p = v;					\
+		} while (0)
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * Control block status and exception codes
+ */
+#define CBS_IDLE			0
+#define CBS_EXCEPTION			1
+#define CBS_ACTIVE			2
+#define CBS_CALL_OS			3
+
+/* CB substatus bitmasks */
+#define CBSS_MSG_QUEUE_MASK		7
+#define CBSS_IMPLICIT_ABORT_ACTIVE_MASK	8
+
+/* CB substatus message queue values (low 3 bits of substatus) */
+#define CBSS_NO_ERROR			0
+#define CBSS_LB_OVERFLOWED		1
+#define CBSS_QLIMIT_REACHED		2
+#define CBSS_PAGE_OVERFLOW		3
+#define CBSS_AMO_NACKED			4
+#define CBSS_PUT_NACKED			5
+
+/*
+ * Structure used to fetch exception detail for CBs that terminate with
+ * CBS_EXCEPTION
+ */
+struct control_block_extended_exc_detail {
+	unsigned long	cb;
+	int		opc;
+	int		ecause;
+	int		exopc;
+	long		exceptdet0;
+	int		exceptdet1;
+};
+
+/*
+ * Instruction formats
+ */
+
+/*
+ * Generic instruction format.
+ * This definition has precise bit field definitions.
+ */
+struct gru_instruction_bits {
+    /* DW 0  - low */
+    unsigned int		icmd:      1;
+    unsigned char		ima:	   3;	/* CB_DelRep, unmapped mode */
+    unsigned char		reserved0: 4;
+    unsigned int		xtype:     3;
+    unsigned int		iaa0:      2;
+    unsigned int		iaa1:      2;
+    unsigned char		reserved1: 1;
+    unsigned char		opc:       8;	/* opcode */
+    unsigned char		exopc:     8;	/* extended opcode */
+    /* DW 0  - high */
+    unsigned int		idef2:    22;	/* TRi0 */
+    unsigned char		reserved2: 2;
+    unsigned char		istatus:   2;
+    unsigned char		isubstatus:4;
+    unsigned char		reserved3: 2;
+    /* DW 1 */
+    unsigned long		idef4;		/* 42 bits: TRi1, BufSize */
+    /* DW 2-6 */
+    unsigned long		idef1;		/* BAddr0 */
+    unsigned long		idef5;		/* Nelem */
+    unsigned long		idef6;		/* Stride, Operand1 */
+    unsigned long		idef3;		/* BAddr1, Value, Operand2 */
+    unsigned long		reserved4;
+    /* DW 7 */
+    unsigned long		avalue;		 /* AValue */
+};
+
+/*
+ * Generic instruction with friendlier names. This format is used
+ * for inline instructions.
+ */
+struct gru_instruction {
+    /* DW 0 */
+    unsigned int		op32;    /* icmd,xtype,iaa0,ima,opc */
+    unsigned int		tri0;
+    unsigned long		tri1_bufsize;		/* DW 1 */
+    unsigned long		baddr0;			/* DW 2 */
+    unsigned long		nelem;			/* DW 3 */
+    unsigned long		op1_stride;		/* DW 4 */
+    unsigned long		op2_value_baddr1;	/* DW 5 */
+    unsigned long		reserved0;		/* DW 6 */
+    unsigned long		avalue;			/* DW 7 */
+};
+
+/* Some shifts and masks for the low 32 bits of a GRU command */
+#define GRU_CB_ICMD_SHFT	0
+#define GRU_CB_ICMD_MASK	0x1
+#define GRU_CB_XTYPE_SHFT	8
+#define GRU_CB_XTYPE_MASK	0x7
+#define GRU_CB_IAA0_SHFT	11
+#define GRU_CB_IAA0_MASK	0x3
+#define GRU_CB_IAA1_SHFT	13
+#define GRU_CB_IAA1_MASK	0x3
+#define GRU_CB_IMA_SHFT		1
+#define GRU_CB_IMA_MASK		0x3
+#define GRU_CB_OPC_SHFT		16
+#define GRU_CB_OPC_MASK		0xff
+#define GRU_CB_EXOPC_SHFT	24
+#define GRU_CB_EXOPC_MASK	0xff
+
+/* GRU instruction opcodes (opc field) */
+#define OP_NOP		0x00
+#define OP_BCOPY	0x01
+#define OP_VLOAD	0x02
+#define OP_IVLOAD	0x03
+#define OP_VSTORE	0x04
+#define OP_IVSTORE	0x05
+#define OP_VSET		0x06
+#define OP_IVSET	0x07
+#define OP_MESQ		0x08
+#define OP_GAMXR	0x09
+#define OP_GAMIR	0x0a
+#define OP_GAMIRR	0x0b
+#define OP_GAMER	0x0c
+#define OP_GAMERR	0x0d
+#define OP_BSTORE	0x0e
+#define OP_VFLUSH	0x0f
+
+
+/* Extended opcodes values (exopc field) */
+
+/* GAMIR - AMOs with implicit operands */
+#define EOP_IR_FETCH	0x01 /* Plain fetch of memory */
+#define EOP_IR_CLR	0x02 /* Fetch and clear */
+#define EOP_IR_INC	0x05 /* Fetch and increment */
+#define EOP_IR_DEC	0x07 /* Fetch and decrement */
+#define EOP_IR_QCHK1	0x0d /* Queue check, 64 byte msg */
+#define EOP_IR_QCHK2	0x0e /* Queue check, 128 byte msg */
+
+/* GAMIRR - Registered AMOs with implicit operands */
+#define EOP_IRR_FETCH	0x01 /* Registered fetch of memory */
+#define EOP_IRR_CLR	0x02 /* Registered fetch and clear */
+#define EOP_IRR_INC	0x05 /* Registered fetch and increment */
+#define EOP_IRR_DEC	0x07 /* Registered fetch and decrement */
+#define EOP_IRR_DECZ	0x0f /* Registered fetch and decrement, update on zero*/
+
+/* GAMER - AMOs with explicit operands */
+#define EOP_ER_SWAP	0x00 /* Exchange argument and memory */
+#define EOP_ER_OR	0x01 /* Logical OR with memory */
+#define EOP_ER_AND	0x02 /* Logical AND with memory */
+#define EOP_ER_XOR	0x03 /* Logical XOR with memory */
+#define EOP_ER_ADD	0x04 /* Add value to memory */
+#define EOP_ER_CSWAP	0x08 /* Compare with operand2, write operand1 if match*/
+#define EOP_ER_CADD	0x0c /* Queue check, operand1*64 byte msg */
+
+/* GAMERR - Registered AMOs with explicit operands */
+#define EOP_ERR_SWAP	0x00 /* Exchange argument and memory */
+#define EOP_ERR_OR	0x01 /* Logical OR with memory */
+#define EOP_ERR_AND	0x02 /* Logical AND with memory */
+#define EOP_ERR_XOR	0x03 /* Logical XOR with memory */
+#define EOP_ERR_ADD	0x04 /* Add value to memory */
+#define EOP_ERR_CSWAP	0x08 /* Compare with operand2, write operand1 if match*/
+#define EOP_ERR_EPOLL	0x09 /* Poll for equality */
+#define EOP_ERR_NPOLL	0x0a /* Poll for inequality */
+
+/* GAMXR - SGI Arithmetic unit */
+#define EOP_XR_CSWAP	0x0b /* Masked compare exchange */
+
+
+/* Transfer types (xtype field) */
+#define XTYPE_B		0x0	/* byte */
+#define XTYPE_S		0x1	/* short (2-byte) */
+#define XTYPE_W		0x2	/* word (4-byte) */
+#define XTYPE_DW	0x3	/* doubleword (8-byte) */
+#define XTYPE_CL	0x6	/* cacheline (64-byte) */
+
+
+/* Instruction access attributes (iaa0, iaa1 fields) */
+#define IAA_RAM		0x0	/* normal cached RAM access */
+#define IAA_NCRAM	0x2	/* noncoherent RAM access */
+#define IAA_MMIO	0x1	/* noncoherent memory-mapped I/O space */
+#define IAA_REGISTER	0x3	/* memory-mapped registers, etc. */
+
+
+/* Instruction mode attributes (ima field) */
+#define IMA_MAPPED	0x0	/* Virtual mode  */
+#define IMA_CB_DELAY	0x1	/* hold read responses until status changes */
+#define IMA_UNMAPPED	0x2	/* bypass the TLBs (OS only) */
+#define IMA_INTERRUPT	0x4	/* Interrupt when instruction completes */
+
+/* CBE ecause bits */
+#define CBE_CAUSE_RI				(1 << 0)
+#define CBE_CAUSE_INVALID_INSTRUCTION		(1 << 1)
+#define CBE_CAUSE_UNMAPPED_MODE_FORBIDDEN	(1 << 2)
+#define CBE_CAUSE_PE_CHECK_DATA_ERROR		(1 << 3)
+#define CBE_CAUSE_IAA_GAA_MISMATCH		(1 << 4)
+#define CBE_CAUSE_DATA_SEGMENT_LIMIT_EXCEPTION	(1 << 5)
+#define CBE_CAUSE_OS_FATAL_TLB_FAULT		(1 << 6)
+#define CBE_CAUSE_EXECUTION_HW_ERROR		(1 << 7)
+#define CBE_CAUSE_TLBHW_ERROR			(1 << 8)
+#define CBE_CAUSE_RA_REQUEST_TIMEOUT		(1 << 9)
+#define CBE_CAUSE_HA_REQUEST_TIMEOUT		(1 << 10)
+#define CBE_CAUSE_RA_RESPONSE_FATAL		(1 << 11)
+#define CBE_CAUSE_RA_RESPONSE_NON_FATAL		(1 << 12)
+#define CBE_CAUSE_HA_RESPONSE_FATAL		(1 << 13)
+#define CBE_CAUSE_HA_RESPONSE_NON_FATAL		(1 << 14)
+#define CBE_CAUSE_ADDRESS_SPACE_DECODE_ERROR	(1 << 15)
+#define CBE_CAUSE_RESPONSE_DATA_ERROR		(1 << 16)
+#define CBE_CAUSE_PROTOCOL_STATE_DATA_ERROR	(1 << 17)
+
+/*
+ * Exceptions are retried for the following cases. If any OTHER bits are set
+ * in ecause, the exception is not retryable.
+ */
+#define EXCEPTION_RETRY_BITS (CBE_CAUSE_RESPONSE_DATA_ERROR |		\
+			      CBE_CAUSE_RA_REQUEST_TIMEOUT |		\
+			      CBE_CAUSE_TLBHW_ERROR |			\
+			      CBE_CAUSE_HA_REQUEST_TIMEOUT)
+
+/* Message queue head structure */
+union gru_mesqhead {
+	unsigned long	val;
+	struct {
+		unsigned int	head;
+		unsigned int	limit;
+	};
+};
+
+
+/* Generate the low word of a GRU instruction */
+static inline unsigned int
+__opword(unsigned char opcode, unsigned char exopc, unsigned char xtype,
+       unsigned char iaa0, unsigned char iaa1,
+       unsigned char ima)
+{
+    return (1 << GRU_CB_ICMD_SHFT) |
+	   (iaa0 << GRU_CB_IAA0_SHFT) |
+	   (iaa1 << GRU_CB_IAA1_SHFT) |
+	   (ima << GRU_CB_IMA_SHFT) |
+	   (xtype << GRU_CB_XTYPE_SHFT) |
+	   (opcode << GRU_CB_OPC_SHFT) |
+	   (exopc << GRU_CB_EXOPC_SHFT);
+}
+
+/*
+ * Architecture specific intrinsics
+ */
+static inline void gru_flush_cache(void *p)
+{
+	__flush_cache(p);
+}
+
+/*
+ * Store the lower 32 bits of the command including the "start" bit. Then
+ * start the instruction executing.
+ */
+static inline void gru_start_instruction(struct gru_instruction *ins, int op32)
+{
+	gru_ordered_store_int(ins, op32);
+}
+
+
+/* Convert "hints" to IMA */
+#define CB_IMA(h)		((h) | IMA_UNMAPPED)
+
+/* Convert data segment cache line index into TRI0 / TRI1 value */
+#define GRU_DINDEX(i)		((i) * GRU_CACHE_LINE_BYTES)
+
+/* Inline functions for GRU instructions.
+ *     Note:
+ *     	- nelem and stride are in elements
+ *     	- tri0/tri1 is in bytes for the beginning of the data segment.
+ */
+static inline void gru_vload(void *cb, unsigned long mem_addr,
+		unsigned int tri0, unsigned char xtype, unsigned long nelem,
+		unsigned long stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (struct gru_instruction *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->op1_stride = stride;
+	gru_start_instruction(ins, __opword(OP_VLOAD, 0, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_vstore(void *cb, unsigned long mem_addr,
+		unsigned int tri0, unsigned char xtype, unsigned long nelem,
+		unsigned long stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->op1_stride = stride;
+	gru_start_instruction(ins, __opword(OP_VSTORE, 0, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_ivload(void *cb, unsigned long mem_addr,
+		unsigned int tri0, unsigned int tri1, unsigned char xtype,
+		unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->tri1_bufsize = tri1;
+	gru_start_instruction(ins, __opword(OP_IVLOAD, 0, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_ivstore(void *cb, unsigned long mem_addr,
+		unsigned int tri0, unsigned int tri1,
+		unsigned char xtype, unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->tri1_bufsize = tri1;
+	gru_start_instruction(ins, __opword(OP_IVSTORE, 0, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_vset(void *cb, unsigned long mem_addr,
+		unsigned long value, unsigned char xtype, unsigned long nelem,
+		unsigned long stride, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->op2_value_baddr1 = value;
+	ins->nelem = nelem;
+	ins->op1_stride = stride;
+	gru_start_instruction(ins, __opword(OP_VSET, 0, xtype, IAA_RAM, 0,
+					 CB_IMA(hints)));
+}
+
+static inline void gru_ivset(void *cb, unsigned long mem_addr,
+		unsigned int tri1, unsigned long value, unsigned char xtype,
+		unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->op2_value_baddr1 = value;
+	ins->nelem = nelem;
+	ins->tri1_bufsize = tri1;
+	gru_start_instruction(ins, __opword(OP_IVSET, 0, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_vflush(void *cb, unsigned long mem_addr,
+		unsigned long nelem, unsigned char xtype, unsigned long stride,
+		unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)mem_addr;
+	ins->op1_stride = stride;
+	ins->nelem = nelem;
+	gru_start_instruction(ins, __opword(OP_VFLUSH, 0, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_nop(void *cb, int hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	gru_start_instruction(ins, __opword(OP_NOP, 0, 0, 0, 0, CB_IMA(hints)));
+}
+
+
+static inline void gru_bcopy(void *cb, const unsigned long src,
+		unsigned long dest,
+		unsigned int tri0, unsigned int xtype, unsigned long nelem,
+		unsigned int bufsize, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op2_value_baddr1 = (long)dest;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	ins->tri1_bufsize = bufsize;
+	gru_start_instruction(ins, __opword(OP_BCOPY, 0, xtype, IAA_RAM,
+					IAA_RAM, CB_IMA(hints)));
+}
+
+static inline void gru_bstore(void *cb, const unsigned long src,
+		unsigned long dest, unsigned int tri0, unsigned int xtype,
+		unsigned long nelem, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op2_value_baddr1 = (long)dest;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	gru_start_instruction(ins, __opword(OP_BSTORE, 0, xtype, 0, IAA_RAM,
+					CB_IMA(hints)));
+}
+
+static inline void gru_gamir(void *cb, int exopc, unsigned long src,
+		unsigned int xtype, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	gru_start_instruction(ins, __opword(OP_GAMIR, exopc, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_gamirr(void *cb, int exopc, unsigned long src,
+		unsigned int xtype, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	gru_start_instruction(ins, __opword(OP_GAMIRR, exopc, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_gamer(void *cb, int exopc, unsigned long src,
+		unsigned int xtype,
+		unsigned long operand1, unsigned long operand2,
+		unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op1_stride = operand1;
+	ins->op2_value_baddr1 = operand2;
+	gru_start_instruction(ins, __opword(OP_GAMER, exopc, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_gamerr(void *cb, int exopc, unsigned long src,
+		unsigned int xtype, unsigned long operand1,
+		unsigned long operand2, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->op1_stride = operand1;
+	ins->op2_value_baddr1 = operand2;
+	gru_start_instruction(ins, __opword(OP_GAMERR, exopc, xtype, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline void gru_gamxr(void *cb, unsigned long src,
+		unsigned int tri0, unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)src;
+	ins->nelem = 4;
+	gru_start_instruction(ins, __opword(OP_GAMXR, EOP_XR_CSWAP, XTYPE_DW,
+				 IAA_RAM, 0, CB_IMA(hints)));
+}
+
+static inline void gru_mesq(void *cb, unsigned long queue,
+		unsigned long tri0, unsigned long nelem,
+		unsigned long hints)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	ins->baddr0 = (long)queue;
+	ins->nelem = nelem;
+	ins->tri0 = tri0;
+	gru_start_instruction(ins, __opword(OP_MESQ, 0, XTYPE_CL, IAA_RAM, 0,
+					CB_IMA(hints)));
+}
+
+static inline unsigned long gru_get_amo_value(void *cb)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	return ins->avalue;
+}
+
+static inline int gru_get_amo_value_head(void *cb)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	return ins->avalue & 0xffffffff;
+}
+
+static inline int gru_get_amo_value_limit(void *cb)
+{
+	struct gru_instruction *ins = (void *)cb;
+
+	return ins->avalue >> 32;
+}
+
+static inline union gru_mesqhead  gru_mesq_head(int head, int limit)
+{
+	union gru_mesqhead mqh;
+
+	mqh.head = head;
+	mqh.limit = limit;
+	return mqh;
+}
+
+/*
+ * Get struct control_block_extended_exc_detail for CB.
+ */
+extern int gru_get_cb_exception_detail(void *cb,
+		       struct control_block_extended_exc_detail *excdet);
+
+#define GRU_EXC_STR_SIZE		256
+
+extern int gru_check_status_proc(void *cb);
+extern int gru_wait_proc(void *cb);
+extern void gru_wait_abort_proc(void *cb);
+
+/*
+ * Control block definition for checking status
+ */
+struct gru_control_block_status {
+	unsigned int	icmd		:1;
+	unsigned int	unused1		:31;
+	unsigned int	unused2		:24;
+	unsigned int	istatus		:2;
+	unsigned int	isubstatus	:4;
+	unsigned int	inused3		:2;
+};
+
+/* Get CB status */
+static inline int gru_get_cb_status(void *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	return cbs->istatus;
+}
+
+/* Get CB message queue substatus */
+static inline int gru_get_cb_message_queue_substatus(void *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	return cbs->isubstatus & CBSS_MSG_QUEUE_MASK;
+}
+
+/* Get CB substatus */
+static inline int gru_get_cb_substatus(void *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	return cbs->isubstatus;
+}
+
+/* Check the status of a CB. If the CB is in UPM mode, call the
+ * OS to handle the UPM status.
+ * Returns the CB status field value (0 for normal completion)
+ */
+static inline int gru_check_status(void *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+	int ret = cbs->istatus;
+
+	if (ret == CBS_CALL_OS)
+		ret = gru_check_status_proc(cb);
+	return ret;
+}
+
+/* Wait for CB to complete.
+ * Returns the CB status field value (0 for normal completion)
+ */
+static inline int gru_wait(void *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+	int ret = cbs->istatus;;
+
+	if (ret != CBS_IDLE)
+		ret = gru_wait_proc(cb);
+	return ret;
+}
+
+/* Wait for CB to complete. Aborts program if error. (Note: error does NOT
+ * mean TLB mis - only fatal errors such as memory parity error or user
+ * bugs will cause termination.
+ */
+static inline void gru_wait_abort(void *cb)
+{
+	struct gru_control_block_status *cbs = (void *)cb;
+
+	if (cbs->istatus != CBS_IDLE)
+		gru_wait_abort_proc(cb);
+}
+
+
+/*
+ * Get a pointer to a control block
+ * 	gseg	- GSeg address returned from gru_get_thread_gru_segment()
+ * 	index	- index of desired CB
+ */
+static inline void *gru_get_cb_pointer(void *gseg,
+						      int index)
+{
+	return gseg + GRU_CB_BASE + index * GRU_HANDLE_STRIDE;
+}
+
+/*
+ * Get a pointer to a cacheline in the data segment portion of a GSeg
+ * 	gseg	- GSeg address returned from gru_get_thread_gru_segment()
+ * 	index	- index of desired cache line
+ */
+static inline void *gru_get_data_pointer(void *gseg, int index)
+{
+	return gseg + GRU_DS_BASE + index * GRU_CACHE_LINE_BYTES;
+}
+
+/*
+ * Convert a vaddr into the tri index within the GSEG
+ * 	vaddr		- virtual address of within gseg
+ */
+static inline int gru_get_tri(void *vaddr)
+{
+	return ((unsigned long)vaddr & (GRU_GSEG_PAGESIZE - 1)) - GRU_DS_BASE;
+}
+#endif		/* __GRU_INSTRUCTIONS_H__ */
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
new file mode 100644
index 00000000000..3d33015bbf3
--- /dev/null
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -0,0 +1,633 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              FAULT HANDLER FOR GRU DETECTED TLB MISSES
+ *
+ * This file contains code that handles TLB misses within the GRU.
+ * These misses are reported either via interrupts or user polling of
+ * the user CB.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include "gru.h"
+#include "grutables.h"
+#include "grulib.h"
+#include "gru_instructions.h"
+#include <asm/uv/uv_hub.h>
+
+/*
+ * Test if a physical address is a valid GRU GSEG address
+ */
+static inline int is_gru_paddr(unsigned long paddr)
+{
+	return paddr >= gru_start_paddr && paddr < gru_end_paddr;
+}
+
+/*
+ * Find the vma of a GRU segment. Caller must hold mmap_sem.
+ */
+struct vm_area_struct *gru_find_vma(unsigned long vaddr)
+{
+	struct vm_area_struct *vma;
+
+	vma = find_vma(current->mm, vaddr);
+	if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
+		return vma;
+	return NULL;
+}
+
+/*
+ * Find and lock the gts that contains the specified user vaddr.
+ *
+ * Returns:
+ * 	- *gts with the mmap_sem locked for read and the GTS locked.
+ *	- NULL if vaddr invalid OR is not a valid GSEG vaddr.
+ */
+
+static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct gru_thread_state *gts = NULL;
+
+	down_read(&mm->mmap_sem);
+	vma = gru_find_vma(vaddr);
+	if (vma)
+		gts = gru_find_thread_state(vma, TSID(vaddr, vma));
+	if (gts)
+		mutex_lock(&gts->ts_ctxlock);
+	else
+		up_read(&mm->mmap_sem);
+	return gts;
+}
+
+static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct gru_thread_state *gts = NULL;
+
+	down_write(&mm->mmap_sem);
+	vma = gru_find_vma(vaddr);
+	if (vma)
+		gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
+	if (gts) {
+		mutex_lock(&gts->ts_ctxlock);
+		downgrade_write(&mm->mmap_sem);
+	} else {
+		up_write(&mm->mmap_sem);
+	}
+
+	return gts;
+}
+
+/*
+ * Unlock a GTS that was previously locked with gru_find_lock_gts().
+ */
+static void gru_unlock_gts(struct gru_thread_state *gts)
+{
+	mutex_unlock(&gts->ts_ctxlock);
+	up_read(&current->mm->mmap_sem);
+}
+
+/*
+ * Set a CB.istatus to active using a user virtual address. This must be done
+ * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
+ * If the line is evicted, the status may be lost. The in-cache update
+ * is necessary to prevent the user from seeing a stale cb.istatus that will
+ * change as soon as the TFH restart is complete. Races may cause an
+ * occasional failure to clear the cb.istatus, but that is ok.
+ *
+ * If the cb address is not valid (should not happen, but...), nothing
+ * bad will happen.. The get_user()/put_user() will fail but there
+ * are no bad side-effects.
+ */
+static void gru_cb_set_istatus_active(unsigned long __user *cb)
+{
+	union {
+		struct gru_instruction_bits bits;
+		unsigned long dw;
+	} u;
+
+	if (cb) {
+		get_user(u.dw, cb);
+		u.bits.istatus = CBS_ACTIVE;
+		put_user(u.dw, cb);
+	}
+}
+
+/*
+ * Convert a interrupt IRQ to a pointer to the GRU GTS that caused the
+ * interrupt. Interrupts are always sent to a cpu on the blade that contains the
+ * GRU (except for headless blades which are not currently supported). A blade
+ * has N grus; a block of N consecutive IRQs is assigned to the GRUs. The IRQ
+ * number uniquely identifies the GRU chiplet on the local blade that caused the
+ * interrupt. Always called in interrupt context.
+ */
+static inline struct gru_state *irq_to_gru(int irq)
+{
+	return &gru_base[uv_numa_blade_id()]->bs_grus[irq - IRQ_GRU];
+}
+
+/*
+ * Read & clear a TFM
+ *
+ * The GRU has an array of fault maps. A map is private to a cpu
+ * Only one cpu will be accessing a cpu's fault map.
+ *
+ * This function scans the cpu-private fault map & clears all bits that
+ * are set. The function returns a bitmap that indicates the bits that
+ * were cleared. Note that sense the maps may be updated asynchronously by
+ * the GRU, atomic operations must be used to clear bits.
+ */
+static void get_clear_fault_map(struct gru_state *gru,
+				struct gru_tlb_fault_map *map)
+{
+	unsigned long i, k;
+	struct gru_tlb_fault_map *tfm;
+
+	tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
+	prefetchw(tfm);		/* Helps on hardware, required for emulator */
+	for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
+		k = tfm->fault_bits[i];
+		if (k)
+			k = xchg(&tfm->fault_bits[i], 0UL);
+		map->fault_bits[i] = k;
+	}
+
+	/*
+	 * Not functionally required but helps performance. (Required
+	 * on emulator)
+	 */
+	gru_flush_cache(tfm);
+}
+
+/*
+ * Atomic (interrupt context) & non-atomic (user context) functions to
+ * convert a vaddr into a physical address. The size of the page
+ * is returned in pageshift.
+ * 	returns:
+ * 		  0 - successful
+ * 		< 0 - error code
+ * 		  1 - (atomic only) try again in non-atomic context
+ */
+static int non_atomic_pte_lookup(struct vm_area_struct *vma,
+				 unsigned long vaddr, int write,
+				 unsigned long *paddr, int *pageshift)
+{
+	struct page *page;
+
+	/* ZZZ Need to handle HUGE pages */
+	if (is_vm_hugetlb_page(vma))
+		return -EFAULT;
+	*pageshift = PAGE_SHIFT;
+	if (get_user_pages
+	    (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
+		return -EFAULT;
+	*paddr = page_to_phys(page);
+	put_page(page);
+	return 0;
+}
+
+/*
+ *
+ * atomic_pte_lookup
+ *
+ * Convert a user virtual address to a physical address
+ * Only supports Intel large pages (2MB only) on x86_64.
+ *	ZZZ - hugepage support is incomplete
+ */
+static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
+	int write, unsigned long *paddr, int *pageshift)
+{
+	pgd_t *pgdp;
+	pmd_t *pmdp;
+	pud_t *pudp;
+	pte_t pte;
+
+	WARN_ON(irqs_disabled());		/* ZZZ debug */
+
+	local_irq_disable();
+	pgdp = pgd_offset(vma->vm_mm, vaddr);
+	if (unlikely(pgd_none(*pgdp)))
+		goto err;
+
+	pudp = pud_offset(pgdp, vaddr);
+	if (unlikely(pud_none(*pudp)))
+		goto err;
+
+	pmdp = pmd_offset(pudp, vaddr);
+	if (unlikely(pmd_none(*pmdp)))
+		goto err;
+#ifdef CONFIG_X86_64
+	if (unlikely(pmd_large(*pmdp)))
+		pte = *(pte_t *) pmdp;
+	else
+#endif
+		pte = *pte_offset_kernel(pmdp, vaddr);
+
+	local_irq_enable();
+
+	if (unlikely(!pte_present(pte) ||
+		     (write && (!pte_write(pte) || !pte_dirty(pte)))))
+		return 1;
+
+	*paddr = pte_pfn(pte) << PAGE_SHIFT;
+	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
+	return 0;
+
+err:
+	local_irq_enable();
+	return 1;
+}
+
+/*
+ * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
+ *	Input:
+ *		cb    Address of user CBR. Null if not running in user context
+ * 	Return:
+ * 		  0 = dropin, exception, or switch to UPM successful
+ * 		  1 = range invalidate active
+ * 		< 0 = error code
+ *
+ */
+static int gru_try_dropin(struct gru_thread_state *gts,
+			  struct gru_tlb_fault_handle *tfh,
+			  unsigned long __user *cb)
+{
+	struct mm_struct *mm = gts->ts_mm;
+	struct vm_area_struct *vma;
+	int pageshift, asid, write, ret;
+	unsigned long paddr, gpa, vaddr;
+
+	/*
+	 * NOTE: The GRU contains magic hardware that eliminates races between
+	 * TLB invalidates and TLB dropins. If an invalidate occurs
+	 * in the window between reading the TFH and the subsequent TLB dropin,
+	 * the dropin is ignored. This eliminates the need for additional locks.
+	 */
+
+	/*
+	 * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
+	 * Might be a hardware race OR a stupid user. Ignore FMM because FMM
+	 * is a transient state.
+	 */
+	if (tfh->state == TFHSTATE_IDLE)
+		goto failidle;
+	if (tfh->state == TFHSTATE_MISS_FMM && cb)
+		goto failfmm;
+
+	write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
+	vaddr = tfh->missvaddr;
+	asid = tfh->missasid;
+	if (asid == 0)
+		goto failnoasid;
+
+	rmb();	/* TFH must be cache resident before reading ms_range_active */
+
+	/*
+	 * TFH is cache resident - at least briefly. Fail the dropin
+	 * if a range invalidate is active.
+	 */
+	if (atomic_read(&gts->ts_gms->ms_range_active))
+		goto failactive;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		goto failinval;
+
+	/*
+	 * Atomic lookup is faster & usually works even if called in non-atomic
+	 * context.
+	 */
+	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pageshift);
+	if (ret) {
+		if (!cb)
+			goto failupm;
+		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr,
+					  &pageshift))
+			goto failinval;
+	}
+	if (is_gru_paddr(paddr))
+		goto failinval;
+
+	paddr = paddr & ~((1UL << pageshift) - 1);
+	gpa = uv_soc_phys_ram_to_gpa(paddr);
+	gru_cb_set_istatus_active(cb);
+	tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
+			  GRU_PAGESIZE(pageshift));
+	STAT(tlb_dropin);
+	gru_dbg(grudev,
+		"%s: tfh 0x%p, vaddr 0x%lx, asid 0x%x, ps %d, gpa 0x%lx\n",
+		ret ? "non-atomic" : "atomic", tfh, vaddr, asid,
+		pageshift, gpa);
+	return 0;
+
+failnoasid:
+	/* No asid (delayed unload). */
+	STAT(tlb_dropin_fail_no_asid);
+	gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+	if (!cb)
+		tfh_user_polling_mode(tfh);
+	else
+		gru_flush_cache(tfh);
+	return -EAGAIN;
+
+failupm:
+	/* Atomic failure switch CBR to UPM */
+	tfh_user_polling_mode(tfh);
+	STAT(tlb_dropin_fail_upm);
+	gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+	return 1;
+
+failfmm:
+	/* FMM state on UPM call */
+	STAT(tlb_dropin_fail_fmm);
+	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
+	return 0;
+
+failidle:
+	/* TFH was idle  - no miss pending */
+	gru_flush_cache(tfh);
+	if (cb)
+		gru_flush_cache(cb);
+	STAT(tlb_dropin_fail_idle);
+	gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
+	return 0;
+
+failinval:
+	/* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
+	tfh_exception(tfh);
+	STAT(tlb_dropin_fail_invalid);
+	gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
+	return -EFAULT;
+
+failactive:
+	/* Range invalidate active. Switch to UPM iff atomic */
+	if (!cb)
+		tfh_user_polling_mode(tfh);
+	else
+		gru_flush_cache(tfh);
+	STAT(tlb_dropin_fail_range_active);
+	gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
+		tfh, vaddr);
+	return 1;
+}
+
+/*
+ * Process an external interrupt from the GRU. This interrupt is
+ * caused by a TLB miss.
+ * Note that this is the interrupt handler that is registered with linux
+ * interrupt handlers.
+ */
+irqreturn_t gru_intr(int irq, void *dev_id)
+{
+	struct gru_state *gru;
+	struct gru_tlb_fault_map map;
+	struct gru_thread_state *gts;
+	struct gru_tlb_fault_handle *tfh = NULL;
+	int cbrnum, ctxnum;
+
+	STAT(intr);
+
+	gru = irq_to_gru(irq);
+	if (!gru) {
+		dev_err(grudev, "GRU: invalid interrupt: cpu %d, irq %d\n",
+			raw_smp_processor_id(), irq);
+		return IRQ_NONE;
+	}
+	get_clear_fault_map(gru, &map);
+	gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
+		map.fault_bits[0]);
+
+	for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
+		tfh = get_tfh_by_index(gru, cbrnum);
+		prefetchw(tfh);	/* Helps on hdw, required for emulator */
+
+		/*
+		 * When hardware sets a bit in the faultmap, it implicitly
+		 * locks the GRU context so that it cannot be unloaded.
+		 * The gts cannot change until a TFH start/writestart command
+		 * is issued.
+		 */
+		ctxnum = tfh->ctxnum;
+		gts = gru->gs_gts[ctxnum];
+
+		/*
+		 * This is running in interrupt context. Trylock the mmap_sem.
+		 * If it fails, retry the fault in user context.
+		 */
+		if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
+			gru_try_dropin(gts, tfh, NULL);
+			up_read(&gts->ts_mm->mmap_sem);
+		} else {
+			tfh_user_polling_mode(tfh);
+		}
+	}
+	return IRQ_HANDLED;
+}
+
+
+static int gru_user_dropin(struct gru_thread_state *gts,
+			   struct gru_tlb_fault_handle *tfh,
+			   unsigned long __user *cb)
+{
+	struct gru_mm_struct *gms = gts->ts_gms;
+	int ret;
+
+	while (1) {
+		wait_event(gms->ms_wait_queue,
+			   atomic_read(&gms->ms_range_active) == 0);
+		prefetchw(tfh);	/* Helps on hdw, required for emulator */
+		ret = gru_try_dropin(gts, tfh, cb);
+		if (ret <= 0)
+			return ret;
+		STAT(call_os_wait_queue);
+	}
+}
+
+/*
+ * This interface is called as a result of a user detecting a "call OS" bit
+ * in a user CB. Normally means that a TLB fault has occurred.
+ * 	cb - user virtual address of the CB
+ */
+int gru_handle_user_call_os(unsigned long cb)
+{
+	struct gru_tlb_fault_handle *tfh;
+	struct gru_thread_state *gts;
+	unsigned long __user *cbp;
+	int ucbnum, cbrnum, ret = -EINVAL;
+
+	STAT(call_os);
+	gru_dbg(grudev, "address 0x%lx\n", cb);
+
+	/* sanity check the cb pointer */
+	ucbnum = get_cb_number((void *)cb);
+	if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
+		return -EINVAL;
+	cbp = (unsigned long *)cb;
+
+	gts = gru_find_lock_gts(cb);
+	if (!gts)
+		return -EINVAL;
+
+	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	/*
+	 * If force_unload is set, the UPM TLB fault is phony. The task
+	 * has migrated to another node and the GSEG must be moved. Just
+	 * unload the context. The task will page fault and assign a new
+	 * context.
+	 */
+	ret = -EAGAIN;
+	cbrnum = thread_cbr_number(gts, ucbnum);
+	if (gts->ts_force_unload) {
+		gru_unload_context(gts, 1);
+	} else if (gts->ts_gru) {
+		tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
+		ret = gru_user_dropin(gts, tfh, cbp);
+	}
+exit:
+	gru_unlock_gts(gts);
+	return ret;
+}
+
+/*
+ * Fetch the exception detail information for a CB that terminated with
+ * an exception.
+ */
+int gru_get_exception_detail(unsigned long arg)
+{
+	struct control_block_extended_exc_detail excdet;
+	struct gru_control_block_extended *cbe;
+	struct gru_thread_state *gts;
+	int ucbnum, cbrnum, ret;
+
+	STAT(user_exception);
+	if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
+		return -EFAULT;
+
+	gru_dbg(grudev, "address 0x%lx\n", excdet.cb);
+	gts = gru_find_lock_gts(excdet.cb);
+	if (!gts)
+		return -EINVAL;
+
+	if (gts->ts_gru) {
+		ucbnum = get_cb_number((void *)excdet.cb);
+		cbrnum = thread_cbr_number(gts, ucbnum);
+		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
+		excdet.opc = cbe->opccpy;
+		excdet.exopc = cbe->exopccpy;
+		excdet.ecause = cbe->ecause;
+		excdet.exceptdet0 = cbe->idef1upd;
+		excdet.exceptdet1 = cbe->idef3upd;
+		ret = 0;
+	} else {
+		ret = -EAGAIN;
+	}
+	gru_unlock_gts(gts);
+
+	gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
+		excdet.ecause);
+	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
+		ret = -EFAULT;
+	return ret;
+}
+
+/*
+ * User request to unload a context. Content is saved for possible reload.
+ */
+int gru_user_unload_context(unsigned long arg)
+{
+	struct gru_thread_state *gts;
+	struct gru_unload_context_req req;
+
+	STAT(user_unload_context);
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
+
+	gts = gru_find_lock_gts(req.gseg);
+	if (!gts)
+		return -EINVAL;
+
+	if (gts->ts_gru)
+		gru_unload_context(gts, 1);
+	gru_unlock_gts(gts);
+
+	return 0;
+}
+
+/*
+ * User request to flush a range of virtual addresses from the GRU TLB
+ * (Mainly for testing).
+ */
+int gru_user_flush_tlb(unsigned long arg)
+{
+	struct gru_thread_state *gts;
+	struct gru_flush_tlb_req req;
+
+	STAT(user_flush_tlb);
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
+		req.vaddr, req.len);
+
+	gts = gru_find_lock_gts(req.gseg);
+	if (!gts)
+		return -EINVAL;
+
+	gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.vaddr + req.len);
+	gru_unlock_gts(gts);
+
+	return 0;
+}
+
+/*
+ * Register the current task as the user of the GSEG slice.
+ * Needed for TLB fault interrupt targeting.
+ */
+int gru_set_task_slice(long address)
+{
+	struct gru_thread_state *gts;
+
+	STAT(set_task_slice);
+	gru_dbg(grudev, "address 0x%lx\n", address);
+	gts = gru_alloc_locked_gts(address);
+	if (!gts)
+		return -EINVAL;
+
+	gts->ts_tgid_owner = current->tgid;
+	gru_unlock_gts(gts);
+
+	return 0;
+}
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
new file mode 100644
index 00000000000..23c91f5f6b6
--- /dev/null
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -0,0 +1,485 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              FILE OPERATIONS & DRIVER INITIALIZATION
+ *
+ * This file supports the user system call for file open, close, mmap, etc.
+ * This also incudes the driver initialization code.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+
+#if defined CONFIG_X86_64
+#include <asm/genapic.h>
+#include <asm/irq.h>
+#define IS_UV()		is_uv_system()
+#elif defined CONFIG_IA64
+#include <asm/system.h>
+#include <asm/sn/simulator.h>
+/* temp support for running on hardware simulator */
+#define IS_UV()		IS_MEDUSA() || ia64_platform_is("uv")
+#else
+#define IS_UV()		0
+#endif
+
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_mmrs.h>
+
+struct gru_blade_state *gru_base[GRU_MAX_BLADES] __read_mostly;
+unsigned long gru_start_paddr, gru_end_paddr __read_mostly;
+struct gru_stats_s gru_stats;
+
+/* Guaranteed user available resources on each node */
+static int max_user_cbrs, max_user_dsr_bytes;
+
+static struct file_operations gru_fops;
+static struct miscdevice gru_miscdev;
+
+
+/*
+ * gru_vma_close
+ *
+ * Called when unmapping a device mapping. Frees all gru resources
+ * and tables belonging to the vma.
+ */
+static void gru_vma_close(struct vm_area_struct *vma)
+{
+	struct gru_vma_data *vdata;
+	struct gru_thread_state *gts;
+	struct list_head *entry, *next;
+
+	if (!vma->vm_private_data)
+		return;
+
+	vdata = vma->vm_private_data;
+	vma->vm_private_data = NULL;
+	gru_dbg(grudev, "vma %p, file %p, vdata %p\n", vma, vma->vm_file,
+				vdata);
+	list_for_each_safe(entry, next, &vdata->vd_head) {
+		gts =
+		    list_entry(entry, struct gru_thread_state, ts_next);
+		list_del(&gts->ts_next);
+		mutex_lock(&gts->ts_ctxlock);
+		if (gts->ts_gru)
+			gru_unload_context(gts, 0);
+		mutex_unlock(&gts->ts_ctxlock);
+		gts_drop(gts);
+	}
+	kfree(vdata);
+	STAT(vdata_free);
+}
+
+/*
+ * gru_file_mmap
+ *
+ * Called when mmaping the device.  Initializes the vma with a fault handler
+ * and private data structure necessary to allocate, track, and free the
+ * underlying pages.
+ */
+static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) != (VM_SHARED | VM_WRITE))
+		return -EPERM;
+
+	if (vma->vm_start & (GRU_GSEG_PAGESIZE - 1) ||
+			vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
+		return -EINVAL;
+
+	vma->vm_flags |=
+	    (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP |
+			VM_RESERVED);
+	vma->vm_page_prot = PAGE_SHARED;
+	vma->vm_ops = &gru_vm_ops;
+
+	vma->vm_private_data = gru_alloc_vma_data(vma, 0);
+	if (!vma->vm_private_data)
+		return -ENOMEM;
+
+	gru_dbg(grudev, "file %p, vaddr 0x%lx, vma %p, vdata %p\n",
+		file, vma->vm_start, vma, vma->vm_private_data);
+	return 0;
+}
+
+/*
+ * Create a new GRU context
+ */
+static int gru_create_new_context(unsigned long arg)
+{
+	struct gru_create_context_req req;
+	struct vm_area_struct *vma;
+	struct gru_vma_data *vdata;
+	int ret = -EINVAL;
+
+
+	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
+		return -EFAULT;
+
+	if (req.data_segment_bytes == 0 ||
+				req.data_segment_bytes > max_user_dsr_bytes)
+		return -EINVAL;
+	if (!req.control_blocks || !req.maximum_thread_count ||
+				req.control_blocks > max_user_cbrs)
+		return -EINVAL;
+
+	if (!(req.options & GRU_OPT_MISS_MASK))
+		req.options |= GRU_OPT_MISS_FMM_INTR;
+
+	down_write(&current->mm->mmap_sem);
+	vma = gru_find_vma(req.gseg);
+	if (vma) {
+		vdata = vma->vm_private_data;
+		vdata->vd_user_options = req.options;
+		vdata->vd_dsr_au_count =
+		    GRU_DS_BYTES_TO_AU(req.data_segment_bytes);
+		vdata->vd_cbr_au_count = GRU_CB_COUNT_TO_AU(req.control_blocks);
+		ret = 0;
+	}
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+/*
+ * Get GRU configuration info (temp - for emulator testing)
+ */
+static long gru_get_config_info(unsigned long arg)
+{
+	struct gru_config_info info;
+	int nodesperblade;
+
+	if (num_online_nodes() > 1 &&
+			(uv_node_to_blade_id(1) == uv_node_to_blade_id(0)))
+		nodesperblade = 2;
+	else
+		nodesperblade = 1;
+	info.cpus = num_online_cpus();
+	info.nodes = num_online_nodes();
+	info.blades = info.nodes / nodesperblade;
+	info.chiplets = GRU_CHIPLETS_PER_BLADE * info.blades;
+
+	if (copy_to_user((void __user *)arg, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * Get GRU chiplet status
+ */
+static long gru_get_chiplet_status(unsigned long arg)
+{
+	struct gru_state *gru;
+	struct gru_chiplet_info info;
+
+	if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+		return -EFAULT;
+
+	if (info.node == -1)
+		info.node = numa_node_id();
+	if (info.node >= num_possible_nodes() ||
+			info.chiplet >= GRU_CHIPLETS_PER_HUB ||
+			info.node < 0 || info.chiplet < 0)
+		return -EINVAL;
+
+	info.blade = uv_node_to_blade_id(info.node);
+	gru = get_gru(info.blade, info.chiplet);
+
+	info.total_dsr_bytes = GRU_NUM_DSR_BYTES;
+	info.total_cbr = GRU_NUM_CB;
+	info.total_user_dsr_bytes = GRU_NUM_DSR_BYTES -
+		gru->gs_reserved_dsr_bytes;
+	info.total_user_cbr = GRU_NUM_CB - gru->gs_reserved_cbrs;
+	info.free_user_dsr_bytes = hweight64(gru->gs_dsr_map) *
+			GRU_DSR_AU_BYTES;
+	info.free_user_cbr = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
+
+	if (copy_to_user((void __user *)arg, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * gru_file_unlocked_ioctl
+ *
+ * Called to update file attributes via IOCTL calls.
+ */
+static long gru_file_unlocked_ioctl(struct file *file, unsigned int req,
+				    unsigned long arg)
+{
+	int err = -EBADRQC;
+
+	gru_dbg(grudev, "file %p\n", file);
+
+	switch (req) {
+	case GRU_CREATE_CONTEXT:
+		err = gru_create_new_context(arg);
+		break;
+	case GRU_SET_TASK_SLICE:
+		err = gru_set_task_slice(arg);
+		break;
+	case GRU_USER_GET_EXCEPTION_DETAIL:
+		err = gru_get_exception_detail(arg);
+		break;
+	case GRU_USER_UNLOAD_CONTEXT:
+		err = gru_user_unload_context(arg);
+		break;
+	case GRU_GET_CHIPLET_STATUS:
+		err = gru_get_chiplet_status(arg);
+		break;
+	case GRU_USER_FLUSH_TLB:
+		err = gru_user_flush_tlb(arg);
+		break;
+	case GRU_USER_CALL_OS:
+		err = gru_handle_user_call_os(arg);
+		break;
+	case GRU_GET_CONFIG_INFO:
+		err = gru_get_config_info(arg);
+		break;
+	}
+	return err;
+}
+
+/*
+ * Called at init time to build tables for all GRUs that are present in the
+ * system.
+ */
+static void gru_init_chiplet(struct gru_state *gru, unsigned long paddr,
+			     void *vaddr, int nid, int bid, int grunum)
+{
+	spin_lock_init(&gru->gs_lock);
+	spin_lock_init(&gru->gs_asid_lock);
+	gru->gs_gru_base_paddr = paddr;
+	gru->gs_gru_base_vaddr = vaddr;
+	gru->gs_gid = bid * GRU_CHIPLETS_PER_BLADE + grunum;
+	gru->gs_blade = gru_base[bid];
+	gru->gs_blade_id = bid;
+	gru->gs_cbr_map = (GRU_CBR_AU == 64) ? ~0 : (1UL << GRU_CBR_AU) - 1;
+	gru->gs_dsr_map = (1UL << GRU_DSR_AU) - 1;
+	gru_tgh_flush_init(gru);
+	gru_dbg(grudev, "bid %d, nid %d, gru %x, vaddr %p (0x%lx)\n",
+		bid, nid, gru->gs_gid, gru->gs_gru_base_vaddr,
+		gru->gs_gru_base_paddr);
+	gru_kservices_init(gru);
+}
+
+static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr)
+{
+	int pnode, nid, bid, chip;
+	int cbrs, dsrbytes, n;
+	int order = get_order(sizeof(struct gru_blade_state));
+	struct page *page;
+	struct gru_state *gru;
+	unsigned long paddr;
+	void *vaddr;
+
+	max_user_cbrs = GRU_NUM_CB;
+	max_user_dsr_bytes = GRU_NUM_DSR_BYTES;
+	for_each_online_node(nid) {
+		bid = uv_node_to_blade_id(nid);
+		pnode = uv_node_to_pnode(nid);
+		if (gru_base[bid])
+			continue;
+		page = alloc_pages_node(nid, GFP_KERNEL, order);
+		if (!page)
+			goto fail;
+		gru_base[bid] = page_address(page);
+		memset(gru_base[bid], 0, sizeof(struct gru_blade_state));
+		gru_base[bid]->bs_lru_gru = &gru_base[bid]->bs_grus[0];
+		spin_lock_init(&gru_base[bid]->bs_lock);
+
+		dsrbytes = 0;
+		cbrs = 0;
+		for (gru = gru_base[bid]->bs_grus, chip = 0;
+		     		chip < GRU_CHIPLETS_PER_BLADE;
+				chip++, gru++) {
+			paddr = gru_chiplet_paddr(gru_base_paddr, pnode, chip);
+			vaddr = gru_chiplet_vaddr(gru_base_vaddr, pnode, chip);
+			gru_init_chiplet(gru, paddr, vaddr, bid, nid, chip);
+			n = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
+			cbrs = max(cbrs, n);
+			n = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
+			dsrbytes = max(dsrbytes, n);
+		}
+		max_user_cbrs = min(max_user_cbrs, cbrs);
+		max_user_dsr_bytes = min(max_user_dsr_bytes, dsrbytes);
+	}
+
+	return 0;
+
+fail:
+	for (nid--; nid >= 0; nid--)
+		free_pages((unsigned long)gru_base[nid], order);
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_IA64
+
+static int get_base_irq(void)
+{
+	return IRQ_GRU;
+}
+
+#elif defined CONFIG_X86_64
+
+static void noop(unsigned int irq)
+{
+}
+
+static struct irq_chip gru_chip = {
+	.name		= "gru",
+	.mask		= noop,
+	.unmask		= noop,
+	.ack		= noop,
+};
+
+static int get_base_irq(void)
+{
+	set_irq_chip(IRQ_GRU, &gru_chip);
+	set_irq_chip(IRQ_GRU + 1, &gru_chip);
+	return IRQ_GRU;
+}
+#endif
+
+/*
+ * gru_init
+ *
+ * Called at boot or module load time to initialize the GRUs.
+ */
+static int __init gru_init(void)
+{
+	int ret, irq, chip;
+	char id[10];
+	void *gru_start_vaddr;
+
+	if (!IS_UV())
+		return 0;
+
+#if defined CONFIG_IA64
+	gru_start_paddr = 0xd000000000UL; /* ZZZZZZZZZZZZZZZZZZZ fixme */
+#else
+	gru_start_paddr = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR) &
+				0x7fffffffffffUL;
+
+#endif
+	gru_start_vaddr = __va(gru_start_paddr);
+	gru_end_paddr = gru_start_paddr + MAX_NUMNODES * GRU_SIZE;
+	printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
+	       gru_start_paddr, gru_end_paddr);
+	irq = get_base_irq();
+	for (chip = 0; chip < GRU_CHIPLETS_PER_BLADE; chip++) {
+		ret = request_irq(irq + chip, gru_intr, 0, id, NULL);
+		if (ret) {
+			printk(KERN_ERR "%s: request_irq failed\n",
+			       GRU_DRIVER_ID_STR);
+			goto exit1;
+		}
+	}
+
+	ret = misc_register(&gru_miscdev);
+	if (ret) {
+		printk(KERN_ERR "%s: misc_register failed\n",
+		       GRU_DRIVER_ID_STR);
+		goto exit1;
+	}
+
+	ret = gru_proc_init();
+	if (ret) {
+		printk(KERN_ERR "%s: proc init failed\n", GRU_DRIVER_ID_STR);
+		goto exit2;
+	}
+
+	ret = gru_init_tables(gru_start_paddr, gru_start_vaddr);
+	if (ret) {
+		printk(KERN_ERR "%s: init tables failed\n", GRU_DRIVER_ID_STR);
+		goto exit3;
+	}
+
+	printk(KERN_INFO "%s: v%s\n", GRU_DRIVER_ID_STR,
+	       GRU_DRIVER_VERSION_STR);
+	return 0;
+
+exit3:
+	gru_proc_exit();
+exit2:
+	misc_deregister(&gru_miscdev);
+exit1:
+	for (--chip; chip >= 0; chip--)
+		free_irq(irq + chip, NULL);
+	return ret;
+
+}
+
+static void __exit gru_exit(void)
+{
+	int i, bid;
+	int order = get_order(sizeof(struct gru_state) *
+			      GRU_CHIPLETS_PER_BLADE);
+
+	for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++)
+		free_irq(IRQ_GRU + i, NULL);
+
+	for (bid = 0; bid < GRU_MAX_BLADES; bid++)
+		free_pages((unsigned long)gru_base[bid], order);
+
+	misc_deregister(&gru_miscdev);
+	gru_proc_exit();
+}
+
+static struct file_operations gru_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= gru_file_unlocked_ioctl,
+	.mmap		= gru_file_mmap,
+};
+
+static struct miscdevice gru_miscdev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "gru",
+	.fops		= &gru_fops,
+};
+
+struct vm_operations_struct gru_vm_ops = {
+	.close		= gru_vma_close,
+	.fault		= gru_fault,
+};
+
+module_init(gru_init);
+module_exit(gru_exit);
+
+module_param(gru_options, ulong, 0644);
+MODULE_PARM_DESC(gru_options, "Various debug options");
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(GRU_DRIVER_ID_STR GRU_DRIVER_VERSION_STR);
+MODULE_VERSION(GRU_DRIVER_VERSION_STR);
+
diff --git a/drivers/misc/sgi-gru/gruhandles.h b/drivers/misc/sgi-gru/gruhandles.h
new file mode 100644
index 00000000000..d16031d6267
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruhandles.h
@@ -0,0 +1,663 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              GRU HANDLE DEFINITION
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __GRUHANDLES_H__
+#define __GRUHANDLES_H__
+#include "gru_instructions.h"
+
+/*
+ * Manifest constants for GRU Memory Map
+ */
+#define GRU_GSEG0_BASE		0
+#define GRU_MCS_BASE		(64 * 1024 * 1024)
+#define GRU_SIZE		(128UL * 1024 * 1024)
+
+/* Handle & resource counts */
+#define GRU_NUM_CB		128
+#define GRU_NUM_DSR_BYTES	(32 * 1024)
+#define GRU_NUM_TFM		16
+#define GRU_NUM_TGH		24
+#define GRU_NUM_CBE		128
+#define GRU_NUM_TFH		128
+#define GRU_NUM_CCH		16
+#define GRU_NUM_GSH		1
+
+/* Maximum resource counts that can be reserved by user programs */
+#define GRU_NUM_USER_CBR	GRU_NUM_CBE
+#define GRU_NUM_USER_DSR_BYTES	GRU_NUM_DSR_BYTES
+
+/* Bytes per handle & handle stride. Code assumes all cb, tfh, cbe handles
+ * are the same */
+#define GRU_HANDLE_BYTES	64
+#define GRU_HANDLE_STRIDE	256
+
+/* Base addresses of handles */
+#define GRU_TFM_BASE		(GRU_MCS_BASE + 0x00000)
+#define GRU_TGH_BASE		(GRU_MCS_BASE + 0x08000)
+#define GRU_CBE_BASE		(GRU_MCS_BASE + 0x10000)
+#define GRU_TFH_BASE		(GRU_MCS_BASE + 0x18000)
+#define GRU_CCH_BASE		(GRU_MCS_BASE + 0x20000)
+#define GRU_GSH_BASE		(GRU_MCS_BASE + 0x30000)
+
+/* User gseg constants */
+#define GRU_GSEG_STRIDE		(4 * 1024 * 1024)
+#define GSEG_BASE(a)		((a) & ~(GRU_GSEG_PAGESIZE - 1))
+
+/* Data segment constants */
+#define GRU_DSR_AU_BYTES	1024
+#define GRU_DSR_CL		(GRU_NUM_DSR_BYTES / GRU_CACHE_LINE_BYTES)
+#define GRU_DSR_AU_CL		(GRU_DSR_AU_BYTES / GRU_CACHE_LINE_BYTES)
+#define GRU_DSR_AU		(GRU_NUM_DSR_BYTES / GRU_DSR_AU_BYTES)
+
+/* Control block constants */
+#define GRU_CBR_AU_SIZE		2
+#define GRU_CBR_AU		(GRU_NUM_CBE / GRU_CBR_AU_SIZE)
+
+/* Convert resource counts to the number of AU */
+#define GRU_DS_BYTES_TO_AU(n)	DIV_ROUND_UP(n, GRU_DSR_AU_BYTES)
+#define GRU_CB_COUNT_TO_AU(n)	DIV_ROUND_UP(n, GRU_CBR_AU_SIZE)
+
+/* UV limits */
+#define GRU_CHIPLETS_PER_HUB	2
+#define GRU_HUBS_PER_BLADE	1
+#define GRU_CHIPLETS_PER_BLADE	(GRU_HUBS_PER_BLADE * GRU_CHIPLETS_PER_HUB)
+
+/* User GRU Gseg offsets */
+#define GRU_CB_BASE		0
+#define GRU_CB_LIMIT		(GRU_CB_BASE + GRU_HANDLE_STRIDE * GRU_NUM_CBE)
+#define GRU_DS_BASE		0x20000
+#define GRU_DS_LIMIT		(GRU_DS_BASE + GRU_NUM_DSR_BYTES)
+
+/* Convert a GRU physical address to the chiplet offset */
+#define GSEGPOFF(h) 		((h) & (GRU_SIZE - 1))
+
+/* Convert an arbitrary handle address to the beginning of the GRU segment */
+#ifndef __PLUGIN__
+#define GRUBASE(h)		((void *)((unsigned long)(h) & ~(GRU_SIZE - 1)))
+#else
+extern void *gmu_grubase(void *h);
+#define GRUBASE(h)		gmu_grubase(h)
+#endif
+
+/* General addressing macros. */
+static inline void *get_gseg_base_address(void *base, int ctxnum)
+{
+	return (void *)(base + GRU_GSEG0_BASE + GRU_GSEG_STRIDE * ctxnum);
+}
+
+static inline void *get_gseg_base_address_cb(void *base, int ctxnum, int line)
+{
+	return (void *)(get_gseg_base_address(base, ctxnum) +
+			GRU_CB_BASE + GRU_HANDLE_STRIDE * line);
+}
+
+static inline void *get_gseg_base_address_ds(void *base, int ctxnum, int line)
+{
+	return (void *)(get_gseg_base_address(base, ctxnum) + GRU_DS_BASE +
+			GRU_CACHE_LINE_BYTES * line);
+}
+
+static inline struct gru_tlb_fault_map *get_tfm(void *base, int ctxnum)
+{
+	return (struct gru_tlb_fault_map *)(base + GRU_TFM_BASE +
+					ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_tlb_global_handle *get_tgh(void *base, int ctxnum)
+{
+	return (struct gru_tlb_global_handle *)(base + GRU_TGH_BASE +
+					ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_control_block_extended *get_cbe(void *base, int ctxnum)
+{
+	return (struct gru_control_block_extended *)(base + GRU_CBE_BASE +
+					ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_tlb_fault_handle *get_tfh(void *base, int ctxnum)
+{
+	return (struct gru_tlb_fault_handle *)(base + GRU_TFH_BASE +
+					ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline struct gru_context_configuration_handle *get_cch(void *base,
+					int ctxnum)
+{
+	return (struct gru_context_configuration_handle *)(base +
+				GRU_CCH_BASE + ctxnum * GRU_HANDLE_STRIDE);
+}
+
+static inline unsigned long get_cb_number(void *cb)
+{
+	return (((unsigned long)cb - GRU_CB_BASE) % GRU_GSEG_PAGESIZE) /
+					GRU_HANDLE_STRIDE;
+}
+
+/* byte offset to a specific GRU chiplet. (p=pnode, c=chiplet (0 or 1)*/
+static inline unsigned long gru_chiplet_paddr(unsigned long paddr, int pnode,
+							int chiplet)
+{
+	return paddr + GRU_SIZE * (2 * pnode  + chiplet);
+}
+
+static inline void *gru_chiplet_vaddr(void *vaddr, int pnode, int chiplet)
+{
+	return vaddr + GRU_SIZE * (2 * pnode  + chiplet);
+}
+
+
+
+/*
+ * Global TLB Fault Map
+ * 	Bitmap of outstanding TLB misses needing interrupt/polling service.
+ *
+ */
+struct gru_tlb_fault_map {
+	unsigned long fault_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
+	unsigned long fill0[2];
+	unsigned long done_bits[BITS_TO_LONGS(GRU_NUM_CBE)];
+	unsigned long fill1[2];
+};
+
+/*
+ * TGH - TLB Global Handle
+ * 	Used for TLB flushing.
+ *
+ */
+struct gru_tlb_global_handle {
+	unsigned int cmd:1;		/* DW 0 */
+	unsigned int delresp:1;
+	unsigned int opc:1;
+	unsigned int fill1:5;
+
+	unsigned int fill2:8;
+
+	unsigned int status:2;
+	unsigned long fill3:2;
+	unsigned int state:3;
+	unsigned long fill4:1;
+
+	unsigned int cause:3;
+	unsigned long fill5:37;
+
+	unsigned long vaddr:64;		/* DW 1 */
+
+	unsigned int asid:24;		/* DW 2 */
+	unsigned int fill6:8;
+
+	unsigned int pagesize:5;
+	unsigned int fill7:11;
+
+	unsigned int global:1;
+	unsigned int fill8:15;
+
+	unsigned long vaddrmask:39;	/* DW 3 */
+	unsigned int fill9:9;
+	unsigned int n:10;
+	unsigned int fill10:6;
+
+	unsigned int ctxbitmap:16;	/* DW4 */
+	unsigned long fill11[3];
+};
+
+enum gru_tgh_cmd {
+	TGHCMD_START
+};
+
+enum gru_tgh_opc {
+	TGHOP_TLBNOP,
+	TGHOP_TLBINV
+};
+
+enum gru_tgh_status {
+	TGHSTATUS_IDLE,
+	TGHSTATUS_EXCEPTION,
+	TGHSTATUS_ACTIVE
+};
+
+enum gru_tgh_state {
+	TGHSTATE_IDLE,
+	TGHSTATE_PE_INVAL,
+	TGHSTATE_INTERRUPT_INVAL,
+	TGHSTATE_WAITDONE,
+	TGHSTATE_RESTART_CTX,
+};
+
+/*
+ * TFH - TLB Global Handle
+ * 	Used for TLB dropins into the GRU TLB.
+ *
+ */
+struct gru_tlb_fault_handle {
+	unsigned int cmd:1;		/* DW 0 - low 32*/
+	unsigned int delresp:1;
+	unsigned int fill0:2;
+	unsigned int opc:3;
+	unsigned int fill1:9;
+
+	unsigned int status:2;
+	unsigned int fill2:1;
+	unsigned int color:1;
+	unsigned int state:3;
+	unsigned int fill3:1;
+
+	unsigned int cause:7;		/* DW 0 - high 32 */
+	unsigned int fill4:1;
+
+	unsigned int indexway:12;
+	unsigned int fill5:4;
+
+	unsigned int ctxnum:4;
+	unsigned int fill6:12;
+
+	unsigned long missvaddr:64;	/* DW 1 */
+
+	unsigned int missasid:24;	/* DW 2 */
+	unsigned int fill7:8;
+	unsigned int fillasid:24;
+	unsigned int dirty:1;
+	unsigned int gaa:2;
+	unsigned long fill8:5;
+
+	unsigned long pfn:41;		/* DW 3 */
+	unsigned int fill9:7;
+	unsigned int pagesize:5;
+	unsigned int fill10:11;
+
+	unsigned long fillvaddr:64;	/* DW 4 */
+
+	unsigned long fill11[3];
+};
+
+enum gru_tfh_opc {
+	TFHOP_NOOP,
+	TFHOP_RESTART,
+	TFHOP_WRITE_ONLY,
+	TFHOP_WRITE_RESTART,
+	TFHOP_EXCEPTION,
+	TFHOP_USER_POLLING_MODE = 7,
+};
+
+enum tfh_status {
+	TFHSTATUS_IDLE,
+	TFHSTATUS_EXCEPTION,
+	TFHSTATUS_ACTIVE,
+};
+
+enum tfh_state {
+	TFHSTATE_INACTIVE,
+	TFHSTATE_IDLE,
+	TFHSTATE_MISS_UPM,
+	TFHSTATE_MISS_FMM,
+	TFHSTATE_HW_ERR,
+	TFHSTATE_WRITE_TLB,
+	TFHSTATE_RESTART_CBR,
+};
+
+/* TFH cause bits */
+enum tfh_cause {
+	TFHCAUSE_NONE,
+	TFHCAUSE_TLB_MISS,
+	TFHCAUSE_TLB_MOD,
+	TFHCAUSE_HW_ERROR_RR,
+	TFHCAUSE_HW_ERROR_MAIN_ARRAY,
+	TFHCAUSE_HW_ERROR_VALID,
+	TFHCAUSE_HW_ERROR_PAGESIZE,
+	TFHCAUSE_INSTRUCTION_EXCEPTION,
+	TFHCAUSE_UNCORRECTIBLE_ERROR,
+};
+
+/* GAA values */
+#define GAA_RAM				0x0
+#define GAA_NCRAM			0x2
+#define GAA_MMIO			0x1
+#define GAA_REGISTER			0x3
+
+/* GRU paddr shift for pfn. (NOTE: shift is NOT by actual pagesize) */
+#define GRU_PADDR_SHIFT			12
+
+/*
+ * Context Configuration handle
+ * 	Used to allocate resources to a GSEG context.
+ *
+ */
+struct gru_context_configuration_handle {
+	unsigned int cmd:1;			/* DW0 */
+	unsigned int delresp:1;
+	unsigned int opc:3;
+	unsigned int unmap_enable:1;
+	unsigned int req_slice_set_enable:1;
+	unsigned int req_slice:2;
+	unsigned int cb_int_enable:1;
+	unsigned int tlb_int_enable:1;
+	unsigned int tfm_fault_bit_enable:1;
+	unsigned int tlb_int_select:4;
+
+	unsigned int status:2;
+	unsigned int state:2;
+	unsigned int reserved2:4;
+
+	unsigned int cause:4;
+	unsigned int tfm_done_bit_enable:1;
+	unsigned int unused:3;
+
+	unsigned int dsr_allocation_map;
+
+	unsigned long cbr_allocation_map;	/* DW1 */
+
+	unsigned int asid[8];			/* DW 2 - 5 */
+	unsigned short sizeavail[8];		/* DW 6 - 7 */
+} __attribute__ ((packed));
+
+enum gru_cch_opc {
+	CCHOP_START = 1,
+	CCHOP_ALLOCATE,
+	CCHOP_INTERRUPT,
+	CCHOP_DEALLOCATE,
+	CCHOP_INTERRUPT_SYNC,
+};
+
+enum gru_cch_status {
+	CCHSTATUS_IDLE,
+	CCHSTATUS_EXCEPTION,
+	CCHSTATUS_ACTIVE,
+};
+
+enum gru_cch_state {
+	CCHSTATE_INACTIVE,
+	CCHSTATE_MAPPED,
+	CCHSTATE_ACTIVE,
+	CCHSTATE_INTERRUPTED,
+};
+
+/* CCH Exception cause */
+enum gru_cch_cause {
+	CCHCAUSE_REGION_REGISTER_WRITE_ERROR = 1,
+	CCHCAUSE_ILLEGAL_OPCODE = 2,
+	CCHCAUSE_INVALID_START_REQUEST = 3,
+	CCHCAUSE_INVALID_ALLOCATION_REQUEST = 4,
+	CCHCAUSE_INVALID_DEALLOCATION_REQUEST = 5,
+	CCHCAUSE_INVALID_INTERRUPT_REQUEST = 6,
+	CCHCAUSE_CCH_BUSY = 7,
+	CCHCAUSE_NO_CBRS_TO_ALLOCATE = 8,
+	CCHCAUSE_BAD_TFM_CONFIG = 9,
+	CCHCAUSE_CBR_RESOURCES_OVERSUBSCRIPED = 10,
+	CCHCAUSE_DSR_RESOURCES_OVERSUBSCRIPED = 11,
+	CCHCAUSE_CBR_DEALLOCATION_ERROR = 12,
+};
+/*
+ * CBE - Control Block Extended
+ * 	Maintains internal GRU state for active CBs.
+ *
+ */
+struct gru_control_block_extended {
+	unsigned int reserved0:1;	/* DW 0  - low */
+	unsigned int imacpy:3;
+	unsigned int reserved1:4;
+	unsigned int xtypecpy:3;
+	unsigned int iaa0cpy:2;
+	unsigned int iaa1cpy:2;
+	unsigned int reserved2:1;
+	unsigned int opccpy:8;
+	unsigned int exopccpy:8;
+
+	unsigned int idef2cpy:22;	/* DW 0  - high */
+	unsigned int reserved3:10;
+
+	unsigned int idef4cpy:22;	/* DW 1 */
+	unsigned int reserved4:10;
+	unsigned int idef4upd:22;
+	unsigned int reserved5:10;
+
+	unsigned long idef1upd:64;	/* DW 2 */
+
+	unsigned long idef5cpy:64;	/* DW 3 */
+
+	unsigned long idef6cpy:64;	/* DW 4 */
+
+	unsigned long idef3upd:64;	/* DW 5 */
+
+	unsigned long idef5upd:64;	/* DW 6 */
+
+	unsigned int idef2upd:22;	/* DW 7 */
+	unsigned int reserved6:10;
+
+	unsigned int ecause:20;
+	unsigned int cbrstate:4;
+	unsigned int cbrexecstatus:8;
+};
+
+enum gru_cbr_state {
+	CBRSTATE_INACTIVE,
+	CBRSTATE_IDLE,
+	CBRSTATE_PE_CHECK,
+	CBRSTATE_QUEUED,
+	CBRSTATE_WAIT_RESPONSE,
+	CBRSTATE_INTERRUPTED,
+	CBRSTATE_INTERRUPTED_MISS_FMM,
+	CBRSTATE_BUSY_INTERRUPT_MISS_FMM,
+	CBRSTATE_INTERRUPTED_MISS_UPM,
+	CBRSTATE_BUSY_INTERRUPTED_MISS_UPM,
+	CBRSTATE_REQUEST_ISSUE,
+	CBRSTATE_BUSY_INTERRUPT,
+};
+
+/* CBE cbrexecstatus bits */
+#define CBR_EXS_ABORT_OCC_BIT			0
+#define CBR_EXS_INT_OCC_BIT			1
+#define CBR_EXS_PENDING_BIT			2
+#define CBR_EXS_QUEUED_BIT			3
+#define CBR_EXS_TLBHW_BIT			4
+#define CBR_EXS_EXCEPTION_BIT			5
+
+#define CBR_EXS_ABORT_OCC			(1 << CBR_EXS_ABORT_OCC_BIT)
+#define CBR_EXS_INT_OCC				(1 << CBR_EXS_INT_OCC_BIT)
+#define CBR_EXS_PENDING				(1 << CBR_EXS_PENDING_BIT)
+#define CBR_EXS_QUEUED				(1 << CBR_EXS_QUEUED_BIT)
+#define CBR_EXS_TLBHW				(1 << CBR_EXS_TLBHW_BIT)
+#define CBR_EXS_EXCEPTION			(1 << CBR_EXS_EXCEPTION_BIT)
+
+/* CBE ecause bits  - defined in gru_instructions.h */
+
+/*
+ * Convert a processor pagesize into the strange encoded pagesize used by the
+ * GRU. Processor pagesize is encoded as log of bytes per page. (or PAGE_SHIFT)
+ * 	pagesize	log pagesize	grupagesize
+ * 	  4k			12	0
+ * 	 16k 			14	1
+ * 	 64k			16	2
+ * 	256k			18	3
+ * 	  1m			20	4
+ * 	  2m			21	5
+ * 	  4m			22	6
+ * 	 16m			24	7
+ * 	 64m			26	8
+ * 	...
+ */
+#define GRU_PAGESIZE(sh)	((((sh) > 20 ? (sh) + 2: (sh)) >> 1) - 6)
+#define GRU_SIZEAVAIL(sh)	(1UL << GRU_PAGESIZE(sh))
+
+/* minimum TLB purge count to ensure a full purge */
+#define GRUMAXINVAL		1024UL
+
+
+/* Extract the status field from a kernel handle */
+#define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
+
+static inline void start_instruction(void *h)
+{
+	unsigned long *w0 = h;
+
+	wmb();		/* setting CMD bit must be last */
+	*w0 = *w0 | 1;
+	gru_flush_cache(h);
+}
+
+static inline int wait_instruction_complete(void *h)
+{
+	int status;
+
+	do {
+		cpu_relax();
+		barrier();
+		status = GET_MSEG_HANDLE_STATUS(h);
+	} while (status == CCHSTATUS_ACTIVE);
+	return status;
+}
+
+#if defined CONFIG_IA64
+static inline void cch_allocate_set_asids(
+		  struct gru_context_configuration_handle *cch, int asidval)
+{
+	int i;
+
+	for (i = 0; i <= RGN_HPAGE; i++) {  /*  assume HPAGE is last region */
+		cch->asid[i] = (asidval++);
+#if 0
+		/* ZZZ hugepages not supported yet */
+		if (i == RGN_HPAGE)
+			cch->sizeavail[i] = GRU_SIZEAVAIL(hpage_shift);
+		else
+#endif
+			cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT);
+	}
+}
+#elif defined CONFIG_X86_64
+static inline void cch_allocate_set_asids(
+		  struct gru_context_configuration_handle *cch, int asidval)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		cch->asid[i] = asidval++;
+		cch->sizeavail[i] = GRU_SIZEAVAIL(PAGE_SHIFT) |
+			GRU_SIZEAVAIL(21);
+	}
+}
+#endif
+
+static inline int cch_allocate(struct gru_context_configuration_handle *cch,
+			       int asidval, unsigned long cbrmap,
+			       unsigned long dsrmap)
+{
+	cch_allocate_set_asids(cch, asidval);
+	cch->dsr_allocation_map = dsrmap;
+	cch->cbr_allocation_map = cbrmap;
+	cch->opc = CCHOP_ALLOCATE;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_start(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_START;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_interrupt(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_INTERRUPT;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_deallocate(struct gru_context_configuration_handle *cch)
+{
+	cch->opc = CCHOP_DEALLOCATE;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int cch_interrupt_sync(struct gru_context_configuration_handle
+				     *cch)
+{
+	cch->opc = CCHOP_INTERRUPT_SYNC;
+	start_instruction(cch);
+	return wait_instruction_complete(cch);
+}
+
+static inline int tgh_invalidate(struct gru_tlb_global_handle *tgh,
+				 unsigned long vaddr, unsigned long vaddrmask,
+				 int asid, int pagesize, int global, int n,
+				 unsigned short ctxbitmap)
+{
+	tgh->vaddr = vaddr;
+	tgh->asid = asid;
+	tgh->pagesize = pagesize;
+	tgh->n = n;
+	tgh->global = global;
+	tgh->vaddrmask = vaddrmask;
+	tgh->ctxbitmap = ctxbitmap;
+	tgh->opc = TGHOP_TLBINV;
+	start_instruction(tgh);
+	return wait_instruction_complete(tgh);
+}
+
+static inline void tfh_write_only(struct gru_tlb_fault_handle *tfh,
+				  unsigned long pfn, unsigned long vaddr,
+				  int asid, int dirty, int pagesize)
+{
+	tfh->fillasid = asid;
+	tfh->fillvaddr = vaddr;
+	tfh->pfn = pfn;
+	tfh->dirty = dirty;
+	tfh->pagesize = pagesize;
+	tfh->opc = TFHOP_WRITE_ONLY;
+	start_instruction(tfh);
+}
+
+static inline void tfh_write_restart(struct gru_tlb_fault_handle *tfh,
+				     unsigned long paddr, int gaa,
+				     unsigned long vaddr, int asid, int dirty,
+				     int pagesize)
+{
+	tfh->fillasid = asid;
+	tfh->fillvaddr = vaddr;
+	tfh->pfn = paddr >> GRU_PADDR_SHIFT;
+	tfh->gaa = gaa;
+	tfh->dirty = dirty;
+	tfh->pagesize = pagesize;
+	tfh->opc = TFHOP_WRITE_RESTART;
+	start_instruction(tfh);
+}
+
+static inline void tfh_restart(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_RESTART;
+	start_instruction(tfh);
+}
+
+static inline void tfh_user_polling_mode(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_USER_POLLING_MODE;
+	start_instruction(tfh);
+}
+
+static inline void tfh_exception(struct gru_tlb_fault_handle *tfh)
+{
+	tfh->opc = TFHOP_EXCEPTION;
+	start_instruction(tfh);
+}
+
+#endif /* __GRUHANDLES_H__ */
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
new file mode 100644
index 00000000000..dfd49af0fe1
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -0,0 +1,679 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              KERNEL SERVICES THAT USE THE GRU
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+#include <linux/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+#include "grukservices.h"
+#include "gru_instructions.h"
+#include <asm/uv/uv_hub.h>
+
+/*
+ * Kernel GRU Usage
+ *
+ * The following is an interim algorithm for management of kernel GRU
+ * resources. This will likely be replaced when we better understand the
+ * kernel/user requirements.
+ *
+ * At boot time, the kernel permanently reserves a fixed number of
+ * CBRs/DSRs for each cpu to use. The resources are all taken from
+ * the GRU chiplet 1 on the blade. This leaves the full set of resources
+ * of chiplet 0 available to be allocated to a single user.
+ */
+
+/* Blade percpu resources PERMANENTLY reserved for kernel use */
+#define GRU_NUM_KERNEL_CBR      1
+#define GRU_NUM_KERNEL_DSR_BYTES 256
+#define KERNEL_CTXNUM           15
+
+/* GRU instruction attributes for all instructions */
+#define IMA			IMA_CB_DELAY
+
+/* GRU cacheline size is always 64 bytes - even on arches with 128 byte lines */
+#define __gru_cacheline_aligned__                               \
+	__attribute__((__aligned__(GRU_CACHE_LINE_BYTES)))
+
+#define MAGIC	0x1234567887654321UL
+
+/* Default retry count for GRU errors on kernel instructions */
+#define EXCEPTION_RETRY_LIMIT	3
+
+/* Status of message queue sections */
+#define MQS_EMPTY		0
+#define MQS_FULL		1
+#define MQS_NOOP		2
+
+/*----------------- RESOURCE MANAGEMENT -------------------------------------*/
+/* optimized for x86_64 */
+struct message_queue {
+	union gru_mesqhead	head __gru_cacheline_aligned__;	/* CL 0 */
+	int			qlines;				/* DW 1 */
+	long 			hstatus[2];
+	void 			*next __gru_cacheline_aligned__;/* CL 1 */
+	void 			*limit;
+	void 			*start;
+	void 			*start2;
+	char			data ____cacheline_aligned;	/* CL 2 */
+};
+
+/* First word in every message - used by mesq interface */
+struct message_header {
+	char	present;
+	char	present2;
+	char 	lines;
+	char	fill;
+};
+
+#define QLINES(mq)	((mq) + offsetof(struct message_queue, qlines))
+#define HSTATUS(mq, h)	((mq) + offsetof(struct message_queue, hstatus[h]))
+
+static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr)
+{
+	struct gru_blade_state *bs;
+	int lcpu;
+
+	BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES);
+	preempt_disable();
+	bs = gru_base[uv_numa_blade_id()];
+	lcpu = uv_blade_processor_id();
+	*cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE;
+	*dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES;
+	return 0;
+}
+
+static void gru_free_cpu_resources(void *cb, void *dsr)
+{
+	preempt_enable();
+}
+
+int gru_get_cb_exception_detail(void *cb,
+		struct control_block_extended_exc_detail *excdet)
+{
+	struct gru_control_block_extended *cbe;
+
+	cbe = get_cbe(GRUBASE(cb), get_cb_number(cb));
+	excdet->opc = cbe->opccpy;
+	excdet->exopc = cbe->exopccpy;
+	excdet->ecause = cbe->ecause;
+	excdet->exceptdet0 = cbe->idef1upd;
+	excdet->exceptdet1 = cbe->idef3upd;
+	return 0;
+}
+
+char *gru_get_cb_exception_detail_str(int ret, void *cb,
+				      char *buf, int size)
+{
+	struct gru_control_block_status *gen = (void *)cb;
+	struct control_block_extended_exc_detail excdet;
+
+	if (ret > 0 && gen->istatus == CBS_EXCEPTION) {
+		gru_get_cb_exception_detail(cb, &excdet);
+		snprintf(buf, size,
+			"GRU exception: cb %p, opc %d, exopc %d, ecause 0x%x,"
+			"excdet0 0x%lx, excdet1 0x%x",
+			gen, excdet.opc, excdet.exopc, excdet.ecause,
+			excdet.exceptdet0, excdet.exceptdet1);
+	} else {
+		snprintf(buf, size, "No exception");
+	}
+	return buf;
+}
+
+static int gru_wait_idle_or_exception(struct gru_control_block_status *gen)
+{
+	while (gen->istatus >= CBS_ACTIVE) {
+		cpu_relax();
+		barrier();
+	}
+	return gen->istatus;
+}
+
+static int gru_retry_exception(void *cb)
+{
+	struct gru_control_block_status *gen = (void *)cb;
+	struct control_block_extended_exc_detail excdet;
+	int retry = EXCEPTION_RETRY_LIMIT;
+
+	while (1)  {
+		if (gru_get_cb_message_queue_substatus(cb))
+			break;
+		if (gru_wait_idle_or_exception(gen) == CBS_IDLE)
+			return CBS_IDLE;
+
+		gru_get_cb_exception_detail(cb, &excdet);
+		if (excdet.ecause & ~EXCEPTION_RETRY_BITS)
+			break;
+		if (retry-- == 0)
+			break;
+		gen->icmd = 1;
+		gru_flush_cache(gen);
+	}
+	return CBS_EXCEPTION;
+}
+
+int gru_check_status_proc(void *cb)
+{
+	struct gru_control_block_status *gen = (void *)cb;
+	int ret;
+
+	ret = gen->istatus;
+	if (ret != CBS_EXCEPTION)
+		return ret;
+	return gru_retry_exception(cb);
+
+}
+
+int gru_wait_proc(void *cb)
+{
+	struct gru_control_block_status *gen = (void *)cb;
+	int ret;
+
+	ret = gru_wait_idle_or_exception(gen);
+	if (ret == CBS_EXCEPTION)
+		ret = gru_retry_exception(cb);
+
+	return ret;
+}
+
+void gru_abort(int ret, void *cb, char *str)
+{
+	char buf[GRU_EXC_STR_SIZE];
+
+	panic("GRU FATAL ERROR: %s - %s\n", str,
+	      gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf)));
+}
+
+void gru_wait_abort_proc(void *cb)
+{
+	int ret;
+
+	ret = gru_wait_proc(cb);
+	if (ret)
+		gru_abort(ret, cb, "gru_wait_abort");
+}
+
+
+/*------------------------------ MESSAGE QUEUES -----------------------------*/
+
+/* Internal status . These are NOT returned to the user. */
+#define MQIE_AGAIN		-1	/* try again */
+
+
+/*
+ * Save/restore the "present" flag that is in the second line of 2-line
+ * messages
+ */
+static inline int get_present2(void *p)
+{
+	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
+	return mhdr->present;
+}
+
+static inline void restore_present2(void *p, int val)
+{
+	struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES;
+	mhdr->present = val;
+}
+
+/*
+ * Create a message queue.
+ * 	qlines - message queue size in cache lines. Includes 2-line header.
+ */
+int gru_create_message_queue(void *p, unsigned int bytes)
+{
+	struct message_queue *mq = p;
+	unsigned int qlines;
+
+	qlines = bytes / GRU_CACHE_LINE_BYTES - 2;
+	memset(mq, 0, bytes);
+	mq->start = &mq->data;
+	mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES;
+	mq->next = &mq->data;
+	mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES;
+	mq->qlines = qlines;
+	mq->hstatus[0] = 0;
+	mq->hstatus[1] = 1;
+	mq->head = gru_mesq_head(2, qlines / 2 + 1);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gru_create_message_queue);
+
+/*
+ * Send a NOOP message to a message queue
+ * 	Returns:
+ * 		 0 - if queue is full after the send. This is the normal case
+ * 		     but various races can change this.
+ *		-1 - if mesq sent successfully but queue not full
+ *		>0 - unexpected error. MQE_xxx returned
+ */
+static int send_noop_message(void *cb,
+				unsigned long mq, void *mesg)
+{
+	const struct message_header noop_header = {
+					.present = MQS_NOOP, .lines = 1};
+	unsigned long m;
+	int substatus, ret;
+	struct message_header save_mhdr, *mhdr = mesg;
+
+	STAT(mesq_noop);
+	save_mhdr = *mhdr;
+	*mhdr = noop_header;
+	gru_mesq(cb, mq, gru_get_tri(mhdr), 1, IMA);
+	ret = gru_wait(cb);
+
+	if (ret) {
+		substatus = gru_get_cb_message_queue_substatus(cb);
+		switch (substatus) {
+		case CBSS_NO_ERROR:
+			STAT(mesq_noop_unexpected_error);
+			ret = MQE_UNEXPECTED_CB_ERR;
+			break;
+		case CBSS_LB_OVERFLOWED:
+			STAT(mesq_noop_lb_overflow);
+			ret = MQE_CONGESTION;
+			break;
+		case CBSS_QLIMIT_REACHED:
+			STAT(mesq_noop_qlimit_reached);
+			ret = 0;
+			break;
+		case CBSS_AMO_NACKED:
+			STAT(mesq_noop_amo_nacked);
+			ret = MQE_CONGESTION;
+			break;
+		case CBSS_PUT_NACKED:
+			STAT(mesq_noop_put_nacked);
+			m = mq + (gru_get_amo_value_head(cb) << 6);
+			gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1,
+						IMA);
+			if (gru_wait(cb) == CBS_IDLE)
+				ret = MQIE_AGAIN;
+			else
+				ret = MQE_UNEXPECTED_CB_ERR;
+			break;
+		case CBSS_PAGE_OVERFLOW:
+		default:
+			BUG();
+		}
+	}
+	*mhdr = save_mhdr;
+	return ret;
+}
+
+/*
+ * Handle a gru_mesq full.
+ */
+static int send_message_queue_full(void *cb,
+			   unsigned long mq, void *mesg, int lines)
+{
+	union gru_mesqhead mqh;
+	unsigned int limit, head;
+	unsigned long avalue;
+	int half, qlines, save;
+
+	/* Determine if switching to first/second half of q */
+	avalue = gru_get_amo_value(cb);
+	head = gru_get_amo_value_head(cb);
+	limit = gru_get_amo_value_limit(cb);
+
+	/*
+	 * Fetch "qlines" from the queue header. Since the queue may be
+	 * in memory that can't be accessed using socket addresses, use
+	 * the GRU to access the data. Use DSR space from the message.
+	 */
+	save = *(int *)mesg;
+	gru_vload(cb, QLINES(mq), gru_get_tri(mesg), XTYPE_W, 1, 1, IMA);
+	if (gru_wait(cb) != CBS_IDLE)
+		goto cberr;
+	qlines = *(int *)mesg;
+	*(int *)mesg = save;
+	half = (limit != qlines);
+
+	if (half)
+		mqh = gru_mesq_head(qlines / 2 + 1, qlines);
+	else
+		mqh = gru_mesq_head(2, qlines / 2 + 1);
+
+	/* Try to get lock for switching head pointer */
+	gru_gamir(cb, EOP_IR_CLR, HSTATUS(mq, half), XTYPE_DW, IMA);
+	if (gru_wait(cb) != CBS_IDLE)
+		goto cberr;
+	if (!gru_get_amo_value(cb)) {
+		STAT(mesq_qf_locked);
+		return MQE_QUEUE_FULL;
+	}
+
+	/* Got the lock. Send optional NOP if queue not full, */
+	if (head != limit) {
+		if (send_noop_message(cb, mq, mesg)) {
+			gru_gamir(cb, EOP_IR_INC, HSTATUS(mq, half),
+					XTYPE_DW, IMA);
+			if (gru_wait(cb) != CBS_IDLE)
+				goto cberr;
+			STAT(mesq_qf_noop_not_full);
+			return MQIE_AGAIN;
+		}
+		avalue++;
+	}
+
+	/* Then flip queuehead to other half of queue. */
+	gru_gamer(cb, EOP_ERR_CSWAP, mq, XTYPE_DW, mqh.val, avalue, IMA);
+	if (gru_wait(cb) != CBS_IDLE)
+		goto cberr;
+
+	/* If not successfully in swapping queue head, clear the hstatus lock */
+	if (gru_get_amo_value(cb) != avalue) {
+		STAT(mesq_qf_switch_head_failed);
+		gru_gamir(cb, EOP_IR_INC, HSTATUS(mq, half), XTYPE_DW, IMA);
+		if (gru_wait(cb) != CBS_IDLE)
+			goto cberr;
+	}
+	return MQIE_AGAIN;
+cberr:
+	STAT(mesq_qf_unexpected_error);
+	return MQE_UNEXPECTED_CB_ERR;
+}
+
+
+/*
+ * Handle a gru_mesq failure. Some of these failures are software recoverable
+ * or retryable.
+ */
+static int send_message_failure(void *cb,
+				unsigned long mq,
+				void *mesg,
+				int lines)
+{
+	int substatus, ret = 0;
+	unsigned long m;
+
+	substatus = gru_get_cb_message_queue_substatus(cb);
+	switch (substatus) {
+	case CBSS_NO_ERROR:
+		STAT(mesq_send_unexpected_error);
+		ret = MQE_UNEXPECTED_CB_ERR;
+		break;
+	case CBSS_LB_OVERFLOWED:
+		STAT(mesq_send_lb_overflow);
+		ret = MQE_CONGESTION;
+		break;
+	case CBSS_QLIMIT_REACHED:
+		STAT(mesq_send_qlimit_reached);
+		ret = send_message_queue_full(cb, mq, mesg, lines);
+		break;
+	case CBSS_AMO_NACKED:
+		STAT(mesq_send_amo_nacked);
+		ret = MQE_CONGESTION;
+		break;
+	case CBSS_PUT_NACKED:
+		STAT(mesq_send_put_nacked);
+		m =mq + (gru_get_amo_value_head(cb) << 6);
+		gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA);
+		if (gru_wait(cb) == CBS_IDLE)
+			ret = MQE_OK;
+		else
+			ret = MQE_UNEXPECTED_CB_ERR;
+		break;
+	default:
+		BUG();
+	}
+	return ret;
+}
+
+/*
+ * Send a message to a message queue
+ * 	cb	GRU control block to use to send message
+ * 	mq	message queue
+ * 	mesg	message. ust be vaddr within a GSEG
+ * 	bytes	message size (<= 2 CL)
+ */
+int gru_send_message_gpa(unsigned long mq, void *mesg, unsigned int bytes)
+{
+	struct message_header *mhdr;
+	void *cb;
+	void *dsr;
+	int istatus, clines, ret;
+
+	STAT(mesq_send);
+	BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES);
+
+	clines = (bytes + GRU_CACHE_LINE_BYTES - 1) / GRU_CACHE_LINE_BYTES;
+	if (gru_get_cpu_resources(bytes, &cb, &dsr))
+		return MQE_BUG_NO_RESOURCES;
+	memcpy(dsr, mesg, bytes);
+	mhdr = dsr;
+	mhdr->present = MQS_FULL;
+	mhdr->lines = clines;
+	if (clines == 2) {
+		mhdr->present2 = get_present2(mhdr);
+		restore_present2(mhdr, MQS_FULL);
+	}
+
+	do {
+		ret = MQE_OK;
+		gru_mesq(cb, mq, gru_get_tri(mhdr), clines, IMA);
+		istatus = gru_wait(cb);
+		if (istatus != CBS_IDLE)
+			ret = send_message_failure(cb, mq, dsr, clines);
+	} while (ret == MQIE_AGAIN);
+	gru_free_cpu_resources(cb, dsr);
+
+	if (ret)
+		STAT(mesq_send_failed);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gru_send_message_gpa);
+
+/*
+ * Advance the receive pointer for the queue to the next message.
+ */
+void gru_free_message(void *rmq, void *mesg)
+{
+	struct message_queue *mq = rmq;
+	struct message_header *mhdr = mq->next;
+	void *next, *pnext;
+	int half = -1;
+	int lines = mhdr->lines;
+
+	if (lines == 2)
+		restore_present2(mhdr, MQS_EMPTY);
+	mhdr->present = MQS_EMPTY;
+
+	pnext = mq->next;
+	next = pnext + GRU_CACHE_LINE_BYTES * lines;
+	if (next == mq->limit) {
+		next = mq->start;
+		half = 1;
+	} else if (pnext < mq->start2 && next >= mq->start2) {
+		half = 0;
+	}
+
+	if (half >= 0)
+		mq->hstatus[half] = 1;
+	mq->next = next;
+}
+EXPORT_SYMBOL_GPL(gru_free_message);
+
+/*
+ * Get next message from message queue. Return NULL if no message
+ * present. User must call next_message() to move to next message.
+ * 	rmq	message queue
+ */
+void *gru_get_next_message(void *rmq)
+{
+	struct message_queue *mq = rmq;
+	struct message_header *mhdr = mq->next;
+	int present = mhdr->present;
+
+	/* skip NOOP messages */
+	STAT(mesq_receive);
+	while (present == MQS_NOOP) {
+		gru_free_message(rmq, mhdr);
+		mhdr = mq->next;
+		present = mhdr->present;
+	}
+
+	/* Wait for both halves of 2 line messages */
+	if (present == MQS_FULL && mhdr->lines == 2 &&
+				get_present2(mhdr) == MQS_EMPTY)
+		present = MQS_EMPTY;
+
+	if (!present) {
+		STAT(mesq_receive_none);
+		return NULL;
+	}
+
+	if (mhdr->lines == 2)
+		restore_present2(mhdr, mhdr->present2);
+
+	return mhdr;
+}
+EXPORT_SYMBOL_GPL(gru_get_next_message);
+
+/* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/
+
+/*
+ * Copy a block of data using the GRU resources
+ */
+int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
+				unsigned int bytes)
+{
+	void *cb;
+	void *dsr;
+	int ret;
+
+	STAT(copy_gpa);
+	if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr))
+		return MQE_BUG_NO_RESOURCES;
+	gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr),
+		  XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_BYTES, IMA);
+	ret = gru_wait(cb);
+	gru_free_cpu_resources(cb, dsr);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gru_copy_gpa);
+
+/* ------------------- KERNEL QUICKTESTS RUN AT STARTUP ----------------*/
+/* 	Temp - will delete after we gain confidence in the GRU		*/
+static __cacheline_aligned unsigned long word0;
+static __cacheline_aligned unsigned long word1;
+
+static int quicktest(struct gru_state *gru)
+{
+	void *cb;
+	void *ds;
+	unsigned long *p;
+
+	cb = get_gseg_base_address_cb(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0);
+	ds = get_gseg_base_address_ds(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0);
+	p = ds;
+	word0 = MAGIC;
+
+	gru_vload(cb, uv_gpa(&word0), 0, XTYPE_DW, 1, 1, IMA);
+	if (gru_wait(cb) != CBS_IDLE)
+		BUG();
+
+	if (*(unsigned long *)ds != MAGIC)
+		BUG();
+	gru_vstore(cb, uv_gpa(&word1), 0, XTYPE_DW, 1, 1, IMA);
+	if (gru_wait(cb) != CBS_IDLE)
+		BUG();
+
+	if (word0 != word1 || word0 != MAGIC) {
+		printk
+		    ("GRU quicktest err: gru %d, found 0x%lx, expected 0x%lx\n",
+		     gru->gs_gid, word1, MAGIC);
+		BUG();		/* ZZZ should not be fatal */
+	}
+
+	return 0;
+}
+
+
+int gru_kservices_init(struct gru_state *gru)
+{
+	struct gru_blade_state *bs;
+	struct gru_context_configuration_handle *cch;
+	unsigned long cbr_map, dsr_map;
+	int err, num, cpus_possible;
+
+	/*
+	 * Currently, resources are reserved ONLY on the second chiplet
+	 * on each blade. This leaves ALL resources on chiplet 0 available
+	 * for user code.
+	 */
+	bs = gru->gs_blade;
+	if (gru != &bs->bs_grus[1])
+		return 0;
+
+	cpus_possible = uv_blade_nr_possible_cpus(gru->gs_blade_id);
+
+	num = GRU_NUM_KERNEL_CBR * cpus_possible;
+	cbr_map = gru_reserve_cb_resources(gru, GRU_CB_COUNT_TO_AU(num), NULL);
+	gru->gs_reserved_cbrs += num;
+
+	num = GRU_NUM_KERNEL_DSR_BYTES * cpus_possible;
+	dsr_map = gru_reserve_ds_resources(gru, GRU_DS_BYTES_TO_AU(num), NULL);
+	gru->gs_reserved_dsr_bytes += num;
+
+	gru->gs_active_contexts++;
+	__set_bit(KERNEL_CTXNUM, &gru->gs_context_map);
+	cch = get_cch(gru->gs_gru_base_vaddr, KERNEL_CTXNUM);
+
+	bs->kernel_cb = get_gseg_base_address_cb(gru->gs_gru_base_vaddr,
+					KERNEL_CTXNUM, 0);
+	bs->kernel_dsr = get_gseg_base_address_ds(gru->gs_gru_base_vaddr,
+					KERNEL_CTXNUM, 0);
+
+	lock_cch_handle(cch);
+	cch->tfm_fault_bit_enable = 0;
+	cch->tlb_int_enable = 0;
+	cch->tfm_done_bit_enable = 0;
+	cch->unmap_enable = 1;
+	err = cch_allocate(cch, 0, cbr_map, dsr_map);
+	if (err) {
+		gru_dbg(grudev,
+			"Unable to allocate kernel CCH: gru %d, err %d\n",
+			gru->gs_gid, err);
+		BUG();
+	}
+	if (cch_start(cch)) {
+		gru_dbg(grudev, "Unable to start kernel CCH: gru %d, err %d\n",
+			gru->gs_gid, err);
+		BUG();
+	}
+	unlock_cch_handle(cch);
+
+	if (gru_options & GRU_QUICKLOOK)
+		quicktest(gru);
+	return 0;
+}
diff --git a/drivers/misc/sgi-gru/grukservices.h b/drivers/misc/sgi-gru/grukservices.h
new file mode 100644
index 00000000000..eb17e0a3ac6
--- /dev/null
+++ b/drivers/misc/sgi-gru/grukservices.h
@@ -0,0 +1,134 @@
+
+/*
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#ifndef __GRU_KSERVICES_H_
+#define __GRU_KSERVICES_H_
+
+
+/*
+ * Message queues using the GRU to send/receive messages.
+ *
+ * These function allow the user to create a message queue for
+ * sending/receiving 1 or 2 cacheline messages using the GRU.
+ *
+ * Processes SENDING messages will use a kernel CBR/DSR to send
+ * the message. This is transparent to the caller.
+ *
+ * The receiver does not use any GRU resources.
+ *
+ * The functions support:
+ * 	- single receiver
+ * 	- multiple senders
+ *	- cross partition message
+ *
+ * Missing features ZZZ:
+ * 	- user options for dealing with timeouts, queue full, etc.
+ * 	- gru_create_message_queue() needs interrupt vector info
+ */
+
+/*
+ * Initialize a user allocated chunk of memory to be used as
+ * a message queue. The caller must ensure that the queue is
+ * in contiguous physical memory and is cacheline aligned.
+ *
+ * Message queue size is the total number of bytes allocated
+ * to the queue including a 2 cacheline header that is used
+ * to manage the queue.
+ *
+ *  Input:
+ * 	p	pointer to user allocated memory.
+ * 	bytes	size of message queue in bytes
+ *
+ *  Errors:
+ *  	0	OK
+ *  	>0	error
+ */
+extern int gru_create_message_queue(void *p, unsigned int bytes);
+
+/*
+ * Send a message to a message queue.
+ *
+ * Note: The message queue transport mechanism uses the first 32
+ * bits of the message. Users should avoid using these bits.
+ *
+ *
+ *   Input:
+ * 	xmq	message queue - must be a UV global physical address
+ * 	mesg	pointer to message. Must be 64-bit aligned
+ * 	bytes	size of message in bytes
+ *
+ *   Output:
+ *      0	message sent
+ *     >0	Send failure - see error codes below
+ *
+ */
+extern int gru_send_message_gpa(unsigned long mq_gpa, void *mesg,
+						unsigned int bytes);
+
+/* Status values for gru_send_message() */
+#define MQE_OK			0	/* message sent successfully */
+#define MQE_CONGESTION		1	/* temporary congestion, try again */
+#define MQE_QUEUE_FULL		2	/* queue is full */
+#define MQE_UNEXPECTED_CB_ERR	3	/* unexpected CB error */
+#define MQE_PAGE_OVERFLOW	10	/* BUG - queue overflowed a page */
+#define MQE_BUG_NO_RESOURCES	11	/* BUG - could not alloc GRU cb/dsr */
+
+/*
+ * Advance the receive pointer for the message queue to the next message.
+ * Note: current API requires messages to be gotten & freed in order. Future
+ * API extensions may allow for out-of-order freeing.
+ *
+ *   Input
+ * 	mq	message queue
+ * 	mesq	message being freed
+ */
+extern void gru_free_message(void *mq, void *mesq);
+
+/*
+ * Get next message from message queue. Returns pointer to
+ * message OR NULL if no message present.
+ * User must call gru_free_message() after message is processed
+ * in order to move the queue pointers to next message.
+ *
+ *   Input
+ * 	mq	message queue
+ *
+ *   Output:
+ *	p	pointer to message
+ *	NULL	no message available
+ */
+extern void *gru_get_next_message(void *mq);
+
+
+/*
+ * Copy data using the GRU. Source or destination can be located in a remote
+ * partition.
+ *
+ *    Input:
+ *    	dest_gpa	destination global physical address
+ *    	src_gpa		source global physical address
+ *    	bytes		number of bytes to copy
+ *
+ *    Output:
+ *	0		OK
+ *	>0		error
+ */
+extern int gru_copy_gpa(unsigned long dest_gpa, unsigned long src_gpa,
+							unsigned int bytes);
+
+#endif 		/* __GRU_KSERVICES_H_ */
diff --git a/drivers/misc/sgi-gru/grulib.h b/drivers/misc/sgi-gru/grulib.h
new file mode 100644
index 00000000000..e56e196a699
--- /dev/null
+++ b/drivers/misc/sgi-gru/grulib.h
@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation; either version 2.1 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __GRULIB_H__
+#define __GRULIB_H__
+
+#define GRU_BASENAME		"gru"
+#define GRU_FULLNAME		"/dev/gru"
+#define GRU_IOCTL_NUM 		 'G'
+
+/*
+ * Maximum number of GRU segments that a user can have open
+ * ZZZ temp - set high for testing. Revisit.
+ */
+#define GRU_MAX_OPEN_CONTEXTS		32
+
+/* Set Number of Request Blocks */
+#define GRU_CREATE_CONTEXT		_IOWR(GRU_IOCTL_NUM, 1, void *)
+
+/* Register task as using the slice */
+#define GRU_SET_TASK_SLICE		_IOWR(GRU_IOCTL_NUM, 5, void *)
+
+/* Fetch exception detail */
+#define GRU_USER_GET_EXCEPTION_DETAIL	_IOWR(GRU_IOCTL_NUM, 6, void *)
+
+/* For user call_os handling - normally a TLB fault */
+#define GRU_USER_CALL_OS		_IOWR(GRU_IOCTL_NUM, 8, void *)
+
+/* For user unload context */
+#define GRU_USER_UNLOAD_CONTEXT		_IOWR(GRU_IOCTL_NUM, 9, void *)
+
+/* For fetching GRU chiplet status */
+#define GRU_GET_CHIPLET_STATUS		_IOWR(GRU_IOCTL_NUM, 10, void *)
+
+/* For user TLB flushing (primarily for tests) */
+#define GRU_USER_FLUSH_TLB		_IOWR(GRU_IOCTL_NUM, 50, void *)
+
+/* Get some config options (primarily for tests & emulator) */
+#define GRU_GET_CONFIG_INFO		_IOWR(GRU_IOCTL_NUM, 51, void *)
+
+#define CONTEXT_WINDOW_BYTES(th)        (GRU_GSEG_PAGESIZE * (th))
+#define THREAD_POINTER(p, th)		(p + GRU_GSEG_PAGESIZE * (th))
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+struct gru_create_context_req {
+	unsigned long		gseg;
+	unsigned int		data_segment_bytes;
+	unsigned int		control_blocks;
+	unsigned int		maximum_thread_count;
+	unsigned int		options;
+};
+
+/*
+ * Structure used to pass unload context parameters to the driver
+ */
+struct gru_unload_context_req {
+	unsigned long	gseg;
+};
+
+/*
+ * Structure used to pass TLB flush parameters to the driver
+ */
+struct gru_flush_tlb_req {
+	unsigned long	gseg;
+	unsigned long	vaddr;
+	size_t		len;
+};
+
+/*
+ * GRU configuration info (temp - for testing)
+ */
+struct gru_config_info {
+	int		cpus;
+	int		blades;
+	int		nodes;
+	int		chiplets;
+	int		fill[16];
+};
+
+#endif /* __GRULIB_H__ */
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
new file mode 100644
index 00000000000..0eeb8dddd2f
--- /dev/null
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -0,0 +1,802 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *            DRIVER TABLE MANAGER + GRU CONTEXT LOAD/UNLOAD
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <asm/uv/uv_hub.h>
+#include "gru.h"
+#include "grutables.h"
+#include "gruhandles.h"
+
+unsigned long gru_options __read_mostly;
+
+static struct device_driver gru_driver = {
+	.name = "gru"
+};
+
+static struct device gru_device = {
+	.bus_id = {0},
+	.driver = &gru_driver,
+};
+
+struct device *grudev = &gru_device;
+
+/*
+ * Select a gru fault map to be used by the current cpu. Note that
+ * multiple cpus may be using the same map.
+ *	ZZZ should "shift" be used?? Depends on HT cpu numbering
+ *	ZZZ should be inline but did not work on emulator
+ */
+int gru_cpu_fault_map_id(void)
+{
+	return uv_blade_processor_id() % GRU_NUM_TFM;
+}
+
+/*--------- ASID Management -------------------------------------------
+ *
+ *  Initially, assign asids sequentially from MIN_ASID .. MAX_ASID.
+ *  Once MAX is reached, flush the TLB & start over. However,
+ *  some asids may still be in use. There won't be many (percentage wise) still
+ *  in use. Search active contexts & determine the value of the first
+ *  asid in use ("x"s below). Set "limit" to this value.
+ *  This defines a block of assignable asids.
+ *
+ *  When "limit" is reached, search forward from limit+1 and determine the
+ *  next block of assignable asids.
+ *
+ *  Repeat until MAX_ASID is reached, then start over again.
+ *
+ *  Each time MAX_ASID is reached, increment the asid generation. Since
+ *  the search for in-use asids only checks contexts with GRUs currently
+ *  assigned, asids in some contexts will be missed. Prior to loading
+ *  a context, the asid generation of the GTS asid is rechecked. If it
+ *  doesn't match the current generation, a new asid will be assigned.
+ *
+ *   	0---------------x------------x---------------------x----|
+ *	  ^-next	^-limit	   				^-MAX_ASID
+ *
+ * All asid manipulation & context loading/unloading is protected by the
+ * gs_lock.
+ */
+
+/* Hit the asid limit. Start over */
+static int gru_wrap_asid(struct gru_state *gru)
+{
+	gru_dbg(grudev, "gru %p\n", gru);
+	STAT(asid_wrap);
+	gru->gs_asid_gen++;
+	gru_flush_all_tlb(gru);
+	return MIN_ASID;
+}
+
+/* Find the next chunk of unused asids */
+static int gru_reset_asid_limit(struct gru_state *gru, int asid)
+{
+	int i, gid, inuse_asid, limit;
+
+	gru_dbg(grudev, "gru %p, asid 0x%x\n", gru, asid);
+	STAT(asid_next);
+	limit = MAX_ASID;
+	if (asid >= limit)
+		asid = gru_wrap_asid(gru);
+	gid = gru->gs_gid;
+again:
+	for (i = 0; i < GRU_NUM_CCH; i++) {
+		if (!gru->gs_gts[i])
+			continue;
+		inuse_asid = gru->gs_gts[i]->ts_gms->ms_asids[gid].mt_asid;
+		gru_dbg(grudev, "gru %p, inuse_asid 0x%x, cxtnum %d, gts %p\n",
+			gru, inuse_asid, i, gru->gs_gts[i]);
+		if (inuse_asid == asid) {
+			asid += ASID_INC;
+			if (asid >= limit) {
+				/*
+				 * empty range: reset the range limit and
+				 * start over
+				 */
+				limit = MAX_ASID;
+				if (asid >= MAX_ASID)
+					asid = gru_wrap_asid(gru);
+				goto again;
+			}
+		}
+
+		if ((inuse_asid > asid) && (inuse_asid < limit))
+			limit = inuse_asid;
+	}
+	gru->gs_asid_limit = limit;
+	gru->gs_asid = asid;
+	gru_dbg(grudev, "gru %p, new asid 0x%x, new_limit 0x%x\n", gru, asid,
+		limit);
+	return asid;
+}
+
+/* Assign a new ASID to a thread context.  */
+static int gru_assign_asid(struct gru_state *gru)
+{
+	int asid;
+
+	spin_lock(&gru->gs_asid_lock);
+	gru->gs_asid += ASID_INC;
+	asid = gru->gs_asid;
+	if (asid >= gru->gs_asid_limit)
+		asid = gru_reset_asid_limit(gru, asid);
+	spin_unlock(&gru->gs_asid_lock);
+
+	gru_dbg(grudev, "gru %p, asid 0x%x\n", gru, asid);
+	return asid;
+}
+
+/*
+ * Clear n bits in a word. Return a word indicating the bits that were cleared.
+ * Optionally, build an array of chars that contain the bit numbers allocated.
+ */
+static unsigned long reserve_resources(unsigned long *p, int n, int mmax,
+				       char *idx)
+{
+	unsigned long bits = 0;
+	int i;
+
+	do {
+		i = find_first_bit(p, mmax);
+		if (i == mmax)
+			BUG();
+		__clear_bit(i, p);
+		__set_bit(i, &bits);
+		if (idx)
+			*idx++ = i;
+	} while (--n);
+	return bits;
+}
+
+unsigned long gru_reserve_cb_resources(struct gru_state *gru, int cbr_au_count,
+				       char *cbmap)
+{
+	return reserve_resources(&gru->gs_cbr_map, cbr_au_count, GRU_CBR_AU,
+				 cbmap);
+}
+
+unsigned long gru_reserve_ds_resources(struct gru_state *gru, int dsr_au_count,
+				       char *dsmap)
+{
+	return reserve_resources(&gru->gs_dsr_map, dsr_au_count, GRU_DSR_AU,
+				 dsmap);
+}
+
+static void reserve_gru_resources(struct gru_state *gru,
+				  struct gru_thread_state *gts)
+{
+	gru->gs_active_contexts++;
+	gts->ts_cbr_map =
+	    gru_reserve_cb_resources(gru, gts->ts_cbr_au_count,
+				     gts->ts_cbr_idx);
+	gts->ts_dsr_map =
+	    gru_reserve_ds_resources(gru, gts->ts_dsr_au_count, NULL);
+}
+
+static void free_gru_resources(struct gru_state *gru,
+			       struct gru_thread_state *gts)
+{
+	gru->gs_active_contexts--;
+	gru->gs_cbr_map |= gts->ts_cbr_map;
+	gru->gs_dsr_map |= gts->ts_dsr_map;
+}
+
+/*
+ * Check if a GRU has sufficient free resources to satisfy an allocation
+ * request. Note: GRU locks may or may not be held when this is called. If
+ * not held, recheck after acquiring the appropriate locks.
+ *
+ * Returns 1 if sufficient resources, 0 if not
+ */
+static int check_gru_resources(struct gru_state *gru, int cbr_au_count,
+			       int dsr_au_count, int max_active_contexts)
+{
+	return hweight64(gru->gs_cbr_map) >= cbr_au_count
+		&& hweight64(gru->gs_dsr_map) >= dsr_au_count
+		&& gru->gs_active_contexts < max_active_contexts;
+}
+
+/*
+ * TLB manangment requires tracking all GRU chiplets that have loaded a GSEG
+ * context.
+ */
+static int gru_load_mm_tracker(struct gru_state *gru, struct gru_mm_struct *gms,
+			       int ctxnum)
+{
+	struct gru_mm_tracker *asids = &gms->ms_asids[gru->gs_gid];
+	unsigned short ctxbitmap = (1 << ctxnum);
+	int asid;
+
+	spin_lock(&gms->ms_asid_lock);
+	asid = asids->mt_asid;
+
+	if (asid == 0 || asids->mt_asid_gen != gru->gs_asid_gen) {
+		asid = gru_assign_asid(gru);
+		asids->mt_asid = asid;
+		asids->mt_asid_gen = gru->gs_asid_gen;
+		STAT(asid_new);
+	} else {
+		STAT(asid_reuse);
+	}
+
+	BUG_ON(asids->mt_ctxbitmap & ctxbitmap);
+	asids->mt_ctxbitmap |= ctxbitmap;
+	if (!test_bit(gru->gs_gid, gms->ms_asidmap))
+		__set_bit(gru->gs_gid, gms->ms_asidmap);
+	spin_unlock(&gms->ms_asid_lock);
+
+	gru_dbg(grudev,
+		"gru %x, gms %p, ctxnum 0x%d, asid 0x%x, asidmap 0x%lx\n",
+		gru->gs_gid, gms, ctxnum, asid, gms->ms_asidmap[0]);
+	return asid;
+}
+
+static void gru_unload_mm_tracker(struct gru_state *gru,
+				  struct gru_mm_struct *gms, int ctxnum)
+{
+	struct gru_mm_tracker *asids;
+	unsigned short ctxbitmap;
+
+	asids = &gms->ms_asids[gru->gs_gid];
+	ctxbitmap = (1 << ctxnum);
+	spin_lock(&gms->ms_asid_lock);
+	BUG_ON((asids->mt_ctxbitmap & ctxbitmap) != ctxbitmap);
+	asids->mt_ctxbitmap ^= ctxbitmap;
+	gru_dbg(grudev, "gru %x, gms %p, ctxnum 0x%d, asidmap 0x%lx\n",
+		gru->gs_gid, gms, ctxnum, gms->ms_asidmap[0]);
+	spin_unlock(&gms->ms_asid_lock);
+}
+
+/*
+ * Decrement the reference count on a GTS structure. Free the structure
+ * if the reference count goes to zero.
+ */
+void gts_drop(struct gru_thread_state *gts)
+{
+	if (gts && atomic_dec_return(&gts->ts_refcnt) == 0) {
+		gru_drop_mmu_notifier(gts->ts_gms);
+		kfree(gts);
+		STAT(gts_free);
+	}
+}
+
+/*
+ * Locate the GTS structure for the current thread.
+ */
+static struct gru_thread_state *gru_find_current_gts_nolock(struct gru_vma_data
+			    *vdata, int tsid)
+{
+	struct gru_thread_state *gts;
+
+	list_for_each_entry(gts, &vdata->vd_head, ts_next)
+	    if (gts->ts_tsid == tsid)
+		return gts;
+	return NULL;
+}
+
+/*
+ * Allocate a thread state structure.
+ */
+static struct gru_thread_state *gru_alloc_gts(struct vm_area_struct *vma,
+					      struct gru_vma_data *vdata,
+					      int tsid)
+{
+	struct gru_thread_state *gts;
+	int bytes;
+
+	bytes = DSR_BYTES(vdata->vd_dsr_au_count) +
+				CBR_BYTES(vdata->vd_cbr_au_count);
+	bytes += sizeof(struct gru_thread_state);
+	gts = kzalloc(bytes, GFP_KERNEL);
+	if (!gts)
+		return NULL;
+
+	STAT(gts_alloc);
+	atomic_set(&gts->ts_refcnt, 1);
+	mutex_init(&gts->ts_ctxlock);
+	gts->ts_cbr_au_count = vdata->vd_cbr_au_count;
+	gts->ts_dsr_au_count = vdata->vd_dsr_au_count;
+	gts->ts_user_options = vdata->vd_user_options;
+	gts->ts_tsid = tsid;
+	gts->ts_user_options = vdata->vd_user_options;
+	gts->ts_ctxnum = NULLCTX;
+	gts->ts_mm = current->mm;
+	gts->ts_vma = vma;
+	gts->ts_tlb_int_select = -1;
+	gts->ts_gms = gru_register_mmu_notifier();
+	if (!gts->ts_gms)
+		goto err;
+
+	gru_dbg(grudev, "alloc vdata %p, new gts %p\n", vdata, gts);
+	return gts;
+
+err:
+	gts_drop(gts);
+	return NULL;
+}
+
+/*
+ * Allocate a vma private data structure.
+ */
+struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma, int tsid)
+{
+	struct gru_vma_data *vdata = NULL;
+
+	vdata = kmalloc(sizeof(*vdata), GFP_KERNEL);
+	if (!vdata)
+		return NULL;
+
+	INIT_LIST_HEAD(&vdata->vd_head);
+	spin_lock_init(&vdata->vd_lock);
+	gru_dbg(grudev, "alloc vdata %p\n", vdata);
+	return vdata;
+}
+
+/*
+ * Find the thread state structure for the current thread.
+ */
+struct gru_thread_state *gru_find_thread_state(struct vm_area_struct *vma,
+					int tsid)
+{
+	struct gru_vma_data *vdata = vma->vm_private_data;
+	struct gru_thread_state *gts;
+
+	spin_lock(&vdata->vd_lock);
+	gts = gru_find_current_gts_nolock(vdata, tsid);
+	spin_unlock(&vdata->vd_lock);
+	gru_dbg(grudev, "vma %p, gts %p\n", vma, gts);
+	return gts;
+}
+
+/*
+ * Allocate a new thread state for a GSEG. Note that races may allow
+ * another thread to race to create a gts.
+ */
+struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct *vma,
+					int tsid)
+{
+	struct gru_vma_data *vdata = vma->vm_private_data;
+	struct gru_thread_state *gts, *ngts;
+
+	gts = gru_alloc_gts(vma, vdata, tsid);
+	if (!gts)
+		return NULL;
+
+	spin_lock(&vdata->vd_lock);
+	ngts = gru_find_current_gts_nolock(vdata, tsid);
+	if (ngts) {
+		gts_drop(gts);
+		gts = ngts;
+		STAT(gts_double_allocate);
+	} else {
+		list_add(&gts->ts_next, &vdata->vd_head);
+	}
+	spin_unlock(&vdata->vd_lock);
+	gru_dbg(grudev, "vma %p, gts %p\n", vma, gts);
+	return gts;
+}
+
+/*
+ * Free the GRU context assigned to the thread state.
+ */
+static void gru_free_gru_context(struct gru_thread_state *gts)
+{
+	struct gru_state *gru;
+
+	gru = gts->ts_gru;
+	gru_dbg(grudev, "gts %p, gru %p\n", gts, gru);
+
+	spin_lock(&gru->gs_lock);
+	gru->gs_gts[gts->ts_ctxnum] = NULL;
+	free_gru_resources(gru, gts);
+	BUG_ON(test_bit(gts->ts_ctxnum, &gru->gs_context_map) == 0);
+	__clear_bit(gts->ts_ctxnum, &gru->gs_context_map);
+	gts->ts_ctxnum = NULLCTX;
+	gts->ts_gru = NULL;
+	spin_unlock(&gru->gs_lock);
+
+	gts_drop(gts);
+	STAT(free_context);
+}
+
+/*
+ * Prefetching cachelines help hardware performance.
+ * (Strictly a performance enhancement. Not functionally required).
+ */
+static void prefetch_data(void *p, int num, int stride)
+{
+	while (num-- > 0) {
+		prefetchw(p);
+		p += stride;
+	}
+}
+
+static inline long gru_copy_handle(void *d, void *s)
+{
+	memcpy(d, s, GRU_HANDLE_BYTES);
+	return GRU_HANDLE_BYTES;
+}
+
+/* rewrite in assembly & use lots of prefetch */
+static void gru_load_context_data(void *save, void *grubase, int ctxnum,
+				  unsigned long cbrmap, unsigned long dsrmap)
+{
+	void *gseg, *cb, *cbe;
+	unsigned long length;
+	int i, scr;
+
+	gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+	length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
+	prefetch_data(gseg + GRU_DS_BASE, length / GRU_CACHE_LINE_BYTES,
+		      GRU_CACHE_LINE_BYTES);
+
+	cb = gseg + GRU_CB_BASE;
+	cbe = grubase + GRU_CBE_BASE;
+	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+		prefetch_data(cb, 1, GRU_CACHE_LINE_BYTES);
+		prefetch_data(cbe + i * GRU_HANDLE_STRIDE, 1,
+			      GRU_CACHE_LINE_BYTES);
+		cb += GRU_HANDLE_STRIDE;
+	}
+
+	cb = gseg + GRU_CB_BASE;
+	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+		save += gru_copy_handle(cb, save);
+		save += gru_copy_handle(cbe + i * GRU_HANDLE_STRIDE, save);
+		cb += GRU_HANDLE_STRIDE;
+	}
+
+	memcpy(gseg + GRU_DS_BASE, save, length);
+}
+
+static void gru_unload_context_data(void *save, void *grubase, int ctxnum,
+				    unsigned long cbrmap, unsigned long dsrmap)
+{
+	void *gseg, *cb, *cbe;
+	unsigned long length;
+	int i, scr;
+
+	gseg = grubase + ctxnum * GRU_GSEG_STRIDE;
+
+	cb = gseg + GRU_CB_BASE;
+	cbe = grubase + GRU_CBE_BASE;
+	for_each_cbr_in_allocation_map(i, &cbrmap, scr) {
+		save += gru_copy_handle(save, cb);
+		save += gru_copy_handle(save, cbe + i * GRU_HANDLE_STRIDE);
+		cb += GRU_HANDLE_STRIDE;
+	}
+	length = hweight64(dsrmap) * GRU_DSR_AU_BYTES;
+	memcpy(save, gseg + GRU_DS_BASE, length);
+}
+
+void gru_unload_context(struct gru_thread_state *gts, int savestate)
+{
+	struct gru_state *gru = gts->ts_gru;
+	struct gru_context_configuration_handle *cch;
+	int ctxnum = gts->ts_ctxnum;
+
+	zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE);
+	cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
+
+	lock_cch_handle(cch);
+	if (cch_interrupt_sync(cch))
+		BUG();
+	gru_dbg(grudev, "gts %p\n", gts);
+
+	gru_unload_mm_tracker(gru, gts->ts_gms, gts->ts_ctxnum);
+	if (savestate)
+		gru_unload_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr,
+					ctxnum, gts->ts_cbr_map,
+					gts->ts_dsr_map);
+
+	if (cch_deallocate(cch))
+		BUG();
+	gts->ts_force_unload = 0;	/* ts_force_unload locked by CCH lock */
+	unlock_cch_handle(cch);
+
+	gru_free_gru_context(gts);
+	STAT(unload_context);
+}
+
+/*
+ * Load a GRU context by copying it from the thread data structure in memory
+ * to the GRU.
+ */
+static void gru_load_context(struct gru_thread_state *gts)
+{
+	struct gru_state *gru = gts->ts_gru;
+	struct gru_context_configuration_handle *cch;
+	int err, asid, ctxnum = gts->ts_ctxnum;
+
+	gru_dbg(grudev, "gts %p\n", gts);
+	cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
+
+	lock_cch_handle(cch);
+	asid = gru_load_mm_tracker(gru, gts->ts_gms, gts->ts_ctxnum);
+	cch->tfm_fault_bit_enable =
+	    (gts->ts_user_options == GRU_OPT_MISS_FMM_POLL
+	     || gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+	cch->tlb_int_enable = (gts->ts_user_options == GRU_OPT_MISS_FMM_INTR);
+	if (cch->tlb_int_enable) {
+		gts->ts_tlb_int_select = gru_cpu_fault_map_id();
+		cch->tlb_int_select = gts->ts_tlb_int_select;
+	}
+	cch->tfm_done_bit_enable = 0;
+	err = cch_allocate(cch, asid, gts->ts_cbr_map, gts->ts_dsr_map);
+	if (err) {
+		gru_dbg(grudev,
+			"err %d: cch %p, gts %p, cbr 0x%lx, dsr 0x%lx\n",
+			err, cch, gts, gts->ts_cbr_map, gts->ts_dsr_map);
+		BUG();
+	}
+
+	gru_load_context_data(gts->ts_gdata, gru->gs_gru_base_vaddr, ctxnum,
+			      gts->ts_cbr_map, gts->ts_dsr_map);
+
+	if (cch_start(cch))
+		BUG();
+	unlock_cch_handle(cch);
+
+	STAT(load_context);
+}
+
+/*
+ * Update fields in an active CCH:
+ * 	- retarget interrupts on local blade
+ * 	- force a delayed context unload by clearing the CCH asids. This
+ * 	  forces TLB misses for new GRU instructions. The context is unloaded
+ * 	  when the next TLB miss occurs.
+ */
+static int gru_update_cch(struct gru_thread_state *gts, int int_select)
+{
+	struct gru_context_configuration_handle *cch;
+	struct gru_state *gru = gts->ts_gru;
+	int i, ctxnum = gts->ts_ctxnum, ret = 0;
+
+	cch = get_cch(gru->gs_gru_base_vaddr, ctxnum);
+
+	lock_cch_handle(cch);
+	if (cch->state == CCHSTATE_ACTIVE) {
+		if (gru->gs_gts[gts->ts_ctxnum] != gts)
+			goto exit;
+		if (cch_interrupt(cch))
+			BUG();
+		if (int_select >= 0) {
+			gts->ts_tlb_int_select = int_select;
+			cch->tlb_int_select = int_select;
+		} else {
+			for (i = 0; i < 8; i++)
+				cch->asid[i] = 0;
+			cch->tfm_fault_bit_enable = 0;
+			cch->tlb_int_enable = 0;
+			gts->ts_force_unload = 1;
+		}
+		if (cch_start(cch))
+			BUG();
+		ret = 1;
+	}
+exit:
+	unlock_cch_handle(cch);
+	return ret;
+}
+
+/*
+ * Update CCH tlb interrupt select. Required when all the following is true:
+ * 	- task's GRU context is loaded into a GRU
+ * 	- task is using interrupt notification for TLB faults
+ * 	- task has migrated to a different cpu on the same blade where
+ * 	  it was previously running.
+ */
+static int gru_retarget_intr(struct gru_thread_state *gts)
+{
+	if (gts->ts_tlb_int_select < 0
+	    || gts->ts_tlb_int_select == gru_cpu_fault_map_id())
+		return 0;
+
+	gru_dbg(grudev, "retarget from %d to %d\n", gts->ts_tlb_int_select,
+		gru_cpu_fault_map_id());
+	return gru_update_cch(gts, gru_cpu_fault_map_id());
+}
+
+
+/*
+ * Insufficient GRU resources available on the local blade. Steal a context from
+ * a process. This is a hack until a _real_ resource scheduler is written....
+ */
+#define next_ctxnum(n)	((n) <  GRU_NUM_CCH - 2 ? (n) + 1 : 0)
+#define next_gru(b, g)	(((g) < &(b)->bs_grus[GRU_CHIPLETS_PER_BLADE - 1]) ?  \
+				 ((g)+1) : &(b)->bs_grus[0])
+
+static void gru_steal_context(struct gru_thread_state *gts)
+{
+	struct gru_blade_state *blade;
+	struct gru_state *gru, *gru0;
+	struct gru_thread_state *ngts = NULL;
+	int ctxnum, ctxnum0, flag = 0, cbr, dsr;
+
+	cbr = gts->ts_cbr_au_count;
+	dsr = gts->ts_dsr_au_count;
+
+	preempt_disable();
+	blade = gru_base[uv_numa_blade_id()];
+	spin_lock(&blade->bs_lock);
+
+	ctxnum = next_ctxnum(blade->bs_lru_ctxnum);
+	gru = blade->bs_lru_gru;
+	if (ctxnum == 0)
+		gru = next_gru(blade, gru);
+	ctxnum0 = ctxnum;
+	gru0 = gru;
+	while (1) {
+		if (check_gru_resources(gru, cbr, dsr, GRU_NUM_CCH))
+			break;
+		spin_lock(&gru->gs_lock);
+		for (; ctxnum < GRU_NUM_CCH; ctxnum++) {
+			if (flag && gru == gru0 && ctxnum == ctxnum0)
+				break;
+			ngts = gru->gs_gts[ctxnum];
+			/*
+			 * We are grabbing locks out of order, so trylock is
+			 * needed. GTSs are usually not locked, so the odds of
+			 * success are high. If trylock fails, try to steal a
+			 * different GSEG.
+			 */
+			if (ngts && mutex_trylock(&ngts->ts_ctxlock))
+				break;
+			ngts = NULL;
+			flag = 1;
+		}
+		spin_unlock(&gru->gs_lock);
+		if (ngts || (flag && gru == gru0 && ctxnum == ctxnum0))
+			break;
+		ctxnum = 0;
+		gru = next_gru(blade, gru);
+	}
+	blade->bs_lru_gru = gru;
+	blade->bs_lru_ctxnum = ctxnum;
+	spin_unlock(&blade->bs_lock);
+	preempt_enable();
+
+	if (ngts) {
+		STAT(steal_context);
+		ngts->ts_steal_jiffies = jiffies;
+		gru_unload_context(ngts, 1);
+		mutex_unlock(&ngts->ts_ctxlock);
+	} else {
+		STAT(steal_context_failed);
+	}
+	gru_dbg(grudev,
+		"stole gru %x, ctxnum %d from gts %p. Need cb %d, ds %d;"
+		" avail cb %ld, ds %ld\n",
+		gru->gs_gid, ctxnum, ngts, cbr, dsr, hweight64(gru->gs_cbr_map),
+		hweight64(gru->gs_dsr_map));
+}
+
+/*
+ * Scan the GRUs on the local blade & assign a GRU context.
+ */
+static struct gru_state *gru_assign_gru_context(struct gru_thread_state *gts)
+{
+	struct gru_state *gru, *grux;
+	int i, max_active_contexts;
+
+	preempt_disable();
+
+again:
+	gru = NULL;
+	max_active_contexts = GRU_NUM_CCH;
+	for_each_gru_on_blade(grux, uv_numa_blade_id(), i) {
+		if (check_gru_resources(grux, gts->ts_cbr_au_count,
+					gts->ts_dsr_au_count,
+					max_active_contexts)) {
+			gru = grux;
+			max_active_contexts = grux->gs_active_contexts;
+			if (max_active_contexts == 0)
+				break;
+		}
+	}
+
+	if (gru) {
+		spin_lock(&gru->gs_lock);
+		if (!check_gru_resources(gru, gts->ts_cbr_au_count,
+					 gts->ts_dsr_au_count, GRU_NUM_CCH)) {
+			spin_unlock(&gru->gs_lock);
+			goto again;
+		}
+		reserve_gru_resources(gru, gts);
+		gts->ts_gru = gru;
+		gts->ts_ctxnum =
+		    find_first_zero_bit(&gru->gs_context_map, GRU_NUM_CCH);
+		BUG_ON(gts->ts_ctxnum == GRU_NUM_CCH);
+		atomic_inc(&gts->ts_refcnt);
+		gru->gs_gts[gts->ts_ctxnum] = gts;
+		__set_bit(gts->ts_ctxnum, &gru->gs_context_map);
+		spin_unlock(&gru->gs_lock);
+
+		STAT(assign_context);
+		gru_dbg(grudev,
+			"gseg %p, gts %p, gru %x, ctx %d, cbr %d, dsr %d\n",
+			gseg_virtual_address(gts->ts_gru, gts->ts_ctxnum), gts,
+			gts->ts_gru->gs_gid, gts->ts_ctxnum,
+			gts->ts_cbr_au_count, gts->ts_dsr_au_count);
+	} else {
+		gru_dbg(grudev, "failed to allocate a GTS %s\n", "");
+		STAT(assign_context_failed);
+	}
+
+	preempt_enable();
+	return gru;
+}
+
+/*
+ * gru_nopage
+ *
+ * Map the user's GRU segment
+ *
+ * 	Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries.
+ */
+int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct gru_thread_state *gts;
+	unsigned long paddr, vaddr;
+
+	vaddr = (unsigned long)vmf->virtual_address;
+	gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
+		vma, vaddr, GSEG_BASE(vaddr));
+	STAT(nopfn);
+
+	/* The following check ensures vaddr is a valid address in the VMA */
+	gts = gru_find_thread_state(vma, TSID(vaddr, vma));
+	if (!gts)
+		return VM_FAULT_SIGBUS;
+
+again:
+	preempt_disable();
+	mutex_lock(&gts->ts_ctxlock);
+	if (gts->ts_gru) {
+		if (gts->ts_gru->gs_blade_id != uv_numa_blade_id()) {
+			STAT(migrated_nopfn_unload);
+			gru_unload_context(gts, 1);
+		} else {
+			if (gru_retarget_intr(gts))
+				STAT(migrated_nopfn_retarget);
+		}
+	}
+
+	if (!gts->ts_gru) {
+		if (!gru_assign_gru_context(gts)) {
+			mutex_unlock(&gts->ts_ctxlock);
+			preempt_enable();
+			schedule_timeout(GRU_ASSIGN_DELAY);  /* true hack ZZZ */
+			if (gts->ts_steal_jiffies + GRU_STEAL_DELAY < jiffies)
+				gru_steal_context(gts);
+			goto again;
+		}
+		gru_load_context(gts);
+		paddr = gseg_physical_address(gts->ts_gru, gts->ts_ctxnum);
+		remap_pfn_range(vma, vaddr & ~(GRU_GSEG_PAGESIZE - 1),
+				paddr >> PAGE_SHIFT, GRU_GSEG_PAGESIZE,
+				vma->vm_page_prot);
+	}
+
+	mutex_unlock(&gts->ts_ctxlock);
+	preempt_enable();
+
+	return VM_FAULT_NOPAGE;
+}
+
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
new file mode 100644
index 00000000000..533923f83f1
--- /dev/null
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -0,0 +1,336 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *              PROC INTERFACES
+ *
+ * This file supports the /proc interfaces for the GRU driver
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/device.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include "gru.h"
+#include "grulib.h"
+#include "grutables.h"
+
+#define printstat(s, f)		printstat_val(s, &gru_stats.f, #f)
+
+static void printstat_val(struct seq_file *s, atomic_long_t *v, char *id)
+{
+	unsigned long val = atomic_long_read(v);
+
+	if (val)
+		seq_printf(s, "%16lu %s\n", val, id);
+}
+
+static int statistics_show(struct seq_file *s, void *p)
+{
+	printstat(s, vdata_alloc);
+	printstat(s, vdata_free);
+	printstat(s, gts_alloc);
+	printstat(s, gts_free);
+	printstat(s, vdata_double_alloc);
+	printstat(s, gts_double_allocate);
+	printstat(s, assign_context);
+	printstat(s, assign_context_failed);
+	printstat(s, free_context);
+	printstat(s, load_context);
+	printstat(s, unload_context);
+	printstat(s, steal_context);
+	printstat(s, steal_context_failed);
+	printstat(s, nopfn);
+	printstat(s, break_cow);
+	printstat(s, asid_new);
+	printstat(s, asid_next);
+	printstat(s, asid_wrap);
+	printstat(s, asid_reuse);
+	printstat(s, intr);
+	printstat(s, call_os);
+	printstat(s, call_os_check_for_bug);
+	printstat(s, call_os_wait_queue);
+	printstat(s, user_flush_tlb);
+	printstat(s, user_unload_context);
+	printstat(s, user_exception);
+	printstat(s, set_task_slice);
+	printstat(s, migrate_check);
+	printstat(s, migrated_retarget);
+	printstat(s, migrated_unload);
+	printstat(s, migrated_unload_delay);
+	printstat(s, migrated_nopfn_retarget);
+	printstat(s, migrated_nopfn_unload);
+	printstat(s, tlb_dropin);
+	printstat(s, tlb_dropin_fail_no_asid);
+	printstat(s, tlb_dropin_fail_upm);
+	printstat(s, tlb_dropin_fail_invalid);
+	printstat(s, tlb_dropin_fail_range_active);
+	printstat(s, tlb_dropin_fail_idle);
+	printstat(s, tlb_dropin_fail_fmm);
+	printstat(s, mmu_invalidate_range);
+	printstat(s, mmu_invalidate_page);
+	printstat(s, mmu_clear_flush_young);
+	printstat(s, flush_tlb);
+	printstat(s, flush_tlb_gru);
+	printstat(s, flush_tlb_gru_tgh);
+	printstat(s, flush_tlb_gru_zero_asid);
+	printstat(s, copy_gpa);
+	printstat(s, mesq_receive);
+	printstat(s, mesq_receive_none);
+	printstat(s, mesq_send);
+	printstat(s, mesq_send_failed);
+	printstat(s, mesq_noop);
+	printstat(s, mesq_send_unexpected_error);
+	printstat(s, mesq_send_lb_overflow);
+	printstat(s, mesq_send_qlimit_reached);
+	printstat(s, mesq_send_amo_nacked);
+	printstat(s, mesq_send_put_nacked);
+	printstat(s, mesq_qf_not_full);
+	printstat(s, mesq_qf_locked);
+	printstat(s, mesq_qf_noop_not_full);
+	printstat(s, mesq_qf_switch_head_failed);
+	printstat(s, mesq_qf_unexpected_error);
+	printstat(s, mesq_noop_unexpected_error);
+	printstat(s, mesq_noop_lb_overflow);
+	printstat(s, mesq_noop_qlimit_reached);
+	printstat(s, mesq_noop_amo_nacked);
+	printstat(s, mesq_noop_put_nacked);
+	return 0;
+}
+
+static ssize_t statistics_write(struct file *file, const char __user *userbuf,
+				size_t count, loff_t *data)
+{
+	memset(&gru_stats, 0, sizeof(gru_stats));
+	return count;
+}
+
+static int options_show(struct seq_file *s, void *p)
+{
+	seq_printf(s, "0x%lx\n", gru_options);
+	return 0;
+}
+
+static ssize_t options_write(struct file *file, const char __user *userbuf,
+			     size_t count, loff_t *data)
+{
+	unsigned long val;
+	char buf[80];
+
+	if (copy_from_user
+	    (buf, userbuf, count < sizeof(buf) ? count : sizeof(buf)))
+		return -EFAULT;
+	if (!strict_strtoul(buf, 10, &val))
+		gru_options = val;
+
+	return count;
+}
+
+static int cch_seq_show(struct seq_file *file, void *data)
+{
+	long gid = *(long *)data;
+	int i;
+	struct gru_state *gru = GID_TO_GRU(gid);
+	struct gru_thread_state *ts;
+	const char *mode[] = { "??", "UPM", "INTR", "OS_POLL" };
+
+	if (gid == 0)
+		seq_printf(file, "#%5s%5s%6s%9s%6s%8s%8s\n", "gid", "bid",
+			   "ctx#", "pid", "cbrs", "dsbytes", "mode");
+	if (gru)
+		for (i = 0; i < GRU_NUM_CCH; i++) {
+			ts = gru->gs_gts[i];
+			if (!ts)
+				continue;
+			seq_printf(file, " %5d%5d%6d%9d%6d%8d%8s\n",
+				   gru->gs_gid, gru->gs_blade_id, i,
+				   ts->ts_tgid_owner,
+				   ts->ts_cbr_au_count * GRU_CBR_AU_SIZE,
+				   ts->ts_cbr_au_count * GRU_DSR_AU_BYTES,
+				   mode[ts->ts_user_options &
+					GRU_OPT_MISS_MASK]);
+		}
+
+	return 0;
+}
+
+static int gru_seq_show(struct seq_file *file, void *data)
+{
+	long gid = *(long *)data, ctxfree, cbrfree, dsrfree;
+	struct gru_state *gru = GID_TO_GRU(gid);
+
+	if (gid == 0) {
+		seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "gid", "nid",
+			   "ctx", "cbr", "dsr", "ctx", "cbr", "dsr");
+		seq_printf(file, "#%5s%5s%7s%6s%6s%8s%6s%6s\n", "", "", "busy",
+			   "busy", "busy", "free", "free", "free");
+	}
+	if (gru) {
+		ctxfree = GRU_NUM_CCH - gru->gs_active_contexts;
+		cbrfree = hweight64(gru->gs_cbr_map) * GRU_CBR_AU_SIZE;
+		dsrfree = hweight64(gru->gs_dsr_map) * GRU_DSR_AU_BYTES;
+		seq_printf(file, " %5d%5d%7ld%6ld%6ld%8ld%6ld%6ld\n",
+			   gru->gs_gid, gru->gs_blade_id, GRU_NUM_CCH - ctxfree,
+			   GRU_NUM_CBE - cbrfree, GRU_NUM_DSR_BYTES - dsrfree,
+			   ctxfree, cbrfree, dsrfree);
+	}
+
+	return 0;
+}
+
+static void seq_stop(struct seq_file *file, void *data)
+{
+}
+
+static void *seq_start(struct seq_file *file, loff_t *gid)
+{
+	if (*gid < GRU_MAX_GRUS)
+		return gid;
+	return NULL;
+}
+
+static void *seq_next(struct seq_file *file, void *data, loff_t *gid)
+{
+	(*gid)++;
+	if (*gid < GRU_MAX_GRUS)
+		return gid;
+	return NULL;
+}
+
+static const struct seq_operations cch_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= cch_seq_show
+};
+
+static const struct seq_operations gru_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= gru_seq_show
+};
+
+static int statistics_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, statistics_show, NULL);
+}
+
+static int options_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, options_show, NULL);
+}
+
+static int cch_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cch_seq_ops);
+}
+
+static int gru_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &gru_seq_ops);
+}
+
+/* *INDENT-OFF* */
+static const struct file_operations statistics_fops = {
+	.open 		= statistics_open,
+	.read 		= seq_read,
+	.write 		= statistics_write,
+	.llseek 	= seq_lseek,
+	.release 	= single_release,
+};
+
+static const struct file_operations options_fops = {
+	.open 		= options_open,
+	.read 		= seq_read,
+	.write 		= options_write,
+	.llseek 	= seq_lseek,
+	.release 	= single_release,
+};
+
+static const struct file_operations cch_fops = {
+	.open 		= cch_open,
+	.read 		= seq_read,
+	.llseek 	= seq_lseek,
+	.release 	= seq_release,
+};
+static const struct file_operations gru_fops = {
+	.open 		= gru_open,
+	.read 		= seq_read,
+	.llseek 	= seq_lseek,
+	.release 	= seq_release,
+};
+
+static struct proc_entry {
+	char *name;
+	int mode;
+	const struct file_operations *fops;
+	struct proc_dir_entry *entry;
+} proc_files[] = {
+	{"statistics", 0644, &statistics_fops},
+	{"debug_options", 0644, &options_fops},
+	{"cch_status", 0444, &cch_fops},
+	{"gru_status", 0444, &gru_fops},
+	{NULL}
+};
+/* *INDENT-ON* */
+
+static struct proc_dir_entry *proc_gru __read_mostly;
+
+static int create_proc_file(struct proc_entry *p)
+{
+	p->entry = create_proc_entry(p->name, p->mode, proc_gru);
+	if (!p->entry)
+		return -1;
+	p->entry->proc_fops = p->fops;
+	return 0;
+}
+
+static void delete_proc_files(void)
+{
+	struct proc_entry *p;
+
+	if (proc_gru) {
+		for (p = proc_files; p->name; p++)
+			if (p->entry)
+				remove_proc_entry(p->name, proc_gru);
+		remove_proc_entry("gru", NULL);
+	}
+}
+
+int gru_proc_init(void)
+{
+	struct proc_entry *p;
+
+	proc_mkdir("sgi_uv", NULL);
+	proc_gru = proc_mkdir("sgi_uv/gru", NULL);
+
+	for (p = proc_files; p->name; p++)
+		if (create_proc_file(p))
+			goto err;
+	return 0;
+
+err:
+	delete_proc_files();
+	return -1;
+}
+
+void gru_proc_exit(void)
+{
+	delete_proc_files();
+}
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
new file mode 100644
index 00000000000..4251018f70f
--- /dev/null
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -0,0 +1,609 @@
+/*
+ * SN Platform GRU Driver
+ *
+ *            GRU DRIVER TABLES, MACROS, externs, etc
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __GRUTABLES_H__
+#define __GRUTABLES_H__
+
+/*
+ * GRU Chiplet:
+ *   The GRU is a user addressible memory accelerator. It provides
+ *   several forms of load, store, memset, bcopy instructions. In addition, it
+ *   contains special instructions for AMOs, sending messages to message
+ *   queues, etc.
+ *
+ *   The GRU is an integral part of the node controller. It connects
+ *   directly to the cpu socket. In its current implementation, there are 2
+ *   GRU chiplets in the node controller on each blade (~node).
+ *
+ *   The entire GRU memory space is fully coherent and cacheable by the cpus.
+ *
+ *   Each GRU chiplet has a physical memory map that looks like the following:
+ *
+ *   	+-----------------+
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	|/////////////////|
+ *   	+-----------------+
+ *   	|  system control |
+ *   	+-----------------+        _______ +-------------+
+ *   	|/////////////////|       /        |             |
+ *   	|/////////////////|      /         |             |
+ *   	|/////////////////|     /          | instructions|
+ *   	|/////////////////|    /           |             |
+ *   	|/////////////////|   /            |             |
+ *   	|/////////////////|  /             |-------------|
+ *   	|/////////////////| /              |             |
+ *   	+-----------------+                |             |
+ *   	|   context 15    |                |  data       |
+ *   	+-----------------+                |             |
+ *   	|    ......       | \              |             |
+ *   	+-----------------+  \____________ +-------------+
+ *   	|   context 1     |
+ *   	+-----------------+
+ *   	|   context 0     |
+ *   	+-----------------+
+ *
+ *   Each of the "contexts" is a chunk of memory that can be mmaped into user
+ *   space. The context consists of 2 parts:
+ *
+ *  	- an instruction space that can be directly accessed by the user
+ *  	  to issue GRU instructions and to check instruction status.
+ *
+ *  	- a data area that acts as normal RAM.
+ *
+ *   User instructions contain virtual addresses of data to be accessed by the
+ *   GRU. The GRU contains a TLB that is used to convert these user virtual
+ *   addresses to physical addresses.
+ *
+ *   The "system control" area of the GRU chiplet is used by the kernel driver
+ *   to manage user contexts and to perform functions such as TLB dropin and
+ *   purging.
+ *
+ *   One context may be reserved for the kernel and used for cross-partition
+ *   communication. The GRU will also be used to asynchronously zero out
+ *   large blocks of memory (not currently implemented).
+ *
+ *
+ * Tables:
+ *
+ * 	VDATA-VMA Data		- Holds a few parameters. Head of linked list of
+ * 				  GTS tables for threads using the GSEG
+ * 	GTS - Gru Thread State  - contains info for managing a GSEG context. A
+ * 				  GTS is allocated for each thread accessing a
+ * 				  GSEG.
+ *     	GTD - GRU Thread Data   - contains shadow copy of GRU data when GSEG is
+ *     				  not loaded into a GRU
+ *	GMS - GRU Memory Struct - Used to manage TLB shootdowns. Tracks GRUs
+ *				  where a GSEG has been loaded. Similar to
+ *				  an mm_struct but for GRU.
+ *
+ *	GS  - GRU State 	- Used to manage the state of a GRU chiplet
+ *	BS  - Blade State	- Used to manage state of all GRU chiplets
+ *				  on a blade
+ *
+ *
+ *  Normal task tables for task using GRU.
+ *  		- 2 threads in process
+ *  		- 2 GSEGs open in process
+ *  		- GSEG1 is being used by both threads
+ *  		- GSEG2 is used only by thread 2
+ *
+ *       task -->|
+ *       task ---+---> mm ->------ (notifier) -------+-> gms
+ *                     |                             |
+ *                     |--> vma -> vdata ---> gts--->|		GSEG1 (thread1)
+ *                     |                  |          |
+ *                     |                  +-> gts--->|		GSEG1 (thread2)
+ *                     |                             |
+ *                     |--> vma -> vdata ---> gts--->|		GSEG2 (thread2)
+ *                     .
+ *                     .
+ *
+ *  GSEGs are marked DONTCOPY on fork
+ *
+ * At open
+ * 	file.private_data -> NULL
+ *
+ * At mmap,
+ * 	vma -> vdata
+ *
+ * After gseg reference
+ * 	vma -> vdata ->gts
+ *
+ * After fork
+ *   parent
+ * 	vma -> vdata -> gts
+ *   child
+ * 	(vma is not copied)
+ *
+ */
+
+#include <linux/rmap.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/mmu_notifier.h>
+#include "gru.h"
+#include "gruhandles.h"
+
+extern struct gru_stats_s gru_stats;
+extern struct gru_blade_state *gru_base[];
+extern unsigned long gru_start_paddr, gru_end_paddr;
+
+#define GRU_MAX_BLADES		MAX_NUMNODES
+#define GRU_MAX_GRUS		(GRU_MAX_BLADES * GRU_CHIPLETS_PER_BLADE)
+
+#define GRU_DRIVER_ID_STR	"SGI GRU Device Driver"
+#define GRU_DRIVER_VERSION_STR	"0.80"
+
+/*
+ * GRU statistics.
+ */
+struct gru_stats_s {
+	atomic_long_t vdata_alloc;
+	atomic_long_t vdata_free;
+	atomic_long_t gts_alloc;
+	atomic_long_t gts_free;
+	atomic_long_t vdata_double_alloc;
+	atomic_long_t gts_double_allocate;
+	atomic_long_t assign_context;
+	atomic_long_t assign_context_failed;
+	atomic_long_t free_context;
+	atomic_long_t load_context;
+	atomic_long_t unload_context;
+	atomic_long_t steal_context;
+	atomic_long_t steal_context_failed;
+	atomic_long_t nopfn;
+	atomic_long_t break_cow;
+	atomic_long_t asid_new;
+	atomic_long_t asid_next;
+	atomic_long_t asid_wrap;
+	atomic_long_t asid_reuse;
+	atomic_long_t intr;
+	atomic_long_t call_os;
+	atomic_long_t call_os_check_for_bug;
+	atomic_long_t call_os_wait_queue;
+	atomic_long_t user_flush_tlb;
+	atomic_long_t user_unload_context;
+	atomic_long_t user_exception;
+	atomic_long_t set_task_slice;
+	atomic_long_t migrate_check;
+	atomic_long_t migrated_retarget;
+	atomic_long_t migrated_unload;
+	atomic_long_t migrated_unload_delay;
+	atomic_long_t migrated_nopfn_retarget;
+	atomic_long_t migrated_nopfn_unload;
+	atomic_long_t tlb_dropin;
+	atomic_long_t tlb_dropin_fail_no_asid;
+	atomic_long_t tlb_dropin_fail_upm;
+	atomic_long_t tlb_dropin_fail_invalid;
+	atomic_long_t tlb_dropin_fail_range_active;
+	atomic_long_t tlb_dropin_fail_idle;
+	atomic_long_t tlb_dropin_fail_fmm;
+	atomic_long_t mmu_invalidate_range;
+	atomic_long_t mmu_invalidate_page;
+	atomic_long_t mmu_clear_flush_young;
+	atomic_long_t flush_tlb;
+	atomic_long_t flush_tlb_gru;
+	atomic_long_t flush_tlb_gru_tgh;
+	atomic_long_t flush_tlb_gru_zero_asid;
+
+	atomic_long_t copy_gpa;
+
+	atomic_long_t mesq_receive;
+	atomic_long_t mesq_receive_none;
+	atomic_long_t mesq_send;
+	atomic_long_t mesq_send_failed;
+	atomic_long_t mesq_noop;
+	atomic_long_t mesq_send_unexpected_error;
+	atomic_long_t mesq_send_lb_overflow;
+	atomic_long_t mesq_send_qlimit_reached;
+	atomic_long_t mesq_send_amo_nacked;
+	atomic_long_t mesq_send_put_nacked;
+	atomic_long_t mesq_qf_not_full;
+	atomic_long_t mesq_qf_locked;
+	atomic_long_t mesq_qf_noop_not_full;
+	atomic_long_t mesq_qf_switch_head_failed;
+	atomic_long_t mesq_qf_unexpected_error;
+	atomic_long_t mesq_noop_unexpected_error;
+	atomic_long_t mesq_noop_lb_overflow;
+	atomic_long_t mesq_noop_qlimit_reached;
+	atomic_long_t mesq_noop_amo_nacked;
+	atomic_long_t mesq_noop_put_nacked;
+
+};
+
+#define OPT_DPRINT	1
+#define OPT_STATS	2
+#define GRU_QUICKLOOK	4
+
+
+#define IRQ_GRU			110	/* Starting IRQ number for interrupts */
+
+/* Delay in jiffies between attempts to assign a GRU context */
+#define GRU_ASSIGN_DELAY	((HZ * 20) / 1000)
+
+/*
+ * If a process has it's context stolen, min delay in jiffies before trying to
+ * steal a context from another process.
+ */
+#define GRU_STEAL_DELAY		((HZ * 200) / 1000)
+
+#define STAT(id)	do {						\
+				if (gru_options & OPT_STATS)		\
+					atomic_long_inc(&gru_stats.id);	\
+			} while (0)
+
+#ifdef CONFIG_SGI_GRU_DEBUG
+#define gru_dbg(dev, fmt, x...)						\
+	do {								\
+		if (gru_options & OPT_DPRINT)				\
+			dev_dbg(dev, "%s: " fmt, __func__, x);		\
+	} while (0)
+#else
+#define gru_dbg(x...)
+#endif
+
+/*-----------------------------------------------------------------------------
+ * ASID management
+ */
+#define MAX_ASID	0xfffff0
+#define MIN_ASID	8
+#define ASID_INC	8	/* number of regions */
+
+/* Generate a GRU asid value from a GRU base asid & a virtual address. */
+#if defined CONFIG_IA64
+#define VADDR_HI_BIT		64
+#define GRUREGION(addr)		((addr) >> (VADDR_HI_BIT - 3) & 3)
+#elif defined __x86_64
+#define VADDR_HI_BIT		48
+#define GRUREGION(addr)		(0)		/* ZZZ could do better */
+#else
+#error "Unsupported architecture"
+#endif
+#define GRUASID(asid, addr)	((asid) + GRUREGION(addr))
+
+/*------------------------------------------------------------------------------
+ *  File & VMS Tables
+ */
+
+struct gru_state;
+
+/*
+ * This structure is pointed to from the mmstruct via the notifier pointer.
+ * There is one of these per address space.
+ */
+struct gru_mm_tracker {
+	unsigned int		mt_asid_gen;	/* ASID wrap count */
+	int			mt_asid;	/* current base ASID for gru */
+	unsigned short		mt_ctxbitmap;	/* bitmap of contexts using
+						   asid */
+};
+
+struct gru_mm_struct {
+	struct mmu_notifier	ms_notifier;
+	atomic_t		ms_refcnt;
+	spinlock_t		ms_asid_lock;	/* protects ASID assignment */
+	atomic_t		ms_range_active;/* num range_invals active */
+	char			ms_released;
+	wait_queue_head_t	ms_wait_queue;
+	DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS);
+	struct gru_mm_tracker	ms_asids[GRU_MAX_GRUS];
+};
+
+/*
+ * One of these structures is allocated when a GSEG is mmaped. The
+ * structure is pointed to by the vma->vm_private_data field in the vma struct.
+ */
+struct gru_vma_data {
+	spinlock_t		vd_lock;	/* Serialize access to vma */
+	struct list_head	vd_head;	/* head of linked list of gts */
+	long			vd_user_options;/* misc user option flags */
+	int			vd_cbr_au_count;
+	int			vd_dsr_au_count;
+};
+
+/*
+ * One of these is allocated for each thread accessing a mmaped GRU. A linked
+ * list of these structure is hung off the struct gru_vma_data in the mm_struct.
+ */
+struct gru_thread_state {
+	struct list_head	ts_next;	/* list - head at vma-private */
+	struct mutex		ts_ctxlock;	/* load/unload CTX lock */
+	struct mm_struct	*ts_mm;		/* mm currently mapped to
+						   context */
+	struct vm_area_struct	*ts_vma;	/* vma of GRU context */
+	struct gru_state	*ts_gru;	/* GRU where the context is
+						   loaded */
+	struct gru_mm_struct	*ts_gms;	/* asid & ioproc struct */
+	unsigned long		ts_cbr_map;	/* map of allocated CBRs */
+	unsigned long		ts_dsr_map;	/* map of allocated DATA
+						   resources */
+	unsigned long		ts_steal_jiffies;/* jiffies when context last
+						    stolen */
+	long			ts_user_options;/* misc user option flags */
+	pid_t			ts_tgid_owner;	/* task that is using the
+						   context - for migration */
+	int			ts_tsid;	/* thread that owns the
+						   structure */
+	int			ts_tlb_int_select;/* target cpu if interrupts
+						     enabled */
+	int			ts_ctxnum;	/* context number where the
+						   context is loaded */
+	atomic_t		ts_refcnt;	/* reference count GTS */
+	unsigned char		ts_dsr_au_count;/* Number of DSR resources
+						   required for contest */
+	unsigned char		ts_cbr_au_count;/* Number of CBR resources
+						   required for contest */
+	char			ts_force_unload;/* force context to be unloaded
+						   after migration */
+	char			ts_cbr_idx[GRU_CBR_AU];/* CBR numbers of each
+							  allocated CB */
+	unsigned long		ts_gdata[0];	/* save area for GRU data (CB,
+						   DS, CBE) */
+};
+
+/*
+ * Threaded programs actually allocate an array of GSEGs when a context is
+ * created. Each thread uses a separate GSEG. TSID is the index into the GSEG
+ * array.
+ */
+#define TSID(a, v)		(((a) - (v)->vm_start) / GRU_GSEG_PAGESIZE)
+#define UGRUADDR(gts)		((gts)->ts_vma->vm_start +		\
+					(gts)->ts_tsid * GRU_GSEG_PAGESIZE)
+
+#define NULLCTX			(-1)	/* if context not loaded into GRU */
+
+/*-----------------------------------------------------------------------------
+ *  GRU State Tables
+ */
+
+/*
+ * One of these exists for each GRU chiplet.
+ */
+struct gru_state {
+	struct gru_blade_state	*gs_blade;		/* GRU state for entire
+							   blade */
+	unsigned long		gs_gru_base_paddr;	/* Physical address of
+							   gru segments (64) */
+	void			*gs_gru_base_vaddr;	/* Virtual address of
+							   gru segments (64) */
+	unsigned char		gs_gid;			/* unique GRU number */
+	unsigned char		gs_tgh_local_shift;	/* used to pick TGH for
+							   local flush */
+	unsigned char		gs_tgh_first_remote;	/* starting TGH# for
+							   remote flush */
+	unsigned short		gs_blade_id;		/* blade of GRU */
+	spinlock_t		gs_asid_lock;		/* lock used for
+							   assigning asids */
+	spinlock_t		gs_lock;		/* lock used for
+							   assigning contexts */
+
+	/* -- the following are protected by the gs_asid_lock spinlock ---- */
+	unsigned int		gs_asid;		/* Next availe ASID */
+	unsigned int		gs_asid_limit;		/* Limit of available
+							   ASIDs */
+	unsigned int		gs_asid_gen;		/* asid generation.
+							   Inc on wrap */
+
+	/* --- the following fields are protected by the gs_lock spinlock --- */
+	unsigned long		gs_context_map;		/* bitmap to manage
+							   contexts in use */
+	unsigned long		gs_cbr_map;		/* bitmap to manage CB
+							   resources */
+	unsigned long		gs_dsr_map;		/* bitmap used to manage
+							   DATA resources */
+	unsigned int		gs_reserved_cbrs;	/* Number of kernel-
+							   reserved cbrs */
+	unsigned int		gs_reserved_dsr_bytes;	/* Bytes of kernel-
+							   reserved dsrs */
+	unsigned short		gs_active_contexts;	/* number of contexts
+							   in use */
+	struct gru_thread_state	*gs_gts[GRU_NUM_CCH];	/* GTS currently using
+							   the context */
+};
+
+/*
+ * This structure contains the GRU state for all the GRUs on a blade.
+ */
+struct gru_blade_state {
+	void			*kernel_cb;		/* First kernel
+							   reserved cb */
+	void			*kernel_dsr;		/* First kernel
+							   reserved DSR */
+	/* ---- the following are protected by the bs_lock spinlock ---- */
+	spinlock_t		bs_lock;		/* lock used for
+							   stealing contexts */
+	int			bs_lru_ctxnum;		/* STEAL - last context
+							   stolen */
+	struct gru_state	*bs_lru_gru;		/* STEAL - last gru
+							   stolen */
+
+	struct gru_state	bs_grus[GRU_CHIPLETS_PER_BLADE];
+};
+
+/*-----------------------------------------------------------------------------
+ * Address Primitives
+ */
+#define get_tfm_for_cpu(g, c)						\
+	((struct gru_tlb_fault_map *)get_tfm((g)->gs_gru_base_vaddr, (c)))
+#define get_tfh_by_index(g, i)						\
+	((struct gru_tlb_fault_handle *)get_tfh((g)->gs_gru_base_vaddr, (i)))
+#define get_tgh_by_index(g, i)						\
+	((struct gru_tlb_global_handle *)get_tgh((g)->gs_gru_base_vaddr, (i)))
+#define get_cbe_by_index(g, i)						\
+	((struct gru_control_block_extended *)get_cbe((g)->gs_gru_base_vaddr,\
+			(i)))
+
+/*-----------------------------------------------------------------------------
+ * Useful Macros
+ */
+
+/* Given a blade# & chiplet#, get a pointer to the GRU */
+#define get_gru(b, c)		(&gru_base[b]->bs_grus[c])
+
+/* Number of bytes to save/restore when unloading/loading GRU contexts */
+#define DSR_BYTES(dsr)		((dsr) * GRU_DSR_AU_BYTES)
+#define CBR_BYTES(cbr)		((cbr) * GRU_HANDLE_BYTES * GRU_CBR_AU_SIZE * 2)
+
+/* Convert a user CB number to the actual CBRNUM */
+#define thread_cbr_number(gts, n) ((gts)->ts_cbr_idx[(n) / GRU_CBR_AU_SIZE] \
+				  * GRU_CBR_AU_SIZE + (n) % GRU_CBR_AU_SIZE)
+
+/* Convert a gid to a pointer to the GRU */
+#define GID_TO_GRU(gid)							\
+	(gru_base[(gid) / GRU_CHIPLETS_PER_BLADE] ?			\
+		(&gru_base[(gid) / GRU_CHIPLETS_PER_BLADE]->		\
+			bs_grus[(gid) % GRU_CHIPLETS_PER_BLADE]) :	\
+	 NULL)
+
+/* Scan all active GRUs in a GRU bitmap */
+#define for_each_gru_in_bitmap(gid, map)				\
+	for ((gid) = find_first_bit((map), GRU_MAX_GRUS); (gid) < GRU_MAX_GRUS;\
+		(gid)++, (gid) = find_next_bit((map), GRU_MAX_GRUS, (gid)))
+
+/* Scan all active GRUs on a specific blade */
+#define for_each_gru_on_blade(gru, nid, i)				\
+	for ((gru) = gru_base[nid]->bs_grus, (i) = 0;			\
+			(i) < GRU_CHIPLETS_PER_BLADE;			\
+			(i)++, (gru)++)
+
+/* Scan all active GTSs on a gru. Note: must hold ss_lock to use this macro. */
+#define for_each_gts_on_gru(gts, gru, ctxnum)				\
+	for ((ctxnum) = 0; (ctxnum) < GRU_NUM_CCH; (ctxnum)++)		\
+		if (((gts) = (gru)->gs_gts[ctxnum]))
+
+/* Scan each CBR whose bit is set in a TFM (or copy of) */
+#define for_each_cbr_in_tfm(i, map)					\
+	for ((i) = find_first_bit(map, GRU_NUM_CBE);			\
+			(i) < GRU_NUM_CBE;				\
+			(i)++, (i) = find_next_bit(map, GRU_NUM_CBE, i))
+
+/* Scan each CBR in a CBR bitmap. Note: multiple CBRs in an allocation unit */
+#define for_each_cbr_in_allocation_map(i, map, k)			\
+	for ((k) = find_first_bit(map, GRU_CBR_AU); (k) < GRU_CBR_AU;	\
+			(k) = find_next_bit(map, GRU_CBR_AU, (k) + 1)) 	\
+		for ((i) = (k)*GRU_CBR_AU_SIZE;				\
+				(i) < ((k) + 1) * GRU_CBR_AU_SIZE; (i)++)
+
+/* Scan each DSR in a DSR bitmap. Note: multiple DSRs in an allocation unit */
+#define for_each_dsr_in_allocation_map(i, map, k)			\
+	for ((k) = find_first_bit((const unsigned long *)map, GRU_DSR_AU);\
+			(k) < GRU_DSR_AU;				\
+			(k) = find_next_bit((const unsigned long *)map,	\
+					  GRU_DSR_AU, (k) + 1))		\
+		for ((i) = (k) * GRU_DSR_AU_CL;				\
+				(i) < ((k) + 1) * GRU_DSR_AU_CL; (i)++)
+
+#define gseg_physical_address(gru, ctxnum)				\
+		((gru)->gs_gru_base_paddr + ctxnum * GRU_GSEG_STRIDE)
+#define gseg_virtual_address(gru, ctxnum)				\
+		((gru)->gs_gru_base_vaddr + ctxnum * GRU_GSEG_STRIDE)
+
+/*-----------------------------------------------------------------------------
+ * Lock / Unlock GRU handles
+ * 	Use the "delresp" bit in the handle as a "lock" bit.
+ */
+
+/* Lock hierarchy checking enabled only in emulator */
+
+static inline void __lock_handle(void *h)
+{
+	while (test_and_set_bit(1, h))
+		cpu_relax();
+}
+
+static inline void __unlock_handle(void *h)
+{
+	clear_bit(1, h);
+}
+
+static inline void lock_cch_handle(struct gru_context_configuration_handle *cch)
+{
+	__lock_handle(cch);
+}
+
+static inline void unlock_cch_handle(struct gru_context_configuration_handle
+				     *cch)
+{
+	__unlock_handle(cch);
+}
+
+static inline void lock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+	__lock_handle(tgh);
+}
+
+static inline void unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+	__unlock_handle(tgh);
+}
+
+/*-----------------------------------------------------------------------------
+ * Function prototypes & externs
+ */
+struct gru_unload_context_req;
+
+extern struct vm_operations_struct gru_vm_ops;
+extern struct device *grudev;
+
+extern struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma,
+				int tsid);
+extern struct gru_thread_state *gru_find_thread_state(struct vm_area_struct
+				*vma, int tsid);
+extern struct gru_thread_state *gru_alloc_thread_state(struct vm_area_struct
+				*vma, int tsid);
+extern void gru_unload_context(struct gru_thread_state *gts, int savestate);
+extern void gts_drop(struct gru_thread_state *gts);
+extern void gru_tgh_flush_init(struct gru_state *gru);
+extern int gru_kservices_init(struct gru_state *gru);
+extern irqreturn_t gru_intr(int irq, void *dev_id);
+extern int gru_handle_user_call_os(unsigned long address);
+extern int gru_user_flush_tlb(unsigned long arg);
+extern int gru_user_unload_context(unsigned long arg);
+extern int gru_get_exception_detail(unsigned long arg);
+extern int gru_set_task_slice(long address);
+extern int gru_cpu_fault_map_id(void);
+extern struct vm_area_struct *gru_find_vma(unsigned long vaddr);
+extern void gru_flush_all_tlb(struct gru_state *gru);
+extern int gru_proc_init(void);
+extern void gru_proc_exit(void);
+
+extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
+		int cbr_au_count, char *cbmap);
+extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
+		int dsr_au_count, char *dsmap);
+extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf);
+extern struct gru_mm_struct *gru_register_mmu_notifier(void);
+extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
+
+extern void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
+					unsigned long len);
+
+extern unsigned long gru_options;
+
+#endif /* __GRUTABLES_H__ */
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
new file mode 100644
index 00000000000..c84496a7769
--- /dev/null
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -0,0 +1,371 @@
+/*
+ * SN Platform GRU Driver
+ *
+ * 		MMUOPS callbacks  + TLB flushing
+ *
+ * This file handles emu notifier callbacks from the core kernel. The callbacks
+ * are used to update the TLB in the GRU as a result of changes in the
+ * state of a process address space. This file also handles TLB invalidates
+ * from the GRU driver.
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/hugetlb.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/srcu.h>
+#include <asm/processor.h>
+#include "gru.h"
+#include "grutables.h"
+#include <asm/uv/uv_hub.h>
+
+#define gru_random()	get_cycles()
+
+/* ---------------------------------- TLB Invalidation functions --------
+ * get_tgh_handle
+ *
+ * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the
+ * local blade, use a fixed TGH that is a function of the blade-local cpu
+ * number. Normally, this TGH is private to the cpu & no contention occurs for
+ * the TGH. For offblade GRUs, select a random TGH in the range above the
+ * private TGHs. A spinlock is required to access this TGH & the lock must be
+ * released when the invalidate is completes. This sucks, but it is the best we
+ * can do.
+ *
+ * Note that the spinlock is IN the TGH handle so locking does not involve
+ * additional cache lines.
+ *
+ */
+static inline int get_off_blade_tgh(struct gru_state *gru)
+{
+	int n;
+
+	n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
+	n = gru_random() % n;
+	n += gru->gs_tgh_first_remote;
+	return n;
+}
+
+static inline int get_on_blade_tgh(struct gru_state *gru)
+{
+	return uv_blade_processor_id() >> gru->gs_tgh_local_shift;
+}
+
+static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state
+							 *gru)
+{
+	struct gru_tlb_global_handle *tgh;
+	int n;
+
+	preempt_disable();
+	if (uv_numa_blade_id() == gru->gs_blade_id)
+		n = get_on_blade_tgh(gru);
+	else
+		n = get_off_blade_tgh(gru);
+	tgh = get_tgh_by_index(gru, n);
+	lock_tgh_handle(tgh);
+
+	return tgh;
+}
+
+static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh)
+{
+	unlock_tgh_handle(tgh);
+	preempt_enable();
+}
+
+/*
+ * gru_flush_tlb_range
+ *
+ * General purpose TLB invalidation function. This function scans every GRU in
+ * the ENTIRE system (partition) looking for GRUs where the specified MM has
+ * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR
+ * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned
+ * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the
+ * cost of (possibly) a large number of future TLBmisses.
+ *
+ * The current algorithm is optimized based on the following (somewhat true)
+ * assumptions:
+ * 	- GRU contexts are not loaded into a GRU unless a reference is made to
+ * 	  the data segment or control block (this is true, not an assumption).
+ * 	  If a DS/CB is referenced, the user will also issue instructions that
+ * 	  cause TLBmisses. It is not necessary to optimize for the case where
+ * 	  contexts are loaded but no instructions cause TLB misses. (I know
+ * 	  this will happen but I'm not optimizing for it).
+ * 	- GRU instructions to invalidate TLB entries are SLOOOOWWW - normally
+ * 	  a few usec but in unusual cases, it could be longer. Avoid if
+ * 	  possible.
+ * 	- intrablade process migration between cpus is not frequent but is
+ * 	  common.
+ * 	- a GRU context is not typically migrated to a different GRU on the
+ * 	  blade because of intrablade migration
+ *	- interblade migration is rare. Processes migrate their GRU context to
+ *	  the new blade.
+ *	- if interblade migration occurs, migration back to the original blade
+ *	  is very very rare (ie., no optimization for this case)
+ *	- most GRU instruction operate on a subset of the user REGIONS. Code
+ *	  & shared library regions are not likely targets of GRU instructions.
+ *
+ * To help improve the efficiency of TLB invalidation, the GMS data
+ * structure is maintained for EACH address space (MM struct). The GMS is
+ * also the structure that contains the pointer to the mmu callout
+ * functions. This structure is linked to the mm_struct for the address space
+ * using the mmu "register" function. The mmu interfaces are used to
+ * provide the callbacks for TLB invalidation. The GMS contains:
+ *
+ * 	- asid[maxgrus] array. ASIDs are assigned to a GRU when a context is
+ * 	  loaded into the GRU.
+ * 	- asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in
+ * 	  the above array
+ *	- ctxbitmap[maxgrus]. Indicates the contexts that are currently active
+ *	  in the GRU for the address space. This bitmap must be passed to the
+ *	  GRU to do an invalidate.
+ *
+ * The current algorithm for invalidating TLBs is:
+ * 	- scan the asidmap for GRUs where the context has been loaded, ie,
+ * 	  asid is non-zero.
+ * 	- for each gru found:
+ * 		- if the ctxtmap is non-zero, there are active contexts in the
+ * 		  GRU. TLB invalidate instructions must be issued to the GRU.
+ *		- if the ctxtmap is zero, no context is active. Set the ASID to
+ *		  zero to force a full TLB invalidation. This is fast but will
+ *		  cause a lot of TLB misses if the context is reloaded onto the
+ *		  GRU
+ *
+ */
+
+void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start,
+			 unsigned long len)
+{
+	struct gru_state *gru;
+	struct gru_mm_tracker *asids;
+	struct gru_tlb_global_handle *tgh;
+	unsigned long num;
+	int grupagesize, pagesize, pageshift, gid, asid;
+
+	/* ZZZ TODO - handle huge pages */
+	pageshift = PAGE_SHIFT;
+	pagesize = (1UL << pageshift);
+	grupagesize = GRU_PAGESIZE(pageshift);
+	num = min(((len + pagesize - 1) >> pageshift), GRUMAXINVAL);
+
+	STAT(flush_tlb);
+	gru_dbg(grudev, "gms %p, start 0x%lx, len 0x%lx, asidmap 0x%lx\n", gms,
+		start, len, gms->ms_asidmap[0]);
+
+	spin_lock(&gms->ms_asid_lock);
+	for_each_gru_in_bitmap(gid, gms->ms_asidmap) {
+		STAT(flush_tlb_gru);
+		gru = GID_TO_GRU(gid);
+		asids = gms->ms_asids + gid;
+		asid = asids->mt_asid;
+		if (asids->mt_ctxbitmap && asid) {
+			STAT(flush_tlb_gru_tgh);
+			asid = GRUASID(asid, start);
+			gru_dbg(grudev,
+	"  FLUSH gruid %d, asid 0x%x, num %ld, cbmap 0x%x\n",
+				gid, asid, num, asids->mt_ctxbitmap);
+			tgh = get_lock_tgh_handle(gru);
+			tgh_invalidate(tgh, start, 0, asid, grupagesize, 0,
+				       num - 1, asids->mt_ctxbitmap);
+			get_unlock_tgh_handle(tgh);
+		} else {
+			STAT(flush_tlb_gru_zero_asid);
+			asids->mt_asid = 0;
+			__clear_bit(gru->gs_gid, gms->ms_asidmap);
+			gru_dbg(grudev,
+	"  CLEARASID gruid %d, asid 0x%x, cbtmap 0x%x, asidmap 0x%lx\n",
+				gid, asid, asids->mt_ctxbitmap,
+				gms->ms_asidmap[0]);
+		}
+	}
+	spin_unlock(&gms->ms_asid_lock);
+}
+
+/*
+ * Flush the entire TLB on a chiplet.
+ */
+void gru_flush_all_tlb(struct gru_state *gru)
+{
+	struct gru_tlb_global_handle *tgh;
+
+	gru_dbg(grudev, "gru %p, gid %d\n", gru, gru->gs_gid);
+	tgh = get_lock_tgh_handle(gru);
+	tgh_invalidate(tgh, 0, ~0, 0, 1, 1, GRUMAXINVAL - 1, 0);
+	get_unlock_tgh_handle(tgh);
+	preempt_enable();
+}
+
+/*
+ * MMUOPS notifier callout functions
+ */
+static void gru_invalidate_range_start(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end)
+{
+	struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+						 ms_notifier);
+
+	STAT(mmu_invalidate_range);
+	atomic_inc(&gms->ms_range_active);
+	gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
+		start, end, atomic_read(&gms->ms_range_active));
+	gru_flush_tlb_range(gms, start, end - start);
+}
+
+static void gru_invalidate_range_end(struct mmu_notifier *mn,
+				     struct mm_struct *mm, unsigned long start,
+				     unsigned long end)
+{
+	struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+						 ms_notifier);
+
+	/* ..._and_test() provides needed barrier */
+	(void)atomic_dec_and_test(&gms->ms_range_active);
+
+	wake_up_all(&gms->ms_wait_queue);
+	gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end);
+}
+
+static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
+				unsigned long address)
+{
+	struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+						 ms_notifier);
+
+	STAT(mmu_invalidate_page);
+	gru_flush_tlb_range(gms, address, PAGE_SIZE);
+	gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address);
+}
+
+static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
+						 ms_notifier);
+
+	gms->ms_released = 1;
+	gru_dbg(grudev, "gms %p\n", gms);
+}
+
+
+static const struct mmu_notifier_ops gru_mmuops = {
+	.invalidate_page	= gru_invalidate_page,
+	.invalidate_range_start	= gru_invalidate_range_start,
+	.invalidate_range_end	= gru_invalidate_range_end,
+	.release		= gru_release,
+};
+
+/* Move this to the basic mmu_notifier file. But for now... */
+static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm,
+			const struct mmu_notifier_ops *ops)
+{
+	struct mmu_notifier *mn, *gru_mn = NULL;
+	struct hlist_node *n;
+
+	if (mm->mmu_notifier_mm) {
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list,
+					 hlist)
+		    if (mn->ops == ops) {
+			gru_mn = mn;
+			break;
+		}
+		rcu_read_unlock();
+	}
+	return gru_mn;
+}
+
+struct gru_mm_struct *gru_register_mmu_notifier(void)
+{
+	struct gru_mm_struct *gms;
+	struct mmu_notifier *mn;
+
+	mn = mmu_find_ops(current->mm, &gru_mmuops);
+	if (mn) {
+		gms = container_of(mn, struct gru_mm_struct, ms_notifier);
+		atomic_inc(&gms->ms_refcnt);
+	} else {
+		gms = kzalloc(sizeof(*gms), GFP_KERNEL);
+		if (gms) {
+			spin_lock_init(&gms->ms_asid_lock);
+			gms->ms_notifier.ops = &gru_mmuops;
+			atomic_set(&gms->ms_refcnt, 1);
+			init_waitqueue_head(&gms->ms_wait_queue);
+			__mmu_notifier_register(&gms->ms_notifier, current->mm);
+		}
+	}
+	gru_dbg(grudev, "gms %p, refcnt %d\n", gms,
+		atomic_read(&gms->ms_refcnt));
+	return gms;
+}
+
+void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
+{
+	gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms,
+		atomic_read(&gms->ms_refcnt), gms->ms_released);
+	if (atomic_dec_return(&gms->ms_refcnt) == 0) {
+		if (!gms->ms_released)
+			mmu_notifier_unregister(&gms->ms_notifier, current->mm);
+		kfree(gms);
+	}
+}
+
+/*
+ * Setup TGH parameters. There are:
+ * 	- 24 TGH handles per GRU chiplet
+ * 	- a portion (MAX_LOCAL_TGH) of the handles are reserved for
+ * 	  use by blade-local cpus
+ * 	- the rest are used by off-blade cpus. This usage is
+ * 	  less frequent than blade-local usage.
+ *
+ * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade
+ * has less tan or equal to 16 cpus, each cpu has a unique handle that it can
+ * use.
+ */
+#define MAX_LOCAL_TGH	16
+
+void gru_tgh_flush_init(struct gru_state *gru)
+{
+	int cpus, shift = 0, n;
+
+	cpus = uv_blade_nr_possible_cpus(gru->gs_blade_id);
+
+	/* n = cpus rounded up to next power of 2 */
+	if (cpus) {
+		n = 1 << fls(cpus - 1);
+
+		/*
+		 * shift count for converting local cpu# to TGH index
+		 *      0 if cpus <= MAX_LOCAL_TGH,
+		 *      1 if cpus <= 2*MAX_LOCAL_TGH,
+		 *      etc
+		 */
+		shift = max(0, fls(n - 1) - fls(MAX_LOCAL_TGH - 1));
+	}
+	gru->gs_tgh_local_shift = shift;
+
+	/* first starting TGH index to use for remote purges */
+	gru->gs_tgh_first_remote = (cpus + (1 << shift) - 1) >> shift;
+
+}