From b299eb5cde1a91706c450804006c6559b0826df8 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadoss.r@intel.com>
Date: Thu, 3 Mar 2011 04:30:13 +0530
Subject: ACPI:Fix goto flows in thermal-sys

This patch fixes two minor bugs in thermal_sys:
(a) The flow of goto's in thermal_hwmon_add_sysfs.
(b) Remove the temp*_crit only if there is a get_crit_temp defined, in
    thermal_remove_hwmon_sysfs.

Signed-off-by: Durgadoss R <durgadoss.r@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/thermal/thermal_sys.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 713b7ea4a60..0b1c82ad680 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -499,7 +499,7 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 	dev_set_drvdata(hwmon->device, hwmon);
 	result = device_create_file(hwmon->device, &dev_attr_name);
 	if (result)
-		goto unregister_hwmon_device;
+		goto free_mem;
 
  register_sys_interface:
 	tz->hwmon = hwmon;
@@ -513,7 +513,7 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 	sysfs_attr_init(&tz->temp_input.attr.attr);
 	result = device_create_file(hwmon->device, &tz->temp_input.attr);
 	if (result)
-		goto unregister_hwmon_device;
+		goto unregister_name;
 
 	if (tz->ops->get_crit_temp) {
 		unsigned long temperature;
@@ -527,7 +527,7 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 			result = device_create_file(hwmon->device,
 						    &tz->temp_crit.attr);
 			if (result)
-				goto unregister_hwmon_device;
+				goto unregister_input;
 		}
 	}
 
@@ -539,9 +539,9 @@ thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 
 	return 0;
 
- unregister_hwmon_device:
-	device_remove_file(hwmon->device, &tz->temp_crit.attr);
+ unregister_input:
 	device_remove_file(hwmon->device, &tz->temp_input.attr);
+ unregister_name:
 	if (new_hwmon_device) {
 		device_remove_file(hwmon->device, &dev_attr_name);
 		hwmon_device_unregister(hwmon->device);
@@ -560,7 +560,8 @@ thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 
 	tz->hwmon = NULL;
 	device_remove_file(hwmon->device, &tz->temp_input.attr);
-	device_remove_file(hwmon->device, &tz->temp_crit.attr);
+	if (tz->ops->get_crit_temp)
+		device_remove_file(hwmon->device, &tz->temp_crit.attr);
 
 	mutex_lock(&thermal_list_lock);
 	list_del(&tz->hwmon_node);
-- 
cgit v1.2.3-70-g09d2


From 9f63b88bd7a1ac1afbb4358772a39abaeddbdd13 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 23 Mar 2011 17:26:34 +0800
Subject: ACPI: osl, add acpi_os_create_lock interface

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/osl.c      | 33 +++++++++++++++++++++++++--------
 include/acpi/acpiosxf.h |  3 +++
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index c90c76aa7f8..cf750a7a952 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -883,14 +883,6 @@ void acpi_os_wait_events_complete(void *context)
 
 EXPORT_SYMBOL(acpi_os_wait_events_complete);
 
-/*
- * Deallocate the memory for a spinlock.
- */
-void acpi_os_delete_lock(acpi_spinlock handle)
-{
-	return;
-}
-
 acpi_status
 acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
@@ -1321,6 +1313,31 @@ int acpi_resources_are_enforced(void)
 }
 EXPORT_SYMBOL(acpi_resources_are_enforced);
 
+/*
+ * Create and initialize a spinlock.
+ */
+acpi_status
+acpi_os_create_lock(acpi_spinlock *out_handle)
+{
+	spinlock_t *lock;
+
+	lock = ACPI_ALLOCATE(sizeof(spinlock_t));
+	if (!lock)
+		return AE_NO_MEMORY;
+	spin_lock_init(lock);
+	*out_handle = lock;
+
+	return AE_OK;
+}
+
+/*
+ * Deallocate the memory for a spinlock.
+ */
+void acpi_os_delete_lock(acpi_spinlock handle)
+{
+	ACPI_FREE(handle);
+}
+
 /*
  * Acquire a spinlock.
  *
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index a3252a5ead6..a756bc8d866 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -98,6 +98,9 @@ acpi_os_table_override(struct acpi_table_header *existing_table,
 /*
  * Spinlock primitives
  */
+acpi_status
+acpi_os_create_lock(acpi_spinlock *out_handle);
+
 void acpi_os_delete_lock(acpi_spinlock handle);
 
 acpi_cpu_flags acpi_os_acquire_lock(acpi_spinlock handle);
-- 
cgit v1.2.3-70-g09d2


From 3854c8e32f46ffa6ee0bf2eb01137f5a48b2754f Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 23 Mar 2011 17:26:35 +0800
Subject: ACPICA: Use acpi_os_create_lock interface

Replace spin_lock_init with acpi_os_create_lock.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/acglobal.h |  9 +++------
 drivers/acpi/acpica/utmutex.c  | 17 ++++++++++++++---
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index d69750b83b3..6d512fcabdb 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -226,12 +226,9 @@ ACPI_EXTERN u8 acpi_gbl_global_lock_present;
  * Spinlocks are used for interfaces that can be possibly called at
  * interrupt level
  */
-ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock;	/* For GPE data structs and registers */
-ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
-ACPI_EXTERN spinlock_t _acpi_ev_global_lock_pending_lock; /* For global lock */
-#define acpi_gbl_gpe_lock	&_acpi_gbl_gpe_lock
-#define acpi_gbl_hardware_lock	&_acpi_gbl_hardware_lock
-#define acpi_ev_global_lock_pending_lock &_acpi_ev_global_lock_pending_lock
+ACPI_EXTERN acpi_spinlock acpi_gbl_gpe_lock;	/* For GPE data structs and registers */
+ACPI_EXTERN acpi_spinlock acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
+ACPI_EXTERN acpi_spinlock acpi_ev_global_lock_pending_lock; /* For global lock */
 
 /*****************************************************************************
  *
diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
index a946c689f03..519d4ee9b45 100644
--- a/drivers/acpi/acpica/utmutex.c
+++ b/drivers/acpi/acpica/utmutex.c
@@ -83,9 +83,20 @@ acpi_status acpi_ut_mutex_initialize(void)
 
 	/* Create the spinlocks for use at interrupt level */
 
-	spin_lock_init(acpi_gbl_gpe_lock);
-	spin_lock_init(acpi_gbl_hardware_lock);
-	spin_lock_init(acpi_ev_global_lock_pending_lock);
+	status = acpi_os_create_lock (&acpi_gbl_gpe_lock);
+	if (ACPI_FAILURE (status)) {
+		return_ACPI_STATUS (status);
+	}
+
+	status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
+	if (ACPI_FAILURE (status)) {
+		return_ACPI_STATUS (status);
+	}
+
+	status = acpi_os_create_lock (&acpi_ev_global_lock_pending_lock);
+	if (ACPI_FAILURE (status)) {
+		return_ACPI_STATUS (status);
+	}
 
 	/* Mutex for _OSI support */
 	status = acpi_os_create_mutex(&acpi_gbl_osi_mutex);
-- 
cgit v1.2.3-70-g09d2


From 749c27639b95c5c4a8185e02a4efb189034944ed Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 23 Mar 2011 17:26:36 +0800
Subject: ACPICA: Fix code divergence of global lock handling

Commit 9cd0314(ACPI / ACPICA: Fix global lock acquisition) was backported
into ACPICA code base, and some divergence was introduced.

This patch fixed it,
- rename acpi_ev_global_lock_pending/acpi_ev_global_lock_pending_lock
  to acpi_gbl_global_lock_pending/acpi_gbl_global_lock_pending_lock.

- move the initialization of acpi_gbl_global_lock_pending_lock from
  acpi_ut_mutex_initialize to acpi_ev_init_global_lock_handler.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/acglobal.h |  6 ++--
 drivers/acpi/acpica/evmisc.c   | 74 +++++++++++++++++++++++-------------------
 drivers/acpi/acpica/utmutex.c  |  5 ---
 3 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 6d512fcabdb..73863d86f02 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -214,13 +214,16 @@ ACPI_EXTERN struct acpi_mutex_info acpi_gbl_mutex_info[ACPI_NUM_MUTEX];
 
 /*
  * Global lock mutex is an actual AML mutex object
- * Global lock semaphore works in conjunction with the HW global lock
+ * Global lock semaphore works in conjunction with the actual global lock
+ * Global lock spinlock is used for "pending" handshake
  */
 ACPI_EXTERN union acpi_operand_object *acpi_gbl_global_lock_mutex;
 ACPI_EXTERN acpi_semaphore acpi_gbl_global_lock_semaphore;
+ACPI_EXTERN acpi_spinlock acpi_gbl_global_lock_pending_lock;
 ACPI_EXTERN u16 acpi_gbl_global_lock_handle;
 ACPI_EXTERN u8 acpi_gbl_global_lock_acquired;
 ACPI_EXTERN u8 acpi_gbl_global_lock_present;
+ACPI_EXTERN u8 acpi_gbl_global_lock_pending;
 
 /*
  * Spinlocks are used for interfaces that can be possibly called at
@@ -228,7 +231,6 @@ ACPI_EXTERN u8 acpi_gbl_global_lock_present;
  */
 ACPI_EXTERN acpi_spinlock acpi_gbl_gpe_lock;	/* For GPE data structs and registers */
 ACPI_EXTERN acpi_spinlock acpi_gbl_hardware_lock;	/* For ACPI H/W except GPE registers */
-ACPI_EXTERN acpi_spinlock acpi_ev_global_lock_pending_lock; /* For global lock */
 
 /*****************************************************************************
  *
diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c
index 7dc80946f7b..69a3b4aa862 100644
--- a/drivers/acpi/acpica/evmisc.c
+++ b/drivers/acpi/acpica/evmisc.c
@@ -284,39 +284,41 @@ static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context)
  * RETURN:      ACPI_INTERRUPT_HANDLED
  *
  * DESCRIPTION: Invoked directly from the SCI handler when a global lock
- *              release interrupt occurs.  If there's a thread waiting for
- *              the global lock, signal it.
- *
- * NOTE: Assumes that the semaphore can be signaled from interrupt level. If
- * this is not possible for some reason, a separate thread will have to be
- * scheduled to do this.
+ *              release interrupt occurs. If there is actually a pending
+ *              request for the lock, signal the waiting thread.
  *
  ******************************************************************************/
-static u8 acpi_ev_global_lock_pending;
 
 static u32 acpi_ev_global_lock_handler(void *context)
 {
 	acpi_status status;
 	acpi_cpu_flags flags;
 
-	flags = acpi_os_acquire_lock(acpi_ev_global_lock_pending_lock);
+	flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
 
-	if (!acpi_ev_global_lock_pending) {
-		goto out;
+	/*
+	 * If a request for the global lock is not actually pending,
+	 * we are done. This handles "spurious" global lock interrupts
+	 * which are possible (and have been seen) with bad BIOSs.
+	 */
+	if (!acpi_gbl_global_lock_pending) {
+		goto cleanup_and_exit;
 	}
 
-	/* Send a unit to the semaphore */
-
+	/*
+	 * Send a unit to the global lock semaphore. The actual acquisition
+	 * of the global lock will be performed by the waiting thread.
+	 */
 	status = acpi_os_signal_semaphore(acpi_gbl_global_lock_semaphore, 1);
 	if (ACPI_FAILURE(status)) {
 		ACPI_ERROR((AE_INFO, "Could not signal Global Lock semaphore"));
 	}
 
-	acpi_ev_global_lock_pending = FALSE;
+	acpi_gbl_global_lock_pending = FALSE;
 
- out:
-	acpi_os_release_lock(acpi_ev_global_lock_pending_lock, flags);
+cleanup_and_exit:
 
+	acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
 	return (ACPI_INTERRUPT_HANDLED);
 }
 
@@ -350,14 +352,20 @@ acpi_status acpi_ev_init_global_lock_handler(void)
 	 * Map to AE_OK, but mark global lock as not present. Any attempt to
 	 * actually use the global lock will be flagged with an error.
 	 */
+	acpi_gbl_global_lock_present = FALSE;
 	if (status == AE_NO_HARDWARE_RESPONSE) {
 		ACPI_ERROR((AE_INFO,
 			    "No response from Global Lock hardware, disabling lock"));
 
-		acpi_gbl_global_lock_present = FALSE;
 		return_ACPI_STATUS(AE_OK);
 	}
 
+	status = acpi_os_create_lock(&acpi_gbl_global_lock_pending_lock);
+	if (ACPI_FAILURE(status)) {
+		return_ACPI_STATUS(status);
+	}
+
+	acpi_gbl_global_lock_pending = FALSE;
 	acpi_gbl_global_lock_present = TRUE;
 	return_ACPI_STATUS(status);
 }
@@ -414,7 +422,7 @@ static int acpi_ev_global_lock_acquired;
 acpi_status acpi_ev_acquire_global_lock(u16 timeout)
 {
 	acpi_cpu_flags flags;
-	acpi_status status = AE_OK;
+	acpi_status status;
 	u8 acquired = FALSE;
 
 	ACPI_FUNCTION_TRACE(ev_acquire_global_lock);
@@ -458,15 +466,15 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout)
 	}
 
 	/*
-	 * Make sure that a global lock actually exists. If not, just treat the
-	 * lock as a standard mutex.
+	 * Make sure that a global lock actually exists. If not, just
+	 * treat the lock as a standard mutex.
 	 */
 	if (!acpi_gbl_global_lock_present) {
 		acpi_gbl_global_lock_acquired = TRUE;
 		return_ACPI_STATUS(AE_OK);
 	}
 
-	flags = acpi_os_acquire_lock(acpi_ev_global_lock_pending_lock);
+	flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
 
 	do {
 
@@ -475,20 +483,19 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout)
 		ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired);
 		if (acquired) {
 			acpi_gbl_global_lock_acquired = TRUE;
-
 			ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
 					  "Acquired hardware Global Lock\n"));
 			break;
 		}
 
-		acpi_ev_global_lock_pending = TRUE;
-
-		acpi_os_release_lock(acpi_ev_global_lock_pending_lock, flags);
-
 		/*
-		 * Did not get the lock. The pending bit was set above, and we
-		 * must wait until we get the global lock released interrupt.
+		 * Did not get the lock. The pending bit was set above, and
+		 * we must now wait until we receive the global lock
+		 * released interrupt.
 		 */
+		acpi_gbl_global_lock_pending = TRUE;
+		acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
+
 		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
 				  "Waiting for hardware Global Lock\n"));
 
@@ -496,17 +503,16 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout)
 		 * Wait for handshake with the global lock interrupt handler.
 		 * This interface releases the interpreter if we must wait.
 		 */
-		status = acpi_ex_system_wait_semaphore(
-						acpi_gbl_global_lock_semaphore,
-						ACPI_WAIT_FOREVER);
+		status =
+		    acpi_ex_system_wait_semaphore
+		    (acpi_gbl_global_lock_semaphore, ACPI_WAIT_FOREVER);
 
-		flags = acpi_os_acquire_lock(acpi_ev_global_lock_pending_lock);
+		flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
 
 	} while (ACPI_SUCCESS(status));
 
-	acpi_ev_global_lock_pending = FALSE;
-
-	acpi_os_release_lock(acpi_ev_global_lock_pending_lock, flags);
+	acpi_gbl_global_lock_pending = FALSE;
+	acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
 
 	return_ACPI_STATUS(status);
 }
diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
index 519d4ee9b45..7d797e2baec 100644
--- a/drivers/acpi/acpica/utmutex.c
+++ b/drivers/acpi/acpica/utmutex.c
@@ -93,11 +93,6 @@ acpi_status acpi_ut_mutex_initialize(void)
 		return_ACPI_STATUS (status);
 	}
 
-	status = acpi_os_create_lock (&acpi_ev_global_lock_pending_lock);
-	if (ACPI_FAILURE (status)) {
-		return_ACPI_STATUS (status);
-	}
-
 	/* Mutex for _OSI support */
 	status = acpi_os_create_mutex(&acpi_gbl_osi_mutex);
 	if (ACPI_FAILURE(status)) {
-- 
cgit v1.2.3-70-g09d2


From 84adccfb8cd2a6b8237da6752668ba25cd90c20b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Thu, 24 Mar 2011 11:32:15 +0530
Subject: dmaengine/dw_dmac fix: dwc_scan_descriptors must compare first desc
 address also with llp

dwc_scan_descriptors scans all descriptors from active_list in case transfer is
not completed.  It compares first_desc->lli.llp, and then all childrens of its
tx_list. But it doesn't compare its own address, i.e. first_desc->txd.phys, as
this is what we have initially programmed into the controller register. So this
causes dma to stop and finish a transfer, which was never started. And thus
fail.

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw_dmac.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 9c25c7d099e..b15c32ca0ef 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -304,6 +304,11 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	dev_vdbg(chan2dev(&dwc->chan), "scan_descriptors: llp=0x%x\n", llp);
 
 	list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
+		/* check first descriptors addr */
+		if (desc->txd.phys == llp)
+			return;
+
+		/* check first descriptors llp */
 		if (desc->lli.llp == llp)
 			/* This one is currently in progress */
 			return;
-- 
cgit v1.2.3-70-g09d2


From e2ec771a99a5cf231c9dea4da26238bf073e1e9c Mon Sep 17 00:00:00 2001
From: Coly Li <bosong.ly@taobao.com>
Date: Sun, 27 Mar 2011 01:26:52 +0800
Subject: dma: use BUG_ON correctly in iop-adma.c, v4

This patch makes BUG_ON() usage correct in drivers/dma/iop-adma.c.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Coly Li <bosong.ly@taobao.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/iop-adma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index c6b01f535b2..e03f811a83d 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -619,7 +619,7 @@ iop_adma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dma_dest,
 
 	if (unlikely(!len))
 		return NULL;
-	BUG_ON(unlikely(len > IOP_ADMA_MAX_BYTE_COUNT));
+	BUG_ON(len > IOP_ADMA_MAX_BYTE_COUNT);
 
 	dev_dbg(iop_chan->device->common.dev, "%s len: %u\n",
 		__func__, len);
@@ -652,7 +652,7 @@ iop_adma_prep_dma_memset(struct dma_chan *chan, dma_addr_t dma_dest,
 
 	if (unlikely(!len))
 		return NULL;
-	BUG_ON(unlikely(len > IOP_ADMA_MAX_BYTE_COUNT));
+	BUG_ON(len > IOP_ADMA_MAX_BYTE_COUNT);
 
 	dev_dbg(iop_chan->device->common.dev, "%s len: %u\n",
 		__func__, len);
@@ -686,7 +686,7 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest,
 
 	if (unlikely(!len))
 		return NULL;
-	BUG_ON(unlikely(len > IOP_ADMA_XOR_MAX_BYTE_COUNT));
+	BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
 
 	dev_dbg(iop_chan->device->common.dev,
 		"%s src_cnt: %d len: %u flags: %lx\n",
-- 
cgit v1.2.3-70-g09d2


From 7912d30007d0c958bcf11cd5ce19f77856cf041b Mon Sep 17 00:00:00 2001
From: Coly Li <bosong.ly@taobao.com>
Date: Sun, 27 Mar 2011 01:26:53 +0800
Subject: dma: use BUG_ON correctly in mv_xor.c, v4

This patch makes BUG_ON() usage correct in drivers/dma/mv_xor.c

Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Coly Li <bosong.ly@taobao.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/mv_xor.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index a25f5f61e0e..954e334e01b 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -671,7 +671,7 @@ mv_xor_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	if (unlikely(len < MV_XOR_MIN_BYTE_COUNT))
 		return NULL;
 
-	BUG_ON(unlikely(len > MV_XOR_MAX_BYTE_COUNT));
+	BUG_ON(len > MV_XOR_MAX_BYTE_COUNT);
 
 	spin_lock_bh(&mv_chan->lock);
 	slot_cnt = mv_chan_memcpy_slot_count(len);
@@ -710,7 +710,7 @@ mv_xor_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value,
 	if (unlikely(len < MV_XOR_MIN_BYTE_COUNT))
 		return NULL;
 
-	BUG_ON(unlikely(len > MV_XOR_MAX_BYTE_COUNT));
+	BUG_ON(len > MV_XOR_MAX_BYTE_COUNT);
 
 	spin_lock_bh(&mv_chan->lock);
 	slot_cnt = mv_chan_memset_slot_count(len);
@@ -744,7 +744,7 @@ mv_xor_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
 	if (unlikely(len < MV_XOR_MIN_BYTE_COUNT))
 		return NULL;
 
-	BUG_ON(unlikely(len > MV_XOR_MAX_BYTE_COUNT));
+	BUG_ON(len > MV_XOR_MAX_BYTE_COUNT);
 
 	dev_dbg(mv_chan->device->common.dev,
 		"%s src_cnt: %d len: dest %x %u flags: %ld\n",
-- 
cgit v1.2.3-70-g09d2


From 427cdf19b97e509e21e5d347e18d8b0b34723dfc Mon Sep 17 00:00:00 2001
From: Coly Li <bosong.ly@taobao.com>
Date: Sun, 27 Mar 2011 01:26:54 +0800
Subject: dma: use BUG_ON correctly in ppc4xx/adam.c, v4

This patch makes BUG_ON() usage correct in drivers/dma/ppc4xx/adam.c

Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: Sean MacLennan <smaclennan@pikatech.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Coly Li <bosong.ly@taobao.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/ppc4xx/adma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c
index cef584533ee..a2b62e93a14 100644
--- a/drivers/dma/ppc4xx/adma.c
+++ b/drivers/dma/ppc4xx/adma.c
@@ -2313,7 +2313,7 @@ static struct dma_async_tx_descriptor *ppc440spe_adma_prep_dma_memcpy(
 	if (unlikely(!len))
 		return NULL;
 
-	BUG_ON(unlikely(len > PPC440SPE_ADMA_DMA_MAX_BYTE_COUNT));
+	BUG_ON(len > PPC440SPE_ADMA_DMA_MAX_BYTE_COUNT);
 
 	spin_lock_bh(&ppc440spe_chan->lock);
 
@@ -2354,7 +2354,7 @@ static struct dma_async_tx_descriptor *ppc440spe_adma_prep_dma_memset(
 	if (unlikely(!len))
 		return NULL;
 
-	BUG_ON(unlikely(len > PPC440SPE_ADMA_DMA_MAX_BYTE_COUNT));
+	BUG_ON(len > PPC440SPE_ADMA_DMA_MAX_BYTE_COUNT);
 
 	spin_lock_bh(&ppc440spe_chan->lock);
 
@@ -2397,7 +2397,7 @@ static struct dma_async_tx_descriptor *ppc440spe_adma_prep_dma_xor(
 				     dma_dest, dma_src, src_cnt));
 	if (unlikely(!len))
 		return NULL;
-	BUG_ON(unlikely(len > PPC440SPE_ADMA_XOR_MAX_BYTE_COUNT));
+	BUG_ON(len > PPC440SPE_ADMA_XOR_MAX_BYTE_COUNT);
 
 	dev_dbg(ppc440spe_chan->device->common.dev,
 		"ppc440spe adma%d: %s src_cnt: %d len: %u int_en: %d\n",
@@ -2887,7 +2887,7 @@ static struct dma_async_tx_descriptor *ppc440spe_adma_prep_dma_pq(
 	ADMA_LL_DBG(prep_dma_pq_dbg(ppc440spe_chan->device->id,
 				    dst, src, src_cnt));
 	BUG_ON(!len);
-	BUG_ON(unlikely(len > PPC440SPE_ADMA_XOR_MAX_BYTE_COUNT));
+	BUG_ON(len > PPC440SPE_ADMA_XOR_MAX_BYTE_COUNT);
 	BUG_ON(!src_cnt);
 
 	if (src_cnt == 1 && dst[1] == src[0]) {
-- 
cgit v1.2.3-70-g09d2


From 1cb7b1e0de6a1f8f071f4a146e3d10f3a662f707 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Thu, 31 Mar 2011 13:36:38 +0200
Subject: ACPI EC: remove dead code

static void acpi_ec_gpe_query(void *ec_cxt);
-> The function is right above this declaration -> not needed.

poll_force is also not used, cleaned up in ec.c and its users:
compal-laptop and msi-laptop.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/ec.c                    |  6 +-----
 drivers/platform/x86/compal-laptop.c | 12 ++++++------
 drivers/platform/x86/msi-laptop.c    | 12 ++++++------
 include/linux/acpi.h                 |  3 +--
 4 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index fa848c4116a..b3f1d6f52a8 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -69,7 +69,6 @@ enum ec_command {
 
 #define ACPI_EC_DELAY		500	/* Wait 500ms max. during EC ops */
 #define ACPI_EC_UDELAY_GLK	1000	/* Wait 1ms max. to get global lock */
-#define ACPI_EC_CDELAY		10	/* Wait 10us before polling EC */
 #define ACPI_EC_MSI_UDELAY	550	/* Wait 550us for MSI EC */
 
 #define ACPI_EC_STORM_THRESHOLD 8	/* number of false interrupts
@@ -433,8 +432,7 @@ EXPORT_SYMBOL(ec_write);
 
 int ec_transaction(u8 command,
 		   const u8 * wdata, unsigned wdata_len,
-		   u8 * rdata, unsigned rdata_len,
-		   int force_poll)
+		   u8 * rdata, unsigned rdata_len)
 {
 	struct transaction t = {.command = command,
 				.wdata = wdata, .rdata = rdata,
@@ -592,8 +590,6 @@ static void acpi_ec_gpe_query(void *ec_cxt)
 	mutex_unlock(&ec->lock);
 }
 
-static void acpi_ec_gpe_query(void *ec_cxt);
-
 static int ec_check_sci(struct acpi_ec *ec, u8 state)
 {
 	if (state & ACPI_EC_FLAG_SCI) {
diff --git a/drivers/platform/x86/compal-laptop.c b/drivers/platform/x86/compal-laptop.c
index 034572b980c..f4f43e65475 100644
--- a/drivers/platform/x86/compal-laptop.c
+++ b/drivers/platform/x86/compal-laptop.c
@@ -200,7 +200,7 @@ static bool extra_features;
  * watching the output of address 0x4F (do an ec_transaction writing 0x33
  * into 0x4F and read a few bytes from the output, like so:
  *	u8 writeData = 0x33;
- *	ec_transaction(0x4F, &writeData, 1, buffer, 32, 0);
+ *	ec_transaction(0x4F, &writeData, 1, buffer, 32);
  * That address is labled "fan1 table information" in the service manual.
  * It should be clear which value in 'buffer' changes). This seems to be
  * related to fan speed. It isn't a proper 'realtime' fan speed value
@@ -286,7 +286,7 @@ static int get_backlight_level(void)
 static void set_backlight_state(bool on)
 {
 	u8 data = on ? BACKLIGHT_STATE_ON_DATA : BACKLIGHT_STATE_OFF_DATA;
-	ec_transaction(BACKLIGHT_STATE_ADDR, &data, 1, NULL, 0, 0);
+	ec_transaction(BACKLIGHT_STATE_ADDR, &data, 1, NULL, 0);
 }
 
 
@@ -294,24 +294,24 @@ static void set_backlight_state(bool on)
 static void pwm_enable_control(void)
 {
 	unsigned char writeData = PWM_ENABLE_DATA;
-	ec_transaction(PWM_ENABLE_ADDR, &writeData, 1, NULL, 0, 0);
+	ec_transaction(PWM_ENABLE_ADDR, &writeData, 1, NULL, 0);
 }
 
 static void pwm_disable_control(void)
 {
 	unsigned char writeData = PWM_DISABLE_DATA;
-	ec_transaction(PWM_DISABLE_ADDR, &writeData, 1, NULL, 0, 0);
+	ec_transaction(PWM_DISABLE_ADDR, &writeData, 1, NULL, 0);
 }
 
 static void set_pwm(int pwm)
 {
-	ec_transaction(PWM_ADDRESS, &pwm_lookup_table[pwm], 1, NULL, 0, 0);
+	ec_transaction(PWM_ADDRESS, &pwm_lookup_table[pwm], 1, NULL, 0);
 }
 
 static int get_fan_rpm(void)
 {
 	u8 value, data = FAN_DATA;
-	ec_transaction(FAN_ADDRESS, &data, 1, &value, 1, 0);
+	ec_transaction(FAN_ADDRESS, &data, 1, &value, 1);
 	return 100 * (int)value;
 }
 
diff --git a/drivers/platform/x86/msi-laptop.c b/drivers/platform/x86/msi-laptop.c
index 7e9bb6df9d3..918a65dd2ab 100644
--- a/drivers/platform/x86/msi-laptop.c
+++ b/drivers/platform/x86/msi-laptop.c
@@ -120,7 +120,7 @@ static int set_lcd_level(int level)
 	buf[1] = (u8) (level*31);
 
 	return ec_transaction(MSI_EC_COMMAND_LCD_LEVEL, buf, sizeof(buf),
-			      NULL, 0, 1);
+			      NULL, 0);
 }
 
 static int get_lcd_level(void)
@@ -129,7 +129,7 @@ static int get_lcd_level(void)
 	int result;
 
 	result = ec_transaction(MSI_EC_COMMAND_LCD_LEVEL, &wdata, 1,
-				&rdata, 1, 1);
+				&rdata, 1);
 	if (result < 0)
 		return result;
 
@@ -142,7 +142,7 @@ static int get_auto_brightness(void)
 	int result;
 
 	result = ec_transaction(MSI_EC_COMMAND_LCD_LEVEL, &wdata, 1,
-				&rdata, 1, 1);
+				&rdata, 1);
 	if (result < 0)
 		return result;
 
@@ -157,7 +157,7 @@ static int set_auto_brightness(int enable)
 	wdata[0] = 4;
 
 	result = ec_transaction(MSI_EC_COMMAND_LCD_LEVEL, wdata, 1,
-				&rdata, 1, 1);
+				&rdata, 1);
 	if (result < 0)
 		return result;
 
@@ -165,7 +165,7 @@ static int set_auto_brightness(int enable)
 	wdata[1] = (rdata & 0xF7) | (enable ? 8 : 0);
 
 	return ec_transaction(MSI_EC_COMMAND_LCD_LEVEL, wdata, 2,
-			      NULL, 0, 1);
+			      NULL, 0);
 }
 
 static ssize_t set_device_state(const char *buf, size_t count, u8 mask)
@@ -202,7 +202,7 @@ static int get_wireless_state(int *wlan, int *bluetooth)
 	u8 wdata = 0, rdata;
 	int result;
 
-	result = ec_transaction(MSI_EC_COMMAND_WIRELESS, &wdata, 1, &rdata, 1, 1);
+	result = ec_transaction(MSI_EC_COMMAND_WIRELESS, &wdata, 1, &rdata, 1);
 	if (result < 0)
 		return -1;
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a2e910e0129..1deb2a73c2d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -150,8 +150,7 @@ extern int ec_read(u8 addr, u8 *val);
 extern int ec_write(u8 addr, u8 val);
 extern int ec_transaction(u8 command,
                           const u8 *wdata, unsigned wdata_len,
-                          u8 *rdata, unsigned rdata_len,
-			  int force_poll);
+                          u8 *rdata, unsigned rdata_len);
 
 #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)
 
-- 
cgit v1.2.3-70-g09d2


From e2142df7ec7184ed4a77ada686bc1eb41075490f Mon Sep 17 00:00:00 2001
From: Kristen Carlson Accardi <kristen@linux.intel.com>
Date: Thu, 31 Mar 2011 11:02:43 -0700
Subject: intel_mid_dma: fix runtime pm issues

Use the correct api in probe to enable runtime pm for this driver.
Additionally, do not just call legacy suspend for runtime_suspend,
as this duplicates some work the pci core does for you.

Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/intel_mid_dma.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c
index 798f46a4590..f153adfcace 100644
--- a/drivers/dma/intel_mid_dma.c
+++ b/drivers/dma/intel_mid_dma.c
@@ -1292,8 +1292,7 @@ static int __devinit intel_mid_dma_probe(struct pci_dev *pdev,
 	if (err)
 		goto err_dma;
 
-	pm_runtime_set_active(&pdev->dev);
-	pm_runtime_enable(&pdev->dev);
+	pm_runtime_put_noidle(&pdev->dev);
 	pm_runtime_allow(&pdev->dev);
 	return 0;
 
@@ -1322,6 +1321,9 @@ err_enable_device:
 static void __devexit intel_mid_dma_remove(struct pci_dev *pdev)
 {
 	struct middma_device *device = pci_get_drvdata(pdev);
+
+	pm_runtime_get_noresume(&pdev->dev);
+	pm_runtime_forbid(&pdev->dev);
 	middma_shutdown(pdev);
 	pci_dev_put(pdev);
 	kfree(device);
@@ -1385,13 +1387,20 @@ int dma_resume(struct pci_dev *pci)
 static int dma_runtime_suspend(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	return dma_suspend(pci_dev, PMSG_SUSPEND);
+	struct middma_device *device = pci_get_drvdata(pci_dev);
+
+	device->state = SUSPENDED;
+	return 0;
 }
 
 static int dma_runtime_resume(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	return dma_resume(pci_dev);
+	struct middma_device *device = pci_get_drvdata(pci_dev);
+
+	device->state = RUNNING;
+	iowrite32(REG_BIT0, device->dma_base + DMA_CFG);
+	return 0;
 }
 
 static int dma_runtime_idle(struct device *dev)
-- 
cgit v1.2.3-70-g09d2


From 364de77831213be20f7f33c39ca1c194593b5c11 Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Sat, 2 Apr 2011 14:20:47 +0800
Subject: drivers, pch_dma: Fix uninitialized var before use

In the function pdc_desc_get(), var 'i' is not
initialized before use. This patch fixes it.

Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pch_dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index 8d8fef1480a..6eebc6205c6 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -403,7 +403,7 @@ static struct pch_dma_desc *pdc_desc_get(struct pch_dma_chan *pd_chan)
 {
 	struct pch_dma_desc *desc, *_d;
 	struct pch_dma_desc *ret = NULL;
-	int i;
+	int i = 0;
 
 	spin_lock(&pd_chan->lock);
 	list_for_each_entry_safe(desc, _d, &pd_chan->free_list, desc_node) {
-- 
cgit v1.2.3-70-g09d2


From 0601f793921157603831d00a9541d92e8f5763f6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 18 May 2009 17:47:56 -0400
Subject: SUNRPC: requeue tcp socket less frequently

Don't requeue the socket in some cases where we know it's unnecessary.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b7d435c3f19..7a3e4bfd895 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -965,7 +965,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 		goto err_again;	/* record not complete */
 	}
 	len = svsk->sk_reclen;
-	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	return len;
  error:
@@ -1115,6 +1114,10 @@ out:
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
+	/* If we have more data, signal svc_xprt_enqueue() to try again */
+	if (svc_recv_available(svsk) > sizeof(rpc_fraghdr))
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+
 
 	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
 	if (serv->sv_stats)
-- 
cgit v1.2.3-70-g09d2


From 5ee78d483c5812228e971e145b912e0a7e35e571 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 18 May 2009 17:47:56 -0400
Subject: SUNRPC: svc_tcp_recvfrom cleanup

Minor cleanup in preparation for later patches.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7a3e4bfd895..733c2f6a185 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -893,6 +893,7 @@ failed:
 static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 {
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	unsigned int want;
 	int len;
 
 	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
@@ -915,9 +916,9 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
-		int		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
 		struct kvec	iov;
 
+		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
 		iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
 		iov.iov_len  = want;
 		if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
@@ -1040,8 +1041,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
-	int pnum, vlen;
 	struct rpc_rqst *req = NULL;
+	unsigned int vlen;
+	int pnum;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
 		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
@@ -1072,7 +1074,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	}
 
 	pnum = 1;
-	while (vlen < len) {
+	while (vlen < svsk->sk_reclen - 8) {
 		vec[pnum].iov_base = (req) ?
 			page_address(req->rq_private_buf.pages[pnum - 1]) :
 			page_address(rqstp->rq_pages[pnum]);
@@ -1083,29 +1085,23 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_respages = &rqstp->rq_pages[pnum];
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, vec, pnum, len);
+	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen - 8);
 	if (len < 0)
 		goto err_again;
 
-	/*
-	 * Account for the 8 bytes we read earlier
-	 */
-	len += 8;
-
 	if (req) {
-		xprt_complete_rqst(req->rq_task, len);
-		len = 0;
+		xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
+		rqstp->rq_arg.len = 0;
 		goto out;
 	}
-	dprintk("svc: TCP complete record (%d bytes)\n", len);
-	rqstp->rq_arg.len = len;
+	dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen);
+	rqstp->rq_arg.len = svsk->sk_reclen;
 	rqstp->rq_arg.page_base = 0;
-	if (len <= rqstp->rq_arg.head[0].iov_len) {
-		rqstp->rq_arg.head[0].iov_len = len;
+	if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
+		rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
 		rqstp->rq_arg.page_len = 0;
-	} else {
-		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
-	}
+	} else
+		rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
 
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
@@ -1123,7 +1119,7 @@ out:
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpcnt++;
 
-	return len;
+	return rqstp->rq_arg.len;
 
 err_again:
 	if (len == -EAGAIN) {
-- 
cgit v1.2.3-70-g09d2


From 48e6555c7b3bf0d92f8167d8b8b8ecf4a3fdab84 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 14 Feb 2011 14:52:03 -0500
Subject: svcrpc: note network-order types in svc_process_calldir

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 733c2f6a185..1955e1a1e39 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -982,9 +982,9 @@ static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
 			       struct rpc_rqst **reqpp, struct kvec *vec)
 {
 	struct rpc_rqst *req = NULL;
-	u32 *p;
-	u32 xid;
-	u32 calldir;
+	__be32 *p;
+	__be32 xid;
+	__be32 calldir;
 	int len;
 
 	len = svc_recvfrom(rqstp, vec, 1, 8);
-- 
cgit v1.2.3-70-g09d2


From cc6c2127f2316c2b2ad1e8919b45cde5e03f65aa Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 14 Feb 2011 15:03:35 -0500
Subject: svcrpc: close connection if client sends short packet

If the client sents a record too short to contain even the beginning of
the rpc header, then just close the connection.

The current code drops the record data and continues.  I don't see the
point.  It's a hopeless situation and simpler just to cut off the
connection completely.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1955e1a1e39..62ff7c5c09c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -955,6 +955,9 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 		}
 	}
 
+	if (svsk->sk_reclen < 8)
+		goto err_delete; /* client is nuts. */
+
 	/* Check whether enough data is available */
 	len = svc_recv_available(svsk);
 	if (len < 0)
@@ -1058,20 +1061,10 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	vec[0] = rqstp->rq_arg.head[0];
 	vlen = PAGE_SIZE;
 
-	/*
-	 * We have enough data for the whole tcp record. Let's try and read the
-	 * first 8 bytes to get the xid and the call direction. We can use this
-	 * to figure out if this is a call or a reply to a callback. If
-	 * sk_reclen is < 8 (xid and calldir), then this is a malformed packet.
-	 * In that case, don't bother with the calldir and just read the data.
-	 * It will be rejected in svc_process.
-	 */
-	if (len >= 8) {
-		len = svc_process_calldir(svsk, rqstp, &req, vec);
-		if (len < 0)
-			goto err_again;
-		vlen -= 8;
-	}
+	len = svc_process_calldir(svsk, rqstp, &req, vec);
+	if (len < 0)
+		goto err_again;
+	vlen -= 8;
 
 	pnum = 1;
 	while (vlen < svsk->sk_reclen - 8) {
-- 
cgit v1.2.3-70-g09d2


From 586c52cc61b5b84c70102208b78269ef5924bf49 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 18 May 2009 17:47:56 -0400
Subject: svcrpc: copy cb reply instead of pages

It's much simpler just to copy the cb reply data than to play tricks
with pages.  Callback replies will typically be very small (at least
until we implement cb_getattr, in which case files with very long ACLs
could pose a problem), so there's no loss in efficiency.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 122 +++++++++++++++++++++++----------------------------
 1 file changed, 56 insertions(+), 66 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 62ff7c5c09c..40b502b1144 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -981,57 +981,58 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	return -EAGAIN;
 }
 
-static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp,
-			       struct rpc_rqst **reqpp, struct kvec *vec)
+static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 {
+	struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
 	struct rpc_rqst *req = NULL;
-	__be32 *p;
+	struct kvec *src, *dst;
+	__be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
 	__be32 xid;
 	__be32 calldir;
-	int len;
-
-	len = svc_recvfrom(rqstp, vec, 1, 8);
-	if (len < 0)
-		goto error;
 
-	p = (u32 *)rqstp->rq_arg.head[0].iov_base;
 	xid = *p++;
 	calldir = *p;
 
-	if (calldir == 0) {
-		/* REQUEST is the most common case */
-		vec[0] = rqstp->rq_arg.head[0];
-	} else {
-		/* REPLY */
-		struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
-
-		if (bc_xprt)
-			req = xprt_lookup_rqst(bc_xprt, xid);
-
-		if (!req) {
-			printk(KERN_NOTICE
-				"%s: Got unrecognized reply: "
-				"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
-				__func__, ntohl(calldir),
-				bc_xprt, xid);
-			vec[0] = rqstp->rq_arg.head[0];
-			goto out;
-		}
+	if (bc_xprt)
+		req = xprt_lookup_rqst(bc_xprt, xid);
 
-		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
-		       sizeof(struct xdr_buf));
-		/* copy the xid and call direction */
-		memcpy(req->rq_private_buf.head[0].iov_base,
-		       rqstp->rq_arg.head[0].iov_base, 8);
-		vec[0] = req->rq_private_buf.head[0];
+	if (!req) {
+		printk(KERN_NOTICE
+			"%s: Got unrecognized reply: "
+			"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
+			__func__, ntohl(calldir),
+			bc_xprt, xid);
+		return -EAGAIN;
 	}
- out:
-	vec[0].iov_base += 8;
-	vec[0].iov_len -= 8;
-	len = svsk->sk_reclen - 8;
- error:
-	*reqpp = req;
-	return len;
+
+	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+	/*
+	 * XXX!: cheating for now!  Only copying HEAD.
+	 * But we know this is good enough for now (in fact, for any
+	 * callback reply in the forseeable future).
+	 */
+	dst = &req->rq_private_buf.head[0];
+	src = &rqstp->rq_arg.head[0];
+	if (dst->iov_len < src->iov_len)
+		return -EAGAIN; /* whatever; just giving up. */
+	memcpy(dst->iov_base, src->iov_base, src->iov_len);
+	xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
+	rqstp->rq_arg.len = 0;
+	return 0;
+}
+
+static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
+{
+	int i = 0;
+	int t = 0;
+
+	while (t < len) {
+		vec[i].iov_base = page_address(pages[i]);
+		vec[i].iov_len = PAGE_SIZE;
+		i++;
+		t += PAGE_SIZE;
+	}
+	return i;
 }
 
 /*
@@ -1044,8 +1045,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
-	struct rpc_rqst *req = NULL;
-	unsigned int vlen;
+	__be32 *p;
+	__be32 calldir;
 	int pnum;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
@@ -1058,35 +1059,17 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		goto error;
 
 	vec = rqstp->rq_vec;
-	vec[0] = rqstp->rq_arg.head[0];
-	vlen = PAGE_SIZE;
 
-	len = svc_process_calldir(svsk, rqstp, &req, vec);
-	if (len < 0)
-		goto err_again;
-	vlen -= 8;
-
-	pnum = 1;
-	while (vlen < svsk->sk_reclen - 8) {
-		vec[pnum].iov_base = (req) ?
-			page_address(req->rq_private_buf.pages[pnum - 1]) :
-			page_address(rqstp->rq_pages[pnum]);
-		vec[pnum].iov_len = PAGE_SIZE;
-		pnum++;
-		vlen += PAGE_SIZE;
-	}
+	pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
+						svsk->sk_reclen);
+
 	rqstp->rq_respages = &rqstp->rq_pages[pnum];
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen - 8);
+	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen);
 	if (len < 0)
 		goto err_again;
 
-	if (req) {
-		xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
-		rqstp->rq_arg.len = 0;
-		goto out;
-	}
 	dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen);
 	rqstp->rq_arg.len = svsk->sk_reclen;
 	rqstp->rq_arg.page_base = 0;
@@ -1099,7 +1082,14 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_xprt_ctxt   = NULL;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
-out:
+	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+	calldir = p[1];
+	if (calldir) {
+		len = receive_cb_reply(svsk, rqstp);
+		if (len < 0)
+			goto err_again;
+	}
+
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
 	svsk->sk_tcplen = 0;
-- 
cgit v1.2.3-70-g09d2


From 31d68ef65c7d49def19c1bae4e01b87d66cf5a56 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 24 Feb 2011 11:25:33 -0800
Subject: SUNRPC: Don't wait for full record to receive tcp data

Ensure that we immediately read and buffer data from the incoming TCP
stream so that we grow the receive window quickly, and don't deadlock on
large READ or WRITE requests.

Also do some minor exit cleanup.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svcsock.h |   1 +
 net/sunrpc/svcsock.c           | 144 ++++++++++++++++++++++++++++++++---------
 2 files changed, 113 insertions(+), 32 deletions(-)

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 04dba23c59f..85c50b40759 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -28,6 +28,7 @@ struct svc_sock {
 	/* private TCP part */
 	u32			sk_reclen;	/* length of record */
 	u32			sk_tcplen;	/* current read length */
+	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
 };
 
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 40b502b1144..a4fafcbc6ea 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -387,6 +387,33 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
 	return len;
 }
 
+static int svc_partial_recvfrom(struct svc_rqst *rqstp,
+				struct kvec *iov, int nr,
+				int buflen, unsigned int base)
+{
+	size_t save_iovlen;
+	void __user *save_iovbase;
+	unsigned int i;
+	int ret;
+
+	if (base == 0)
+		return svc_recvfrom(rqstp, iov, nr, buflen);
+
+	for (i = 0; i < nr; i++) {
+		if (iov[i].iov_len > base)
+			break;
+		base -= iov[i].iov_len;
+	}
+	save_iovlen = iov[i].iov_len;
+	save_iovbase = iov[i].iov_base;
+	iov[i].iov_len -= base;
+	iov[i].iov_base += base;
+	ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
+	iov[i].iov_len = save_iovlen;
+	iov[i].iov_base = save_iovbase;
+	return ret;
+}
+
 /*
  * Set socket snd and rcv buffer lengths
  */
@@ -884,6 +911,56 @@ failed:
 	return NULL;
 }
 
+static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		return 0;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		if (rqstp->rq_pages[i] != NULL)
+			put_page(rqstp->rq_pages[i]);
+		BUG_ON(svsk->sk_pages[i] == NULL);
+		rqstp->rq_pages[i] = svsk->sk_pages[i];
+		svsk->sk_pages[i] = NULL;
+	}
+	rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]);
+	return len;
+}
+
+static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		return;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		svsk->sk_pages[i] = rqstp->rq_pages[i];
+		rqstp->rq_pages[i] = NULL;
+	}
+}
+
+static void svc_tcp_clear_pages(struct svc_sock *svsk)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		goto out;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		BUG_ON(svsk->sk_pages[i] == NULL);
+		put_page(svsk->sk_pages[i]);
+		svsk->sk_pages[i] = NULL;
+	}
+out:
+	svsk->sk_tcplen = 0;
+}
+
 /*
  * Receive data.
  * If we haven't gotten the record length yet, get the next four bytes.
@@ -928,7 +1005,7 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 		if (len < want) {
 			dprintk("svc: short recvfrom while reading record "
 				"length (%d of %d)\n", len, want);
-			goto err_again; /* record header not complete */
+			return -EAGAIN;
 		}
 
 		svsk->sk_reclen = ntohl(svsk->sk_reclen);
@@ -958,26 +1035,14 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	if (svsk->sk_reclen < 8)
 		goto err_delete; /* client is nuts. */
 
-	/* Check whether enough data is available */
-	len = svc_recv_available(svsk);
-	if (len < 0)
-		goto error;
-
-	if (len < svsk->sk_reclen) {
-		dprintk("svc: incomplete TCP record (%d of %d)\n",
-			len, svsk->sk_reclen);
-		goto err_again;	/* record not complete */
-	}
 	len = svsk->sk_reclen;
 
 	return len;
- error:
-	if (len == -EAGAIN)
-		dprintk("RPC: TCP recv_record got EAGAIN\n");
+error:
+	dprintk("RPC: TCP recv_record got %d\n", len);
 	return len;
- err_delete:
+err_delete:
 	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
- err_again:
 	return -EAGAIN;
 }
 
@@ -1035,6 +1100,7 @@ static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
 	return i;
 }
 
+
 /*
  * Receive data from a TCP socket.
  */
@@ -1045,6 +1111,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
 	int		len;
 	struct kvec *vec;
+	unsigned int want, base;
 	__be32 *p;
 	__be32 calldir;
 	int pnum;
@@ -1058,6 +1125,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (len < 0)
 		goto error;
 
+	base = svc_tcp_restore_pages(svsk, rqstp);
+	want = svsk->sk_reclen - base;
+
 	vec = rqstp->rq_vec;
 
 	pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
@@ -1066,11 +1136,18 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_respages = &rqstp->rq_pages[pnum];
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, vec, pnum, svsk->sk_reclen);
-	if (len < 0)
-		goto err_again;
+	len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
+	if (len >= 0)
+		svsk->sk_tcplen += len;
+	if (len != want) {
+		if (len < 0 && len != -EAGAIN)
+			goto err_other;
+		svc_tcp_save_pages(svsk, rqstp);
+		dprintk("svc: incomplete TCP record (%d of %d)\n",
+			svsk->sk_tcplen, svsk->sk_reclen);
+		goto err_noclose;
+	}
 
-	dprintk("svc: TCP complete record (%d bytes)\n", svsk->sk_reclen);
 	rqstp->rq_arg.len = svsk->sk_reclen;
 	rqstp->rq_arg.page_base = 0;
 	if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
@@ -1087,7 +1164,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (calldir) {
 		len = receive_cb_reply(svsk, rqstp);
 		if (len < 0)
-			goto err_again;
+			goto error;
 	}
 
 	/* Reset TCP read info */
@@ -1102,20 +1179,20 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (serv->sv_stats)
 		serv->sv_stats->nettcpcnt++;
 
+	dprintk("svc: TCP complete record (%d bytes)\n", rqstp->rq_arg.len);
 	return rqstp->rq_arg.len;
 
-err_again:
-	if (len == -EAGAIN) {
-		dprintk("RPC: TCP recvfrom got EAGAIN\n");
-		return len;
-	}
 error:
-	if (len != -EAGAIN) {
-		printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
-		       svsk->sk_xprt.xpt_server->sv_name, -len);
-		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
-	}
+	if (len != -EAGAIN)
+		goto err_other;
+	dprintk("RPC: TCP recvfrom got EAGAIN\n");
 	return -EAGAIN;
+err_other:
+	printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
+	       svsk->sk_xprt.xpt_server->sv_name, -len);
+	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+err_noclose:
+	return -EAGAIN;	/* record not complete */
 }
 
 /*
@@ -1286,6 +1363,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		svsk->sk_reclen = 0;
 		svsk->sk_tcplen = 0;
+		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
@@ -1544,8 +1622,10 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt)
 
 	svc_sock_detach(xprt);
 
-	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags))
+	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+		svc_tcp_clear_pages(svsk);
 		kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR);
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 9660439861aa8dbd5e2b8087f33e20760c2c9afc Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@citi.umich.edu>
Date: Tue, 21 Oct 2008 14:13:47 -0400
Subject: svcrpc: take advantage of tcp autotuning

Allow the NFSv4 server to make use of TCP autotuning behaviour, which
was previously disabled by setting the sk_userlocks variable.

Set the receive buffers to be big enough to receive the whole RPC
request, and set this for the listening socket, not the accept socket.

Remove the code that readjusts the receive/send buffer sizes for the
accepted socket. Previously this code was used to influence the TCP
window management behaviour, which is no longer needed when autotuning
is enabled.

This can improve IO bandwidth on networks with high bandwidth-delay
products, where a large tcp window is required.  It also simplifies
performance tuning, since getting adequate tcp buffers previously
required increasing the number of nfsd threads.

Signed-off-by: Olga Kornievskaia <aglo@citi.umich.edu>
Cc: Jim Rees <rees@umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcsock.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a4fafcbc6ea..213dea8b283 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -436,7 +436,6 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	lock_sock(sock->sk);
 	sock->sk->sk_sndbuf = snd * 2;
 	sock->sk->sk_rcvbuf = rcv * 2;
-	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
 	sock->sk->sk_write_space(sock->sk);
 	release_sock(sock->sk);
 #endif
@@ -973,23 +972,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	unsigned int want;
 	int len;
 
-	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
-		/* sndbuf needs to have room for one request
-		 * per thread, otherwise we can stall even when the
-		 * network isn't a bottleneck.
-		 *
-		 * We count all threads rather than threads in a
-		 * particular pool, which provides an upper bound
-		 * on the number of threads which will access the socket.
-		 *
-		 * rcvbuf just needs to be able to hold a few requests.
-		 * Normally they will be removed from the queue
-		 * as soon a a complete request arrives.
-		 */
-		svc_sock_setbufsize(svsk->sk_sock,
-				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
-				    3 * serv->sv_max_mesg);
-
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
@@ -1367,15 +1349,6 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
-		/* initialise setting must have enough space to
-		 * receive and respond to one request.
-		 * svc_tcp_recvfrom will re-adjust if necessary
-		 */
-		svc_sock_setbufsize(svsk->sk_sock,
-				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
-				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
-
-		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		if (sk->sk_state != TCP_ESTABLISHED)
 			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
@@ -1439,8 +1412,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
 		svc_udp_init(svsk, serv);
-	else
+	else {
+		/* initialise setting must have enough space to
+		 * receive and respond to one request.
+		 */
+		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
+					4 * serv->sv_max_mesg);
 		svc_tcp_init(svsk, serv);
+	}
 
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
-- 
cgit v1.2.3-70-g09d2


From 8985ef0b8af895c3b85a8c1b7108e0169fcbd20b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 9 Apr 2011 10:03:10 -0400
Subject: svcrpc: complete svsk processing on cb receive failure

Currently when there's some failure to receive a callback (because we
couldn't find a matching xid, for example), we exit svc_recv with
sk_tcplen still set but without any pages saved with the socket.  This
will cause a crash later in svc_tcp_restore_pages.

Instead, make sure we reset that tcp information whether the callback
received failed or succeeded.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 213dea8b283..af04f779ce9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1143,11 +1143,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 
 	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
 	calldir = p[1];
-	if (calldir) {
+	if (calldir)
 		len = receive_cb_reply(svsk, rqstp);
-		if (len < 0)
-			goto error;
-	}
 
 	/* Reset TCP read info */
 	svsk->sk_reclen = 0;
@@ -1156,6 +1153,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	if (svc_recv_available(svsk) > sizeof(rpc_fraghdr))
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
+	if (len < 0)
+		goto error;
 
 	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
 	if (serv->sv_stats)
-- 
cgit v1.2.3-70-g09d2


From aea93397db4b39c9d15443a0e7cc9a380ba990c6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 10 Apr 2011 10:35:12 -0400
Subject: nfsd: distinguish functions of NFSD_MAY_* flags

Most of the NFSD_MAY_* flags actually request permissions, but over the
years we've accreted a few that modify the behavior of the permission or
open code in other ways.

Distinguish the two cases a little more.  In particular, allow the
shortcut at the start of nfsd_permission to ignore the
non-permission-requesting bits.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 2 +-
 fs/nfsd/vfs.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2e1cebde90d..a76ef7e0b3d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2027,7 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	struct inode	*inode = dentry->d_inode;
 	int		err;
 
-	if (acc == NFSD_MAY_NOP)
+	if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
 		return 0;
 #if 0
 	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9a370a5e36b..1036913e6e8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -17,6 +17,9 @@
 #define NFSD_MAY_SATTR		8
 #define NFSD_MAY_TRUNC		16
 #define NFSD_MAY_LOCK		32
+#define NFSD_MAY_MASK		63
+
+/* extra hints to permission and open routines: */
 #define NFSD_MAY_OWNER_OVERRIDE	64
 #define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
-- 
cgit v1.2.3-70-g09d2


From 204f4ce75434c3453907813f8a819d4cf2a5728f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 8 Apr 2011 16:32:54 -0400
Subject: nfsd4: allow fh_verify caller to skip pseudoflavor checks

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsfh.c | 2 +-
 fs/nfsd/vfs.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 55c8e63af0b..90c6aa6d5e0 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 	 * which clients virtually always use auth_sys for,
 	 * even while using RPCSEC_GSS for NFS.
 	 */
-	if (access & NFSD_MAY_LOCK)
+	if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
 		goto skip_pseudoflavor_check;
 	/*
 	 * Clients may expect to be able to use auth_sys during mount,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 1036913e6e8..4d2509f766d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -24,6 +24,7 @@
 #define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
 #define NFSD_MAY_NOT_BREAK_LEASE 512
+#define NFSD_MAY_BYPASS_GSS	1024
 
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
-- 
cgit v1.2.3-70-g09d2


From 22b03214962ec2a9748abc9987fc2e66dec4626d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 9 Apr 2011 11:23:24 -0400
Subject: nfsd4: introduce OPDESC helper

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5fcb1396a7e..126b8f75b57 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1031,6 +1031,11 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
 	return nfs_ok;
 }
 
+static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+	return &nfsd4_ops[op->opnum];
+}
+
 /*
  * COMPOUND call.
  */
@@ -1108,7 +1113,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			goto encode_op;
 		}
 
-		opdesc = &nfsd4_ops[op->opnum];
+		opdesc = OPDESC(op);
 
 		if (!cstate->current_fh.fh_dentry) {
 			if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
-- 
cgit v1.2.3-70-g09d2


From 29a78a3ed7fc9c4ee49962751eb321b038c190a2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 9 Apr 2011 11:28:53 -0400
Subject: nfsd4: make fh_verify responsibility of nfsd_lookup_dentry caller

The secinfo caller actually won't want this.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 3 +++
 fs/nfsd/vfs.c      | 9 +++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 126b8f75b57..8059adae013 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 err;
 
 	fh_init(&resfh, NFS4_FHSIZE);
+	err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		return err;
 	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
 				    secinfo->si_name, secinfo->si_namelen,
 				    &exp, &dentry);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a76ef7e0b3d..e53313972c3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct svc_export	*exp;
 	struct dentry		*dparent;
 	struct dentry		*dentry;
-	__be32			err;
 	int			host_err;
 
 	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
 
-	/* Obtain dentry and export. */
-	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
-	if (err)
-		return err;
-
 	dparent = fhp->fh_dentry;
 	exp  = fhp->fh_export;
 	exp_get(exp);
@@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 	struct dentry		*dentry;
 	__be32 err;
 
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+	if (err)
+		return err;
 	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
 	if (err)
 		return err;
-- 
cgit v1.2.3-70-g09d2


From af986d101d141f10231ffa7e40ae397dc7356857 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Sun, 24 Apr 2011 22:09:32 +0100
Subject: ACPI: EC: add another DMI check for ASUS hardware

Commit 0adf3c746a73684b3f8c2821a584e1db998f61e9 introduced a regression
by making the ECDT validation test for ASUS hardware more restrictive.
The previous test used the dmi_name_in_vendors function which searches
a number of DMI fields, while the new test checked only the BIOS
vendor, which is known to not match on an ASUS F5GL laptop which
requires ECDT validation.

Add a rule to ec_dmi_table based on an alternative DMI pattern for
ASUS hardware as found elsewhere in the kernel.

Signed-off-by: Peter Collingbourne <peter@pcc.me.uk>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/ec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index fa848c4116a..30ca71791bf 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -940,6 +940,9 @@ static struct dmi_system_id __initdata ec_dmi_table[] = {
 	{
 	ec_validate_ecdt, "ASUS hardware", {
 	DMI_MATCH(DMI_BIOS_VENDOR, "ASUS") }, NULL},
+	{
+	ec_validate_ecdt, "ASUS hardware", {
+	DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer Inc.") }, NULL},
 	{},
 };
 
-- 
cgit v1.2.3-70-g09d2


From 68d93184352f2e723f135b0a9bad93b58f9d120b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 8 Apr 2011 17:00:50 -0400
Subject: nfsd4: fix wrongsec handling for PUTFH + op cases

When PUTFH is followed by an operation that uses the filehandle, and
when the current client is using a security flavor that is inconsistent
with the given filehandle, we have a choice: we can return WRONGSEC
either when the current filehandle is set using the PUTFH, or when the
filehandle is first used by the following operation.

Follow the recommendations of RFC 5661 in making this choice.

(Our current behavior prevented the client from doing security
negotiation by returning WRONGSEC on PUTFH+SECINFO_NO_NAME.)

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c   |  6 ------
 fs/nfsd/nfs4proc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ad000aeb21a..b9566e46219 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 	rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
-	if (rv)
-		goto out;
-	rv = check_nfsd_access(exp, rqstp);
-	if (rv)
-		fh_put(fhp);
-out:
 	exp_put(exp);
 	return rv;
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8059adae013..ad32568a1aa 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
 	memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
 	       putfh->pf_fhlen);
-	return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+	return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
 }
 
 static __be32
@@ -989,6 +989,9 @@ enum nfsd4_op_flags {
 	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
 	ALLOWED_ON_ABSENT_FS = 1 << 1,	/* ops processed on absent fs */
 	ALLOWED_AS_FIRST_OP = 1 << 2,	/* ops reqired first in compound */
+	/* For rfc 5661 section 2.6.3.1.1: */
+	OP_HANDLES_WRONGSEC = 1 << 3,
+	OP_IS_PUTFH_LIKE = 1 << 4,
 };
 
 struct nfsd4_operation {
@@ -1039,6 +1042,39 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
 	return &nfsd4_ops[op->opnum];
 }
 
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+	struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+	struct nfsd4_op *next = &argp->ops[resp->opcnt];
+	struct nfsd4_operation *thisd;
+	struct nfsd4_operation *nextd;
+
+	thisd = OPDESC(this);
+	/*
+	 * Most ops check wronsec on our own; only the putfh-like ops
+	 * have special rules.
+	 */
+	if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+		return false;
+	/*
+	 * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+	 * put-filehandle operation if we're not going to use the
+	 * result:
+	 */
+	if (argp->opcnt == resp->opcnt)
+		return false;
+
+	nextd = OPDESC(next);
+	/*
+	 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+	 * errors themselves as necessary; others should check for them
+	 * now:
+	 */
+	return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
+
 /*
  * COMPOUND call.
  */
@@ -1134,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		else
 			BUG_ON(op->status == nfs_ok);
 
+		if (!op->status && need_wrongsec_check(rqstp))
+			op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+
 encode_op:
 		/* Only from SEQUENCE */
 		if (resp->cstate.status == nfserr_replay_cache) {
@@ -1225,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOOKUP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookup,
+		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_LOOKUP",
 	},
 	[OP_LOOKUPP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookupp,
+		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_LOOKUPP",
 	},
 	[OP_NVERIFY] = {
@@ -1237,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_OPEN] = {
 		.op_func = (nfsd4op_func)nfsd4_open,
+		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_OPEN",
 	},
 	[OP_OPEN_CONFIRM] = {
@@ -1249,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_PUTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putfh,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
 		.op_name = "OP_PUTFH",
 	},
 	[OP_PUTPUBFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
 		.op_name = "OP_PUTPUBFH",
 	},
 	[OP_PUTROOTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
 		.op_name = "OP_PUTROOTFH",
 	},
 	[OP_READ] = {
@@ -1289,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_RESTOREFH] = {
 		.op_func = (nfsd4op_func)nfsd4_restorefh,
-		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+				| OP_IS_PUTFH_LIKE,
 		.op_name = "OP_RESTOREFH",
 	},
 	[OP_SAVEFH] = {
 		.op_func = (nfsd4op_func)nfsd4_savefh,
+		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_SAVEFH",
 	},
 	[OP_SECINFO] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo,
+		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_SECINFO",
 	},
 	[OP_SETATTR] = {
@@ -1361,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_SECINFO_NO_NAME] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+		.op_flags = OP_HANDLES_WRONGSEC,
 		.op_name = "OP_SECINFO_NO_NAME",
 	},
 };
-- 
cgit v1.2.3-70-g09d2


From ac6721a13e5b1a90728e790600f827a5e5f5da2f Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 20 Apr 2011 17:06:25 +0800
Subject: nfsd41: make sure nfs server process OPEN with EXCLUSIVE4_1 correctly

The NFS server uses nfsd_create_v3 to handle EXCLUSIVE4_1 opens, but
that function is not prepared to handle them.

Rename nfsd_create_v3() to do_nfsd_create(), and add handling of
EXCLUSIVE4_1.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3proc.c |  2 +-
 fs/nfsd/nfs4proc.c |  4 ++--
 fs/nfsd/vfs.c      | 20 ++++++++++++++++----
 fs/nfsd/vfs.h      |  2 +-
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2247fc91d5e..9095f3c21df 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 	}
 
 	/* Now create the file and set attributes */
-	nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len,
+	nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
 				attr, newfhp,
 				argp->createmode, argp->verf, NULL, NULL);
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ad32568a1aa..3a6dbd70b34 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 
 		/*
 		 * Note: create modes (UNCHECKED,GUARDED...) are the same
-		 * in NFSv4 as in v3.
+		 * in NFSv4 as in v3 except EXCLUSIVE4_1.
 		 */
-		status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data,
+		status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
 					open->op_fname.len, &open->op_iattr,
 					&resfh, open->op_createmode,
 					(u32 *)open->op_verf.data,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index e53313972c3..389e45ad200 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1337,11 +1337,18 @@ out_nfserr:
 }
 
 #ifdef CONFIG_NFSD_V3
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+	return createmode == NFS3_CREATE_EXCLUSIVE
+	       || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
 /*
- * NFSv3 version of nfsd_create
+ * NFSv3 and NFSv4 version of nfsd_create
  */
 __be32
-nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		char *fname, int flen, struct iattr *iap,
 		struct svc_fh *resfhp, int createmode, u32 *verifier,
 	        int *truncp, int *created)
@@ -1386,7 +1393,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (err)
 		goto out;
 
-	if (createmode == NFS3_CREATE_EXCLUSIVE) {
+	if (nfsd_create_is_exclusive(createmode)) {
 		/* solaris7 gets confused (bugid 4218508) if these have
 		 * the high bit set, so just clear the high bits. If this is
 		 * ever changed to use different attrs for storing the
@@ -1427,6 +1434,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			    && dchild->d_inode->i_atime.tv_sec == v_atime
 			    && dchild->d_inode->i_size  == 0 )
 				break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
+			    && dchild->d_inode->i_atime.tv_sec == v_atime
+			    && dchild->d_inode->i_size  == 0 )
+				goto set_attr;
 			 /* fallthru */
 		case NFS3_CREATE_GUARDED:
 			err = nfserr_exist;
@@ -1445,7 +1457,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	nfsd_check_ignore_resizing(iap);
 
-	if (createmode == NFS3_CREATE_EXCLUSIVE) {
+	if (nfsd_create_is_exclusive(createmode)) {
 		/* Cram the verifier into atime/mtime */
 		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
 			| ATTR_MTIME_SET|ATTR_ATIME_SET;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4d2509f766d..e0bbac04d1d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -58,7 +58,7 @@ __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
 				int type, dev_t rdev, struct svc_fh *res);
 #ifdef CONFIG_NFSD_V3
 __be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32		nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+__be32		do_nfsd_create(struct svc_rqst *, struct svc_fh *,
 				char *name, int len, struct iattr *attrs,
 				struct svc_fh *res, int createmode,
 				u32 *verifier, int *truncp, int *created);
-- 
cgit v1.2.3-70-g09d2


From a62573dc353b255dbbc8520bfdcb1c8a8c1ada87 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 23 Mar 2011 17:57:07 +0800
Subject: nfsd41: add flag checking for create_session

Teach the NFS server to reject invalid create_session flags.

Also do some minor formatting adjustments.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c  | 3 +++
 include/linux/nfs4.h | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbde6f79922..6dbaa379c86 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1515,6 +1515,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	bool confirm_me = false;
 	int status = 0;
 
+	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+		return nfserr_inval;
+
 	nfs4_lock_state();
 	unconf = find_unconfirmed_client(&cr_ses->clientid);
 	conf = find_confirmed_client(&cr_ses->clientid);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b528f6d4b86..8937727e703 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -570,9 +570,11 @@ struct nfs4_sessionid {
 };
 
 /* Create Session Flags */
-#define SESSION4_PERSIST	 0x001
-#define SESSION4_BACK_CHAN 	 0x002
-#define SESSION4_RDMA		 0x004
+#define SESSION4_PERSIST	0x001
+#define SESSION4_BACK_CHAN	0x002
+#define SESSION4_RDMA		0x004
+
+#define SESSION4_FLAG_MASK_A	0x007
 
 enum state_protect_how4 {
 	SP4_NONE	= 0,
-- 
cgit v1.2.3-70-g09d2


From b7c66360dc34e64742edabf4dc410070aa883119 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 22 Apr 2011 12:45:59 -0400
Subject: nfsd v4.1 lOCKT clientid field must be ignored

RFC 5661 Section 18.11.3

   The clientid field of the owner MAY be set to any value by the client
   and MUST be ignored by the server.  The reason the server MUST ignore
   the clientid field is that the server MUST derive the client ID from
   the session ID from the SEQUENCE operation of the COMPOUND request.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6766af00d9..6e7f10a8935 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -588,8 +588,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 	READ_BUF(lockt->lt_owner.len);
 	READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
 
-	if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
-		return nfserr_inval;
 	DECODE_TAIL;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 868b89c3dc25eec03985b31ff070e8f73c818980 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 27 Apr 2011 09:09:58 +0800
Subject: nfsd41: compare request's opcnt with session's maxops at
 nfsd4_sequence

Make sure nfs server errors out if request contains more ops
than channel allows.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
[bfields@redhat.com: use helper function]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6dbaa379c86..3196dc38857 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1724,6 +1724,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
 	return;
 }
 
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+	return args->opcnt > session->se_fchannel.maxops;
+}
+
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
 	       struct nfsd4_compound_state *cstate,
@@ -1752,6 +1759,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (!session)
 		goto out;
 
+	status = nfserr_too_many_ops;
+	if (nfsd4_session_too_many_ops(rqstp, session))
+		goto out;
+
 	status = nfserr_badslot;
 	if (seq->slotid >= session->se_fchannel.maxreqs)
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From bcecf1ccc336200ee488e8eb68acdafc4b0dbd1a Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Wed, 27 Apr 2011 09:14:30 +0800
Subject: nfsd41: error out on repeated RECLAIM_COMPLETE

Servers are supposed to return nfserr_complete_already to clients that
attempt to send multiple RECLAIM_COMPLETEs.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3196dc38857..2bb03f86a03 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1818,6 +1818,8 @@ out:
 __be32
 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
 {
+	int status = 0;
+
 	if (rc->rca_one_fs) {
 		if (!cstate->current_fh.fh_dentry)
 			return nfserr_nofilehandle;
@@ -1827,9 +1829,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 		 */
 		 return nfs_ok;
 	}
+
 	nfs4_lock_state();
-	if (is_client_expired(cstate->session->se_client)) {
-		nfs4_unlock_state();
+	status = nfserr_complete_already;
+	if (cstate->session->se_client->cl_firststate)
+		goto out;
+
+	status = nfserr_stale_clientid;
+	if (is_client_expired(cstate->session->se_client))
 		/*
 		 * The following error isn't really legal.
 		 * But we only get here if the client just explicitly
@@ -1837,11 +1844,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 		 * error it gets back on an operation for the dead
 		 * client.
 		 */
-		return nfserr_stale_clientid;
-	}
+		goto out;
+
+	status = nfs_ok;
 	nfsd4_create_clid_dir(cstate->session->se_client);
+out:
 	nfs4_unlock_state();
-	return nfs_ok;
+	return status;
 }
 
 __be32
-- 
cgit v1.2.3-70-g09d2


From fccb13c947de83a368e1f3c2216bbf4d8d41efa1 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 27 Apr 2011 15:47:14 -0400
Subject: NFSD: Remove setting unused variable in nfsd_vfs_read()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling gave me this warning:
fs/nfsd/vfs.c: In function ‘nfsd_vfs_read’:
fs/nfsd/vfs.c:880:16: warning: variable ‘inode’ set but not used
[-Wunused-but-set-variable]

I discovered that a local variable "inode" was being set towards the
beginning of nfsd_vfs_read() and then ignored for the rest of the
function.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 389e45ad200..f4e056ae53e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -874,13 +874,11 @@ static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
-	struct inode *inode;
 	mm_segment_t	oldfs;
 	__be32		err;
 	int		host_err;
 
 	err = nfserr_perm;
-	inode = file->f_path.dentry->d_inode;
 
 	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
 		struct splice_desc sd = {
-- 
cgit v1.2.3-70-g09d2


From 1db2b9dde3317e181f76860410cb0e7433896f28 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 27 Apr 2011 15:47:15 -0400
Subject: NFSD: Check status from nfsd4_map_bcts_dir()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling gave me this warning:
fs/nfsd/nfs4state.c: In function ‘nfsd4_bind_conn_to_session’:
fs/nfsd/nfs4state.c:1623:9: warning: variable ‘status’ set but not used
[-Wunused-but-set-variable]

The local variable "status" was being set by nfsd4_map_bcts_dir() and
then ignored before calling nfsd4_new_conn().

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2bb03f86a03..a2ea14f40b4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1636,8 +1636,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
 		return nfserr_badsession;
 
 	status = nfsd4_map_bcts_dir(&bcts->dir);
-	nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
-	return nfs_ok;
+	if (!status)
+		nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+	return status;
 }
 
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
-- 
cgit v1.2.3-70-g09d2


From 6ce2357f1e73e0da8ebeace6ec829f48a646bb8c Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Wed, 27 Apr 2011 15:47:16 -0400
Subject: NFSD: Remove unused variable from nfsd4_decode_bind_conn_to_session()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling gave me this warning:
fs/nfsd/nfs4xdr.c: In function ‘nfsd4_decode_bind_conn_to_session’:
fs/nfsd/nfs4xdr.c:427:6: warning: variable ‘dummy’ set but not used
[-Wunused-but-set-variable]

The local variable "dummy" wasn't being used past the READ32() macro that
set it.  READ_BUF() should ensure that the xdr buffer is pushed past the
data read into dummy already, so nothing needs to be read in.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
[bfields@redhat.com: minor comment fixup.]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6e7f10a8935..195a91d9405 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
 static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
 {
 	DECODE_HEAD;
-	u32 dummy;
 
 	READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
 	COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
 	READ32(bcts->dir);
-	/* XXX: Perhaps Tom Tucker could help us figure out how we
-	 * should be using ctsa_use_conn_in_rdma_mode: */
-	READ32(dummy);
-
+	/* XXX: skipping ctsa_use_conn_in_rdma_mode.  Perhaps Tom Tucker
+	 * could help us figure out we should be using it. */
 	DECODE_TAIL;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9b3aa589eaa1366200062ce1f9cc7ddca8d1d578 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Sat, 30 Apr 2011 16:57:45 +0200
Subject: dmaengine: at_hdmac: modify way to use interrupts

Now we use Buffer Transfer Completed interrupts. If we
want a chained buffer completed information, we setup the
ATC_IEN bit in CTRLB register in the lli.
This is done by set_desc_eol() function and used by
memcpy/slave_sg functions.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c      |  4 ++--
 drivers/dma/at_hdmac_regs.h | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 3d7d705f026..13050e646f8 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -464,7 +464,7 @@ static irqreturn_t at_dma_interrupt(int irq, void *dev_id)
 
 		for (i = 0; i < atdma->dma_common.chancnt; i++) {
 			atchan = &atdma->chan[i];
-			if (pending & (AT_DMA_CBTC(i) | AT_DMA_ERR(i))) {
+			if (pending & (AT_DMA_BTC(i) | AT_DMA_ERR(i))) {
 				if (pending & AT_DMA_ERR(i)) {
 					/* Disable channel on AHB error */
 					dma_writel(atdma, CHDR, atchan->mask);
@@ -549,7 +549,7 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	}
 
 	ctrla =   ATC_DEFAULT_CTRLA;
-	ctrlb =   ATC_DEFAULT_CTRLB
+	ctrlb =   ATC_DEFAULT_CTRLB | ATC_IEN
 		| ATC_SRC_ADDR_MODE_INCR
 		| ATC_DST_ADDR_MODE_INCR
 		| ATC_FC_MEM2MEM;
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 495457e3dc4..8303306ea82 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -309,8 +309,8 @@ static void atc_setup_irq(struct at_dma_chan *atchan, int on)
 	struct at_dma	*atdma = to_at_dma(atchan->chan_common.device);
 	u32		ebci;
 
-	/* enable interrupts on buffer chain completion & error */
-	ebci =    AT_DMA_CBTC(atchan->chan_common.chan_id)
+	/* enable interrupts on buffer transfer completion & error */
+	ebci =    AT_DMA_BTC(atchan->chan_common.chan_id)
 		| AT_DMA_ERR(atchan->chan_common.chan_id);
 	if (on)
 		dma_writel(atdma, EBCIER, ebci);
@@ -347,7 +347,12 @@ static inline int atc_chan_is_enabled(struct at_dma_chan *atchan)
  */
 static void set_desc_eol(struct at_desc *desc)
 {
-	desc->lli.ctrlb |= ATC_SRC_DSCR_DIS | ATC_DST_DSCR_DIS;
+	u32 ctrlb = desc->lli.ctrlb;
+
+	ctrlb &= ~ATC_IEN;
+	ctrlb |= ATC_SRC_DSCR_DIS | ATC_DST_DSCR_DIS;
+
+	desc->lli.ctrlb = ctrlb;
 	desc->lli.dscr = 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 53830cc75974a199b6b654c062ff8c54c58caa0b Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Sat, 30 Apr 2011 16:57:46 +0200
Subject: dmaengine: at_hdmac: add cyclic DMA operation support

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c      | 231 +++++++++++++++++++++++++++++++++++++++++---
 drivers/dma/at_hdmac_regs.h |  14 ++-
 2 files changed, 229 insertions(+), 16 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 13050e646f8..ed9d92429de 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -164,6 +164,29 @@ static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc)
 	}
 }
 
+/**
+ * atc_desc_chain - build chain adding a descripor
+ * @first: address of first descripor of the chain
+ * @prev: address of previous descripor of the chain
+ * @desc: descriptor to queue
+ *
+ * Called from prep_* functions
+ */
+static void atc_desc_chain(struct at_desc **first, struct at_desc **prev,
+			   struct at_desc *desc)
+{
+	if (!(*first)) {
+		*first = desc;
+	} else {
+		/* inform the HW lli about chaining */
+		(*prev)->lli.dscr = desc->txd.phys;
+		/* insert the link descriptor to the LD ring */
+		list_add_tail(&desc->desc_node,
+				&(*first)->tx_list);
+	}
+	*prev = desc;
+}
+
 /**
  * atc_assign_cookie - compute and assign new cookie
  * @atchan: channel we work on
@@ -237,16 +260,12 @@ static void atc_dostart(struct at_dma_chan *atchan, struct at_desc *first)
 static void
 atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
 {
-	dma_async_tx_callback		callback;
-	void				*param;
 	struct dma_async_tx_descriptor	*txd = &desc->txd;
 
 	dev_vdbg(chan2dev(&atchan->chan_common),
 		"descriptor %u complete\n", txd->cookie);
 
 	atchan->completed_cookie = txd->cookie;
-	callback = txd->callback;
-	param = txd->callback_param;
 
 	/* move children to free_list */
 	list_splice_init(&desc->tx_list, &atchan->free_list);
@@ -278,12 +297,19 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
 		}
 	}
 
-	/*
-	 * The API requires that no submissions are done from a
-	 * callback, so we don't need to drop the lock here
-	 */
-	if (callback)
-		callback(param);
+	/* for cyclic transfers,
+	 * no need to replay callback function while stopping */
+	if (!test_bit(ATC_IS_CYCLIC, &atchan->status)) {
+		dma_async_tx_callback	callback = txd->callback;
+		void			*param = txd->callback_param;
+
+		/*
+		 * The API requires that no submissions are done from a
+		 * callback, so we don't need to drop the lock here
+		 */
+		if (callback)
+			callback(param);
+	}
 
 	dma_run_dependencies(txd);
 }
@@ -419,6 +445,26 @@ static void atc_handle_error(struct at_dma_chan *atchan)
 	atc_chain_complete(atchan, bad_desc);
 }
 
+/**
+ * atc_handle_cyclic - at the end of a period, run callback function
+ * @atchan: channel used for cyclic operations
+ *
+ * Called with atchan->lock held and bh disabled
+ */
+static void atc_handle_cyclic(struct at_dma_chan *atchan)
+{
+	struct at_desc			*first = atc_first_active(atchan);
+	struct dma_async_tx_descriptor	*txd = &first->txd;
+	dma_async_tx_callback		callback = txd->callback;
+	void				*param = txd->callback_param;
+
+	dev_vdbg(chan2dev(&atchan->chan_common),
+			"new cyclic period llp 0x%08x\n",
+			channel_readl(atchan, DSCR));
+
+	if (callback)
+		callback(param);
+}
 
 /*--  IRQ & Tasklet  ---------------------------------------------------*/
 
@@ -434,8 +480,10 @@ static void atc_tasklet(unsigned long data)
 	}
 
 	spin_lock(&atchan->lock);
-	if (test_and_clear_bit(0, &atchan->error_status))
+	if (test_and_clear_bit(ATC_IS_ERROR, &atchan->status))
 		atc_handle_error(atchan);
+	else if (test_bit(ATC_IS_CYCLIC, &atchan->status))
+		atc_handle_cyclic(atchan);
 	else
 		atc_advance_work(atchan);
 
@@ -469,7 +517,7 @@ static irqreturn_t at_dma_interrupt(int irq, void *dev_id)
 					/* Disable channel on AHB error */
 					dma_writel(atdma, CHDR, atchan->mask);
 					/* Give information to tasklet */
-					set_bit(0, &atchan->error_status);
+					set_bit(ATC_IS_ERROR, &atchan->status);
 				}
 				tasklet_schedule(&atchan->tasklet);
 				ret = IRQ_HANDLED;
@@ -759,6 +807,148 @@ err_desc_get:
 	return NULL;
 }
 
+/**
+ * atc_dma_cyclic_check_values
+ * Check for too big/unaligned periods and unaligned DMA buffer
+ */
+static int
+atc_dma_cyclic_check_values(unsigned int reg_width, dma_addr_t buf_addr,
+		size_t period_len, enum dma_data_direction direction)
+{
+	if (period_len > (ATC_BTSIZE_MAX << reg_width))
+		goto err_out;
+	if (unlikely(period_len & ((1 << reg_width) - 1)))
+		goto err_out;
+	if (unlikely(buf_addr & ((1 << reg_width) - 1)))
+		goto err_out;
+	if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+		goto err_out;
+
+	return 0;
+
+err_out:
+	return -EINVAL;
+}
+
+/**
+ * atc_dma_cyclic_fill_desc - Fill one period decriptor
+ */
+static int
+atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc,
+		unsigned int period_index, dma_addr_t buf_addr,
+		size_t period_len, enum dma_data_direction direction)
+{
+	u32		ctrla;
+	unsigned int	reg_width = atslave->reg_width;
+
+	/* prepare common CRTLA value */
+	ctrla =   ATC_DEFAULT_CTRLA | atslave->ctrla
+		| ATC_DST_WIDTH(reg_width)
+		| ATC_SRC_WIDTH(reg_width)
+		| period_len >> reg_width;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		desc->lli.saddr = buf_addr + (period_len * period_index);
+		desc->lli.daddr = atslave->tx_reg;
+		desc->lli.ctrla = ctrla;
+		desc->lli.ctrlb = ATC_DEFAULT_CTRLB
+				| ATC_DST_ADDR_MODE_FIXED
+				| ATC_SRC_ADDR_MODE_INCR
+				| ATC_FC_MEM2PER;
+		break;
+
+	case DMA_FROM_DEVICE:
+		desc->lli.saddr = atslave->rx_reg;
+		desc->lli.daddr = buf_addr + (period_len * period_index);
+		desc->lli.ctrla = ctrla;
+		desc->lli.ctrlb = ATC_DEFAULT_CTRLB
+				| ATC_DST_ADDR_MODE_INCR
+				| ATC_SRC_ADDR_MODE_FIXED
+				| ATC_FC_PER2MEM;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * atc_prep_dma_cyclic - prepare the cyclic DMA transfer
+ * @chan: the DMA channel to prepare
+ * @buf_addr: physical DMA address where the buffer starts
+ * @buf_len: total number of bytes for the entire buffer
+ * @period_len: number of bytes for each period
+ * @direction: transfer direction, to or from device
+ */
+static struct dma_async_tx_descriptor *
+atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
+		size_t period_len, enum dma_data_direction direction)
+{
+	struct at_dma_chan	*atchan = to_at_dma_chan(chan);
+	struct at_dma_slave	*atslave = chan->private;
+	struct at_desc		*first = NULL;
+	struct at_desc		*prev = NULL;
+	unsigned long		was_cyclic;
+	unsigned int		periods = buf_len / period_len;
+	unsigned int		i;
+
+	dev_vdbg(chan2dev(chan), "prep_dma_cyclic: %s buf@0x%08x - %d (%d/%d)\n",
+			direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE",
+			buf_addr,
+			periods, buf_len, period_len);
+
+	if (unlikely(!atslave || !buf_len || !period_len)) {
+		dev_dbg(chan2dev(chan), "prep_dma_cyclic: length is zero!\n");
+		return NULL;
+	}
+
+	was_cyclic = test_and_set_bit(ATC_IS_CYCLIC, &atchan->status);
+	if (was_cyclic) {
+		dev_dbg(chan2dev(chan), "prep_dma_cyclic: channel in use!\n");
+		return NULL;
+	}
+
+	/* Check for too big/unaligned periods and unaligned DMA buffer */
+	if (atc_dma_cyclic_check_values(atslave->reg_width, buf_addr,
+					period_len, direction))
+		goto err_out;
+
+	/* build cyclic linked list */
+	for (i = 0; i < periods; i++) {
+		struct at_desc	*desc;
+
+		desc = atc_desc_get(atchan);
+		if (!desc)
+			goto err_desc_get;
+
+		if (atc_dma_cyclic_fill_desc(atslave, desc, i, buf_addr,
+						period_len, direction))
+			goto err_desc_get;
+
+		atc_desc_chain(&first, &prev, desc);
+	}
+
+	/* lets make a cyclic list */
+	prev->lli.dscr = first->txd.phys;
+
+	/* First descriptor of the chain embedds additional information */
+	first->txd.cookie = -EBUSY;
+	first->len = buf_len;
+
+	return &first->txd;
+
+err_desc_get:
+	dev_err(chan2dev(chan), "not enough descriptors available\n");
+	atc_desc_put(atchan, first);
+err_out:
+	clear_bit(ATC_IS_CYCLIC, &atchan->status);
+	return NULL;
+}
+
+
 static int atc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 		       unsigned long arg)
 {
@@ -793,6 +983,9 @@ static int atc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 	list_for_each_entry_safe(desc, _desc, &list, desc_node)
 		atc_chain_complete(atchan, desc);
 
+	/* if channel dedicated to cyclic operations, free it */
+	clear_bit(ATC_IS_CYCLIC, &atchan->status);
+
 	spin_unlock_bh(&atchan->lock);
 
 	return 0;
@@ -853,6 +1046,10 @@ static void atc_issue_pending(struct dma_chan *chan)
 
 	dev_vdbg(chan2dev(chan), "issue_pending\n");
 
+	/* Not needed for cyclic transfers */
+	if (test_bit(ATC_IS_CYCLIC, &atchan->status))
+		return;
+
 	spin_lock_bh(&atchan->lock);
 	if (!atc_chan_is_enabled(atchan)) {
 		atc_advance_work(atchan);
@@ -959,6 +1156,7 @@ static void atc_free_chan_resources(struct dma_chan *chan)
 	}
 	list_splice_init(&atchan->free_list, &list);
 	atchan->descs_allocated = 0;
+	atchan->status = 0;
 
 	dev_vdbg(chan2dev(chan), "free_chan_resources: done\n");
 }
@@ -1092,10 +1290,15 @@ static int __init at_dma_probe(struct platform_device *pdev)
 	if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask))
 		atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy;
 
-	if (dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask)) {
+	if (dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask))
 		atdma->dma_common.device_prep_slave_sg = atc_prep_slave_sg;
+
+	if (dma_has_cap(DMA_CYCLIC, atdma->dma_common.cap_mask))
+		atdma->dma_common.device_prep_dma_cyclic = atc_prep_dma_cyclic;
+
+	if (dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask) ||
+	    dma_has_cap(DMA_CYCLIC, atdma->dma_common.cap_mask))
 		atdma->dma_common.device_control = atc_control;
-	}
 
 	dma_writel(atdma, EN, AT_DMA_ENABLE);
 
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 8303306ea82..c79a9e07f7b 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -180,13 +180,23 @@ txd_to_at_desc(struct dma_async_tx_descriptor *txd)
 
 /*--  Channels  --------------------------------------------------------*/
 
+/**
+ * atc_status - information bits stored in channel status flag
+ *
+ * Manipulated with atomic operations.
+ */
+enum atc_status {
+	ATC_IS_ERROR = 0,
+	ATC_IS_CYCLIC = 24,
+};
+
 /**
  * struct at_dma_chan - internal representation of an Atmel HDMAC channel
  * @chan_common: common dmaengine channel object members
  * @device: parent device
  * @ch_regs: memory mapped register base
  * @mask: channel index in a mask
- * @error_status: transmit error status information from irq handler
+ * @status: transmit status information from irq/prep* functions
  *                to tasklet (use atomic operations)
  * @tasklet: bottom half to finish transaction work
  * @lock: serializes enqueue/dequeue operations to descriptors lists
@@ -201,7 +211,7 @@ struct at_dma_chan {
 	struct at_dma		*device;
 	void __iomem		*ch_regs;
 	u8			mask;
-	unsigned long		error_status;
+	unsigned long		status;
 	struct tasklet_struct	tasklet;
 
 	spinlock_t		lock;
-- 
cgit v1.2.3-70-g09d2


From cc52a10a048fc1fbe4ffba58c2f0afc79ae0f56f Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Sat, 30 Apr 2011 16:57:47 +0200
Subject: dmaengine: at_hdmac: debug information sg_len for prep_slave_sg

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index ed9d92429de..8f50a0fb79e 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -687,7 +687,8 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
 
-	dev_vdbg(chan2dev(chan), "prep_slave_sg: %s f0x%lx\n",
+	dev_vdbg(chan2dev(chan), "prep_slave_sg (%d): %s f0x%lx\n",
+			sg_len,
 			direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE",
 			flags);
 
-- 
cgit v1.2.3-70-g09d2


From 2f432823ec6e693d7b934e805ce1838f41d66ce7 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Sat, 30 Apr 2011 16:57:48 +0200
Subject: dmaengine: at_hdmac: remove channel status testing in tasklet

There is no need to test if channel is enabled in tasklet:
- in error path, channel is disabled in interrupt routine
- in normal path, this test is performed in sub functions to report
a misuse of the engine.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 8f50a0fb79e..65bd52a84bc 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -472,13 +472,6 @@ static void atc_tasklet(unsigned long data)
 {
 	struct at_dma_chan *atchan = (struct at_dma_chan *)data;
 
-	/* Channel cannot be enabled here */
-	if (atc_chan_is_enabled(atchan)) {
-		dev_err(chan2dev(&atchan->chan_common),
-			"BUG: channel enabled in tasklet\n");
-		return;
-	}
-
 	spin_lock(&atchan->lock);
 	if (test_and_clear_bit(ATC_IS_ERROR, &atchan->status))
 		atc_handle_error(atchan);
-- 
cgit v1.2.3-70-g09d2


From ae14d4b5e0a4ebc4e674831cbb97b73ba66dba08 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Sat, 30 Apr 2011 16:57:49 +0200
Subject: dmaengine: at_hdmac: specialize AHB interfaces to optimize transfers

DMA controller has two AHB interfaces on the SOC internal
matrix.
It is more efficient to specialize each interface as the
access to memory can introduce latencies that are not compatible
with peripheral accesses requirements.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c      | 26 +++++++++++++++-----------
 drivers/dma/at_hdmac_regs.h |  4 ++++
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 65bd52a84bc..f52c9e38d88 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -37,8 +37,8 @@
 
 #define	ATC_DEFAULT_CFG		(ATC_FIFOCFG_HALFFIFO)
 #define	ATC_DEFAULT_CTRLA	(0)
-#define	ATC_DEFAULT_CTRLB	(ATC_SIF(0)	\
-				|ATC_DIF(1))
+#define	ATC_DEFAULT_CTRLB	(ATC_SIF(AT_DMA_MEM_IF) \
+				|ATC_DIF(AT_DMA_MEM_IF))
 
 /*
  * Initial number of descriptors to allocate for each channel. This could
@@ -693,14 +693,15 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	reg_width = atslave->reg_width;
 
 	ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla;
-	ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN;
+	ctrlb = ATC_IEN;
 
 	switch (direction) {
 	case DMA_TO_DEVICE:
 		ctrla |=  ATC_DST_WIDTH(reg_width);
 		ctrlb |=  ATC_DST_ADDR_MODE_FIXED
 			| ATC_SRC_ADDR_MODE_INCR
-			| ATC_FC_MEM2PER;
+			| ATC_FC_MEM2PER
+			| ATC_SIF(AT_DMA_MEM_IF) | ATC_DIF(AT_DMA_PER_IF);
 		reg = atslave->tx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct at_desc	*desc;
@@ -741,7 +742,8 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		ctrla |=  ATC_SRC_WIDTH(reg_width);
 		ctrlb |=  ATC_DST_ADDR_MODE_INCR
 			| ATC_SRC_ADDR_MODE_FIXED
-			| ATC_FC_PER2MEM;
+			| ATC_FC_PER2MEM
+			| ATC_SIF(AT_DMA_PER_IF) | ATC_DIF(AT_DMA_MEM_IF);
 
 		reg = atslave->rx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
@@ -846,20 +848,22 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc,
 		desc->lli.saddr = buf_addr + (period_len * period_index);
 		desc->lli.daddr = atslave->tx_reg;
 		desc->lli.ctrla = ctrla;
-		desc->lli.ctrlb = ATC_DEFAULT_CTRLB
-				| ATC_DST_ADDR_MODE_FIXED
+		desc->lli.ctrlb = ATC_DST_ADDR_MODE_FIXED
 				| ATC_SRC_ADDR_MODE_INCR
-				| ATC_FC_MEM2PER;
+				| ATC_FC_MEM2PER
+				| ATC_SIF(AT_DMA_MEM_IF)
+				| ATC_DIF(AT_DMA_PER_IF);
 		break;
 
 	case DMA_FROM_DEVICE:
 		desc->lli.saddr = atslave->rx_reg;
 		desc->lli.daddr = buf_addr + (period_len * period_index);
 		desc->lli.ctrla = ctrla;
-		desc->lli.ctrlb = ATC_DEFAULT_CTRLB
-				| ATC_DST_ADDR_MODE_INCR
+		desc->lli.ctrlb = ATC_DST_ADDR_MODE_INCR
 				| ATC_SRC_ADDR_MODE_FIXED
-				| ATC_FC_PER2MEM;
+				| ATC_FC_PER2MEM
+				| ATC_SIF(AT_DMA_PER_IF)
+				| ATC_DIF(AT_DMA_MEM_IF);
 		break;
 
 	default:
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index c79a9e07f7b..ae3056df4f4 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -103,6 +103,10 @@
 /* Bitfields in CTRLB */
 #define	ATC_SIF(i)		(0x3 & (i))	/* Src tx done via AHB-Lite Interface i */
 #define	ATC_DIF(i)		((0x3 & (i)) <<  4)	/* Dst tx done via AHB-Lite Interface i */
+				  /* Specify AHB interfaces */
+#define AT_DMA_MEM_IF		0 /* interface 0 as memory interface */
+#define AT_DMA_PER_IF		1 /* interface 1 as peripheral interface */
+
 #define	ATC_SRC_PIP		(0x1 <<  8)	/* Source Picture-in-Picture enabled */
 #define	ATC_DST_PIP		(0x1 << 12)	/* Destination Picture-in-Picture enabled */
 #define	ATC_SRC_DSCR_DIS	(0x1 << 16)	/* Src Descriptor fetch disable */
-- 
cgit v1.2.3-70-g09d2


From 711b9cea92554be6bd44f04f2485582d762fc441 Mon Sep 17 00:00:00 2001
From: Philippe Langlais <philippe.langlais@linaro.org>
Date: Sat, 7 May 2011 17:09:43 +0200
Subject: dmaengine/ste_dma40: fix introduced warnings

The compiler nowadays moans about possibly non-assigned variable.
Fix this by default-assigning 0.

Signed-off-by: Philippe Langlais <philippe.langlais@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/ste_dma40.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index af955de035f..65d2535d12e 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -1829,7 +1829,7 @@ d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction)
 {
 	struct stedma40_platform_data *plat = chan->base->plat_data;
 	struct stedma40_chan_cfg *cfg = &chan->dma_cfg;
-	dma_addr_t addr;
+	dma_addr_t addr = 0;
 
 	if (chan->runtime_addr)
 		return chan->runtime_addr;
-- 
cgit v1.2.3-70-g09d2


From 543aabc7d295bfe2489f184259395e3467520d48 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Fri, 6 May 2011 19:56:51 +0200
Subject: dmaengine: at_hdmac: set residue as total len in atc_tx_status

If transfer status is !=DMA_SUCCESS, return total transfer len as residue,
instead of zero.

Idea from dw_dmac patch by Viresh Kumar.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index f52c9e38d88..ba0b5ec4e4c 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -1026,7 +1026,12 @@ atc_tx_status(struct dma_chan *chan,
 
 	spin_unlock_bh(&atchan->lock);
 
-	dma_set_tx_state(txstate, last_complete, last_used, 0);
+	if (ret != DMA_SUCCESS)
+		dma_set_tx_state(txstate, last_complete, last_used,
+			atc_first_active(atchan)->len);
+	else
+		dma_set_tx_state(txstate, last_complete, last_used, 0);
+
 	dev_vdbg(chan2dev(chan), "tx_status: %d (d%d, u%d)\n",
 		 cookie, last_complete ? last_complete : 0,
 		 last_used ? last_used : 0);
-- 
cgit v1.2.3-70-g09d2


From 23b5e3ad68a3c26a6a36039ea907997664aedcab Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Fri, 6 May 2011 19:56:52 +0200
Subject: dmaengine: at_hdmac: implement pause and resume in atc_control

Pause and resume controls are useful for audio devices. This also returns
correct status from atc_tx_status() in case chan is paused.

Idea from dw_dmac patch by Linus Walleij.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c      | 97 ++++++++++++++++++++++++++++++++-------------
 drivers/dma/at_hdmac_regs.h |  1 +
 2 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index ba0b5ec4e4c..5968245e1e6 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -508,7 +508,8 @@ static irqreturn_t at_dma_interrupt(int irq, void *dev_id)
 			if (pending & (AT_DMA_BTC(i) | AT_DMA_ERR(i))) {
 				if (pending & AT_DMA_ERR(i)) {
 					/* Disable channel on AHB error */
-					dma_writel(atdma, CHDR, atchan->mask);
+					dma_writel(atdma, CHDR,
+						AT_DMA_RES(i) | atchan->mask);
 					/* Give information to tasklet */
 					set_bit(ATC_IS_ERROR, &atchan->status);
 				}
@@ -952,39 +953,78 @@ static int atc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 {
 	struct at_dma_chan	*atchan = to_at_dma_chan(chan);
 	struct at_dma		*atdma = to_at_dma(chan->device);
-	struct at_desc		*desc, *_desc;
+	int			chan_id = atchan->chan_common.chan_id;
+
 	LIST_HEAD(list);
 
-	/* Only supports DMA_TERMINATE_ALL */
-	if (cmd != DMA_TERMINATE_ALL)
-		return -ENXIO;
+	dev_vdbg(chan2dev(chan), "atc_control (%d)\n", cmd);
 
-	/*
-	 * This is only called when something went wrong elsewhere, so
-	 * we don't really care about the data. Just disable the
-	 * channel. We still have to poll the channel enable bit due
-	 * to AHB/HSB limitations.
-	 */
-	spin_lock_bh(&atchan->lock);
+	if (cmd == DMA_PAUSE) {
+		int pause_timeout = 1000;
 
-	dma_writel(atdma, CHDR, atchan->mask);
+		spin_lock_bh(&atchan->lock);
 
-	/* confirm that this channel is disabled */
-	while (dma_readl(atdma, CHSR) & atchan->mask)
-		cpu_relax();
+		dma_writel(atdma, CHER, AT_DMA_SUSP(chan_id));
+
+		/* wait for FIFO to be empty */
+		while (!(dma_readl(atdma, CHSR) & AT_DMA_EMPT(chan_id))) {
+			if (pause_timeout-- > 0) {
+				/* the FIFO can only drain if the peripheral
+				 * is still requesting data:
+				 * -> timeout if it is not the case. */
+				dma_writel(atdma, CHDR, AT_DMA_RES(chan_id));
+				spin_unlock_bh(&atchan->lock);
+				return -ETIMEDOUT;
+			}
+			cpu_relax();
+		}
 
-	/* active_list entries will end up before queued entries */
-	list_splice_init(&atchan->queue, &list);
-	list_splice_init(&atchan->active_list, &list);
+		set_bit(ATC_IS_PAUSED, &atchan->status);
 
-	/* Flush all pending and queued descriptors */
-	list_for_each_entry_safe(desc, _desc, &list, desc_node)
-		atc_chain_complete(atchan, desc);
+		spin_unlock_bh(&atchan->lock);
+	} else if (cmd == DMA_RESUME) {
+		if (!test_bit(ATC_IS_PAUSED, &atchan->status))
+			return 0;
 
-	/* if channel dedicated to cyclic operations, free it */
-	clear_bit(ATC_IS_CYCLIC, &atchan->status);
+		spin_lock_bh(&atchan->lock);
 
-	spin_unlock_bh(&atchan->lock);
+		dma_writel(atdma, CHDR, AT_DMA_RES(chan_id));
+		clear_bit(ATC_IS_PAUSED, &atchan->status);
+
+		spin_unlock_bh(&atchan->lock);
+	} else if (cmd == DMA_TERMINATE_ALL) {
+		struct at_desc	*desc, *_desc;
+		/*
+		 * This is only called when something went wrong elsewhere, so
+		 * we don't really care about the data. Just disable the
+		 * channel. We still have to poll the channel enable bit due
+		 * to AHB/HSB limitations.
+		 */
+		spin_lock_bh(&atchan->lock);
+
+		/* disabling channel: must also remove suspend state */
+		dma_writel(atdma, CHDR, AT_DMA_RES(chan_id) | atchan->mask);
+
+		/* confirm that this channel is disabled */
+		while (dma_readl(atdma, CHSR) & atchan->mask)
+			cpu_relax();
+
+		/* active_list entries will end up before queued entries */
+		list_splice_init(&atchan->queue, &list);
+		list_splice_init(&atchan->active_list, &list);
+
+		/* Flush all pending and queued descriptors */
+		list_for_each_entry_safe(desc, _desc, &list, desc_node)
+			atc_chain_complete(atchan, desc);
+
+		clear_bit(ATC_IS_PAUSED, &atchan->status);
+		/* if channel dedicated to cyclic operations, free it */
+		clear_bit(ATC_IS_CYCLIC, &atchan->status);
+
+		spin_unlock_bh(&atchan->lock);
+	} else {
+		return -ENXIO;
+	}
 
 	return 0;
 }
@@ -1032,8 +1072,11 @@ atc_tx_status(struct dma_chan *chan,
 	else
 		dma_set_tx_state(txstate, last_complete, last_used, 0);
 
-	dev_vdbg(chan2dev(chan), "tx_status: %d (d%d, u%d)\n",
-		 cookie, last_complete ? last_complete : 0,
+	if (test_bit(ATC_IS_PAUSED, &atchan->status))
+		ret = DMA_PAUSED;
+
+	dev_vdbg(chan2dev(chan), "tx_status %d: cookie = %d (d%d, u%d)\n",
+		 ret, cookie, last_complete ? last_complete : 0,
 		 last_used ? last_used : 0);
 
 	return ret;
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index ae3056df4f4..087dbf1dd39 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -191,6 +191,7 @@ txd_to_at_desc(struct dma_async_tx_descriptor *txd)
  */
 enum atc_status {
 	ATC_IS_ERROR = 0,
+	ATC_IS_PAUSED = 1,
 	ATC_IS_CYCLIC = 24,
 };
 
-- 
cgit v1.2.3-70-g09d2


From e257e1563f28890f54b5f82861373bb4b32dd770 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Fri, 6 May 2011 19:56:53 +0200
Subject: dmaengine: at_hdmac: use descriptor chaining help function

A little function helps to chain descriptors:
it is already used in cyclic dma operations, now use it in memcpy and slave_sg
preparation functions.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c | 33 +++------------------------------
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 5968245e1e6..2c6bc3aa3dd 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -626,16 +626,7 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 		desc->txd.cookie = 0;
 
-		if (!first) {
-			first = desc;
-		} else {
-			/* inform the HW lli about chaining */
-			prev->lli.dscr = desc->txd.phys;
-			/* insert the link descriptor to the LD ring */
-			list_add_tail(&desc->desc_node,
-					&first->tx_list);
-		}
-		prev = desc;
+		atc_desc_chain(&first, &prev, desc);
 	}
 
 	/* First descriptor of the chain embedds additional information */
@@ -726,16 +717,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 					| len >> mem_width;
 			desc->lli.ctrlb = ctrlb;
 
-			if (!first) {
-				first = desc;
-			} else {
-				/* inform the HW lli about chaining */
-				prev->lli.dscr = desc->txd.phys;
-				/* insert the link descriptor to the LD ring */
-				list_add_tail(&desc->desc_node,
-						&first->tx_list);
-			}
-			prev = desc;
+			atc_desc_chain(&first, &prev, desc);
 			total_len += len;
 		}
 		break;
@@ -769,16 +751,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 					| len >> reg_width;
 			desc->lli.ctrlb = ctrlb;
 
-			if (!first) {
-				first = desc;
-			} else {
-				/* inform the HW lli about chaining */
-				prev->lli.dscr = desc->txd.phys;
-				/* insert the link descriptor to the LD ring */
-				list_add_tail(&desc->desc_node,
-						&first->tx_list);
-			}
-			prev = desc;
+			atc_desc_chain(&first, &prev, desc);
 			total_len += len;
 		}
 		break;
-- 
cgit v1.2.3-70-g09d2


From c8fcba600c46c5d7667ec230b1d9ce3ce5859f9c Mon Sep 17 00:00:00 2001
From: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Date: Mon, 9 May 2011 16:09:35 +0900
Subject: pch_dma: fix dma direction issue for ML7213 IOH video-in

Currently, even-channel number is set as tx direction and odd is set as rx.
However, though video-in uses ch6, the direction is not tx but rx.
This patch sets video-in's DMA direction correctly.

Signed-off-by: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pch_dma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index 6eebc6205c6..d28c47a42f1 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -478,7 +478,6 @@ static int pd_alloc_chan_resources(struct dma_chan *chan)
 	spin_unlock_bh(&pd_chan->lock);
 
 	pdc_enable_irq(chan, 1);
-	pdc_set_dir(chan);
 
 	return pd_chan->descs_allocated;
 }
@@ -561,6 +560,9 @@ static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan,
 	else
 		return NULL;
 
+	pd_chan->dir = direction;
+	pdc_set_dir(chan);
+
 	for_each_sg(sgl, sg, sg_len, i) {
 		desc = pdc_desc_get(pd_chan);
 
@@ -850,8 +852,6 @@ static int __devinit pch_dma_probe(struct pci_dev *pdev,
 
 		pd_chan->membase = &regs->desc[i];
 
-		pd_chan->dir = (i % 2) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-
 		spin_lock_init(&pd_chan->lock);
 
 		INIT_LIST_HEAD(&pd_chan->active_list);
-- 
cgit v1.2.3-70-g09d2


From 08645fdc7bab4564f7dfd07525da8a1761f8f106 Mon Sep 17 00:00:00 2001
From: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Date: Mon, 9 May 2011 16:09:36 +0900
Subject: pch_dma: modify for checkpatch

Fix checkpatch warnings.

Signed-off-by: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pch_dma.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index d28c47a42f1..afd6fba08a6 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -138,7 +138,8 @@ struct pch_dma {
 #define dma_writel(pd, name, val) \
 	writel((val), (pd)->membase + PCH_DMA_##name)
 
-static inline struct pch_dma_desc *to_pd_desc(struct dma_async_tx_descriptor *txd)
+static inline
+struct pch_dma_desc *to_pd_desc(struct dma_async_tx_descriptor *txd)
 {
 	return container_of(txd, struct pch_dma_desc, txd);
 }
@@ -163,13 +164,15 @@ static inline struct device *chan2parent(struct dma_chan *chan)
 	return chan->dev->device.parent;
 }
 
-static inline struct pch_dma_desc *pdc_first_active(struct pch_dma_chan *pd_chan)
+static inline
+struct pch_dma_desc *pdc_first_active(struct pch_dma_chan *pd_chan)
 {
 	return list_first_entry(&pd_chan->active_list,
 				struct pch_dma_desc, desc_node);
 }
 
-static inline struct pch_dma_desc *pdc_first_queued(struct pch_dma_chan *pd_chan)
+static inline
+struct pch_dma_desc *pdc_first_queued(struct pch_dma_chan *pd_chan)
 {
 	return list_first_entry(&pd_chan->queue,
 				struct pch_dma_desc, desc_node);
-- 
cgit v1.2.3-70-g09d2


From 60092d0bde4c8741198da4a69b693d3709385bf1 Mon Sep 17 00:00:00 2001
From: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Date: Mon, 9 May 2011 16:09:37 +0900
Subject: pch_dma: Fix DMA setting issue

Currently, Direct-Start mode(*) is enabled.
Our IOH's devices must not use this mode.
This causes unexpected behavior.
This patch deletes Direct-Start setting.
(*) This mode is used in order for CPU to generate the DMA request.

Signed-off-by: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pch_dma.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index afd6fba08a6..4e466720406 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -254,9 +254,6 @@ static bool pdc_is_idle(struct pch_dma_chan *pd_chan)
 
 static void pdc_dostart(struct pch_dma_chan *pd_chan, struct pch_dma_desc* desc)
 {
-	struct pch_dma *pd = to_pd(pd_chan->chan.device);
-	u32 val;
-
 	if (!pdc_is_idle(pd_chan)) {
 		dev_err(chan2dev(&pd_chan->chan),
 			"BUG: Attempt to start non-idle channel\n");
@@ -282,10 +279,6 @@ static void pdc_dostart(struct pch_dma_chan *pd_chan, struct pch_dma_desc* desc)
 		channel_writel(pd_chan, NEXT, desc->txd.phys);
 		pdc_set_mode(&pd_chan->chan, DMA_CTL0_SG);
 	}
-
-	val = dma_readl(pd, CTL2);
-	val |= 1 << (DMA_CTL2_START_SHIFT_BITS + pd_chan->chan.chan_id);
-	dma_writel(pd, CTL2, val);
 }
 
 static void pdc_chain_complete(struct pch_dma_chan *pd_chan,
-- 
cgit v1.2.3-70-g09d2


From 194f5f2706c7472f9c6bb2d17fa788993606581f Mon Sep 17 00:00:00 2001
From: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Date: Mon, 9 May 2011 16:09:38 +0900
Subject: pch_dma: Support I2S for ML7213 IOH

Support I2S device for ML7213 IOH

Signed-off-by: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pch_dma.c | 62 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index 4e466720406..2edcc9c1029 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -77,10 +77,10 @@ struct pch_dma_regs {
 	u32	dma_ctl0;
 	u32	dma_ctl1;
 	u32	dma_ctl2;
-	u32	reserved1;
+	u32	dma_ctl3;
 	u32	dma_sts0;
 	u32	dma_sts1;
-	u32	reserved2;
+	u32	dma_sts2;
 	u32	reserved3;
 	struct pch_dma_desc_regs desc[MAX_CHAN_NR];
 };
@@ -130,6 +130,7 @@ struct pch_dma {
 #define PCH_DMA_CTL0	0x00
 #define PCH_DMA_CTL1	0x04
 #define PCH_DMA_CTL2	0x08
+#define PCH_DMA_CTL3	0x0C
 #define PCH_DMA_STS0	0x10
 #define PCH_DMA_STS1	0x14
 
@@ -202,16 +203,30 @@ static void pdc_set_dir(struct dma_chan *chan)
 	struct pch_dma *pd = to_pd(chan->device);
 	u32 val;
 
-	val = dma_readl(pd, CTL0);
+	if (chan->chan_id < 8) {
+		val = dma_readl(pd, CTL0);
 
-	if (pd_chan->dir == DMA_TO_DEVICE)
-		val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
-			       DMA_CTL0_DIR_SHIFT_BITS);
-	else
-		val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
-				 DMA_CTL0_DIR_SHIFT_BITS));
+		if (pd_chan->dir == DMA_TO_DEVICE)
+			val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
+				       DMA_CTL0_DIR_SHIFT_BITS);
+		else
+			val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
+					 DMA_CTL0_DIR_SHIFT_BITS));
+
+		dma_writel(pd, CTL0, val);
+	} else {
+		int ch = chan->chan_id - 8; /* ch8-->0 ch9-->1 ... ch11->3 */
+		val = dma_readl(pd, CTL3);
 
-	dma_writel(pd, CTL0, val);
+		if (pd_chan->dir == DMA_TO_DEVICE)
+			val |= 0x1 << (DMA_CTL0_BITS_PER_CH * ch +
+				       DMA_CTL0_DIR_SHIFT_BITS);
+		else
+			val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * ch +
+					 DMA_CTL0_DIR_SHIFT_BITS));
+
+		dma_writel(pd, CTL3, val);
+	}
 
 	dev_dbg(chan2dev(chan), "pdc_set_dir: chan %d -> %x\n",
 		chan->chan_id, val);
@@ -222,13 +237,26 @@ static void pdc_set_mode(struct dma_chan *chan, u32 mode)
 	struct pch_dma *pd = to_pd(chan->device);
 	u32 val;
 
-	val = dma_readl(pd, CTL0);
+	if (chan->chan_id < 8) {
+		val = dma_readl(pd, CTL0);
 
-	val &= ~(DMA_CTL0_MODE_MASK_BITS <<
-		(DMA_CTL0_BITS_PER_CH * chan->chan_id));
-	val |= mode << (DMA_CTL0_BITS_PER_CH * chan->chan_id);
+		val &= ~(DMA_CTL0_MODE_MASK_BITS <<
+			(DMA_CTL0_BITS_PER_CH * chan->chan_id));
+		val |= mode << (DMA_CTL0_BITS_PER_CH * chan->chan_id);
 
-	dma_writel(pd, CTL0, val);
+		dma_writel(pd, CTL0, val);
+	} else {
+		int ch = chan->chan_id - 8; /* ch8-->0 ch9-->1 ... ch11->3 */
+
+		val = dma_readl(pd, CTL3);
+
+		val &= ~(DMA_CTL0_MODE_MASK_BITS <<
+			(DMA_CTL0_BITS_PER_CH * ch));
+		val |= mode << (DMA_CTL0_BITS_PER_CH * ch);
+
+		dma_writel(pd, CTL3, val);
+
+	}
 
 	dev_dbg(chan2dev(chan), "pdc_set_mode: chan %d -> %x\n",
 		chan->chan_id, val);
@@ -701,6 +729,7 @@ static void pch_dma_save_regs(struct pch_dma *pd)
 	pd->regs.dma_ctl0 = dma_readl(pd, CTL0);
 	pd->regs.dma_ctl1 = dma_readl(pd, CTL1);
 	pd->regs.dma_ctl2 = dma_readl(pd, CTL2);
+	pd->regs.dma_ctl3 = dma_readl(pd, CTL3);
 
 	list_for_each_entry_safe(chan, _c, &pd->dma.channels, device_node) {
 		pd_chan = to_pd_chan(chan);
@@ -723,6 +752,7 @@ static void pch_dma_restore_regs(struct pch_dma *pd)
 	dma_writel(pd, CTL0, pd->regs.dma_ctl0);
 	dma_writel(pd, CTL1, pd->regs.dma_ctl1);
 	dma_writel(pd, CTL2, pd->regs.dma_ctl2);
+	dma_writel(pd, CTL3, pd->regs.dma_ctl3);
 
 	list_for_each_entry_safe(chan, _c, &pd->dma.channels, device_node) {
 		pd_chan = to_pd_chan(chan);
@@ -925,6 +955,7 @@ static void __devexit pch_dma_remove(struct pci_dev *pdev)
 #define PCI_DEVICE_ID_ML7213_DMA1_8CH	0x8026
 #define PCI_DEVICE_ID_ML7213_DMA2_8CH	0x802B
 #define PCI_DEVICE_ID_ML7213_DMA3_4CH	0x8034
+#define PCI_DEVICE_ID_ML7213_DMA4_12CH	0x8032
 
 static const struct pci_device_id pch_dma_id_table[] = {
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_EG20T_PCH_DMA_8CH), 8 },
@@ -932,6 +963,7 @@ static const struct pci_device_id pch_dma_id_table[] = {
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA1_8CH), 8}, /* UART Video */
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA2_8CH), 8}, /* PCMIF SPI */
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA3_4CH), 4}, /* FPGA */
+	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA4_12CH), 12}, /* I2S */
 	{ 0, },
 };
 
-- 
cgit v1.2.3-70-g09d2


From c0dfc04ac96847913a791f5459f4ac83a81a4745 Mon Sep 17 00:00:00 2001
From: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Date: Mon, 9 May 2011 16:09:39 +0900
Subject: pch_dma: Support new device ML7223 IOH

Support new device OKI SEMICONDUCTOR ML7223 IOH(Input/Output Hub).
The ML7223 IOH is for MP(Media Phone) use.
The ML7223 is companion chip for Intel Atom E6xx series.
The ML7223 is completely compatible for Intel EG20T PCH.

Signed-off-by: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig   | 12 +++++++-----
 drivers/dma/pch_dma.c |  8 ++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index a572600e44e..25cf327cd1c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -200,16 +200,18 @@ config PL330_DMA
 	  platform_data for a dma-pl330 device.
 
 config PCH_DMA
-	tristate "Intel EG20T PCH / OKI SEMICONDUCTOR ML7213 IOH DMA support"
+	tristate "Intel EG20T PCH / OKI Semi IOH(ML7213/ML7223) DMA support"
 	depends on PCI && X86
 	select DMA_ENGINE
 	help
 	  Enable support for Intel EG20T PCH DMA engine.
 
-	  This driver also can be used for OKI SEMICONDUCTOR ML7213 IOH(Input/
-	  Output Hub) which is for IVI(In-Vehicle Infotainment) use.
-	  ML7213 is companion chip for Intel Atom E6xx series.
-	  ML7213 is completely compatible for Intel EG20T PCH.
+	  This driver also can be used for OKI SEMICONDUCTOR IOH(Input/
+	  Output Hub), ML7213 and ML7223.
+	  ML7213 IOH is for IVI(In-Vehicle Infotainment) use and ML7223 IOH is
+	  for MP(Media Phone) use.
+	  ML7213/ML7223 is companion chip for Intel Atom E6xx series.
+	  ML7213/ML7223 is completely compatible for Intel EG20T PCH.
 
 config IMX_SDMA
 	tristate "i.MX SDMA support"
diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index 2edcc9c1029..083e698da53 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -956,6 +956,10 @@ static void __devexit pch_dma_remove(struct pci_dev *pdev)
 #define PCI_DEVICE_ID_ML7213_DMA2_8CH	0x802B
 #define PCI_DEVICE_ID_ML7213_DMA3_4CH	0x8034
 #define PCI_DEVICE_ID_ML7213_DMA4_12CH	0x8032
+#define PCI_DEVICE_ID_ML7223_DMA1_4CH	0x800B
+#define PCI_DEVICE_ID_ML7223_DMA2_4CH	0x800E
+#define PCI_DEVICE_ID_ML7223_DMA3_4CH	0x8017
+#define PCI_DEVICE_ID_ML7223_DMA4_4CH	0x803B
 
 static const struct pci_device_id pch_dma_id_table[] = {
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_EG20T_PCH_DMA_8CH), 8 },
@@ -964,6 +968,10 @@ static const struct pci_device_id pch_dma_id_table[] = {
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA2_8CH), 8}, /* PCMIF SPI */
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA3_4CH), 4}, /* FPGA */
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA4_12CH), 12}, /* I2S */
+	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA1_4CH), 4}, /* UART */
+	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA2_4CH), 4}, /* Video SPI */
+	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA3_4CH), 4}, /* Security */
+	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA4_4CH), 4}, /* FPGA */
 	{ 0, },
 };
 
-- 
cgit v1.2.3-70-g09d2


From eb8590b504caacb029dea4540e0b0dcc98da4381 Mon Sep 17 00:00:00 2001
From: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Date: Mon, 9 May 2011 16:09:40 +0900
Subject: pch_dma: modify pci device table definition

Signed-off-by: Tomoya MORINAGA <tomoya-linux@dsn.okisemi.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/pch_dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
index 083e698da53..ff5b38f9d45 100644
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -961,7 +961,7 @@ static void __devexit pch_dma_remove(struct pci_dev *pdev)
 #define PCI_DEVICE_ID_ML7223_DMA3_4CH	0x8017
 #define PCI_DEVICE_ID_ML7223_DMA4_4CH	0x803B
 
-static const struct pci_device_id pch_dma_id_table[] = {
+DEFINE_PCI_DEVICE_TABLE(pch_dma_id_table) = {
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_EG20T_PCH_DMA_8CH), 8 },
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_EG20T_PCH_DMA_4CH), 4 },
 	{ PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7213_DMA1_8CH), 8}, /* UART Video */
-- 
cgit v1.2.3-70-g09d2


From 0f6896f195df014064014493c8287a8e2d18f938 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 13 Apr 2011 11:33:17 +0800
Subject: ACPICA: Split all internal Global Lock functions to new file -
 evglock

These functions were moved from evmisc.c

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/Makefile   |   2 +-
 drivers/acpi/acpica/acevents.h |  17 ++-
 drivers/acpi/acpica/evglock.c  | 335 +++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/acpica/evmisc.c   | 309 -------------------------------------
 4 files changed, 347 insertions(+), 316 deletions(-)
 create mode 100644 drivers/acpi/acpica/evglock.c

diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile
index a1224712fd0..301bd2d388a 100644
--- a/drivers/acpi/acpica/Makefile
+++ b/drivers/acpi/acpica/Makefile
@@ -14,7 +14,7 @@ acpi-y := dsfield.o   dsmthdat.o  dsopcode.o  dswexec.o  dswscope.o \
 
 acpi-y += evevent.o  evregion.o  evsci.o    evxfevnt.o \
 	 evmisc.o   evrgnini.o  evxface.o  evxfregn.o \
-	 evgpe.o    evgpeblk.o evgpeinit.o  evgpeutil.o evxfgpe.o
+	 evgpe.o    evgpeblk.o evgpeinit.o  evgpeutil.o evxfgpe.o evglock.o
 
 acpi-y += exconfig.o  exfield.o  exnames.o   exoparg6.o  exresolv.o  exstorob.o\
 	 exconvrt.o  exfldio.o  exoparg1.o  exprep.o    exresop.o   exsystem.o\
diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h
index 41d247daf46..bea3b489918 100644
--- a/drivers/acpi/acpica/acevents.h
+++ b/drivers/acpi/acpica/acevents.h
@@ -58,18 +58,23 @@ u32 acpi_ev_fixed_event_detect(void);
  */
 u8 acpi_ev_is_notify_object(struct acpi_namespace_node *node);
 
-acpi_status acpi_ev_acquire_global_lock(u16 timeout);
-
-acpi_status acpi_ev_release_global_lock(void);
-
-acpi_status acpi_ev_init_global_lock_handler(void);
-
 u32 acpi_ev_get_gpe_number_index(u32 gpe_number);
 
 acpi_status
 acpi_ev_queue_notify_request(struct acpi_namespace_node *node,
 			     u32 notify_value);
 
+/*
+ * evglock - Global Lock support
+ */
+acpi_status acpi_ev_init_global_lock_handler(void);
+
+acpi_status acpi_ev_acquire_global_lock(u16 timeout);
+
+acpi_status acpi_ev_release_global_lock(void);
+
+acpi_status acpi_ev_remove_global_lock_handler(void);
+
 /*
  * evgpe - Low-level GPE support
  */
diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c
new file mode 100644
index 00000000000..56a562a1e5d
--- /dev/null
+++ b/drivers/acpi/acpica/evglock.c
@@ -0,0 +1,335 @@
+/******************************************************************************
+ *
+ * Module Name: evglock - Global Lock support
+ *
+ *****************************************************************************/
+
+/*
+ * Copyright (C) 2000 - 2011, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *    of any contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <acpi/acpi.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acinterp.h"
+
+#define _COMPONENT          ACPI_EVENTS
+ACPI_MODULE_NAME("evglock")
+
+/* Local prototypes */
+static u32 acpi_ev_global_lock_handler(void *context);
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_init_global_lock_handler
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Install a handler for the global lock release event
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ev_init_global_lock_handler(void)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE(ev_init_global_lock_handler);
+
+	/* Attempt installation of the global lock handler */
+
+	status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL,
+						  acpi_ev_global_lock_handler,
+						  NULL);
+
+	/*
+	 * If the global lock does not exist on this platform, the attempt to
+	 * enable GBL_STATUS will fail (the GBL_ENABLE bit will not stick).
+	 * Map to AE_OK, but mark global lock as not present. Any attempt to
+	 * actually use the global lock will be flagged with an error.
+	 */
+	acpi_gbl_global_lock_present = FALSE;
+	if (status == AE_NO_HARDWARE_RESPONSE) {
+		ACPI_ERROR((AE_INFO,
+			    "No response from Global Lock hardware, disabling lock"));
+
+		return_ACPI_STATUS(AE_OK);
+	}
+
+	status = acpi_os_create_lock(&acpi_gbl_global_lock_pending_lock);
+	if (ACPI_FAILURE(status)) {
+		return_ACPI_STATUS(status);
+	}
+
+	acpi_gbl_global_lock_pending = FALSE;
+	acpi_gbl_global_lock_present = TRUE;
+	return_ACPI_STATUS(status);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_remove_global_lock_handler
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Remove the handler for the Global Lock
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ev_remove_global_lock_handler(void)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE(ev_remove_global_lock_handler);
+
+	acpi_gbl_global_lock_present = FALSE;
+	status = acpi_remove_fixed_event_handler(ACPI_EVENT_GLOBAL,
+						 acpi_ev_global_lock_handler);
+
+	return_ACPI_STATUS(status);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_global_lock_handler
+ *
+ * PARAMETERS:  Context         - From thread interface, not used
+ *
+ * RETURN:      ACPI_INTERRUPT_HANDLED
+ *
+ * DESCRIPTION: Invoked directly from the SCI handler when a global lock
+ *              release interrupt occurs. If there is actually a pending
+ *              request for the lock, signal the waiting thread.
+ *
+ ******************************************************************************/
+
+static u32 acpi_ev_global_lock_handler(void *context)
+{
+	acpi_status status;
+	acpi_cpu_flags flags;
+
+	flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
+
+	/*
+	 * If a request for the global lock is not actually pending,
+	 * we are done. This handles "spurious" global lock interrupts
+	 * which are possible (and have been seen) with bad BIOSs.
+	 */
+	if (!acpi_gbl_global_lock_pending) {
+		goto cleanup_and_exit;
+	}
+
+	/*
+	 * Send a unit to the global lock semaphore. The actual acquisition
+	 * of the global lock will be performed by the waiting thread.
+	 */
+	status = acpi_os_signal_semaphore(acpi_gbl_global_lock_semaphore, 1);
+	if (ACPI_FAILURE(status)) {
+		ACPI_ERROR((AE_INFO, "Could not signal Global Lock semaphore"));
+	}
+
+	acpi_gbl_global_lock_pending = FALSE;
+
+      cleanup_and_exit:
+
+	acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
+	return (ACPI_INTERRUPT_HANDLED);
+}
+
+/******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_acquire_global_lock
+ *
+ * PARAMETERS:  Timeout         - Max time to wait for the lock, in millisec.
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Attempt to gain ownership of the Global Lock.
+ *
+ * MUTEX:       Interpreter must be locked
+ *
+ * Note: The original implementation allowed multiple threads to "acquire" the
+ * Global Lock, and the OS would hold the lock until the last thread had
+ * released it. However, this could potentially starve the BIOS out of the
+ * lock, especially in the case where there is a tight handshake between the
+ * Embedded Controller driver and the BIOS. Therefore, this implementation
+ * allows only one thread to acquire the HW Global Lock at a time, and makes
+ * the global lock appear as a standard mutex on the OS side.
+ *
+ *****************************************************************************/
+
+acpi_status acpi_ev_acquire_global_lock(u16 timeout)
+{
+	acpi_cpu_flags flags;
+	acpi_status status;
+	u8 acquired = FALSE;
+
+	ACPI_FUNCTION_TRACE(ev_acquire_global_lock);
+
+	/*
+	 * Only one thread can acquire the GL at a time, the global_lock_mutex
+	 * enforces this. This interface releases the interpreter if we must wait.
+	 */
+	status =
+	    acpi_ex_system_wait_mutex(acpi_gbl_global_lock_mutex->mutex.
+				      os_mutex, timeout);
+	if (ACPI_FAILURE(status)) {
+		return_ACPI_STATUS(status);
+	}
+
+	/*
+	 * Update the global lock handle and check for wraparound. The handle is
+	 * only used for the external global lock interfaces, but it is updated
+	 * here to properly handle the case where a single thread may acquire the
+	 * lock via both the AML and the acpi_acquire_global_lock interfaces. The
+	 * handle is therefore updated on the first acquire from a given thread
+	 * regardless of where the acquisition request originated.
+	 */
+	acpi_gbl_global_lock_handle++;
+	if (acpi_gbl_global_lock_handle == 0) {
+		acpi_gbl_global_lock_handle = 1;
+	}
+
+	/*
+	 * Make sure that a global lock actually exists. If not, just
+	 * treat the lock as a standard mutex.
+	 */
+	if (!acpi_gbl_global_lock_present) {
+		acpi_gbl_global_lock_acquired = TRUE;
+		return_ACPI_STATUS(AE_OK);
+	}
+
+	flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
+
+	do {
+
+		/* Attempt to acquire the actual hardware lock */
+
+		ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired);
+		if (acquired) {
+			acpi_gbl_global_lock_acquired = TRUE;
+			ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+					  "Acquired hardware Global Lock\n"));
+			break;
+		}
+
+		/*
+		 * Did not get the lock. The pending bit was set above, and
+		 * we must now wait until we receive the global lock
+		 * released interrupt.
+		 */
+		acpi_gbl_global_lock_pending = TRUE;
+		acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
+
+		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+				  "Waiting for hardware Global Lock\n"));
+
+		/*
+		 * Wait for handshake with the global lock interrupt handler.
+		 * This interface releases the interpreter if we must wait.
+		 */
+		status =
+		    acpi_ex_system_wait_semaphore
+		    (acpi_gbl_global_lock_semaphore, ACPI_WAIT_FOREVER);
+
+		flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
+
+	} while (ACPI_SUCCESS(status));
+
+	acpi_gbl_global_lock_pending = FALSE;
+	acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
+
+	return_ACPI_STATUS(status);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_release_global_lock
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Releases ownership of the Global Lock.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ev_release_global_lock(void)
+{
+	u8 pending = FALSE;
+	acpi_status status = AE_OK;
+
+	ACPI_FUNCTION_TRACE(ev_release_global_lock);
+
+	/* Lock must be already acquired */
+
+	if (!acpi_gbl_global_lock_acquired) {
+		ACPI_WARNING((AE_INFO,
+			      "Cannot release the ACPI Global Lock, it has not been acquired"));
+		return_ACPI_STATUS(AE_NOT_ACQUIRED);
+	}
+
+	if (acpi_gbl_global_lock_present) {
+
+		/* Allow any thread to release the lock */
+
+		ACPI_RELEASE_GLOBAL_LOCK(acpi_gbl_FACS, pending);
+
+		/*
+		 * If the pending bit was set, we must write GBL_RLS to the control
+		 * register
+		 */
+		if (pending) {
+			status =
+			    acpi_write_bit_register
+			    (ACPI_BITREG_GLOBAL_LOCK_RELEASE,
+			     ACPI_ENABLE_EVENT);
+		}
+
+		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+				  "Released hardware Global Lock\n"));
+	}
+
+	acpi_gbl_global_lock_acquired = FALSE;
+
+	/* Release the local GL mutex */
+
+	acpi_os_release_mutex(acpi_gbl_global_lock_mutex->mutex.os_mutex);
+	return_ACPI_STATUS(status);
+}
diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c
index 69a3b4aa862..d0b33184442 100644
--- a/drivers/acpi/acpica/evmisc.c
+++ b/drivers/acpi/acpica/evmisc.c
@@ -45,7 +45,6 @@
 #include "accommon.h"
 #include "acevents.h"
 #include "acnamesp.h"
-#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evmisc")
@@ -53,10 +52,6 @@ ACPI_MODULE_NAME("evmisc")
 /* Local prototypes */
 static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context);
 
-static u32 acpi_ev_global_lock_handler(void *context);
-
-static acpi_status acpi_ev_remove_global_lock_handler(void);
-
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ev_is_notify_object
@@ -275,310 +270,6 @@ static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context)
 	acpi_ut_delete_generic_state(notify_info);
 }
 
-/*******************************************************************************
- *
- * FUNCTION:    acpi_ev_global_lock_handler
- *
- * PARAMETERS:  Context         - From thread interface, not used
- *
- * RETURN:      ACPI_INTERRUPT_HANDLED
- *
- * DESCRIPTION: Invoked directly from the SCI handler when a global lock
- *              release interrupt occurs. If there is actually a pending
- *              request for the lock, signal the waiting thread.
- *
- ******************************************************************************/
-
-static u32 acpi_ev_global_lock_handler(void *context)
-{
-	acpi_status status;
-	acpi_cpu_flags flags;
-
-	flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
-
-	/*
-	 * If a request for the global lock is not actually pending,
-	 * we are done. This handles "spurious" global lock interrupts
-	 * which are possible (and have been seen) with bad BIOSs.
-	 */
-	if (!acpi_gbl_global_lock_pending) {
-		goto cleanup_and_exit;
-	}
-
-	/*
-	 * Send a unit to the global lock semaphore. The actual acquisition
-	 * of the global lock will be performed by the waiting thread.
-	 */
-	status = acpi_os_signal_semaphore(acpi_gbl_global_lock_semaphore, 1);
-	if (ACPI_FAILURE(status)) {
-		ACPI_ERROR((AE_INFO, "Could not signal Global Lock semaphore"));
-	}
-
-	acpi_gbl_global_lock_pending = FALSE;
-
-cleanup_and_exit:
-
-	acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
-	return (ACPI_INTERRUPT_HANDLED);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_ev_init_global_lock_handler
- *
- * PARAMETERS:  None
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Install a handler for the global lock release event
- *
- ******************************************************************************/
-
-acpi_status acpi_ev_init_global_lock_handler(void)
-{
-	acpi_status status;
-
-	ACPI_FUNCTION_TRACE(ev_init_global_lock_handler);
-
-	/* Attempt installation of the global lock handler */
-
-	status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL,
-						  acpi_ev_global_lock_handler,
-						  NULL);
-
-	/*
-	 * If the global lock does not exist on this platform, the attempt to
-	 * enable GBL_STATUS will fail (the GBL_ENABLE bit will not stick).
-	 * Map to AE_OK, but mark global lock as not present. Any attempt to
-	 * actually use the global lock will be flagged with an error.
-	 */
-	acpi_gbl_global_lock_present = FALSE;
-	if (status == AE_NO_HARDWARE_RESPONSE) {
-		ACPI_ERROR((AE_INFO,
-			    "No response from Global Lock hardware, disabling lock"));
-
-		return_ACPI_STATUS(AE_OK);
-	}
-
-	status = acpi_os_create_lock(&acpi_gbl_global_lock_pending_lock);
-	if (ACPI_FAILURE(status)) {
-		return_ACPI_STATUS(status);
-	}
-
-	acpi_gbl_global_lock_pending = FALSE;
-	acpi_gbl_global_lock_present = TRUE;
-	return_ACPI_STATUS(status);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_ev_remove_global_lock_handler
- *
- * PARAMETERS:  None
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Remove the handler for the Global Lock
- *
- ******************************************************************************/
-
-static acpi_status acpi_ev_remove_global_lock_handler(void)
-{
-	acpi_status status;
-
-	ACPI_FUNCTION_TRACE(ev_remove_global_lock_handler);
-
-	acpi_gbl_global_lock_present = FALSE;
-	status = acpi_remove_fixed_event_handler(ACPI_EVENT_GLOBAL,
-						 acpi_ev_global_lock_handler);
-
-	return_ACPI_STATUS(status);
-}
-
-/******************************************************************************
- *
- * FUNCTION:    acpi_ev_acquire_global_lock
- *
- * PARAMETERS:  Timeout         - Max time to wait for the lock, in millisec.
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Attempt to gain ownership of the Global Lock.
- *
- * MUTEX:       Interpreter must be locked
- *
- * Note: The original implementation allowed multiple threads to "acquire" the
- * Global Lock, and the OS would hold the lock until the last thread had
- * released it. However, this could potentially starve the BIOS out of the
- * lock, especially in the case where there is a tight handshake between the
- * Embedded Controller driver and the BIOS. Therefore, this implementation
- * allows only one thread to acquire the HW Global Lock at a time, and makes
- * the global lock appear as a standard mutex on the OS side.
- *
- *****************************************************************************/
-static acpi_thread_id acpi_ev_global_lock_thread_id;
-static int acpi_ev_global_lock_acquired;
-
-acpi_status acpi_ev_acquire_global_lock(u16 timeout)
-{
-	acpi_cpu_flags flags;
-	acpi_status status;
-	u8 acquired = FALSE;
-
-	ACPI_FUNCTION_TRACE(ev_acquire_global_lock);
-
-	/*
-	 * Only one thread can acquire the GL at a time, the global_lock_mutex
-	 * enforces this. This interface releases the interpreter if we must wait.
-	 */
-	status = acpi_ex_system_wait_mutex(
-			acpi_gbl_global_lock_mutex->mutex.os_mutex, 0);
-	if (status == AE_TIME) {
-		if (acpi_ev_global_lock_thread_id == acpi_os_get_thread_id()) {
-			acpi_ev_global_lock_acquired++;
-			return AE_OK;
-		}
-	}
-
-	if (ACPI_FAILURE(status)) {
-		status = acpi_ex_system_wait_mutex(
-				acpi_gbl_global_lock_mutex->mutex.os_mutex,
-				timeout);
-	}
-	if (ACPI_FAILURE(status)) {
-		return_ACPI_STATUS(status);
-	}
-
-	acpi_ev_global_lock_thread_id = acpi_os_get_thread_id();
-	acpi_ev_global_lock_acquired++;
-
-	/*
-	 * Update the global lock handle and check for wraparound. The handle is
-	 * only used for the external global lock interfaces, but it is updated
-	 * here to properly handle the case where a single thread may acquire the
-	 * lock via both the AML and the acpi_acquire_global_lock interfaces. The
-	 * handle is therefore updated on the first acquire from a given thread
-	 * regardless of where the acquisition request originated.
-	 */
-	acpi_gbl_global_lock_handle++;
-	if (acpi_gbl_global_lock_handle == 0) {
-		acpi_gbl_global_lock_handle = 1;
-	}
-
-	/*
-	 * Make sure that a global lock actually exists. If not, just
-	 * treat the lock as a standard mutex.
-	 */
-	if (!acpi_gbl_global_lock_present) {
-		acpi_gbl_global_lock_acquired = TRUE;
-		return_ACPI_STATUS(AE_OK);
-	}
-
-	flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
-
-	do {
-
-		/* Attempt to acquire the actual hardware lock */
-
-		ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired);
-		if (acquired) {
-			acpi_gbl_global_lock_acquired = TRUE;
-			ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-					  "Acquired hardware Global Lock\n"));
-			break;
-		}
-
-		/*
-		 * Did not get the lock. The pending bit was set above, and
-		 * we must now wait until we receive the global lock
-		 * released interrupt.
-		 */
-		acpi_gbl_global_lock_pending = TRUE;
-		acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
-
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "Waiting for hardware Global Lock\n"));
-
-		/*
-		 * Wait for handshake with the global lock interrupt handler.
-		 * This interface releases the interpreter if we must wait.
-		 */
-		status =
-		    acpi_ex_system_wait_semaphore
-		    (acpi_gbl_global_lock_semaphore, ACPI_WAIT_FOREVER);
-
-		flags = acpi_os_acquire_lock(acpi_gbl_global_lock_pending_lock);
-
-	} while (ACPI_SUCCESS(status));
-
-	acpi_gbl_global_lock_pending = FALSE;
-	acpi_os_release_lock(acpi_gbl_global_lock_pending_lock, flags);
-
-	return_ACPI_STATUS(status);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_ev_release_global_lock
- *
- * PARAMETERS:  None
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Releases ownership of the Global Lock.
- *
- ******************************************************************************/
-
-acpi_status acpi_ev_release_global_lock(void)
-{
-	u8 pending = FALSE;
-	acpi_status status = AE_OK;
-
-	ACPI_FUNCTION_TRACE(ev_release_global_lock);
-
-	/* Lock must be already acquired */
-
-	if (!acpi_gbl_global_lock_acquired) {
-		ACPI_WARNING((AE_INFO,
-			      "Cannot release the ACPI Global Lock, it has not been acquired"));
-		return_ACPI_STATUS(AE_NOT_ACQUIRED);
-	}
-
-	acpi_ev_global_lock_acquired--;
-	if (acpi_ev_global_lock_acquired > 0) {
-		return AE_OK;
-	}
-
-	if (acpi_gbl_global_lock_present) {
-
-		/* Allow any thread to release the lock */
-
-		ACPI_RELEASE_GLOBAL_LOCK(acpi_gbl_FACS, pending);
-
-		/*
-		 * If the pending bit was set, we must write GBL_RLS to the control
-		 * register
-		 */
-		if (pending) {
-			status =
-			    acpi_write_bit_register
-			    (ACPI_BITREG_GLOBAL_LOCK_RELEASE,
-			     ACPI_ENABLE_EVENT);
-		}
-
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "Released hardware Global Lock\n"));
-	}
-
-	acpi_gbl_global_lock_acquired = FALSE;
-
-	/* Release the local GL mutex */
-	acpi_ev_global_lock_thread_id = 0;
-	acpi_ev_global_lock_acquired = 0;
-	acpi_os_release_mutex(acpi_gbl_global_lock_mutex->mutex.os_mutex);
-	return_ACPI_STATUS(status);
-}
-
 /******************************************************************************
  *
  * FUNCTION:    acpi_ev_terminate
-- 
cgit v1.2.3-70-g09d2


From 945488b9c5fc3f85e3f6a3580235b5c73febc8a6 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 13 Apr 2011 13:15:58 +0800
Subject: ACPICA: Add more methods eligible for NULL package element removal

This change adds another group of predefined names to the list
of names eligible to have NULL package elements dynamically
removed. This group is the names that return a single
variable-length package containing simple data types such
as integers, buffers, strings. This includes: _ALx,_BCL,_CID,
_DOD,_EDL,_FIX,_PCL,_PLD,_PMD,_PRx,_PSL,_Sx, and _TZD.

http://www.acpica.org/bugzilla/show_bug.cgi?id=914

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/nsrepair.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/acpica/nsrepair.c b/drivers/acpi/acpica/nsrepair.c
index 1d76ac85b5e..ac7b854b0bd 100644
--- a/drivers/acpi/acpica/nsrepair.c
+++ b/drivers/acpi/acpica/nsrepair.c
@@ -74,7 +74,6 @@ ACPI_MODULE_NAME("nsrepair")
  *
  * Additional possible repairs:
  *
- * Optional/unnecessary NULL package elements removed
  * Required package elements that are NULL replaced by Integer/String/Buffer
  * Incorrect standalone package wrapped with required outer package
  *
@@ -623,16 +622,12 @@ acpi_ns_remove_null_elements(struct acpi_predefined_data *data,
 	ACPI_FUNCTION_NAME(ns_remove_null_elements);
 
 	/*
-	 * PTYPE1 packages contain no subpackages.
-	 * PTYPE2 packages contain a variable number of sub-packages. We can
-	 * safely remove all NULL elements from the PTYPE2 packages.
+	 * We can safely remove all NULL elements from these package types:
+	 * PTYPE1_VAR packages contain a variable number of simple data types.
+	 * PTYPE2 packages contain a variable number of sub-packages.
 	 */
 	switch (package_type) {
-	case ACPI_PTYPE1_FIXED:
 	case ACPI_PTYPE1_VAR:
-	case ACPI_PTYPE1_OPTION:
-		return;
-
 	case ACPI_PTYPE2:
 	case ACPI_PTYPE2_COUNT:
 	case ACPI_PTYPE2_PKG_COUNT:
@@ -642,6 +637,8 @@ acpi_ns_remove_null_elements(struct acpi_predefined_data *data,
 		break;
 
 	default:
+	case ACPI_PTYPE1_FIXED:
+	case ACPI_PTYPE1_OPTION:
 		return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 82a1b7cb83b6bd7d2fe3972e847c8ccbff55cb9c Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 13 Apr 2011 13:19:35 +0800
Subject: ACPICA: Update internal address SpaceID for DataTable regions

Moved this internal space id in preparation for ACPI 5.0 changes
that will include some new space IDs.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/acconfig.h |  2 +-
 drivers/acpi/acpica/amlcode.h  | 15 ---------------
 drivers/acpi/acpica/dswload.c  |  2 +-
 drivers/acpi/acpica/dswload2.c |  2 +-
 drivers/acpi/acpica/excreate.c |  3 ++-
 drivers/acpi/acpica/utdecode.c |  5 +++--
 include/acpi/actypes.h         | 13 +++++++++++--
 7 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/drivers/acpi/acpica/acconfig.h b/drivers/acpi/acpica/acconfig.h
index ab87396c2c0..c115bb9bd4d 100644
--- a/drivers/acpi/acpica/acconfig.h
+++ b/drivers/acpi/acpica/acconfig.h
@@ -187,7 +187,7 @@
 
 /* Operation regions */
 
-#define ACPI_NUM_PREDEFINED_REGIONS     9
+#define ACPI_NUM_PREDEFINED_REGIONS     8
 #define ACPI_USER_REGION_BEGIN          0x80
 
 /* Maximum space_ids for Operation Regions */
diff --git a/drivers/acpi/acpica/amlcode.h b/drivers/acpi/acpica/amlcode.h
index f4f0998d396..1077f17859e 100644
--- a/drivers/acpi/acpica/amlcode.h
+++ b/drivers/acpi/acpica/amlcode.h
@@ -394,21 +394,6 @@
 #define AML_CLASS_METHOD_CALL       0x09
 #define AML_CLASS_UNKNOWN           0x0A
 
-/* Predefined Operation Region space_iDs */
-
-typedef enum {
-	REGION_MEMORY = 0,
-	REGION_IO,
-	REGION_PCI_CONFIG,
-	REGION_EC,
-	REGION_SMBUS,
-	REGION_CMOS,
-	REGION_PCI_BAR,
-	REGION_IPMI,
-	REGION_DATA_TABLE,	/* Internal use only */
-	REGION_FIXED_HW = 0x7F
-} AML_REGION_TYPES;
-
 /* Comparison operation codes for match_op operator */
 
 typedef enum {
diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c
index 23a3b1ab20c..324acec1179 100644
--- a/drivers/acpi/acpica/dswload.c
+++ b/drivers/acpi/acpica/dswload.c
@@ -450,7 +450,7 @@ acpi_status acpi_ds_load1_end_op(struct acpi_walk_state *walk_state)
 			status =
 			    acpi_ex_create_region(op->named.data,
 						  op->named.length,
-						  REGION_DATA_TABLE,
+						  ACPI_ADR_SPACE_DATA_TABLE,
 						  walk_state);
 			if (ACPI_FAILURE(status)) {
 				return_ACPI_STATUS(status);
diff --git a/drivers/acpi/acpica/dswload2.c b/drivers/acpi/acpica/dswload2.c
index 4be4e921dfe..976318138c5 100644
--- a/drivers/acpi/acpica/dswload2.c
+++ b/drivers/acpi/acpica/dswload2.c
@@ -562,7 +562,7 @@ acpi_status acpi_ds_load2_end_op(struct acpi_walk_state *walk_state)
 				    ((op->common.value.arg)->common.value.
 				     integer);
 			} else {
-				region_space = REGION_DATA_TABLE;
+				region_space = ACPI_ADR_SPACE_DATA_TABLE;
 			}
 
 			/*
diff --git a/drivers/acpi/acpica/excreate.c b/drivers/acpi/acpica/excreate.c
index e7b372d1766..110711afada 100644
--- a/drivers/acpi/acpica/excreate.c
+++ b/drivers/acpi/acpica/excreate.c
@@ -305,7 +305,8 @@ acpi_ex_create_region(u8 * aml_start,
 	 * range
 	 */
 	if ((region_space >= ACPI_NUM_PREDEFINED_REGIONS) &&
-	    (region_space < ACPI_USER_REGION_BEGIN)) {
+	    (region_space < ACPI_USER_REGION_BEGIN) &&
+	    (region_space != ACPI_ADR_SPACE_DATA_TABLE)) {
 		ACPI_ERROR((AE_INFO, "Invalid AddressSpace type 0x%X",
 			    region_space));
 		return_ACPI_STATUS(AE_AML_INVALID_SPACE_ID);
diff --git a/drivers/acpi/acpica/utdecode.c b/drivers/acpi/acpica/utdecode.c
index 136a814cec6..97cb36f85ce 100644
--- a/drivers/acpi/acpica/utdecode.c
+++ b/drivers/acpi/acpica/utdecode.c
@@ -170,8 +170,7 @@ const char *acpi_gbl_region_types[ACPI_NUM_PREDEFINED_REGIONS] = {
 	"SMBus",
 	"SystemCMOS",
 	"PCIBARTarget",
-	"IPMI",
-	"DataTable"
+	"IPMI"
 };
 
 char *acpi_ut_get_region_name(u8 space_id)
@@ -179,6 +178,8 @@ char *acpi_ut_get_region_name(u8 space_id)
 
 	if (space_id >= ACPI_USER_REGION_BEGIN) {
 		return ("UserDefinedRegion");
+	} else if (space_id == ACPI_ADR_SPACE_DATA_TABLE) {
+		return ("DataTable");
 	} else if (space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
 		return ("FunctionalFixedHW");
 	} else if (space_id >= ACPI_NUM_PREDEFINED_REGIONS) {
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 64f838beaab..ad77c613c74 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -712,8 +712,17 @@ typedef u8 acpi_adr_space_type;
 #define ACPI_ADR_SPACE_CMOS             (acpi_adr_space_type) 5
 #define ACPI_ADR_SPACE_PCI_BAR_TARGET   (acpi_adr_space_type) 6
 #define ACPI_ADR_SPACE_IPMI             (acpi_adr_space_type) 7
-#define ACPI_ADR_SPACE_DATA_TABLE       (acpi_adr_space_type) 8
-#define ACPI_ADR_SPACE_FIXED_HARDWARE   (acpi_adr_space_type) 127
+
+/*
+ * Special region types
+ *
+ * Note: A Data Table region is a special type of operation region
+ * that has its own AML opcode. However, internally, the AML
+ * interpreter simply creates an operation region with an an address
+ * space type of ACPI_ADR_SPACE_DATA_TABLE.
+ */
+#define ACPI_ADR_SPACE_DATA_TABLE       (acpi_adr_space_type) 0x7E	/* Internal to ACPICA only */
+#define ACPI_ADR_SPACE_FIXED_HARDWARE   (acpi_adr_space_type) 0x7F
 
 /*
  * bit_register IDs
-- 
cgit v1.2.3-70-g09d2


From 07aa99e9df2184e78068f7d5414e29e4a5a1b452 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 13 Apr 2011 13:20:49 +0800
Subject: ACPICA: Move ACPI_NUM_PREDEFINED_REGIONS to a more appropriate place

Moved to where the predefined regions are actually defined.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/acconfig.h | 1 -
 include/acpi/actypes.h         | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpica/acconfig.h b/drivers/acpi/acpica/acconfig.h
index c115bb9bd4d..bc533dde16c 100644
--- a/drivers/acpi/acpica/acconfig.h
+++ b/drivers/acpi/acpica/acconfig.h
@@ -187,7 +187,6 @@
 
 /* Operation regions */
 
-#define ACPI_NUM_PREDEFINED_REGIONS     8
 #define ACPI_USER_REGION_BEGIN          0x80
 
 /* Maximum space_ids for Operation Regions */
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index ad77c613c74..f3b29fa5654 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -713,8 +713,10 @@ typedef u8 acpi_adr_space_type;
 #define ACPI_ADR_SPACE_PCI_BAR_TARGET   (acpi_adr_space_type) 6
 #define ACPI_ADR_SPACE_IPMI             (acpi_adr_space_type) 7
 
+#define ACPI_NUM_PREDEFINED_REGIONS     8
+
 /*
- * Special region types
+ * Special Address Spaces
  *
  * Note: A Data Table region is a special type of operation region
  * that has its own AML opcode. However, internally, the AML
-- 
cgit v1.2.3-70-g09d2


From e2066ca1b211ff08325c98be9fb8ad95affbaba8 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 13 Apr 2011 13:22:04 +0800
Subject: ACPICA: Execute an orphan _REG method under the EC device

This change will force the execution of a _REG method underneath
the EC device even if there is no corresponding operation region
of type EmbeddedControl. Fixes a problem seen on some machines
and apparently is compatible with Windows behavior.

http://www.acpica.org/bugzilla/show_bug.cgi?id=875

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/acpica/evregion.c | 121 ++++++++++++++++++++++++++++++++++++++++-
 drivers/acpi/acpica/evrgnini.c |   2 +-
 drivers/acpi/acpica/evxfregn.c |  13 +++--
 include/acpi/actypes.h         |   5 ++
 4 files changed, 132 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c
index bea7223d7a7..f0edf5c43c0 100644
--- a/drivers/acpi/acpica/evregion.c
+++ b/drivers/acpi/acpica/evregion.c
@@ -55,6 +55,8 @@ static u8
 acpi_ev_has_default_handler(struct acpi_namespace_node *node,
 			    acpi_adr_space_type space_id);
 
+static void acpi_ev_orphan_ec_reg_method(void);
+
 static acpi_status
 acpi_ev_reg_run(acpi_handle obj_handle,
 		u32 level, void *context, void **return_value);
@@ -561,7 +563,9 @@ acpi_ev_detach_region(union acpi_operand_object *region_obj,
 
 			/* Now stop region accesses by executing the _REG method */
 
-			status = acpi_ev_execute_reg_method(region_obj, 0);
+			status =
+			    acpi_ev_execute_reg_method(region_obj,
+						       ACPI_REG_DISCONNECT);
 			if (ACPI_FAILURE(status)) {
 				ACPI_EXCEPTION((AE_INFO, status,
 						"from region _REG, [%s]",
@@ -1062,6 +1066,12 @@ acpi_ev_execute_reg_methods(struct acpi_namespace_node *node,
 					ACPI_NS_WALK_UNLOCK, acpi_ev_reg_run,
 					NULL, &space_id, NULL);
 
+	/* Special case for EC: handle "orphan" _REG methods with no region */
+
+	if (space_id == ACPI_ADR_SPACE_EC) {
+		acpi_ev_orphan_ec_reg_method();
+	}
+
 	return_ACPI_STATUS(status);
 }
 
@@ -1120,6 +1130,113 @@ acpi_ev_reg_run(acpi_handle obj_handle,
 		return (AE_OK);
 	}
 
-	status = acpi_ev_execute_reg_method(obj_desc, 1);
+	status = acpi_ev_execute_reg_method(obj_desc, ACPI_REG_CONNECT);
 	return (status);
 }
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_orphan_ec_reg_method
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      None
+ *
+ * DESCRIPTION: Execute an "orphan" _REG method that appears under the EC
+ *              device. This is a _REG method that has no corresponding region
+ *              within the EC device scope. The orphan _REG method appears to
+ *              have been enabled by the description of the ECDT in the ACPI
+ *              specification: "The availability of the region space can be
+ *              detected by providing a _REG method object underneath the
+ *              Embedded Controller device."
+ *
+ *              To quickly access the EC device, we use the EC_ID that appears
+ *              within the ECDT. Otherwise, we would need to perform a time-
+ *              consuming namespace walk, executing _HID methods to find the
+ *              EC device.
+ *
+ ******************************************************************************/
+
+static void acpi_ev_orphan_ec_reg_method(void)
+{
+	struct acpi_table_ecdt *table;
+	acpi_status status;
+	struct acpi_object_list args;
+	union acpi_object objects[2];
+	struct acpi_namespace_node *ec_device_node;
+	struct acpi_namespace_node *reg_method;
+	struct acpi_namespace_node *next_node;
+
+	ACPI_FUNCTION_TRACE(ev_orphan_ec_reg_method);
+
+	/* Get the ECDT (if present in system) */
+
+	status = acpi_get_table(ACPI_SIG_ECDT, 0,
+				ACPI_CAST_INDIRECT_PTR(struct acpi_table_header,
+						       &table));
+	if (ACPI_FAILURE(status)) {
+		return_VOID;
+	}
+
+	/* We need a valid EC_ID string */
+
+	if (!(*table->id)) {
+		return_VOID;
+	}
+
+	/* Namespace is currently locked, must release */
+
+	(void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE);
+
+	/* Get a handle to the EC device referenced in the ECDT */
+
+	status = acpi_get_handle(NULL,
+				 ACPI_CAST_PTR(char, table->id),
+				 ACPI_CAST_PTR(acpi_handle, &ec_device_node));
+	if (ACPI_FAILURE(status)) {
+		goto exit;
+	}
+
+	/* Get a handle to a _REG method immediately under the EC device */
+
+	status = acpi_get_handle(ec_device_node,
+				 METHOD_NAME__REG, ACPI_CAST_PTR(acpi_handle,
+								 &reg_method));
+	if (ACPI_FAILURE(status)) {
+		goto exit;
+	}
+
+	/*
+	 * Execute the _REG method only if there is no Operation Region in
+	 * this scope with the Embedded Controller space ID. Otherwise, it
+	 * will already have been executed. Note, this allows for Regions
+	 * with other space IDs to be present; but the code below will then
+	 * execute the _REG method with the EC space ID argument.
+	 */
+	next_node = acpi_ns_get_next_node(ec_device_node, NULL);
+	while (next_node) {
+		if ((next_node->type == ACPI_TYPE_REGION) &&
+		    (next_node->object) &&
+		    (next_node->object->region.space_id == ACPI_ADR_SPACE_EC)) {
+			goto exit;	/* Do not execute _REG */
+		}
+		next_node = acpi_ns_get_next_node(ec_device_node, next_node);
+	}
+
+	/* Evaluate the _REG(EC,Connect) method */
+
+	args.count = 2;
+	args.pointer = objects;
+	objects[0].type = ACPI_TYPE_INTEGER;
+	objects[0].integer.value = ACPI_ADR_SPACE_EC;
+	objects[1].type = ACPI_TYPE_INTEGER;
+	objects[1].integer.value = ACPI_REG_CONNECT;
+
+	status = acpi_evaluate_object(reg_method, NULL, &args, NULL);
+
+      exit:
+	/* We ignore all errors from above, don't care */
+
+	status = acpi_ut_acquire_mutex(ACPI_MTX_NAMESPACE);
+	return_VOID;
+}
diff --git a/drivers/acpi/acpica/evrgnini.c b/drivers/acpi/acpica/evrgnini.c
index 9659cee6093..55a5d35ef34 100644
--- a/drivers/acpi/acpica/evrgnini.c
+++ b/drivers/acpi/acpica/evrgnini.c
@@ -637,7 +637,7 @@ acpi_ev_initialize_region(union acpi_operand_object *region_obj,
 
 					status =
 					    acpi_ev_execute_reg_method
-					    (region_obj, 1);
+					    (region_obj, ACPI_REG_CONNECT);
 
 					if (acpi_ns_locked) {
 						status =
diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c
index c85c8c45599..00cd95692a9 100644
--- a/drivers/acpi/acpica/evxfregn.c
+++ b/drivers/acpi/acpica/evxfregn.c
@@ -130,20 +130,21 @@ acpi_install_address_space_handler(acpi_handle device,
 	case ACPI_ADR_SPACE_PCI_CONFIG:
 	case ACPI_ADR_SPACE_DATA_TABLE:
 
-		if (acpi_gbl_reg_methods_executed) {
+		if (!acpi_gbl_reg_methods_executed) {
 
-			/* Run all _REG methods for this address space */
-
-			status = acpi_ev_execute_reg_methods(node, space_id);
+			/* We will defer execution of the _REG methods for this space */
+			goto unlock_and_exit;
 		}
 		break;
 
 	default:
-
-		status = acpi_ev_execute_reg_methods(node, space_id);
 		break;
 	}
 
+	/* Run all _REG methods for this address space */
+
+	status = acpi_ev_execute_reg_methods(node, space_id);
+
       unlock_and_exit:
 	(void)acpi_ut_release_mutex(ACPI_MTX_NAMESPACE);
 	return_ACPI_STATUS(status);
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index f3b29fa5654..a6412b82a57 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -726,6 +726,11 @@ typedef u8 acpi_adr_space_type;
 #define ACPI_ADR_SPACE_DATA_TABLE       (acpi_adr_space_type) 0x7E	/* Internal to ACPICA only */
 #define ACPI_ADR_SPACE_FIXED_HARDWARE   (acpi_adr_space_type) 0x7F
 
+/* Values for _REG connection code */
+
+#define ACPI_REG_DISCONNECT             0
+#define ACPI_REG_CONNECT                1
+
 /*
  * bit_register IDs
  *
-- 
cgit v1.2.3-70-g09d2


From 0a63e2308cbbdc7e2f5645769afaf53785bcb9fa Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 15 Apr 2011 09:12:47 +0800
Subject: ACPICA: Update to version 20110413

Version 20110413

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index f6ad63d25b7..2ed0a8486c1 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -47,7 +47,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20110316
+#define ACPI_CA_VERSION                 0x20110413
 
 #include "actypes.h"
 #include "actbl.h"
-- 
cgit v1.2.3-70-g09d2


From de7a2f9f7b6f5b48d8531ff4c9c9b95cab8a8ce8 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Mon, 9 May 2011 18:11:37 +0200
Subject: dmaengine: at_hdmac: pause: no need to wait for FIFO empty

With the addition of the "pause" feature, an active wait was introduced
to check the "FIFO empty" event. This event was not always happening and
a timout contition was needed.
But, in some cases, this event depend on the peripheral connected to the
channel that is paused: FIFO becomes empty if the peripheral consumes data.
The timeout is pretty difficult to evaluate. Moreover, this check is not
needed.
In conclusion, it seems sensible to entirely remove the checking of
"FIFO empty" status when pausing.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
[commit msg edited for grammer]
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_hdmac.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 2c6bc3aa3dd..3134003eec8 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -933,25 +933,9 @@ static int atc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 	dev_vdbg(chan2dev(chan), "atc_control (%d)\n", cmd);
 
 	if (cmd == DMA_PAUSE) {
-		int pause_timeout = 1000;
-
 		spin_lock_bh(&atchan->lock);
 
 		dma_writel(atdma, CHER, AT_DMA_SUSP(chan_id));
-
-		/* wait for FIFO to be empty */
-		while (!(dma_readl(atdma, CHSR) & AT_DMA_EMPT(chan_id))) {
-			if (pause_timeout-- > 0) {
-				/* the FIFO can only drain if the peripheral
-				 * is still requesting data:
-				 * -> timeout if it is not the case. */
-				dma_writel(atdma, CHDR, AT_DMA_RES(chan_id));
-				spin_unlock_bh(&atchan->lock);
-				return -ETIMEDOUT;
-			}
-			cpu_relax();
-		}
-
 		set_bit(ATC_IS_PAUSED, &atchan->status);
 
 		spin_unlock_bh(&atchan->lock);
-- 
cgit v1.2.3-70-g09d2


From 5fedefb87bd0a64281d28edd295f29e3b989d78c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Fri, 15 Apr 2011 16:03:35 +0530
Subject: dmaengine/dw_dmac: don't call callback routine in case
 dmaengine_terminate_all() is called

If dmaengine_terminate_all() is called for dma channel, then it doesn't make
much sense to call registered callback routine. While in case of success or
failure it must be called.

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw_dmac.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index b15c32ca0ef..265d43c8b35 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -195,18 +195,21 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
 /*----------------------------------------------------------------------*/
 
 static void
-dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
+dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc,
+		bool callback_required)
 {
-	dma_async_tx_callback		callback;
-	void				*param;
+	dma_async_tx_callback		callback = NULL;
+	void				*param = NULL;
 	struct dma_async_tx_descriptor	*txd = &desc->txd;
 	struct dw_desc			*child;
 
 	dev_vdbg(chan2dev(&dwc->chan), "descriptor %u complete\n", txd->cookie);
 
 	dwc->completed = txd->cookie;
-	callback = txd->callback;
-	param = txd->callback_param;
+	if (callback_required) {
+		callback = txd->callback;
+		param = txd->callback_param;
+	}
 
 	dwc_sync_desc_for_cpu(dwc, desc);
 
@@ -238,11 +241,7 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
 		}
 	}
 
-	/*
-	 * The API requires that no submissions are done from a
-	 * callback, so we don't need to drop the lock here
-	 */
-	if (callback)
+	if (callback_required && callback)
 		callback(param);
 }
 
@@ -272,7 +271,7 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	}
 
 	list_for_each_entry_safe(desc, _desc, &list, desc_node)
-		dwc_descriptor_complete(dwc, desc);
+		dwc_descriptor_complete(dwc, desc, true);
 }
 
 static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
@@ -322,7 +321,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		 * No descriptors so far seem to be in progress, i.e.
 		 * this one must be done.
 		 */
-		dwc_descriptor_complete(dwc, desc);
+		dwc_descriptor_complete(dwc, desc, true);
 	}
 
 	dev_err(chan2dev(&dwc->chan),
@@ -384,7 +383,7 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		dwc_dump_lli(dwc, &child->lli);
 
 	/* Pretend the descriptor completed successfully */
-	dwc_descriptor_complete(dwc, bad_desc);
+	dwc_descriptor_complete(dwc, bad_desc, true);
 }
 
 /* --------------------- Cyclic DMA API extensions -------------------- */
@@ -831,7 +830,7 @@ static int dwc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 
 	/* Flush all pending and queued descriptors */
 	list_for_each_entry_safe(desc, _desc, &list, desc_node)
-		dwc_descriptor_complete(dwc, desc);
+		dwc_descriptor_complete(dwc, desc, false);
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From abf53902dcc6d44d2e06b09817fa67857aa686fe Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Fri, 15 Apr 2011 16:03:35 +0530
Subject: dmaengine/dw_dmac: set residue as total len in dwc_tx_status if
 status is !DMA_SUCCESS

If transfer status is !=DMA_SUCCESS, return total transfer len as residue,
instead of zero.

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw_dmac.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 265d43c8b35..f209ff8835e 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -860,7 +860,11 @@ dwc_tx_status(struct dma_chan *chan,
 		ret = dma_async_is_complete(cookie, last_complete, last_used);
 	}
 
-	dma_set_tx_state(txstate, last_complete, last_used, 0);
+	if (ret != DMA_SUCCESS)
+		dma_set_tx_state(txstate, last_complete, last_used,
+				dwc_first_active(dwc)->len);
+	else
+		dma_set_tx_state(txstate, last_complete, last_used, 0);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 69dc14b51c1aad9d82afd8f96bf4e4835089bffc Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Mon, 18 Apr 2011 14:54:56 +0530
Subject: dmaengine/dw_dmac: Divide one sg to many desc, if sg len is greater
 than DWC_MAX_COUNT

If len passed in sg for slave_sg transfers is greater than DWC_MAX_COUNT, then
driver programmes controller incorrectly.  This patch adds code to handle this
situation by allocation more than one desc for same sg.

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw_dmac.c | 65 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index f209ff8835e..a9755e3dd60 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -693,9 +693,15 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		reg = dws->tx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
-			u32		len;
-			u32		mem;
+			u32		len, dlen, mem;
 
+			mem = sg_phys(sg);
+			len = sg_dma_len(sg);
+			mem_width = 2;
+			if (unlikely(mem & 3 || len & 3))
+				mem_width = 0;
+
+slave_sg_todev_fill_desc:
 			desc = dwc_desc_get(dwc);
 			if (!desc) {
 				dev_err(chan2dev(chan),
@@ -703,16 +709,19 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				goto err_desc_get;
 			}
 
-			mem = sg_phys(sg);
-			len = sg_dma_len(sg);
-			mem_width = 2;
-			if (unlikely(mem & 3 || len & 3))
-				mem_width = 0;
-
 			desc->lli.sar = mem;
 			desc->lli.dar = reg;
 			desc->lli.ctllo = ctllo | DWC_CTLL_SRC_WIDTH(mem_width);
-			desc->lli.ctlhi = len >> mem_width;
+			if ((len >> mem_width) > DWC_MAX_COUNT) {
+				dlen = DWC_MAX_COUNT << mem_width;
+				mem += dlen;
+				len -= dlen;
+			} else {
+				dlen = len;
+				len = 0;
+			}
+
+			desc->lli.ctlhi = dlen >> mem_width;
 
 			if (!first) {
 				first = desc;
@@ -726,7 +735,10 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 						&first->tx_list);
 			}
 			prev = desc;
-			total_len += len;
+			total_len += dlen;
+
+			if (len)
+				goto slave_sg_todev_fill_desc;
 		}
 		break;
 	case DMA_FROM_DEVICE:
@@ -739,15 +751,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		reg = dws->rx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
-			u32		len;
-			u32		mem;
-
-			desc = dwc_desc_get(dwc);
-			if (!desc) {
-				dev_err(chan2dev(chan),
-					"not enough descriptors available\n");
-				goto err_desc_get;
-			}
+			u32		len, dlen, mem;
 
 			mem = sg_phys(sg);
 			len = sg_dma_len(sg);
@@ -755,10 +759,26 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 			if (unlikely(mem & 3 || len & 3))
 				mem_width = 0;
 
+slave_sg_fromdev_fill_desc:
+			desc = dwc_desc_get(dwc);
+			if (!desc) {
+				dev_err(chan2dev(chan),
+						"not enough descriptors available\n");
+				goto err_desc_get;
+			}
+
 			desc->lli.sar = reg;
 			desc->lli.dar = mem;
 			desc->lli.ctllo = ctllo | DWC_CTLL_DST_WIDTH(mem_width);
-			desc->lli.ctlhi = len >> reg_width;
+			if ((len >> reg_width) > DWC_MAX_COUNT) {
+				dlen = DWC_MAX_COUNT << reg_width;
+				mem += dlen;
+				len -= dlen;
+			} else {
+				dlen = len;
+				len = 0;
+			}
+			desc->lli.ctlhi = dlen >> reg_width;
 
 			if (!first) {
 				first = desc;
@@ -772,7 +792,10 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 						&first->tx_list);
 			}
 			prev = desc;
-			total_len += len;
+			total_len += dlen;
+
+			if (len)
+				goto slave_sg_fromdev_fill_desc;
 		}
 		break;
 	default:
-- 
cgit v1.2.3-70-g09d2


From 69cea5a00d3135677939fce1fefe54ed522055a0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Fri, 15 Apr 2011 16:03:35 +0530
Subject: dmaengine/dw_dmac: Replace spin_lock* with irqsave variants and
 enable submission from callback

dmaengine routines can be called from interrupt context and with interrupts
disabled.  Whereas spin_unlock_bh can't be called from such contexts. So this
patch converts all spin_*_bh routines to irqsave variants.

Also, spin_lock() used in tasklet is converted to irqsave variants, as tasklet
can be interrupted, and dma requests from such interruptions may also call
spin_lock.

Now, submission from callbacks are permitted as per dmaengine framework. So we
shouldn't hold any locks while calling callbacks. As locks were taken by parent
routines, so releasing them before calling callbacks doesn't look clean enough.
So, locks are taken inside all routine now, whereever they are required. And
dwc_descriptor_complete is always called without taking locks.

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw_dmac.c | 116 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 77 insertions(+), 39 deletions(-)

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index a9755e3dd60..442b98b81e7 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -93,8 +93,9 @@ static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
 	struct dw_desc *desc, *_desc;
 	struct dw_desc *ret = NULL;
 	unsigned int i = 0;
+	unsigned long flags;
 
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 	list_for_each_entry_safe(desc, _desc, &dwc->free_list, desc_node) {
 		if (async_tx_test_ack(&desc->txd)) {
 			list_del(&desc->desc_node);
@@ -104,7 +105,7 @@ static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
 		dev_dbg(chan2dev(&dwc->chan), "desc %p not ACKed\n", desc);
 		i++;
 	}
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	dev_vdbg(chan2dev(&dwc->chan), "scanned %u descriptors on freelist\n", i);
 
@@ -130,12 +131,14 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc)
  */
 static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 {
+	unsigned long flags;
+
 	if (desc) {
 		struct dw_desc *child;
 
 		dwc_sync_desc_for_cpu(dwc, desc);
 
-		spin_lock_bh(&dwc->lock);
+		spin_lock_irqsave(&dwc->lock, flags);
 		list_for_each_entry(child, &desc->tx_list, desc_node)
 			dev_vdbg(chan2dev(&dwc->chan),
 					"moving child desc %p to freelist\n",
@@ -143,7 +146,7 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 		list_splice_init(&desc->tx_list, &dwc->free_list);
 		dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
 		list_add(&desc->desc_node, &dwc->free_list);
-		spin_unlock_bh(&dwc->lock);
+		spin_unlock_irqrestore(&dwc->lock, flags);
 	}
 }
 
@@ -202,9 +205,11 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc,
 	void				*param = NULL;
 	struct dma_async_tx_descriptor	*txd = &desc->txd;
 	struct dw_desc			*child;
+	unsigned long			flags;
 
 	dev_vdbg(chan2dev(&dwc->chan), "descriptor %u complete\n", txd->cookie);
 
+	spin_lock_irqsave(&dwc->lock, flags);
 	dwc->completed = txd->cookie;
 	if (callback_required) {
 		callback = txd->callback;
@@ -241,6 +246,8 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc,
 		}
 	}
 
+	spin_unlock_irqrestore(&dwc->lock, flags);
+
 	if (callback_required && callback)
 		callback(param);
 }
@@ -249,7 +256,9 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 {
 	struct dw_desc *desc, *_desc;
 	LIST_HEAD(list);
+	unsigned long flags;
 
+	spin_lock_irqsave(&dwc->lock, flags);
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
 		dev_err(chan2dev(&dwc->chan),
 			"BUG: XFER bit set, but channel not idle!\n");
@@ -270,6 +279,8 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		dwc_dostart(dwc, dwc_first_active(dwc));
 	}
 
+	spin_unlock_irqrestore(&dwc->lock, flags);
+
 	list_for_each_entry_safe(desc, _desc, &list, desc_node)
 		dwc_descriptor_complete(dwc, desc, true);
 }
@@ -280,7 +291,9 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	struct dw_desc *desc, *_desc;
 	struct dw_desc *child;
 	u32 status_xfer;
+	unsigned long flags;
 
+	spin_lock_irqsave(&dwc->lock, flags);
 	/*
 	 * Clear block interrupt flag before scanning so that we don't
 	 * miss any, and read LLP before RAW_XFER to ensure it is
@@ -293,35 +306,47 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	if (status_xfer & dwc->mask) {
 		/* Everything we've submitted is done */
 		dma_writel(dw, CLEAR.XFER, dwc->mask);
+		spin_unlock_irqrestore(&dwc->lock, flags);
+
 		dwc_complete_all(dw, dwc);
 		return;
 	}
 
-	if (list_empty(&dwc->active_list))
+	if (list_empty(&dwc->active_list)) {
+		spin_unlock_irqrestore(&dwc->lock, flags);
 		return;
+	}
 
 	dev_vdbg(chan2dev(&dwc->chan), "scan_descriptors: llp=0x%x\n", llp);
 
 	list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
 		/* check first descriptors addr */
-		if (desc->txd.phys == llp)
+		if (desc->txd.phys == llp) {
+			spin_unlock_irqrestore(&dwc->lock, flags);
 			return;
+		}
 
 		/* check first descriptors llp */
-		if (desc->lli.llp == llp)
+		if (desc->lli.llp == llp) {
 			/* This one is currently in progress */
+			spin_unlock_irqrestore(&dwc->lock, flags);
 			return;
+		}
 
 		list_for_each_entry(child, &desc->tx_list, desc_node)
-			if (child->lli.llp == llp)
+			if (child->lli.llp == llp) {
 				/* Currently in progress */
+				spin_unlock_irqrestore(&dwc->lock, flags);
 				return;
+			}
 
 		/*
 		 * No descriptors so far seem to be in progress, i.e.
 		 * this one must be done.
 		 */
+		spin_unlock_irqrestore(&dwc->lock, flags);
 		dwc_descriptor_complete(dwc, desc, true);
+		spin_lock_irqsave(&dwc->lock, flags);
 	}
 
 	dev_err(chan2dev(&dwc->chan),
@@ -336,6 +361,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		list_move(dwc->queue.next, &dwc->active_list);
 		dwc_dostart(dwc, dwc_first_active(dwc));
 	}
+	spin_unlock_irqrestore(&dwc->lock, flags);
 }
 
 static void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_lli *lli)
@@ -350,9 +376,12 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 {
 	struct dw_desc *bad_desc;
 	struct dw_desc *child;
+	unsigned long flags;
 
 	dwc_scan_descriptors(dw, dwc);
 
+	spin_lock_irqsave(&dwc->lock, flags);
+
 	/*
 	 * The descriptor currently at the head of the active list is
 	 * borked. Since we don't have any way to report errors, we'll
@@ -382,6 +411,8 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	list_for_each_entry(child, &bad_desc->tx_list, desc_node)
 		dwc_dump_lli(dwc, &child->lli);
 
+	spin_unlock_irqrestore(&dwc->lock, flags);
+
 	/* Pretend the descriptor completed successfully */
 	dwc_descriptor_complete(dwc, bad_desc, true);
 }
@@ -406,6 +437,8 @@ EXPORT_SYMBOL(dw_dma_get_dst_addr);
 static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
 		u32 status_block, u32 status_err, u32 status_xfer)
 {
+	unsigned long flags;
+
 	if (status_block & dwc->mask) {
 		void (*callback)(void *param);
 		void *callback_param;
@@ -416,11 +449,9 @@ static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
 
 		callback = dwc->cdesc->period_callback;
 		callback_param = dwc->cdesc->period_callback_param;
-		if (callback) {
-			spin_unlock(&dwc->lock);
+
+		if (callback)
 			callback(callback_param);
-			spin_lock(&dwc->lock);
-		}
 	}
 
 	/*
@@ -434,6 +465,9 @@ static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
 		dev_err(chan2dev(&dwc->chan), "cyclic DMA unexpected %s "
 				"interrupt, stopping DMA transfer\n",
 				status_xfer ? "xfer" : "error");
+
+		spin_lock_irqsave(&dwc->lock, flags);
+
 		dev_err(chan2dev(&dwc->chan),
 			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
 			channel_readl(dwc, SAR),
@@ -457,6 +491,8 @@ static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
 
 		for (i = 0; i < dwc->cdesc->periods; i++)
 			dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
+
+		spin_unlock_irqrestore(&dwc->lock, flags);
 	}
 }
 
@@ -480,7 +516,6 @@ static void dw_dma_tasklet(unsigned long data)
 
 	for (i = 0; i < dw->dma.chancnt; i++) {
 		dwc = &dw->chan[i];
-		spin_lock(&dwc->lock);
 		if (test_bit(DW_DMA_IS_CYCLIC, &dwc->flags))
 			dwc_handle_cyclic(dw, dwc, status_block, status_err,
 					status_xfer);
@@ -488,7 +523,6 @@ static void dw_dma_tasklet(unsigned long data)
 			dwc_handle_error(dw, dwc);
 		else if ((status_block | status_xfer) & (1 << i))
 			dwc_scan_descriptors(dw, dwc);
-		spin_unlock(&dwc->lock);
 	}
 
 	/*
@@ -543,8 +577,9 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
 	struct dw_desc		*desc = txd_to_dw_desc(tx);
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(tx->chan);
 	dma_cookie_t		cookie;
+	unsigned long		flags;
 
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 	cookie = dwc_assign_cookie(dwc, desc);
 
 	/*
@@ -564,7 +599,7 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
 		list_add_tail(&desc->desc_node, &dwc->queue);
 	}
 
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	return cookie;
 }
@@ -826,6 +861,7 @@ static int dwc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(chan->device);
 	struct dw_desc		*desc, *_desc;
+	unsigned long		flags;
 	LIST_HEAD(list);
 
 	/* Only supports DMA_TERMINATE_ALL */
@@ -838,7 +874,7 @@ static int dwc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 	 * channel. We still have to poll the channel enable bit due
 	 * to AHB/HSB limitations.
 	 */
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 
 	channel_clear_bit(dw, CH_EN, dwc->mask);
 
@@ -849,7 +885,7 @@ static int dwc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 	list_splice_init(&dwc->queue, &list);
 	list_splice_init(&dwc->active_list, &list);
 
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	/* Flush all pending and queued descriptors */
 	list_for_each_entry_safe(desc, _desc, &list, desc_node)
@@ -873,9 +909,7 @@ dwc_tx_status(struct dma_chan *chan,
 
 	ret = dma_async_is_complete(cookie, last_complete, last_used);
 	if (ret != DMA_SUCCESS) {
-		spin_lock_bh(&dwc->lock);
 		dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
-		spin_unlock_bh(&dwc->lock);
 
 		last_complete = dwc->completed;
 		last_used = chan->cookie;
@@ -896,10 +930,8 @@ static void dwc_issue_pending(struct dma_chan *chan)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 
-	spin_lock_bh(&dwc->lock);
 	if (!list_empty(&dwc->queue))
 		dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
-	spin_unlock_bh(&dwc->lock);
 }
 
 static int dwc_alloc_chan_resources(struct dma_chan *chan)
@@ -911,6 +943,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	int			i;
 	u32			cfghi;
 	u32			cfglo;
+	unsigned long		flags;
 
 	dev_vdbg(chan2dev(chan), "alloc_chan_resources\n");
 
@@ -948,16 +981,16 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	 * doesn't mean what you think it means), and status writeback.
 	 */
 
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 	i = dwc->descs_allocated;
 	while (dwc->descs_allocated < NR_DESCS_PER_CHANNEL) {
-		spin_unlock_bh(&dwc->lock);
+		spin_unlock_irqrestore(&dwc->lock, flags);
 
 		desc = kzalloc(sizeof(struct dw_desc), GFP_KERNEL);
 		if (!desc) {
 			dev_info(chan2dev(chan),
 				"only allocated %d descriptors\n", i);
-			spin_lock_bh(&dwc->lock);
+			spin_lock_irqsave(&dwc->lock, flags);
 			break;
 		}
 
@@ -969,7 +1002,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 				sizeof(desc->lli), DMA_TO_DEVICE);
 		dwc_desc_put(dwc, desc);
 
-		spin_lock_bh(&dwc->lock);
+		spin_lock_irqsave(&dwc->lock, flags);
 		i = ++dwc->descs_allocated;
 	}
 
@@ -978,7 +1011,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	channel_set_bit(dw, MASK.BLOCK, dwc->mask);
 	channel_set_bit(dw, MASK.ERROR, dwc->mask);
 
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	dev_dbg(chan2dev(chan),
 		"alloc_chan_resources allocated %d descriptors\n", i);
@@ -991,6 +1024,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(chan->device);
 	struct dw_desc		*desc, *_desc;
+	unsigned long		flags;
 	LIST_HEAD(list);
 
 	dev_dbg(chan2dev(chan), "free_chan_resources (descs allocated=%u)\n",
@@ -1001,7 +1035,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	BUG_ON(!list_empty(&dwc->queue));
 	BUG_ON(dma_readl(to_dw_dma(chan->device), CH_EN) & dwc->mask);
 
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 	list_splice_init(&dwc->free_list, &list);
 	dwc->descs_allocated = 0;
 
@@ -1010,7 +1044,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	channel_clear_bit(dw, MASK.BLOCK, dwc->mask);
 	channel_clear_bit(dw, MASK.ERROR, dwc->mask);
 
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
 		dev_vdbg(chan2dev(chan), "  freeing descriptor %p\n", desc);
@@ -1035,13 +1069,14 @@ int dw_dma_cyclic_start(struct dma_chan *chan)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+	unsigned long		flags;
 
 	if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) {
 		dev_err(chan2dev(&dwc->chan), "missing prep for cyclic DMA\n");
 		return -ENODEV;
 	}
 
-	spin_lock(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 
 	/* assert channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
@@ -1054,7 +1089,7 @@ int dw_dma_cyclic_start(struct dma_chan *chan)
 			channel_readl(dwc, LLP),
 			channel_readl(dwc, CTL_HI),
 			channel_readl(dwc, CTL_LO));
-		spin_unlock(&dwc->lock);
+		spin_unlock_irqrestore(&dwc->lock, flags);
 		return -EBUSY;
 	}
 
@@ -1069,7 +1104,7 @@ int dw_dma_cyclic_start(struct dma_chan *chan)
 
 	channel_set_bit(dw, CH_EN, dwc->mask);
 
-	spin_unlock(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	return 0;
 }
@@ -1085,14 +1120,15 @@ void dw_dma_cyclic_stop(struct dma_chan *chan)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
+	unsigned long		flags;
 
-	spin_lock(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 
 	channel_clear_bit(dw, CH_EN, dwc->mask);
 	while (dma_readl(dw, CH_EN) & dwc->mask)
 		cpu_relax();
 
-	spin_unlock(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 }
 EXPORT_SYMBOL(dw_dma_cyclic_stop);
 
@@ -1121,17 +1157,18 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
 	unsigned int			reg_width;
 	unsigned int			periods;
 	unsigned int			i;
+	unsigned long			flags;
 
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 	if (!list_empty(&dwc->queue) || !list_empty(&dwc->active_list)) {
-		spin_unlock_bh(&dwc->lock);
+		spin_unlock_irqrestore(&dwc->lock, flags);
 		dev_dbg(chan2dev(&dwc->chan),
 				"queue and/or active list are not empty\n");
 		return ERR_PTR(-EBUSY);
 	}
 
 	was_cyclic = test_and_set_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 	if (was_cyclic) {
 		dev_dbg(chan2dev(&dwc->chan),
 				"channel already prepared for cyclic DMA\n");
@@ -1245,13 +1282,14 @@ void dw_dma_cyclic_free(struct dma_chan *chan)
 	struct dw_dma		*dw = to_dw_dma(dwc->chan.device);
 	struct dw_cyclic_desc	*cdesc = dwc->cdesc;
 	int			i;
+	unsigned long		flags;
 
 	dev_dbg(chan2dev(&dwc->chan), "cyclic free\n");
 
 	if (!cdesc)
 		return;
 
-	spin_lock_bh(&dwc->lock);
+	spin_lock_irqsave(&dwc->lock, flags);
 
 	channel_clear_bit(dw, CH_EN, dwc->mask);
 	while (dma_readl(dw, CH_EN) & dwc->mask)
@@ -1261,7 +1299,7 @@ void dw_dma_cyclic_free(struct dma_chan *chan)
 	dma_writel(dw, CLEAR.ERROR, dwc->mask);
 	dma_writel(dw, CLEAR.XFER, dwc->mask);
 
-	spin_unlock_bh(&dwc->lock);
+	spin_unlock_irqrestore(&dwc->lock, flags);
 
 	for (i = 0; i < cdesc->periods; i++)
 		dwc_desc_put(dwc, cdesc->desc[i]);
-- 
cgit v1.2.3-70-g09d2


From a7c57cf7d4327c41510f8cbf45b1b970e02c34f8 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 19 Apr 2011 08:31:32 +0800
Subject: dmaengine/dw_dmac: implement pause and resume in dwc_control

Some peripherals like amba-pl011 needs pause to be implemented in DMA controller
drivers. This also returns correct status from dwc_tx_status() in case chan is
paused.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dw_dmac.c      | 59 ++++++++++++++++++++++++++++++----------------
 drivers/dma/dw_dmac_regs.h |  1 +
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 442b98b81e7..eec675bf4f9 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -862,34 +862,50 @@ static int dwc_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 	struct dw_dma		*dw = to_dw_dma(chan->device);
 	struct dw_desc		*desc, *_desc;
 	unsigned long		flags;
+	u32			cfglo;
 	LIST_HEAD(list);
 
-	/* Only supports DMA_TERMINATE_ALL */
-	if (cmd != DMA_TERMINATE_ALL)
-		return -ENXIO;
+	if (cmd == DMA_PAUSE) {
+		spin_lock_irqsave(&dwc->lock, flags);
 
-	/*
-	 * This is only called when something went wrong elsewhere, so
-	 * we don't really care about the data. Just disable the
-	 * channel. We still have to poll the channel enable bit due
-	 * to AHB/HSB limitations.
-	 */
-	spin_lock_irqsave(&dwc->lock, flags);
+		cfglo = channel_readl(dwc, CFG_LO);
+		channel_writel(dwc, CFG_LO, cfglo | DWC_CFGL_CH_SUSP);
+		while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY))
+			cpu_relax();
 
-	channel_clear_bit(dw, CH_EN, dwc->mask);
+		dwc->paused = true;
+		spin_unlock_irqrestore(&dwc->lock, flags);
+	} else if (cmd == DMA_RESUME) {
+		if (!dwc->paused)
+			return 0;
 
-	while (dma_readl(dw, CH_EN) & dwc->mask)
-		cpu_relax();
+		spin_lock_irqsave(&dwc->lock, flags);
 
-	/* active_list entries will end up before queued entries */
-	list_splice_init(&dwc->queue, &list);
-	list_splice_init(&dwc->active_list, &list);
+		cfglo = channel_readl(dwc, CFG_LO);
+		channel_writel(dwc, CFG_LO, cfglo & ~DWC_CFGL_CH_SUSP);
+		dwc->paused = false;
 
-	spin_unlock_irqrestore(&dwc->lock, flags);
+		spin_unlock_irqrestore(&dwc->lock, flags);
+	} else if (cmd == DMA_TERMINATE_ALL) {
+		spin_lock_irqsave(&dwc->lock, flags);
 
-	/* Flush all pending and queued descriptors */
-	list_for_each_entry_safe(desc, _desc, &list, desc_node)
-		dwc_descriptor_complete(dwc, desc, false);
+		channel_clear_bit(dw, CH_EN, dwc->mask);
+		while (dma_readl(dw, CH_EN) & dwc->mask)
+			cpu_relax();
+
+		dwc->paused = false;
+
+		/* active_list entries will end up before queued entries */
+		list_splice_init(&dwc->queue, &list);
+		list_splice_init(&dwc->active_list, &list);
+
+		spin_unlock_irqrestore(&dwc->lock, flags);
+
+		/* Flush all pending and queued descriptors */
+		list_for_each_entry_safe(desc, _desc, &list, desc_node)
+			dwc_descriptor_complete(dwc, desc, false);
+	} else
+		return -ENXIO;
 
 	return 0;
 }
@@ -923,6 +939,9 @@ dwc_tx_status(struct dma_chan *chan,
 	else
 		dma_set_tx_state(txstate, last_complete, last_used, 0);
 
+	if (dwc->paused)
+		return DMA_PAUSED;
+
 	return ret;
 }
 
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index 720f821527f..c968597c32a 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -138,6 +138,7 @@ struct dw_dma_chan {
 	void __iomem		*ch_regs;
 	u8			mask;
 	u8			priority;
+	bool			paused;
 
 	spinlock_t		lock;
 
-- 
cgit v1.2.3-70-g09d2


From c47d832bc0155153920e507f075647519bad09a2 Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Mon, 16 May 2011 16:38:14 +0200
Subject: nfsd: make local functions static

This also fixes a number of sparse warnings.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c   | 2 +-
 fs/nfsd/nfs4state.c | 4 ++--
 fs/nfsd/nfs4xdr.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7e84a852cda..291f2d2551b 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,7 +842,7 @@ out:
 	return rv;
 }
 
-__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
 {
 	struct svc_fh	fh;
 	int err;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a2ea14f40b4..a8e4e91587a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2482,7 +2482,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 	return NULL;
 }
 
-int share_access_to_flags(u32 share_access)
+static int share_access_to_flags(u32 share_access)
 {
 	share_access &= ~NFS4_SHARE_WANT_MASK;
 
@@ -2902,7 +2902,7 @@ out:
 	return status;
 }
 
-struct lock_manager nfsd4_manager = {
+static struct lock_manager nfsd4_manager = {
 };
 
 static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 195a91d9405..99018110321 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3115,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
 	return nfserr;
 }
 
-__be32
+static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
 		      struct nfsd4_sequence *seq)
 {
-- 
cgit v1.2.3-70-g09d2


From a0eb221a446f2f6c988430f0b0a13f74b7c2b799 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 18 May 2011 14:18:57 +0200
Subject: dmaengine: move link order

Move the dmaengine subsystem up early in the drivers Makefile so
DMA is made available early to all drivers, just like e.g.
regulators. Now even regulators can use DMA on the same initlevel.
As a result we can bump the ste_dma40 and coh901318 dmaengine
drivers down one initlevel to subsys_init().

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/Makefile        | 4 +++-
 drivers/dma/coh901318.c | 2 +-
 drivers/dma/ste_dma40.c | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/Makefile b/drivers/Makefile
index 3f135b6fb01..e68423da897 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -17,6 +17,9 @@ obj-$(CONFIG_SFI)		+= sfi/
 # was used and do nothing if so
 obj-$(CONFIG_PNP)		+= pnp/
 obj-$(CONFIG_ARM_AMBA)		+= amba/
+# Many drivers will want to use DMA so this has to be made available
+# really early.
+obj-$(CONFIG_DMA_ENGINE)	+= dma/
 
 obj-$(CONFIG_VIRTIO)		+= virtio/
 obj-$(CONFIG_XEN)		+= xen/
@@ -92,7 +95,6 @@ obj-$(CONFIG_EISA)		+= eisa/
 obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
-obj-$(CONFIG_DMA_ENGINE)	+= dma/
 obj-$(CONFIG_MMC)		+= mmc/
 obj-$(CONFIG_MEMSTICK)		+= memstick/
 obj-$(CONFIG_NEW_LEDS)		+= leds/
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
index 00deabd9a04..b5a318916d0 100644
--- a/drivers/dma/coh901318.c
+++ b/drivers/dma/coh901318.c
@@ -1610,7 +1610,7 @@ int __init coh901318_init(void)
 {
 	return platform_driver_probe(&coh901318_driver, coh901318_probe);
 }
-arch_initcall(coh901318_init);
+subsys_initcall(coh901318_init);
 
 void __exit coh901318_exit(void)
 {
diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index 65d2535d12e..5d054bb908e 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -2962,4 +2962,4 @@ static int __init stedma40_init(void)
 {
 	return platform_driver_probe(&d40_driver, d40_probe);
 }
-arch_initcall(stedma40_init);
+subsys_initcall(stedma40_init);
-- 
cgit v1.2.3-70-g09d2


From 3d2606f42984613d324ad3047cf503bcddc3880a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 20 May 2011 09:46:54 +0200
Subject: oprofile, x86: Enable preemption during pci device setup in IBS init

IBS initialization is a mix of per-core register access and per-node
pci device setup. Register access should be pinned to the cpu, but pci
setup must run with preemption enabled.

This patch better separates the code into non-/preemptible sections
and fixes sleeping with preemption disabled. See bug message below.

Fixes also freeing the eilvt entry by introducing put_eilvt().

BUG: sleeping function called from invalid context at mm/slub.c:824
in_atomic(): 1, irqs_disabled(): 0, pid: 32357, name: modprobe
INFO: lockdep is turned off.
Pid: 32357, comm: modprobe Not tainted 2.6.39-rc7+ #14
Call Trace:
 [<ffffffff8104bdc8>] __might_sleep+0x112/0x117
 [<ffffffff81129693>] kmem_cache_alloc_trace+0x4b/0xe7
 [<ffffffff81278f14>] kzalloc.constprop.0+0x29/0x2b
 [<ffffffff81278f4c>] pci_get_subsys+0x36/0x78
 [<ffffffff81022689>] ? setup_APIC_eilvt+0xfb/0x139
 [<ffffffff81278fa4>] pci_get_device+0x16/0x18
 [<ffffffffa06c8b5d>] op_amd_init+0xd3/0x211 [oprofile]
 [<ffffffffa064d000>] ? 0xffffffffa064cfff
 [<ffffffffa064d298>] op_nmi_init+0x21e/0x26a [oprofile]
 [<ffffffffa064d062>] oprofile_arch_init+0xe/0x26 [oprofile]
 [<ffffffffa064d010>] oprofile_init+0x10/0x42 [oprofile]
 [<ffffffff81002099>] do_one_initcall+0x7f/0x13a
 [<ffffffff81096524>] sys_init_module+0x132/0x281
 [<ffffffff814cc682>] system_call_fastpath+0x16/0x1b

Reported-by: Dave Jones <davej@redhat.com>
Cc: <stable@kernel.org>         [2.6.37.x]
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 95 +++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index c3b8e24f2b1..9fd8a567fe1 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -316,16 +316,23 @@ static void op_amd_stop_ibs(void)
 		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
 }
 
-static inline int eilvt_is_available(int offset)
+static inline int get_eilvt(int offset)
 {
-	/* check if we may assign a vector */
 	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
 }
 
+static inline int put_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
 static inline int ibs_eilvt_valid(void)
 {
 	int offset;
 	u64 val;
+	int valid = 0;
+
+	preempt_disable();
 
 	rdmsrl(MSR_AMD64_IBSCTL, val);
 	offset = val & IBSCTL_LVT_OFFSET_MASK;
@@ -333,16 +340,20 @@ static inline int ibs_eilvt_valid(void)
 	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
 		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
 		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		return 0;
+		goto out;
 	}
 
-	if (!eilvt_is_available(offset)) {
+	if (!get_eilvt(offset)) {
 		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
 		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		return 0;
+		goto out;
 	}
 
-	return 1;
+	valid = 1;
+out:
+	preempt_enable();
+
+	return valid;
 }
 
 static inline int get_ibs_offset(void)
@@ -600,67 +611,69 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
 
 static int force_ibs_eilvt_setup(void)
 {
-	int i;
+	int offset;
 	int ret;
 
-	/* find the next free available EILVT entry */
-	for (i = 1; i < 4; i++) {
-		if (!eilvt_is_available(i))
-			continue;
-		ret = setup_ibs_ctl(i);
-		if (ret)
-			return ret;
-		pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
-		return 0;
+	/*
+	 * find the next free available EILVT entry, skip offset 0,
+	 * pin search to this cpu
+	 */
+	preempt_disable();
+	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+		if (get_eilvt(offset))
+			break;
 	}
+	preempt_enable();
 
-	printk(KERN_DEBUG "No EILVT entry available\n");
-
-	return -EBUSY;
-}
-
-static int __init_ibs_nmi(void)
-{
-	int ret;
-
-	if (ibs_eilvt_valid())
-		return 0;
+	if (offset == APIC_EILVT_NR_MAX) {
+		printk(KERN_DEBUG "No EILVT entry available\n");
+		return -EBUSY;
+	}
 
-	ret = force_ibs_eilvt_setup();
+	ret = setup_ibs_ctl(offset);
 	if (ret)
-		return ret;
+		goto out;
 
-	if (!ibs_eilvt_valid())
-		return -EFAULT;
+	if (!ibs_eilvt_valid()) {
+		ret = -EFAULT;
+		goto out;
+	}
 
+	pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
 	pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
 
 	return 0;
+out:
+	preempt_disable();
+	put_eilvt(offset);
+	preempt_enable();
+	return ret;
 }
 
 /*
  * check and reserve APIC extended interrupt LVT offset for IBS if
  * available
- *
- * init_ibs() preforms implicitly cpu-local operations, so pin this
- * thread to its current CPU
  */
 
 static void init_ibs(void)
 {
-	preempt_disable();
-
 	ibs_caps = get_ibs_caps();
+
 	if (!ibs_caps)
+		return;
+
+	if (ibs_eilvt_valid())
 		goto out;
 
-	if (__init_ibs_nmi() < 0)
-		ibs_caps = 0;
-	else
-		printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
+	if (!force_ibs_eilvt_setup())
+		goto out;
+
+	/* Failed to setup ibs */
+	ibs_caps = 0;
+	return;
 
 out:
-	preempt_enable();
+	printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
 }
 
 static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
-- 
cgit v1.2.3-70-g09d2


From 74b2107543da4ed9607ec484f63c42362dc9fca6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 13 Apr 2011 12:02:53 -0400
Subject: Btrfs: make sure to use the delalloc reserve when filling delalloc

In the prealloc filling code and compressed code we don't set trans->block_rsv
to the delalloc block reserve properly, which is going to make us use metadata
from the wrong pool, this patch fixes that.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7cd8ab0ef04..3b9f1643aa5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -619,6 +619,7 @@ retry:
 
 		trans = btrfs_join_transaction(root, 1);
 		BUG_ON(IS_ERR(trans));
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 		ret = btrfs_reserve_extent(trans, root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
@@ -1060,6 +1061,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 		trans = btrfs_join_transaction(root, 1);
 	}
 	BUG_ON(IS_ERR(trans));
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	cow_start = (u64)-1;
 	cur_offset = start;
-- 
cgit v1.2.3-70-g09d2


From 7a7eaa40a39bde4eefc91aadeb1ce3dc4e6a1252 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 13 Apr 2011 12:54:33 -0400
Subject: Btrfs: take away the num_items argument from btrfs_join_transaction

I keep forgetting that btrfs_join_transaction() just ignores the num_items
argument, which leads me to sending pointless patches and looking stupid :).  So
just kill the num_items argument from btrfs_join_transaction and
btrfs_start_ioctl_transaction, since neither of them use it.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/disk-io.c     |  6 +++---
 fs/btrfs/extent-tree.c | 12 ++++++------
 fs/btrfs/inode.c       | 34 +++++++++++++++++-----------------
 fs/btrfs/ioctl.c       |  4 ++--
 fs/btrfs/relocation.c  | 12 ++++++------
 fs/btrfs/transaction.c | 13 +++++--------
 fs/btrfs/transaction.h |  9 +++------
 7 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 228cf36ece8..9d6c9e332ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1568,7 +1568,7 @@ static int transaction_kthread(void *arg)
 		transid = cur->transid;
 		spin_unlock(&root->fs_info->new_trans_lock);
 
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		if (transid == trans->transid) {
 			ret = btrfs_commit_transaction(trans, root);
@@ -2495,13 +2495,13 @@ int btrfs_commit_super(struct btrfs_root *root)
 	down_write(&root->fs_info->cleanup_work_sem);
 	up_write(&root->fs_info->cleanup_work_sem);
 
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
 	/* run commit again to drop the original snapshot */
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 	btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ee6bd55e16..941b28e7893 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3174,7 +3174,7 @@ again:
 			spin_unlock(&data_sinfo->lock);
 alloc:
 			alloc_target = btrfs_get_alloc_profile(root, 1);
-			trans = btrfs_join_transaction(root, 1);
+			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
 
@@ -3202,7 +3202,7 @@ alloc:
 commit_trans:
 		if (!committed && !root->fs_info->open_ioctl_trans) {
 			committed = 1;
-			trans = btrfs_join_transaction(root, 1);
+			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
 			ret = btrfs_commit_transaction(trans, root);
@@ -3589,7 +3589,7 @@ again:
 		goto out;
 
 	ret = -ENOSPC;
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		goto out;
 	ret = btrfs_commit_transaction(trans, root);
@@ -3816,7 +3816,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		if (trans)
 			return -EAGAIN;
 
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
 		return 0;
@@ -7649,7 +7649,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
 
 		BUG_ON(reloc_root->commit_root != NULL);
 		while (1) {
-			trans = btrfs_join_transaction(root, 1);
+			trans = btrfs_join_transaction(root);
 			BUG_ON(IS_ERR(trans));
 
 			mutex_lock(&root->fs_info->drop_mutex);
@@ -8176,7 +8176,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
 	BUG_ON(cache->ro);
 
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
 
 	alloc_flags = update_block_group_flags(root, cache->flags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b9f1643aa5..e47bdf0fb75 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -420,7 +420,7 @@ again:
 		}
 	}
 	if (start == 0) {
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -617,7 +617,7 @@ retry:
 			    async_extent->start + async_extent->ram_size - 1,
 			    GFP_NOFS);
 
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 		ret = btrfs_reserve_extent(trans, root,
@@ -779,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode,
 	int ret = 0;
 
 	BUG_ON(root == root->fs_info->tree_root);
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1056,9 +1056,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	BUG_ON(!path);
 	if (root == root->fs_info->tree_root) {
 		nolock = true;
-		trans = btrfs_join_transaction_nolock(root, 1);
+		trans = btrfs_join_transaction_nolock(root);
 	} else {
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 	}
 	BUG_ON(IS_ERR(trans));
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1718,9 +1718,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (!ret) {
 			if (nolock)
-				trans = btrfs_join_transaction_nolock(root, 1);
+				trans = btrfs_join_transaction_nolock(root);
 			else
-				trans = btrfs_join_transaction(root, 1);
+				trans = btrfs_join_transaction(root);
 			BUG_ON(IS_ERR(trans));
 			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1735,9 +1735,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 			 0, &cached_state, GFP_NOFS);
 
 	if (nolock)
-		trans = btrfs_join_transaction_nolock(root, 1);
+		trans = btrfs_join_transaction_nolock(root);
 	else
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -2415,7 +2415,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 					(u64)-1);
 
 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
-		trans = btrfs_join_transaction(root, 1);
+		trans = btrfs_join_transaction(root);
 		if (!IS_ERR(trans))
 			btrfs_end_transaction(trans, root);
 	}
@@ -4378,9 +4378,9 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (wbc->sync_mode == WB_SYNC_ALL) {
 		if (nolock)
-			trans = btrfs_join_transaction_nolock(root, 1);
+			trans = btrfs_join_transaction_nolock(root);
 		else
-			trans = btrfs_join_transaction(root, 1);
+			trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 		btrfs_set_trans_block_group(trans, inode);
@@ -4407,7 +4407,7 @@ void btrfs_dirty_inode(struct inode *inode)
 	if (BTRFS_I(inode)->dummy_inode)
 		return;
 
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -5226,7 +5226,7 @@ again:
 				free_extent_map(em);
 				em = NULL;
 				btrfs_release_path(root, path);
-				trans = btrfs_join_transaction(root, 1);
+				trans = btrfs_join_transaction(root);
 				if (IS_ERR(trans))
 					return ERR_CAST(trans);
 				goto again;
@@ -5470,7 +5470,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 		btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
 	}
 
-	trans = btrfs_join_transaction(root, 0);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return ERR_CAST(trans);
 
@@ -5703,7 +5703,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		 * to make sure the current transaction stays open
 		 * while we look for nocow cross refs
 		 */
-		trans = btrfs_join_transaction(root, 0);
+		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			goto must_cow;
 
@@ -5841,7 +5841,7 @@ again:
 
 	BUG_ON(!ordered);
 
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
 		err = -ENOMEM;
 		goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2616f7ed479..908c3d4b48c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -242,7 +242,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
 
 	ret = btrfs_update_inode(trans, root, inode);
@@ -2182,7 +2182,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	ret = -ENOMEM;
-	trans = btrfs_start_ioctl_transaction(root, 0);
+	trans = btrfs_start_ioctl_transaction(root);
 	if (IS_ERR(trans))
 		goto out_drop;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 199a8013431..8bb256667f2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2149,7 +2149,7 @@ again:
 			err = ret;
 	}
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans)) {
 		if (!err)
 			btrfs_block_rsv_release(rc->extent_root,
@@ -3233,7 +3233,7 @@ truncate:
 		goto out;
 	}
 
-	trans = btrfs_join_transaction(root, 0);
+	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
 		ret = PTR_ERR(trans);
@@ -3642,7 +3642,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	BUG_ON(IS_ERR(trans));
 	btrfs_commit_transaction(trans, rc->extent_root);
 	return 0;
@@ -3831,7 +3831,7 @@ restart:
 	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
 
 	/* get rid of pinned extents */
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans))
 		err = PTR_ERR(trans);
 	else
@@ -4156,7 +4156,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 
 	set_reloc_control(rc);
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans)) {
 		unset_reloc_control(rc);
 		err = PTR_ERR(trans);
@@ -4190,7 +4190,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 
 	unset_reloc_control(rc);
 
-	trans = btrfs_join_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root);
 	if (IS_ERR(trans))
 		err = PTR_ERR(trans);
 	else
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c571734d5e5..70bfb26df96 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -257,22 +257,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 {
 	return start_transaction(root, num_items, TRANS_START);
 }
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-						   int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
 	return start_transaction(root, 0, TRANS_JOIN);
 }
 
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
-							  int num_blocks)
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
 	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
 }
 
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-							 int num_blocks)
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-	return start_transaction(r, 0, TRANS_USERSPACE);
+	return start_transaction(root, 0, TRANS_USERSPACE);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -1171,7 +1168,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 
 	INIT_DELAYED_WORK(&ac->work, do_async_commit);
 	ac->root = root;
-	ac->newtrans = btrfs_join_transaction(root, 0);
+	ac->newtrans = btrfs_join_transaction(root);
 	if (IS_ERR(ac->newtrans)) {
 		int err = PTR_ERR(ac->newtrans);
 		kfree(ac);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e441acc6c58..1f573f09dba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -92,12 +92,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-						  int num_blocks);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
-							  int num_blocks);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-							 int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
-- 
cgit v1.2.3-70-g09d2


From 2a1eb4614d984d5cd4c928784e9afcf5c07f93be Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 13 Apr 2011 15:15:59 -0400
Subject: Btrfs: if we've already started a trans handle, use that one

We currently track trans handles in current->journal_info, but we don't actually
use it.  This patch fixes it.  This will cover the case where we have multiple
people starting transactions down the call chain.  This keeps us from having to
allocate a new handle and all of that, we just increase the use count of the
current handle, save the old block_rsv, and return.  I tested this with xfstests
and it worked out fine.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/transaction.c | 17 +++++++++++++++++
 fs/btrfs/transaction.h |  2 ++
 2 files changed, 19 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 70bfb26df96..46f40564c16 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -184,6 +184,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 
 	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
 		return ERR_PTR(-EROFS);
+
+	if (current->journal_info) {
+		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+		h = current->journal_info;
+		h->use_count++;
+		h->orig_rsv = h->block_rsv;
+		h->block_rsv = NULL;
+		goto got_it;
+	}
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	if (!h)
@@ -213,7 +222,9 @@ again:
 	h->block_group = 0;
 	h->bytes_reserved = 0;
 	h->delayed_ref_updates = 0;
+	h->use_count = 1;
 	h->block_rsv = NULL;
+	h->orig_rsv = NULL;
 
 	smp_mb();
 	if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -241,6 +252,7 @@ again:
 		}
 	}
 
+got_it:
 	if (type != TRANS_JOIN_NOLOCK)
 		mutex_lock(&root->fs_info->trans_mutex);
 	record_root_in_trans(h, root);
@@ -428,6 +440,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *info = root->fs_info;
 	int count = 0;
 
+	if (--trans->use_count) {
+		trans->block_rsv = trans->orig_rsv;
+		return 0;
+	}
+
 	while (count < 4) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1f573f09dba..154314f80f8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,11 +47,13 @@ struct btrfs_trans_handle {
 	u64 transid;
 	u64 block_group;
 	u64 bytes_reserved;
+	unsigned long use_count;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	unsigned long delayed_ref_updates;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *orig_rsv;
 };
 
 struct btrfs_pending_snapshot {
-- 
cgit v1.2.3-70-g09d2


From a4abeea41adfa3c143c289045f4625dfaeba2212 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 11 Apr 2011 17:25:13 -0400
Subject: Btrfs: kill trans_mutex

We use trans_mutex for lots of things, here's a basic list

1) To serialize trans_handles joining the currently running transaction
2) To make sure that no new trans handles are started while we are committing
3) To protect the dead_roots list and the transaction lists

Really the serializing trans_handles joining is not too hard, and can really get
bogged down in acquiring a reference to the transaction.  So replace the
trans_mutex with a trans_lock spinlock and use it to do the following

1) Protect fs_info->running_transaction.  All trans handles have to do is check
this, and then take a reference of the transaction and keep on going.
2) Protect the fs_info->trans_list.  This doesn't get used too much, basically
it just holds the current transactions, which will usually just be the currently
committing transaction and the currently running transaction at most.
3) Protect the dead roots list.  This is only ever processed by splicing the
list so this is relatively simple.
4) Protect the fs_info->reloc_ctl stuff.  This is very lightweight and was using
the trans_mutex before, so this is a pretty straightforward change.
5) Protect fs_info->no_trans_join.  Because we don't hold the trans_lock over
the entirety of the commit we need to have a way to block new people from
creating a new transaction while we're doing our work.  So we set no_trans_join
and in join_transaction we test to see if that is set, and if it is we do a
wait_on_commit.
6) Make the transaction use count atomic so we don't need to take locks to
modify it when we're dropping references.
7) Add a commit_lock to the transaction to make sure multiple people trying to
commit the same transaction don't race and commit at the same time.
8) Make open_ioctl_trans an atomic so we don't have to take any locks for ioctl
trans.

I have tested this with xfstests, but obviously it is a pretty hairy change so
lots of testing is greatly appreciated.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |   6 +-
 fs/btrfs/disk-io.c     |  30 +++---
 fs/btrfs/extent-tree.c |   3 +-
 fs/btrfs/file.c        |   4 +-
 fs/btrfs/ioctl.c       |  12 +--
 fs/btrfs/relocation.c  |  16 +--
 fs/btrfs/transaction.c | 271 ++++++++++++++++++++++++++-----------------------
 fs/btrfs/transaction.h |   4 +-
 8 files changed, 177 insertions(+), 169 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f4b81de3ae..522a39b0033 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -919,7 +919,6 @@ struct btrfs_fs_info {
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	u64 open_ioctl_trans;
 	unsigned long mount_opt:20;
 	unsigned long compress_type:4;
 	u64 max_inline;
@@ -936,7 +935,6 @@ struct btrfs_fs_info {
 	struct super_block *sb;
 	struct inode *btree_inode;
 	struct backing_dev_info bdi;
-	struct mutex trans_mutex;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
@@ -957,6 +955,7 @@ struct btrfs_fs_info {
 	struct rw_semaphore subvol_sem;
 	struct srcu_struct subvol_srcu;
 
+	spinlock_t trans_lock;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -969,6 +968,7 @@ struct btrfs_fs_info {
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
+	atomic_t open_ioctl_trans;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -1032,6 +1032,7 @@ struct btrfs_fs_info {
 	int closing;
 	int log_root_recovering;
 	int enospc_unlink;
+	int trans_no_join;
 
 	u64 total_pinned;
 
@@ -1053,7 +1054,6 @@ struct btrfs_fs_info {
 	struct reloc_control *reloc_ctl;
 
 	spinlock_t delalloc_lock;
-	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
 
 	/* data_alloc_cluster is only used in ssd mode */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9d6c9e332ca..93ef254ec43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1551,22 +1551,22 @@ static int transaction_kthread(void *arg)
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		spin_lock(&root->fs_info->new_trans_lock);
+		spin_lock(&root->fs_info->trans_lock);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
-			spin_unlock(&root->fs_info->new_trans_lock);
+			spin_unlock(&root->fs_info->trans_lock);
 			goto sleep;
 		}
 
 		now = get_seconds();
 		if (!cur->blocked &&
 		    (now < cur->start_time || now - cur->start_time < 30)) {
-			spin_unlock(&root->fs_info->new_trans_lock);
+			spin_unlock(&root->fs_info->trans_lock);
 			delay = HZ * 5;
 			goto sleep;
 		}
 		transid = cur->transid;
-		spin_unlock(&root->fs_info->new_trans_lock);
+		spin_unlock(&root->fs_info->trans_lock);
 
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
@@ -1658,7 +1658,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
 	spin_lock_init(&fs_info->delalloc_lock);
-	spin_lock_init(&fs_info->new_trans_lock);
+	spin_lock_init(&fs_info->trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
@@ -1687,6 +1687,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->sb = sb;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
+	fs_info->trans_no_join = 0;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
@@ -1735,7 +1736,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->do_barriers = 1;
 
 
-	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->chunk_mutex);
@@ -3006,10 +3006,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 
 	WARN_ON(1);
 
-	mutex_lock(&root->fs_info->trans_mutex);
 	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
+	spin_lock(&root->fs_info->trans_lock);
 	list_splice_init(&root->fs_info->trans_list, &list);
+	root->fs_info->trans_no_join = 1;
+	spin_unlock(&root->fs_info->trans_lock);
+
 	while (!list_empty(&list)) {
 		t = list_entry(list.next, struct btrfs_transaction, list);
 		if (!t)
@@ -3034,23 +3037,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 		t->blocked = 0;
 		if (waitqueue_active(&root->fs_info->transaction_wait))
 			wake_up(&root->fs_info->transaction_wait);
-		mutex_unlock(&root->fs_info->trans_mutex);
 
-		mutex_lock(&root->fs_info->trans_mutex);
 		t->commit_done = 1;
 		if (waitqueue_active(&t->commit_wait))
 			wake_up(&t->commit_wait);
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		mutex_lock(&root->fs_info->trans_mutex);
 
 		btrfs_destroy_pending_snapshots(t);
 
 		btrfs_destroy_delalloc_inodes(root);
 
-		spin_lock(&root->fs_info->new_trans_lock);
+		spin_lock(&root->fs_info->trans_lock);
 		root->fs_info->running_transaction = NULL;
-		spin_unlock(&root->fs_info->new_trans_lock);
+		spin_unlock(&root->fs_info->trans_lock);
 
 		btrfs_destroy_marked_extents(root, &t->dirty_pages,
 					     EXTENT_DIRTY);
@@ -3064,8 +3062,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 		kmem_cache_free(btrfs_transaction_cachep, t);
 	}
 
+	spin_lock(&root->fs_info->trans_lock);
+	root->fs_info->trans_no_join = 0;
+	spin_unlock(&root->fs_info->trans_lock);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 941b28e7893..ca599654ce1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3200,7 +3200,8 @@ alloc:
 
 		/* commit the current transaction and try again */
 commit_trans:
-		if (!committed && !root->fs_info->open_ioctl_trans) {
+		if (!committed &&
+		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			committed = 1;
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 75899a01dde..cd5e82e500c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1222,14 +1222,12 @@ int btrfs_sync_file(struct file *file, int datasync)
 	 * the current transaction, we can bail out now without any
 	 * syncing
 	 */
-	mutex_lock(&root->fs_info->trans_mutex);
+	smp_mb();
 	if (BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
-		mutex_unlock(&root->fs_info->trans_mutex);
 		goto out;
 	}
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 908c3d4b48c..a578620e06a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2177,9 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	if (ret)
 		goto out;
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	root->fs_info->open_ioctl_trans++;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	atomic_inc(&root->fs_info->open_ioctl_trans);
 
 	ret = -ENOMEM;
 	trans = btrfs_start_ioctl_transaction(root);
@@ -2190,9 +2188,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	return 0;
 
 out_drop:
-	mutex_lock(&root->fs_info->trans_mutex);
-	root->fs_info->open_ioctl_trans--;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	atomic_dec(&root->fs_info->open_ioctl_trans);
 	mnt_drop_write(file->f_path.mnt);
 out:
 	return ret;
@@ -2426,9 +2422,7 @@ long btrfs_ioctl_trans_end(struct file *file)
 
 	btrfs_end_transaction(trans, root);
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	root->fs_info->open_ioctl_trans--;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	atomic_dec(&root->fs_info->open_ioctl_trans);
 
 	mnt_drop_write(file->f_path.mnt);
 	return 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8bb256667f2..09c30d37d43 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2136,10 +2136,10 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 	u64 num_bytes = 0;
 	int ret;
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	spin_lock(&root->fs_info->trans_lock);
 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
 	rc->merging_rsv_size += rc->nodes_relocated * 2;
-	mutex_unlock(&root->fs_info->trans_mutex);
+	spin_unlock(&root->fs_info->trans_lock);
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
@@ -2208,9 +2208,9 @@ int merge_reloc_roots(struct reloc_control *rc)
 	int ret;
 again:
 	root = rc->extent_root;
-	mutex_lock(&root->fs_info->trans_mutex);
+	spin_lock(&root->fs_info->trans_lock);
 	list_splice_init(&rc->reloc_roots, &reloc_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	spin_unlock(&root->fs_info->trans_lock);
 
 	while (!list_empty(&reloc_roots)) {
 		found = 1;
@@ -3583,17 +3583,17 @@ next:
 static void set_reloc_control(struct reloc_control *rc)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	mutex_lock(&fs_info->trans_mutex);
+	spin_lock(&fs_info->trans_lock);
 	fs_info->reloc_ctl = rc;
-	mutex_unlock(&fs_info->trans_mutex);
+	spin_unlock(&fs_info->trans_lock);
 }
 
 static void unset_reloc_control(struct reloc_control *rc)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	mutex_lock(&fs_info->trans_mutex);
+	spin_lock(&fs_info->trans_lock);
 	fs_info->reloc_ctl = NULL;
-	mutex_unlock(&fs_info->trans_mutex);
+	spin_unlock(&fs_info->trans_lock);
 }
 
 static int check_extent_flags(u64 flags)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 46f40564c16..43816f8b23e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,6 +34,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
+		BUG_ON(!list_empty(&transaction->list));
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
@@ -48,47 +49,73 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
 {
 	struct btrfs_transaction *cur_trans;
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (root->fs_info->trans_no_join) {
+		if (!nofail) {
+			spin_unlock(&root->fs_info->trans_lock);
+			return -EBUSY;
+		}
+	}
+
 	cur_trans = root->fs_info->running_transaction;
-	if (!cur_trans) {
-		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
-					     GFP_NOFS);
-		if (!cur_trans)
-			return -ENOMEM;
-		root->fs_info->generation++;
-		atomic_set(&cur_trans->num_writers, 1);
-		cur_trans->num_joined = 0;
-		cur_trans->transid = root->fs_info->generation;
-		init_waitqueue_head(&cur_trans->writer_wait);
-		init_waitqueue_head(&cur_trans->commit_wait);
-		cur_trans->in_commit = 0;
-		cur_trans->blocked = 0;
-		atomic_set(&cur_trans->use_count, 1);
-		cur_trans->commit_done = 0;
-		cur_trans->start_time = get_seconds();
-
-		cur_trans->delayed_refs.root = RB_ROOT;
-		cur_trans->delayed_refs.num_entries = 0;
-		cur_trans->delayed_refs.num_heads_ready = 0;
-		cur_trans->delayed_refs.num_heads = 0;
-		cur_trans->delayed_refs.flushing = 0;
-		cur_trans->delayed_refs.run_delayed_start = 0;
-		spin_lock_init(&cur_trans->delayed_refs.lock);
-
-		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-		extent_io_tree_init(&cur_trans->dirty_pages,
-				     root->fs_info->btree_inode->i_mapping,
-				     GFP_NOFS);
-		spin_lock(&root->fs_info->new_trans_lock);
-		root->fs_info->running_transaction = cur_trans;
-		spin_unlock(&root->fs_info->new_trans_lock);
-	} else {
+	if (cur_trans) {
+		atomic_inc(&cur_trans->use_count);
+		atomic_inc(&cur_trans->num_writers);
+		cur_trans->num_joined++;
+		spin_unlock(&root->fs_info->trans_lock);
+		return 0;
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+
+	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+	if (!cur_trans)
+		return -ENOMEM;
+	spin_lock(&root->fs_info->trans_lock);
+	if (root->fs_info->running_transaction) {
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		cur_trans = root->fs_info->running_transaction;
+		atomic_inc(&cur_trans->use_count);
 		atomic_inc(&cur_trans->num_writers);
 		cur_trans->num_joined++;
+		spin_unlock(&root->fs_info->trans_lock);
+		return 0;
 	}
+	atomic_set(&cur_trans->num_writers, 1);
+	cur_trans->num_joined = 0;
+	init_waitqueue_head(&cur_trans->writer_wait);
+	init_waitqueue_head(&cur_trans->commit_wait);
+	cur_trans->in_commit = 0;
+	cur_trans->blocked = 0;
+	/*
+	 * One for this trans handle, one so it will live on until we
+	 * commit the transaction.
+	 */
+	atomic_set(&cur_trans->use_count, 2);
+	cur_trans->commit_done = 0;
+	cur_trans->start_time = get_seconds();
+
+	cur_trans->delayed_refs.root = RB_ROOT;
+	cur_trans->delayed_refs.num_entries = 0;
+	cur_trans->delayed_refs.num_heads_ready = 0;
+	cur_trans->delayed_refs.num_heads = 0;
+	cur_trans->delayed_refs.flushing = 0;
+	cur_trans->delayed_refs.run_delayed_start = 0;
+	spin_lock_init(&cur_trans->commit_lock);
+	spin_lock_init(&cur_trans->delayed_refs.lock);
+
+	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+	extent_io_tree_init(&cur_trans->dirty_pages,
+			     root->fs_info->btree_inode->i_mapping,
+			     GFP_NOFS);
+	root->fs_info->generation++;
+	cur_trans->transid = root->fs_info->generation;
+	root->fs_info->running_transaction = cur_trans;
+	spin_unlock(&root->fs_info->trans_lock);
 
 	return 0;
 }
@@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root)
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
 {
 	if (root->ref_cows && root->last_trans < trans->transid) {
 		WARN_ON(root == root->fs_info->extent_root);
 		WARN_ON(root->commit_root != root->node);
 
+		spin_lock(&root->fs_info->fs_roots_radix_lock);
+		if (root->last_trans == trans->transid) {
+			spin_unlock(&root->fs_info->fs_roots_radix_lock);
+			return 0;
+		}
+		root->last_trans = trans->transid;
 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 			   (unsigned long)root->root_key.objectid,
 			   BTRFS_ROOT_TRANS_TAG);
-		root->last_trans = trans->transid;
+		spin_unlock(&root->fs_info->fs_roots_radix_lock);
 		btrfs_init_reloc_root(trans, root);
 	}
 	return 0;
 }
 
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root)
-{
-	if (!root->ref_cows)
-		return 0;
-
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (root->last_trans == trans->transid) {
-		mutex_unlock(&root->fs_info->trans_mutex);
-		return 0;
-	}
-
-	record_root_in_trans(trans, root);
-	mutex_unlock(&root->fs_info->trans_mutex);
-	return 0;
-}
-
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
@@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
 
+	spin_lock(&root->fs_info->trans_lock);
 	cur_trans = root->fs_info->running_transaction;
 	if (cur_trans && cur_trans->blocked) {
 		DEFINE_WAIT(wait);
 		atomic_inc(&cur_trans->use_count);
+		spin_unlock(&root->fs_info->trans_lock);
 		while (1) {
 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 			if (!cur_trans->blocked)
 				break;
-			mutex_unlock(&root->fs_info->trans_mutex);
 			schedule();
-			mutex_lock(&root->fs_info->trans_mutex);
 		}
 		finish_wait(&root->fs_info->transaction_wait, &wait);
 		put_transaction(cur_trans);
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
 	}
 }
 
@@ -167,10 +185,16 @@ enum btrfs_trans_type {
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-	if (!root->fs_info->log_root_recovering &&
-	    ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-	     type == TRANS_USERSPACE))
+	if (root->fs_info->log_root_recovering)
+		return 0;
+
+	if (type == TRANS_USERSPACE)
+		return 1;
+
+	if (type == TRANS_START &&
+	    !atomic_read(&root->fs_info->open_ioctl_trans))
 		return 1;
+
 	return 0;
 }
 
@@ -198,23 +222,21 @@ again:
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_lock(&root->fs_info->trans_mutex);
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
-	ret = join_transaction(root);
+	do {
+		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+		if (ret == -EBUSY)
+			wait_current_trans(root);
+	} while (ret == -EBUSY);
+
 	if (ret < 0) {
 		kmem_cache_free(btrfs_trans_handle_cachep, h);
-		if (type != TRANS_JOIN_NOLOCK)
-			mutex_unlock(&root->fs_info->trans_mutex);
 		return ERR_PTR(ret);
 	}
 
 	cur_trans = root->fs_info->running_transaction;
-	atomic_inc(&cur_trans->use_count);
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_unlock(&root->fs_info->trans_mutex);
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
@@ -253,11 +275,7 @@ again:
 	}
 
 got_it:
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_lock(&root->fs_info->trans_mutex);
-	record_root_in_trans(h, root);
-	if (type != TRANS_JOIN_NOLOCK)
-		mutex_unlock(&root->fs_info->trans_mutex);
+	btrfs_record_root_in_trans(h, root);
 
 	if (!current->journal_info && type != TRANS_USERSPACE)
 		current->journal_info = h;
@@ -289,17 +307,13 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
 	DEFINE_WAIT(wait);
-	mutex_lock(&root->fs_info->trans_mutex);
 	while (!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (commit->commit_done)
 			break;
-		mutex_unlock(&root->fs_info->trans_mutex);
 		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
 	}
-	mutex_unlock(&root->fs_info->trans_mutex);
 	finish_wait(&commit->commit_wait, &wait);
 	return 0;
 }
@@ -309,50 +323,49 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 	struct btrfs_transaction *cur_trans = NULL, *t;
 	int ret;
 
-	mutex_lock(&root->fs_info->trans_mutex);
-
 	ret = 0;
 	if (transid) {
 		if (transid <= root->fs_info->last_trans_committed)
-			goto out_unlock;
+			goto out;
 
 		/* find specified transaction */
+		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry(t, &root->fs_info->trans_list, list) {
 			if (t->transid == transid) {
 				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
 				break;
 			}
 			if (t->transid > transid)
 				break;
 		}
+		spin_unlock(&root->fs_info->trans_lock);
 		ret = -EINVAL;
 		if (!cur_trans)
-			goto out_unlock;  /* bad transid */
+			goto out;  /* bad transid */
 	} else {
 		/* find newest transaction that is committing | committed */
+		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
 					    list) {
 			if (t->in_commit) {
 				if (t->commit_done)
-					goto out_unlock;
+					goto out;
 				cur_trans = t;
+				atomic_inc(&cur_trans->use_count);
 				break;
 			}
 		}
+		spin_unlock(&root->fs_info->trans_lock);
 		if (!cur_trans)
-			goto out_unlock;  /* nothing committing|committed */
+			goto out;  /* nothing committing|committed */
 	}
 
-	atomic_inc(&cur_trans->use_count);
-	mutex_unlock(&root->fs_info->trans_mutex);
-
 	wait_for_commit(root, cur_trans);
 
-	mutex_lock(&root->fs_info->trans_mutex);
 	put_transaction(cur_trans);
 	ret = 0;
-out_unlock:
-	mutex_unlock(&root->fs_info->trans_mutex);
+out:
 	return ret;
 }
 
@@ -401,10 +414,8 @@ harder:
 
 void btrfs_throttle(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->trans_mutex);
-	if (!root->fs_info->open_ioctl_trans)
+	if (!atomic_read(&root->fs_info->open_ioctl_trans))
 		wait_current_trans(root);
-	mutex_unlock(&root->fs_info->trans_mutex);
 }
 
 static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -422,6 +433,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	int updates;
 
+	smp_mb();
 	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
 		return 1;
 
@@ -467,9 +479,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 
-	if (lock && !root->fs_info->open_ioctl_trans &&
-	    should_end_transaction(trans, root))
+	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
+	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
+		smp_wmb();
+	}
 
 	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
 		if (throttle)
@@ -739,9 +753,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
  */
 int btrfs_add_dead_root(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->trans_mutex);
+	spin_lock(&root->fs_info->trans_lock);
 	list_add(&root->root_list, &root->fs_info->dead_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	spin_unlock(&root->fs_info->trans_lock);
 	return 0;
 }
 
@@ -757,6 +771,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 
+	spin_lock(&fs_info->fs_roots_radix_lock);
 	while (1) {
 		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
 						 (void **)gang, 0,
@@ -769,6 +784,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			radix_tree_tag_clear(&fs_info->fs_roots_radix,
 					(unsigned long)root->root_key.objectid,
 					BTRFS_ROOT_TRANS_TAG);
+			spin_unlock(&fs_info->fs_roots_radix_lock);
 
 			btrfs_free_log(trans, root);
 			btrfs_update_reloc_root(trans, root);
@@ -783,10 +799,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			err = btrfs_update_root(trans, fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
+			spin_lock(&fs_info->fs_roots_radix_lock);
 			if (err)
 				break;
 		}
 	}
+	spin_unlock(&fs_info->fs_roots_radix_lock);
 	return err;
 }
 
@@ -972,7 +990,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	parent = dget_parent(dentry);
 	parent_inode = parent->d_inode;
 	parent_root = BTRFS_I(parent_inode)->root;
-	record_root_in_trans(trans, parent_root);
+	btrfs_record_root_in_trans(trans, parent_root);
 
 	/*
 	 * insert the directory item
@@ -990,7 +1008,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	BUG_ON(ret);
 
-	record_root_in_trans(trans, root);
+	btrfs_record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 	btrfs_check_and_init_root_item(new_root_item);
@@ -1080,20 +1098,20 @@ static void update_super_roots(struct btrfs_root *root)
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
 	int ret = 0;
-	spin_lock(&info->new_trans_lock);
+	spin_lock(&info->trans_lock);
 	if (info->running_transaction)
 		ret = info->running_transaction->in_commit;
-	spin_unlock(&info->new_trans_lock);
+	spin_unlock(&info->trans_lock);
 	return ret;
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
 	int ret = 0;
-	spin_lock(&info->new_trans_lock);
+	spin_lock(&info->trans_lock);
 	if (info->running_transaction)
 		ret = info->running_transaction->blocked;
-	spin_unlock(&info->new_trans_lock);
+	spin_unlock(&info->trans_lock);
 	return ret;
 }
 
@@ -1117,9 +1135,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
 				    &wait);
 			break;
 		}
-		mutex_unlock(&root->fs_info->trans_mutex);
 		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
 	}
 }
@@ -1145,9 +1161,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 				    &wait);
 			break;
 		}
-		mutex_unlock(&root->fs_info->trans_mutex);
 		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&root->fs_info->transaction_wait,
 			    &wait);
 	}
@@ -1193,22 +1207,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	}
 
 	/* take transaction reference */
-	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = trans->transaction;
 	atomic_inc(&cur_trans->use_count);
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	btrfs_end_transaction(trans, root);
 	schedule_delayed_work(&ac->work, 0);
 
 	/* wait for transaction to start and unblock */
-	mutex_lock(&root->fs_info->trans_mutex);
 	if (wait_for_unblock)
 		wait_current_trans_commit_start_and_unblock(root, cur_trans);
 	else
 		wait_current_trans_commit_start(root, cur_trans);
 	put_transaction(cur_trans);
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	return 0;
 }
@@ -1252,38 +1262,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
-	mutex_lock(&root->fs_info->trans_mutex);
+	spin_lock(&cur_trans->commit_lock);
 	if (cur_trans->in_commit) {
+		spin_unlock(&cur_trans->commit_lock);
 		atomic_inc(&cur_trans->use_count);
-		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
 
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
 
-		mutex_lock(&root->fs_info->trans_mutex);
 		put_transaction(cur_trans);
-		mutex_unlock(&root->fs_info->trans_mutex);
 
 		return 0;
 	}
 
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
+	spin_unlock(&cur_trans->commit_lock);
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
+	spin_lock(&root->fs_info->trans_lock);
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
 		if (!prev_trans->commit_done) {
 			atomic_inc(&prev_trans->use_count);
-			mutex_unlock(&root->fs_info->trans_mutex);
+			spin_unlock(&root->fs_info->trans_lock);
 
 			wait_for_commit(root, prev_trans);
 
-			mutex_lock(&root->fs_info->trans_mutex);
 			put_transaction(prev_trans);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
 		}
+	} else {
+		spin_unlock(&root->fs_info->trans_lock);
 	}
 
 	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1291,12 +1304,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	do {
 		int snap_pending = 0;
+
 		joined = cur_trans->num_joined;
 		if (!list_empty(&trans->transaction->pending_snapshots))
 			snap_pending = 1;
 
 		WARN_ON(cur_trans != trans->transaction);
-		mutex_unlock(&root->fs_info->trans_mutex);
 
 		if (flush_on_commit || snap_pending) {
 			btrfs_start_delalloc_inodes(root, 1);
@@ -1316,14 +1329,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		prepare_to_wait(&cur_trans->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		smp_mb();
 		if (atomic_read(&cur_trans->num_writers) > 1)
 			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
 		else if (should_grow)
 			schedule_timeout(1);
 
-		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
+		spin_lock(&root->fs_info->trans_lock);
+		root->fs_info->trans_no_join = 1;
+		spin_unlock(&root->fs_info->trans_lock);
 	} while (atomic_read(&cur_trans->num_writers) > 1 ||
 		 (should_grow && cur_trans->num_joined != joined));
 
@@ -1364,9 +1378,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_prepare_extent_commit(trans, root);
 
 	cur_trans = root->fs_info->running_transaction;
-	spin_lock(&root->fs_info->new_trans_lock);
-	root->fs_info->running_transaction = NULL;
-	spin_unlock(&root->fs_info->new_trans_lock);
 
 	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
 			    root->fs_info->tree_root->node);
@@ -1387,10 +1398,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	       sizeof(root->fs_info->super_copy));
 
 	trans->transaction->blocked = 0;
+	spin_lock(&root->fs_info->trans_lock);
+	root->fs_info->running_transaction = NULL;
+	root->fs_info->trans_no_join = 0;
+	spin_unlock(&root->fs_info->trans_lock);
 
 	wake_up(&root->fs_info->transaction_wait);
 
-	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
 	write_ctree_super(trans, root, 0);
@@ -1403,22 +1417,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	mutex_lock(&root->fs_info->trans_mutex);
-
 	cur_trans->commit_done = 1;
 
 	root->fs_info->last_trans_committed = cur_trans->transid;
 
 	wake_up(&cur_trans->commit_wait);
 
+	spin_lock(&root->fs_info->trans_lock);
 	list_del_init(&cur_trans->list);
+	spin_unlock(&root->fs_info->trans_lock);
+
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
 	trace_btrfs_transaction_commit(root);
 
-	mutex_unlock(&root->fs_info->trans_mutex);
-
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
 
@@ -1438,9 +1451,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 	LIST_HEAD(list);
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	mutex_lock(&fs_info->trans_mutex);
+	spin_lock(&fs_info->trans_lock);
 	list_splice_init(&fs_info->dead_roots, &list);
-	mutex_unlock(&fs_info->trans_mutex);
+	spin_unlock(&fs_info->trans_lock);
 
 	while (!list_empty(&list)) {
 		root = list_entry(list.next, struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 154314f80f8..11c6efcd4ed 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -28,10 +28,12 @@ struct btrfs_transaction {
 	 * transaction can end
 	 */
 	atomic_t num_writers;
+	atomic_t use_count;
 
 	unsigned long num_joined;
+
+	spinlock_t commit_lock;
 	int in_commit;
-	atomic_t use_count;
 	int commit_done;
 	int blocked;
 	struct list_head list;
-- 
cgit v1.2.3-70-g09d2


From fcb80c2affd63237cff5b34cba5756be7c976a5a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 3 May 2011 10:40:22 -0400
Subject: Btrfs: fix how we do space reservation for truncate

The ceph guys keep running into problems where we have space reserved in our
orphan block rsv when freeing it up.  This is because they tend to do snapshots
alot, so their truncates tend to use a bunch of space, so when we go to do
things like update the inode we have to steal reservation space in order to make
the reservation happen.  This happens because truncate can use as much space as
it freaking feels like, but we still have to hold space for removing the orphan
item and updating the inode, which will definitely always happen.  So in order
to fix this we need to split all of the reservation stuf up.  So with this patch
we have

1) The orphan block reserve which only holds the space for deleting our orphan
item when everything is over.

2) The truncate block reserve which gets allocated and used specifically for the
space that the truncate will use on a per truncate basis.

3) The transaction will always have 1 item's worth of data reserved so we can
update the inode normally.

Hopefully this will make the ceph problem go away.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |   3 ++
 fs/btrfs/extent-tree.c |  46 +++++++++++++++-----
 fs/btrfs/inode.c       | 111 +++++++++++++++++++++++++++++++++++++------------
 3 files changed, 123 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 522a39b0033..f31aed7fedd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2224,6 +2224,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ca599654ce1..a2ca561c70f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3980,6 +3980,37 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
 		3 * num_items;
 }
 
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_block_rsv *rsv)
+{
+	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
+	u64 num_bytes;
+	int ret;
+
+	/*
+	 * Truncate should be freeing data, but give us 2 items just in case it
+	 * needs to use some space.  We may want to be smarter about this in the
+	 * future.
+	 */
+	num_bytes = calc_trans_metadata_size(root, 2);
+
+	/* We already have enough bytes, just return */
+	if (rsv->reserved >= num_bytes)
+		return 0;
+
+	num_bytes -= rsv->reserved;
+
+	/*
+	 * You should have reserved enough space before hand to do this, so this
+	 * should not fail.
+	 */
+	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+	BUG_ON(ret);
+
+	return 0;
+}
+
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 int num_items)
@@ -4020,23 +4051,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
 
 	/*
-	 * one for deleting orphan item, one for updating inode and
-	 * two for calling btrfs_truncate_inode_items.
-	 *
-	 * btrfs_truncate_inode_items is a delete operation, it frees
-	 * more space than it uses in most cases. So two units of
-	 * metadata space should be enough for calling it many times.
-	 * If all of the metadata space is used, we can commit
-	 * transaction and use space it freed.
+	 * We need to hold space in order to delete our orphan item once we've
+	 * added it, so this takes the reservation so we can release it later
+	 * when we are truly done with the orphan item.
 	 */
-	u64 num_bytes = calc_trans_metadata_size(root, 4);
+	u64 num_bytes = calc_trans_metadata_size(root, 1);
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
 void btrfs_orphan_release_metadata(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 num_bytes = calc_trans_metadata_size(root, 4);
+	u64 num_bytes = calc_trans_metadata_size(root, 1);
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e47bdf0fb75..bc12ba23db5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6591,6 +6591,7 @@ out:
 static int btrfs_truncate(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *rsv;
 	int ret;
 	int err = 0;
 	struct btrfs_trans_handle *trans;
@@ -6604,28 +6605,83 @@ static int btrfs_truncate(struct inode *inode)
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
-	trans = btrfs_start_transaction(root, 5);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	/*
+	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
+	 * 3 things going on here
+	 *
+	 * 1) We need to reserve space for our orphan item and the space to
+	 * delete our orphan item.  Lord knows we don't want to have a dangling
+	 * orphan item because we didn't reserve space to remove it.
+	 *
+	 * 2) We need to reserve space to update our inode.
+	 *
+	 * 3) We need to have something to cache all the space that is going to
+	 * be free'd up by the truncate operation, but also have some slack
+	 * space reserved in case it uses space during the truncate (thank you
+	 * very much snapshotting).
+	 *
+	 * And we need these to all be seperate.  The fact is we can use alot of
+	 * space doing the truncate, and we have no earthly idea how much space
+	 * we will use, so we need the truncate reservation to be seperate so it
+	 * doesn't end up using space reserved for updating the inode or
+	 * removing the orphan item.  We also need to be able to stop the
+	 * transaction and start a new one, which means we need to be able to
+	 * update the inode several times, and we have no idea of knowing how
+	 * many times that will be, so we can't just reserve 1 item for the
+	 * entirety of the opration, so that has to be done seperately as well.
+	 * Then there is the orphan item, which does indeed need to be held on
+	 * to for the whole operation, and we need nobody to touch this reserved
+	 * space except the orphan code.
+	 *
+	 * So that leaves us with
+	 *
+	 * 1) root->orphan_block_rsv - for the orphan deletion.
+	 * 2) rsv - for the truncate reservation, which we will steal from the
+	 * transaction reservation.
+	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+	 * updating the inode.
+	 */
+	rsv = btrfs_alloc_block_rsv(root);
+	if (!rsv)
+		return -ENOMEM;
+	btrfs_add_durable_block_rsv(root->fs_info, rsv);
+
+	trans = btrfs_start_transaction(root, 4);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
 
 	btrfs_set_trans_block_group(trans, inode);
 
+	/*
+	 * Reserve space for the truncate process.  Truncate should be adding
+	 * space, but if there are snapshots it may end up using space.
+	 */
+	ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+	BUG_ON(ret);
+
 	ret = btrfs_orphan_add(trans, inode);
 	if (ret) {
 		btrfs_end_transaction(trans, root);
-		return ret;
+		goto out;
 	}
 
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 
-	/* Now start a transaction for the truncate */
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	/*
+	 * Ok so we've already migrated our bytes over for the truncate, so here
+	 * just reserve the one slot we need for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
 	btrfs_set_trans_block_group(trans, inode);
-	trans->block_rsv = root->orphan_block_rsv;
+	trans->block_rsv = rsv;
 
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
@@ -6649,24 +6705,18 @@ static int btrfs_truncate(struct inode *inode)
 
 	while (1) {
 		if (!trans) {
-			trans = btrfs_start_transaction(root, 0);
-			if (IS_ERR(trans))
-				return PTR_ERR(trans);
-			btrfs_set_trans_block_group(trans, inode);
-			trans->block_rsv = root->orphan_block_rsv;
-		}
+			trans = btrfs_start_transaction(root, 3);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				goto out;
+			}
 
-		ret = btrfs_block_rsv_check(trans, root,
-					    root->orphan_block_rsv, 0, 5);
-		if (ret == -EAGAIN) {
-			ret = btrfs_commit_transaction(trans, root);
-			if (ret)
-				return ret;
-			trans = NULL;
-			continue;
-		} else if (ret) {
-			err = ret;
-			break;
+			ret = btrfs_truncate_reserve_metadata(trans, root,
+							      rsv);
+			BUG_ON(ret);
+
+			btrfs_set_trans_block_group(trans, inode);
+			trans->block_rsv = rsv;
 		}
 
 		ret = btrfs_truncate_inode_items(trans, root, inode,
@@ -6677,6 +6727,7 @@ static int btrfs_truncate(struct inode *inode)
 			break;
 		}
 
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		ret = btrfs_update_inode(trans, root, inode);
 		if (ret) {
 			err = ret;
@@ -6690,6 +6741,7 @@ static int btrfs_truncate(struct inode *inode)
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {
+		trans->block_rsv = root->orphan_block_rsv;
 		ret = btrfs_orphan_del(trans, inode);
 		if (ret)
 			err = ret;
@@ -6701,15 +6753,20 @@ static int btrfs_truncate(struct inode *inode)
 		ret = btrfs_orphan_del(NULL, inode);
 	}
 
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	ret = btrfs_update_inode(trans, root, inode);
 	if (ret && !err)
 		err = ret;
 
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+
+out:
+	btrfs_free_block_rsv(root, rsv);
+
 	if (ret && !err)
 		err = ret;
-	btrfs_btree_balance_dirty(root, nr);
 
 	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From af60bed24eb0e3b6d93eaa6bb395a5721e6c09a8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 4 May 2011 11:11:17 -0400
Subject: Btrfs: set range_start to the right start in count_range_bits

In count_range_bits we are adjusting total_bytes based on the range we are
searching for, but we don't adjust the range start according to the range we are
searching for, which makes for weird results.  For example, if the range

[0-8192]

is set DELALLOC, but I search for 4096-8192, I will get back 4096 for the number
of bytes found, but the range_start will be 0, which makes it look like the
range is [0-4096].  So instead set range_start = max(cur_start, state->start).
This makes everything come out right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ba41da59e31..b5f6f227a97 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1480,7 +1480,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 			if (total_bytes >= max_bytes)
 				break;
 			if (!found) {
-				*start = state->start;
+				*start = max(cur_start, state->start);
 				found = 1;
 			}
 			last = state->end;
-- 
cgit v1.2.3-70-g09d2


From cb25c2ea6a79702ab7895b873c6c43e0d3bc3c72 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 11 May 2011 12:17:34 -0400
Subject: Btrfs: map the node block when looking for readahead targets

If we have particularly full nodes, we could call btrfs_node_blockptr up to 32
times, which is 32 pairs of kmap/kunmap, which _sucks_.  So go ahead and map the
extent buffer while we look for readahead targets.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84d7ca1fe0b..009bcf7f1e4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1229,6 +1229,7 @@ static void reada_for_search(struct btrfs_root *root,
 	u64 search;
 	u64 target;
 	u64 nread = 0;
+	u64 gen;
 	int direction = path->reada;
 	struct extent_buffer *eb;
 	u32 nr;
@@ -1256,6 +1257,15 @@ static void reada_for_search(struct btrfs_root *root,
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
 	while (1) {
+		if (!node->map_token) {
+			unsigned long offset = btrfs_node_key_ptr_offset(nr);
+			map_private_extent_buffer(node, offset,
+						  sizeof(struct btrfs_key_ptr),
+						  &node->map_token,
+						  &node->kaddr,
+						  &node->map_start,
+						  &node->map_len, KM_USER1);
+		}
 		if (direction < 0) {
 			if (nr == 0)
 				break;
@@ -1273,14 +1283,23 @@ static void reada_for_search(struct btrfs_root *root,
 		search = btrfs_node_blockptr(node, nr);
 		if ((search <= target && target - search <= 65536) ||
 		    (search > target && search - target <= 65536)) {
-			readahead_tree_block(root, search, blocksize,
-				     btrfs_node_ptr_generation(node, nr));
+			gen = btrfs_node_ptr_generation(node, nr);
+			if (node->map_token) {
+				unmap_extent_buffer(node, node->map_token,
+						    KM_USER1);
+				node->map_token = NULL;
+			}
+			readahead_tree_block(root, search, blocksize, gen);
 			nread += blocksize;
 		}
 		nscan++;
 		if ((nread > 65536 || nscan > 32))
 			break;
 	}
+	if (node->map_token) {
+		unmap_extent_buffer(node, node->map_token, KM_USER1);
+		node->map_token = NULL;
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 7e2355ba1a11649f0b212a29fdb9f47476f1248e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 11 May 2011 12:25:37 -0400
Subject: Btrfs: don't look at the extent buffer level 3 times in a row

We have a bit of debugging in btrfs_search_slot to make sure the level of the
cow block is the same as the original block we were cow'ing.  I don't think I've
ever seen this tripped, so kill it.  This saves us 2 kmap's per level in our
search.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 009bcf7f1e4..f7a0a64b868 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1672,9 +1672,6 @@ again:
 		}
 cow_done:
 		BUG_ON(!cow && ins_len);
-		if (level != btrfs_header_level(b))
-			WARN_ON(1);
-		level = btrfs_header_level(b);
 
 		p->nodes[level] = b;
 		if (!p->skip_locking)
-- 
cgit v1.2.3-70-g09d2


From d82a6f1d7e8b61ed5996334d0db66651bb43641d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 11 May 2011 15:26:06 -0400
Subject: Btrfs: kill BTRFS_I(inode)->block_group

Originally this was going to be used as a way to give hints to the allocator,
but frankly we can get much better hints elsewhere and it's not even used at all
for anything usefull.  In addition to be completely useless, when we initialize
an inode we try and find a freeish block group to set as the inodes block group,
and with a completely full 40gb fs this takes _forever_, so I imagine with say
1tb fs this is just unbearable.  So just axe the thing altoghether, we don't
need it and it saves us 8 bytes in the inode and saves us 500 microseconds per
inode lookup in my testcase.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h |  3 --
 fs/btrfs/ctree.h       |  3 +-
 fs/btrfs/extent-tree.c | 10 ++----
 fs/btrfs/inode.c       | 87 ++++++--------------------------------------------
 fs/btrfs/ioctl.c       |  3 +-
 fs/btrfs/transaction.c |  1 -
 fs/btrfs/transaction.h | 14 --------
 fs/btrfs/xattr.c       |  2 --
 8 files changed, 13 insertions(+), 110 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 57c3bb2884c..4bc852d3b83 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -120,9 +120,6 @@ struct btrfs_inode {
 	 */
 	u64 index_cnt;
 
-	/* the start of block group preferred for allocations. */
-	u64 block_group;
-
 	/* the fsync log has some corner cases that mean we have to check
 	 * directories to see if any unlinks have been done before
 	 * the directory was logged.  See tree-log.c for all the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f31aed7fedd..0f8c489bcc0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2512,8 +2512,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root,
-			     u64 new_dirid, u64 alloc_hint);
+			     struct btrfs_root *new_root, u64 new_dirid);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio, unsigned long bio_flags);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a2ca561c70f..9f0a4e3bd8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5319,6 +5319,7 @@ checks:
 			btrfs_add_free_space(block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
+		btrfs_put_block_group(block_group);
 		break;
 loop:
 		failed_cluster_refill = false;
@@ -5411,14 +5412,7 @@ loop:
 		ret = -ENOSPC;
 	} else if (!ins->objectid) {
 		ret = -ENOSPC;
-	}
-
-	/* we found what we needed */
-	if (ins->objectid) {
-		if (!(data & BTRFS_BLOCK_GROUP_DATA))
-			trans->block_group = block_group->key.objectid;
-
-		btrfs_put_block_group(block_group);
+	} else if (ins->objectid) {
 		ret = 0;
 	}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bc12ba23db5..dd5938a7de2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -136,7 +136,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->leave_spinning = 1;
-	btrfs_set_trans_block_group(trans, inode);
 
 	key.objectid = inode->i_ino;
 	key.offset = start;
@@ -422,7 +421,6 @@ again:
 	if (start == 0) {
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
-		btrfs_set_trans_block_group(trans, inode);
 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 		/* lets try to make an inline extent */
@@ -781,7 +779,6 @@ static noinline int cow_file_range(struct inode *inode,
 	BUG_ON(root == root->fs_info->tree_root);
 	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -1502,8 +1499,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_ordered_sum *sum;
 
-	btrfs_set_trans_block_group(trans, inode);
-
 	list_for_each_entry(sum, list, list) {
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1722,7 +1717,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 			else
 				trans = btrfs_join_transaction(root);
 			BUG_ON(IS_ERR(trans));
-			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
 			BUG_ON(ret);
@@ -1739,7 +1733,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	else
 		trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2495,7 +2488,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
 	int maybe_acls;
-	u64 alloc_group_block;
 	u32 rdev;
 	int ret;
 
@@ -2539,8 +2531,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
-	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
-
 	/*
 	 * try to precache a NULL acl entry for files that don't have
 	 * any xattrs or acls
@@ -2549,8 +2539,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	if (!maybe_acls)
 		cache_no_acl(inode);
 
-	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
-						alloc_group_block, 0);
 	btrfs_free_path(path);
 	inode_item = NULL;
 
@@ -2630,7 +2618,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+	btrfs_set_inode_block_group(leaf, item, 0);
 
 	if (leaf->map_token) {
 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
@@ -2971,8 +2959,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
 
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -3068,8 +3054,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
 	if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 		err = btrfs_unlink_subvol(trans, root, dir,
 					  BTRFS_I(inode)->location.objectid,
@@ -3649,7 +3633,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 				err = PTR_ERR(trans);
 				break;
 			}
-			btrfs_set_trans_block_group(trans, inode);
 
 			err = btrfs_drop_extents(trans, inode, cur_offset,
 						 cur_offset + hole_size,
@@ -3785,7 +3768,6 @@ void btrfs_evict_inode(struct inode *inode)
 	while (1) {
 		trans = btrfs_start_transaction(root, 0);
 		BUG_ON(IS_ERR(trans));
-		btrfs_set_trans_block_group(trans, inode);
 		trans->block_rsv = root->orphan_block_rsv;
 
 		ret = btrfs_block_rsv_check(trans, root,
@@ -4383,7 +4365,6 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 			trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
-		btrfs_set_trans_block_group(trans, inode);
 		if (nolock)
 			ret = btrfs_end_transaction_nolock(trans, root);
 		else
@@ -4409,7 +4390,6 @@ void btrfs_dirty_inode(struct inode *inode)
 
 	trans = btrfs_join_transaction(root);
 	BUG_ON(IS_ERR(trans));
-	btrfs_set_trans_block_group(trans, inode);
 
 	ret = btrfs_update_inode(trans, root, inode);
 	if (ret && ret == -ENOSPC) {
@@ -4424,7 +4404,6 @@ void btrfs_dirty_inode(struct inode *inode)
 			}
 			return;
 		}
-		btrfs_set_trans_block_group(trans, inode);
 
 		ret = btrfs_update_inode(trans, root, inode);
 		if (ret) {
@@ -4519,8 +4498,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *dir,
 				     const char *name, int name_len,
-				     u64 ref_objectid, u64 objectid,
-				     u64 alloc_hint, int mode, u64 *index)
+				     u64 ref_objectid, u64 objectid, int mode,
+				     u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
@@ -4567,8 +4546,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 0;
 	else
 		owner = 1;
-	BTRFS_I(inode)->block_group =
-			btrfs_find_block_group(root, 0, alloc_hint, owner);
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4729,11 +4706,9 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, mode, &index);
+				mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_unlock;
@@ -4745,7 +4720,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
@@ -4754,8 +4728,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		init_special_inode(inode, inode->i_mode, rdev);
 		btrfs_update_inode(trans, root, inode);
 	}
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
@@ -4791,11 +4763,9 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, mode, &index);
+				mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_unlock;
@@ -4807,7 +4777,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
@@ -4818,8 +4787,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
@@ -4866,8 +4833,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	btrfs_inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME;
-
-	btrfs_set_trans_block_group(trans, dir);
 	ihold(inode);
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -4876,7 +4841,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 	} else {
 		struct dentry *parent = dget_parent(dentry);
-		btrfs_update_inode_block_group(trans, dir);
 		err = btrfs_update_inode(trans, root, inode);
 		BUG_ON(err);
 		btrfs_log_new_name(trans, inode, NULL, parent);
@@ -4917,12 +4881,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	btrfs_set_trans_block_group(trans, dir);
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFDIR | mode,
-				&index);
+				S_IFDIR | mode, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -4936,7 +4898,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
-	btrfs_set_trans_block_group(trans, inode);
 
 	btrfs_i_size_write(inode, 0);
 	err = btrfs_update_inode(trans, root, inode);
@@ -4950,8 +4911,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 
 out_fail:
 	nr = trans->blocks_used;
@@ -6652,8 +6611,6 @@ static int btrfs_truncate(struct inode *inode)
 		goto out;
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
-
 	/*
 	 * Reserve space for the truncate process.  Truncate should be adding
 	 * space, but if there are snapshots it may end up using space.
@@ -6680,7 +6637,6 @@ static int btrfs_truncate(struct inode *inode)
 		err = PTR_ERR(trans);
 		goto out;
 	}
-	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = rsv;
 
 	/*
@@ -6715,7 +6671,6 @@ static int btrfs_truncate(struct inode *inode)
 							      rsv);
 			BUG_ON(ret);
 
-			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = rsv;
 		}
 
@@ -6775,15 +6730,14 @@ out:
  * create a new subvolume directory/inode (helper for the ioctl).
  */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *new_root,
-			     u64 new_dirid, u64 alloc_hint)
+			     struct btrfs_root *new_root, u64 new_dirid)
 {
 	struct inode *inode;
 	int err;
 	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+				new_dirid, S_IFDIR | 0700, &index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
@@ -6893,21 +6847,6 @@ void btrfs_destroy_inode(struct inode *inode)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 	}
 
-	if (root == root->fs_info->tree_root) {
-		struct btrfs_block_group_cache *block_group;
-
-		block_group = btrfs_lookup_block_group(root->fs_info,
-						BTRFS_I(inode)->block_group);
-		if (block_group && block_group->inode == inode) {
-			spin_lock(&block_group->lock);
-			block_group->inode = NULL;
-			spin_unlock(&block_group->lock);
-			btrfs_put_block_group(block_group);
-		} else if (block_group) {
-			btrfs_put_block_group(block_group);
-		}
-	}
-
 	spin_lock(&root->orphan_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -7091,8 +7030,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 goto out_notrans;
         }
 
-	btrfs_set_trans_block_group(trans, new_dir);
-
 	if (dest != root)
 		btrfs_record_root_in_trans(trans, dest);
 
@@ -7331,12 +7268,9 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, dir);
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len, dir->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
-				&index);
+				S_IFLNK|S_IRWXUGO, &index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_unlock;
@@ -7348,7 +7282,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
@@ -7359,8 +7292,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
 	if (drop_inode)
 		goto out_unlock;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a578620e06a..8e90ccf4b76 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -413,8 +413,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	btrfs_record_root_in_trans(trans, new_root);
 
-	ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
-				       BTRFS_I(dir)->block_group);
+	ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
 	/*
 	 * insert the directory item
 	 */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 43816f8b23e..f4ea695325b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -241,7 +241,6 @@ again:
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
 	h->blocks_used = 0;
-	h->block_group = 0;
 	h->bytes_reserved = 0;
 	h->delayed_ref_updates = 0;
 	h->use_count = 1;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 11c6efcd4ed..da7289e06a8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,7 +47,6 @@ struct btrfs_transaction {
 
 struct btrfs_trans_handle {
 	u64 transid;
-	u64 block_group;
 	u64 bytes_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
@@ -70,19 +69,6 @@ struct btrfs_pending_snapshot {
 	struct list_head list;
 };
 
-static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
-					       struct inode *inode)
-{
-	trans->block_group = BTRFS_I(inode)->block_group;
-}
-
-static inline void btrfs_update_inode_block_group(
-					  struct btrfs_trans_handle *trans,
-					  struct inode *inode)
-{
-	BTRFS_I(inode)->block_group = trans->block_group;
-}
-
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 					      struct inode *inode)
 {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index cfd660550de..72ab0295ca7 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	btrfs_set_trans_block_group(trans, inode);
-
 	ret = do_setxattr(trans, inode, name, value, size, flags);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 589d8ade83f07c0f11c8191c0ca309f34d7a2c14 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 11 May 2011 17:30:53 -0400
Subject: Btrfs: try not to sleep as much when doing slow caching

When the fs is super full and we unmount the fs, we could get stuck in this
thing where unmount is waiting for the caching kthread to make progress and the
caching kthread keeps scheduling because we're in the middle of a commit.  So
instead just let the caching kthread keep going and only yeild if
need_resched().  This makes my horrible umount case go from taking up to 10
minutes to taking less than 20 seconds.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f0a4e3bd8a..96be6245031 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -378,15 +378,18 @@ again:
 			if (ret)
 				break;
 
-			caching_ctl->progress = last;
-			btrfs_release_path(extent_root, path);
-			up_read(&fs_info->extent_commit_sem);
-			mutex_unlock(&caching_ctl->mutex);
-			if (btrfs_transaction_in_commit(fs_info))
-				schedule_timeout(1);
-			else
+			if (need_resched() ||
+			    btrfs_next_leaf(extent_root, path)) {
+				caching_ctl->progress = last;
+				btrfs_release_path(extent_root, path);
+				up_read(&fs_info->extent_commit_sem);
+				mutex_unlock(&caching_ctl->mutex);
 				cond_resched();
-			goto again;
+				goto again;
+			}
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			continue;
 		}
 
 		if (key.objectid < block_group->key.objectid) {
-- 
cgit v1.2.3-70-g09d2


From 026fd317828500524cdc7e5ff9e8e7923abb2868 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 13 May 2011 10:32:11 -0400
Subject: Btrfs: don't always do readahead

Our readahead is sort of sloppy, and really isn't always needed.  For example if
ls is doing a stating ls (which is the default) it's going to stat in non-disk
order, so if say you have a directory with a stupid amount of files, readahead
is going to do nothing but waste time in the case of doing the stat.  Taking the
unconditional readahead out made my test go from 57 minutes to 36 minutes.  This
means that everywhere we do loop through the tree we want to make sure we do set
path->reada properly, so I went through and found all of the places where we
loop through the path and set reada to 1.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.c       |  2 --
 fs/btrfs/extent-tree.c |  3 ++-
 fs/btrfs/inode.c       | 14 ++++++++++++--
 fs/btrfs/relocation.c  |  6 ++++++
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f7a0a64b868..f61c16c1481 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -48,8 +48,6 @@ struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-	if (path)
-		path->reada = 1;
 	return path;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96be6245031..1ba2cc58eab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -347,7 +347,7 @@ static int caching_kthread(void *data)
 	 */
 	path->skip_locking = 1;
 	path->search_commit_root = 1;
-	path->reada = 2;
+	path->reada = 1;
 
 	key.objectid = last;
 	key.offset = 0;
@@ -8556,6 +8556,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
 	if (cache_gen != 0 &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dd5938a7de2..6228a304b54 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4242,7 +4242,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 		filp->f_pos = 2;
 	}
 	path = btrfs_alloc_path();
-	path->reada = 2;
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
 
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
@@ -5043,7 +5045,15 @@ again:
 
 	if (!path) {
 		path = btrfs_alloc_path();
-		BUG_ON(!path);
+		if (!path) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * Chances are we'll be called again, so go ahead and do
+		 * readahead
+		 */
+		path->reada = 1;
 	}
 
 	ret = btrfs_lookup_file_extent(trans, root, path,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 09c30d37d43..5872b41581f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -676,6 +676,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
 		err = -ENOMEM;
 		goto out;
 	}
+	path1->reada = 1;
+	path2->reada = 2;
 
 	node = alloc_backref_node(cache);
 	if (!node) {
@@ -1996,6 +1998,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	reloc_root = root->reloc_root;
 	root_item = &reloc_root->root_item;
@@ -3297,6 +3300,7 @@ static int find_data_references(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
 	if (IS_ERR(root)) {
@@ -3665,6 +3669,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 
 	ret = prepare_to_relocate(rc);
 	if (ret) {
@@ -4090,6 +4095,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = -1;
 
 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
 	key.type = BTRFS_ROOT_ITEM_KEY;
-- 
cgit v1.2.3-70-g09d2


From cca1c81f43e26ab60c0d1090fb90992358d69bdf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 13 May 2011 11:07:12 -0400
Subject: Btrfs: don't try to allocate from a block group that doesn't have
 enough space

If we have a very large filesystem, we can spend a lot of time in
find_free_extent just trying to allocate from empty block groups.  So instead
check to see if the block group even has enough space for the allocation, and if
not go on to the next block group.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1ba2cc58eab..c8c318494de 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5159,6 +5159,14 @@ have_block_group:
 		if (unlikely(block_group->ro))
 			goto loop;
 
+		spin_lock(&block_group->tree_lock);
+		if (cached &&
+		    block_group->free_space < num_bytes + empty_size) {
+			spin_unlock(&block_group->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->tree_lock);
+
 		/*
 		 * Ok we want to try and use the cluster allocator, so lets look
 		 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
-- 
cgit v1.2.3-70-g09d2


From 207dde8289d9b005b665cb9d8d2bb9464256101d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 13 May 2011 14:49:23 -0400
Subject: Btrfs: check for duplicate entries in the free space cache

If there are duplicate entries in the free space cache, discard the entire cache
and load it the old fashioned way.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 63731a1fb0a..d634a7e4220 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -420,7 +420,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 				spin_lock(&block_group->tree_lock);
 				ret = link_free_space(block_group, e);
 				spin_unlock(&block_group->tree_lock);
-				BUG_ON(ret);
+				if (ret) {
+					printk(KERN_ERR "Duplicate entries in "
+					       "free space cache, dumping\n");
+					kunmap(page);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
 			} else {
 				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
 				if (!e->bitmap) {
@@ -437,6 +444,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 				recalculate_thresholds(block_group);
 				spin_unlock(&block_group->tree_lock);
 				list_add_tail(&e->list, &bitmaps);
+				if (ret) {
+					printk(KERN_ERR "Duplicate entries in "
+					       "free space cache, dumping\n");
+					kunmap(page);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				}
 			}
 
 			num_entries--;
@@ -909,10 +924,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 			 * logically.
 			 */
 			if (bitmap) {
-				WARN_ON(info->bitmap);
+				if (info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
 				p = &(*p)->rb_right;
 			} else {
-				WARN_ON(!info->bitmap);
+				if (!info->bitmap) {
+					WARN_ON_ONCE(1);
+					return -EEXIST;
+				}
 				p = &(*p)->rb_left;
 			}
 		}
-- 
cgit v1.2.3-70-g09d2


From d90c732122a1f6d0efe388a8a204f67f144b2eb3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 17 May 2011 09:50:54 -0400
Subject: Btrfs: leave spinning on lookup and map the leaf

On lookup we only want to read the inode item, so leave the path spinning.  Also
we're just wholesale reading the leaf off, so map the leaf so we don't do a
bunch of kmap/kunmaps.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6228a304b54..dc8fb2b3a14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2493,6 +2493,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	path->leave_spinning = 1;
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2502,6 +2503,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
+	if (!leaf->map_token)
+		map_private_extent_buffer(leaf, (unsigned long)inode_item,
+					  sizeof(struct btrfs_inode_item),
+					  &leaf->map_token, &leaf->kaddr,
+					  &leaf->map_start, &leaf->map_len,
+					  KM_USER1);
 
 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
 	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2539,6 +2546,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	if (!maybe_acls)
 		cache_no_acl(inode);
 
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
 	btrfs_free_path(path);
 	inode_item = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From 6e9101aeec39961308176e0f59e73ac5d37d243a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 24 May 2011 05:43:18 +0200
Subject: watchdog: Fix non-standard prototype of get_softlockup_thresh()

This build warning slipped through:

  kernel/watchdog.c:102: warning: function declaration isn't a prototype

As reported by Stephen Rothwell.

Also address an unused variable warning that GCC 4.6.0 reports:
we cannot do anything about failed watchdog ops during CPU hotplug
(it's not serious enough to return an error from the notifier),
so ignore them.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20110524134129.8da27016.sfr@canb.auug.org.au
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20110517071642.GF22305@elte.hu>
---
 kernel/watchdog.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e63097fa73..3d0c56ad479 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -98,7 +98,7 @@ __setup("nosoftlockup", nosoftlockup_setup);
  * the thresholds with a factor: we make the soft threshold twice the amount of
  * time the hard threshold is.
  */
-static int get_softlockup_thresh()
+static int get_softlockup_thresh(void)
 {
 	return watchdog_thresh * 2;
 }
@@ -415,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 /* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
 {
 	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
 
 	WARN_ON(per_cpu(softlockup_watchdog, cpu));
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;
-
-	return 0;
 }
 
 static int watchdog_enable(int cpu)
@@ -542,17 +540,16 @@ static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
-	int err = 0;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		err = watchdog_prepare_cpu(hotcpu);
+		watchdog_prepare_cpu(hotcpu);
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		if (watchdog_enabled)
-			err = watchdog_enable(hotcpu);
+			watchdog_enable(hotcpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
-- 
cgit v1.2.3-70-g09d2


From a3793a0d86e2d65dda6beb86fa295be25b238159 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Tue, 22 Feb 2011 09:57:44 +0000
Subject: sh: switch ap325rxa to dynamically manage the platform camera

Use soc_camera_platform helper functions to dynamically manage the
camera device.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-ap325rxa/setup.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c
index 618bd566cf5..969421f64a1 100644
--- a/arch/sh/boards/mach-ap325rxa/setup.c
+++ b/arch/sh/boards/mach-ap325rxa/setup.c
@@ -359,37 +359,31 @@ static struct soc_camera_link camera_link = {
 	.priv		= &camera_info,
 };
 
-static void dummy_release(struct device *dev)
+static struct platform_device *camera_device;
+
+static void ap325rxa_camera_release(struct device *dev)
 {
+	soc_camera_platform_release(&camera_device);
 }
 
-static struct platform_device camera_device = {
-	.name		= "soc_camera_platform",
-	.dev		= {
-		.platform_data	= &camera_info,
-		.release	= dummy_release,
-	},
-};
-
 static int ap325rxa_camera_add(struct soc_camera_link *icl,
 			       struct device *dev)
 {
-	if (icl != &camera_link || camera_probe() <= 0)
-		return -ENODEV;
+	int ret = soc_camera_platform_add(icl, dev, &camera_device, &camera_link,
+					  ap325rxa_camera_release, 0);
+	if (ret < 0)
+		return ret;
 
-	camera_info.dev = dev;
+	ret = camera_probe();
+	if (ret < 0)
+		soc_camera_platform_del(icl, camera_device, &camera_link);
 
-	return platform_device_register(&camera_device);
+	return ret;
 }
 
 static void ap325rxa_camera_del(struct soc_camera_link *icl)
 {
-	if (icl != &camera_link)
-		return;
-
-	platform_device_unregister(&camera_device);
-	memset(&camera_device.dev.kobj, 0,
-	       sizeof(camera_device.dev.kobj));
+	soc_camera_platform_del(icl, camera_device, &camera_link);
 }
 #endif /* CONFIG_I2C */
 
-- 
cgit v1.2.3-70-g09d2


From 52b96c2592e8fdb93231c9644b68112518916a57 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Fri, 15 Apr 2011 18:30:55 +0000
Subject: sh: add MMCIF runtime PM support on ecovec

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Cc: Simon Horman <horms@verge.net.au>
Cc: Magnus Damm <damm@opensource.se>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-ecovec24/setup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index bb13d0e1b96..3a32741cc0a 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -885,6 +885,9 @@ static struct platform_device sh_mmcif_device = {
 	},
 	.num_resources	= ARRAY_SIZE(sh_mmcif_resources),
 	.resource	= sh_mmcif_resources,
+	.archdata = {
+		.hwblk_id = HWBLK_MMC,
+	},
 };
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 0f0ebd980e0e8a2fd33ab3ef0d5b0cffbcddd8a1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 24 May 2011 17:25:23 +0900
Subject: sh: arch/sh/kernel/process_32.c needs linux/prefetch.h.

Trivial build fix for certain configurations that don't grab
linux/prefetch.h via alternate means (specifically SH-2 and SH-3 parts).

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/process_32.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 762a13984bb..b473f0c06fb 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/ftrace.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/prefetch.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/system.h>
-- 
cgit v1.2.3-70-g09d2


From 4e2b1084b0fb79c963b73586815e1ef034dc2b57 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 24 May 2011 17:33:51 +0900
Subject: sh: Update shmin to reflect PIO dependency.

shmin uses __set_io_port_base() for legacy I/O mapping that ethernet and
other SuperI/O functions depend on. Ensure that PIO support is built in
until the board is updated for MMIO properly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b44e37753b9..a164c9e3c73 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -167,7 +167,7 @@ config ARCH_HAS_CPU_IDLE_WAIT
 
 config NO_IOPORT
 	def_bool !PCI
-	depends on !SH_CAYMAN && !SH_SH4202_MICRODEV
+	depends on !SH_CAYMAN && !SH_SH4202_MICRODEV && !SH_SHMIN
 
 config IO_TRAPPED
 	bool
-- 
cgit v1.2.3-70-g09d2


From d819437156fd99da61d4e1402b2dbfc5cc472265 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@mgebm.net>
Date: Mon, 23 May 2011 10:22:40 -0400
Subject: oprofile, powerpc: Handle events that raise an exception without
 overflowing

Commit 0837e3242c73566fc1c0196b4ec61779c25ffc93 fixes a situation on POWER7
where events can roll back if a specualtive event doesn't actually complete.
This can raise a performance monitor exception.  We need to catch this to ensure
that we reset the PMC.  In all cases the PMC will be less than 256 cycles from
overflow.

This patch lifts Anton's fix for the problem in perf and applies it to oprofile
as well.

Signed-off-by: Eric B Munson <emunson@mgebm.net>
Cc: <stable@kernel.org> # as far back as it applies cleanly
Tested-by: Maynard Johnson <maynardj@us.ibm.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/powerpc/oprofile/op_model_power4.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c
index 8ee51a252cf..e6bec74be13 100644
--- a/arch/powerpc/oprofile/op_model_power4.c
+++ b/arch/powerpc/oprofile/op_model_power4.c
@@ -261,6 +261,28 @@ static int get_kernel(unsigned long pc, unsigned long mmcra)
 	return is_kernel;
 }
 
+static bool pmc_overflow(unsigned long val)
+{
+	if ((int)val < 0)
+		return true;
+
+	/*
+	 * Events on POWER7 can roll back if a speculative event doesn't
+	 * eventually complete. Unfortunately in some rare cases they will
+	 * raise a performance monitor exception. We need to catch this to
+	 * ensure we reset the PMC. In all cases the PMC will be 256 or less
+	 * cycles from overflow.
+	 *
+	 * We only do this if the first pass fails to find any overflowing
+	 * PMCs because a user might set a period of less than 256 and we
+	 * don't want to mistakenly reset them.
+	 */
+	if (__is_processor(PV_POWER7) && ((0x80000000 - val) <= 256))
+		return true;
+
+	return false;
+}
+
 static void power4_handle_interrupt(struct pt_regs *regs,
 				    struct op_counter_config *ctr)
 {
@@ -281,7 +303,7 @@ static void power4_handle_interrupt(struct pt_regs *regs,
 
 	for (i = 0; i < cur_cpu_spec->num_pmcs; ++i) {
 		val = classic_ctr_read(i);
-		if (val < 0) {
+		if (pmc_overflow(val)) {
 			if (oprofile_running && ctr[i].enabled) {
 				oprofile_add_ext_sample(pc, regs, i, is_kernel);
 				classic_ctr_write(i, reset_value[i]);
-- 
cgit v1.2.3-70-g09d2


From b76a06e08d94b2a63e47837dfe46bbbf0a3af6c2 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sun, 8 May 2011 19:32:36 -0400
Subject: oprofile: Use linux/mutex.h

The oprofile code is still including asm/mutex.h instead of
linux/mutex.h.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/event_buffer.h | 2 +-
 drivers/oprofile/oprof.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h
index 4e70749f8d1..a8d5bb3cba8 100644
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -11,7 +11,7 @@
 #define EVENT_BUFFER_H
 
 #include <linux/types.h>
-#include <asm/mutex.h>
+#include <linux/mutex.h>
 
 int alloc_event_buffer(void);
 
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index f9bda64fcd1..dccd8636095 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -14,7 +14,7 @@
 #include <linux/moduleparam.h>
 #include <linux/workqueue.h>
 #include <linux/time.h>
-#include <asm/mutex.h>
+#include <linux/mutex.h>
 
 #include "oprof.h"
 #include "event_buffer.h"
-- 
cgit v1.2.3-70-g09d2


From b779260b097dec295e8798277ed08ec5ecd3b2c1 Mon Sep 17 00:00:00 2001
From: Joseph Cihula <joseph.cihula@intel.com>
Date: Tue, 3 May 2011 00:08:37 -0700
Subject: intel-iommu: fix VT-d PMR disable for TXT on S3 resume

This patch is a follow on to https://lkml.org/lkml/2011/3/21/239, which
was merged as commit 51a63e67da6056c13b5b597dcc9e1b3bd7ceaa55.

This patch adds support for S3, as pointed out by Chris Wright.

Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index d552d2c7784..4e4e0202e59 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -141,6 +141,12 @@ static struct intel_iommu **g_iommus;
 static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
 
+/*
+ * set to 1 to panic kernel if can't successfully enable VT-d
+ * (used when kernel is launched w/ TXT)
+ */
+static int force_on = 0;
+
 /*
  * 0: Present
  * 1-11: Reserved
@@ -2217,7 +2223,7 @@ static int __init iommu_prepare_static_identity_mapping(int hw)
 	return 0;
 }
 
-static int __init init_dmars(int force_on)
+static int __init init_dmars(void)
 {
 	struct dmar_drhd_unit *drhd;
 	struct dmar_rmrr_unit *rmrr;
@@ -3117,7 +3123,17 @@ static int init_iommu_hw(void)
 		if (iommu->qi)
 			dmar_reenable_qi(iommu);
 
-	for_each_active_iommu(iommu, drhd) {
+	for_each_iommu(iommu, drhd) {
+		if (drhd->ignored) {
+			/*
+			 * we always have to disable PMRs or DMA may fail on
+			 * this device
+			 */
+			if (force_on)
+				iommu_disable_protect_mem_regions(iommu);
+			continue;
+		}
+	
 		iommu_flush_write_buffer(iommu);
 
 		iommu_set_root_entry(iommu);
@@ -3126,7 +3142,8 @@ static int init_iommu_hw(void)
 					   DMA_CCMD_GLOBAL_INVL);
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 					 DMA_TLB_GLOBAL_FLUSH);
-		iommu_enable_translation(iommu);
+		if (iommu_enable_translation(iommu))
+			return 1;
 		iommu_disable_protect_mem_regions(iommu);
 	}
 
@@ -3193,7 +3210,10 @@ static void iommu_resume(void)
 	unsigned long flag;
 
 	if (init_iommu_hw()) {
-		WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+		if (force_on)
+			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+		else
+			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
 		return;
 	}
 
@@ -3270,7 +3290,6 @@ static struct notifier_block device_nb = {
 int __init intel_iommu_init(void)
 {
 	int ret = 0;
-	int force_on = 0;
 
 	/* VT-d is required for a TXT/tboot launch, so enforce that */
 	force_on = tboot_force_iommu();
@@ -3308,7 +3327,7 @@ int __init intel_iommu_init(void)
 
 	init_no_remapping_devices();
 
-	ret = init_dmars(force_on);
+	ret = init_dmars();
 	if (ret) {
 		if (force_on)
 			panic("tboot: Failed to initialize DMARs\n");
-- 
cgit v1.2.3-70-g09d2


From b3a530e4e734cab7043a3ab7d135f539042443a7 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 15 May 2011 12:34:55 +0200
Subject: intel-iommu: Remove obsolete comment from detect_intel_iommu

Since cacd4213d8, this comment no longer applies.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/dmar.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 12e02bf92c4..3dc9befa5ae 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -698,12 +698,7 @@ int __init detect_intel_iommu(void)
 	{
 #ifdef CONFIG_INTR_REMAP
 		struct acpi_table_dmar *dmar;
-		/*
-		 * for now we will disable dma-remapping when interrupt
-		 * remapping is enabled.
-		 * When support for queued invalidation for IOTLB invalidation
-		 * is added, we will not need this any more.
-		 */
+
 		dmar = (struct acpi_table_dmar *) dmar_tbl;
 		if (ret && cpu_has_x2apic && dmar->flags & 0x1)
 			printk(KERN_INFO
-- 
cgit v1.2.3-70-g09d2


From 7b668357810ecb5fdda4418689d50f5d95aea6a8 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 May 2011 12:02:41 +0100
Subject: intel-iommu: Flush unmaps at domain_exit

We typically batch unmaps to be lazily flushed out at
regular intervals.  When we destroy a domain, we need
to force a flush of these lazy unmaps to be sure none
reference the domain we're about to free.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=35062
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@kernel.org
---
 drivers/pci/intel-iommu.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 4e4e0202e59..395f253c049 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1422,6 +1422,10 @@ static void domain_exit(struct dmar_domain *domain)
 	if (!domain)
 		return;
 
+	/* Flush any lazy unmaps that may reference this domain */
+	if (!intel_iommu_strict)
+		flush_unmaps_timeout(0);
+
 	domain_remove_dev_info(domain);
 	/* destroy iovas */
 	put_iova_domain(&domain->iovad);
-- 
cgit v1.2.3-70-g09d2


From 7ccafc5f75c87853f3c49845d5a884f2376e03ce Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Tue, 24 May 2011 16:29:26 -0700
Subject: x86, cpufeature: Update CPU feature RDRND to RDRAND

The Intel manual changed the name of the CPUID bit to match the
instruction name. We should follow suit for sanity's sake. (See Intel SDM
Volume 2, Table 3-20 "Feature Information Returned in the ECX Register".)

[ hpa: we can only do this at this time because there are currently no CPUs
  with this feature on the market, hence this is pre-hardware enabling.
  However, Cc:'ing stable so that stable can present a consistent ABI. ]

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Link: http://lkml.kernel.org/r/20110524232926.GA27728@outflux.net
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: <stable@kernel.org> v2.6.36-39
---
 arch/x86/include/asm/cpufeature.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 5dc6acc98db..71cc3800712 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,7 +125,7 @@
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
 #define X86_FEATURE_AVX		(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_F16C	(4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRND	(4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_RDRAND	(4*32+30) /* The RDRAND instruction */
 #define X86_FEATURE_HYPERVISOR	(4*32+31) /* Running on a hypervisor */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-- 
cgit v1.2.3-70-g09d2


From 1c7fcbed1adee44708dafb65ac40b186c203d7e8 Mon Sep 17 00:00:00 2001
From: Damian <dhobsong@igel.co.jp>
Date: Tue, 24 May 2011 07:01:22 +0000
Subject: sh_mobile_meram: MERAM platform data for LCDC

Based on the patch by Takanari Hayama <taki@igel.co.jp>

Add the necessary platform data to add MERAM functionality to LCDC

Includes platform data for both the AP4EVB and mackerel

Signed-off-by: Damian Hobson-Garcia <dhobsong@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/arm/mach-shmobile/board-ap4evb.c   | 56 +++++++++++++++++++++++++++++++
 arch/arm/mach-shmobile/board-mackerel.c | 58 +++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/arch/arm/mach-shmobile/board-ap4evb.c b/arch/arm/mach-shmobile/board-ap4evb.c
index 1e35fa976d6..c9aad5eefcc 100644
--- a/arch/arm/mach-shmobile/board-ap4evb.c
+++ b/arch/arm/mach-shmobile/board-ap4evb.c
@@ -249,6 +249,29 @@ static int slot_cn7_get_cd(struct platform_device *pdev)
 {
 	return !gpio_get_value(GPIO_PORT41);
 }
+/* MERAM */
+static struct sh_mobile_meram_info meram_info = {
+	.addr_mode      = SH_MOBILE_MERAM_MODE1,
+};
+
+static struct resource meram_resources[] = {
+	[0] = {
+		.name   = "MERAM",
+		.start  = 0xe8000000,
+		.end    = 0xe81fffff,
+		.flags  = IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device meram_device = {
+	.name           = "sh_mobile_meram",
+	.id             = 0,
+	.num_resources  = ARRAY_SIZE(meram_resources),
+	.resource       = meram_resources,
+	.dev            = {
+		.platform_data = &meram_info,
+	},
+};
 
 /* SH_MMCIF */
 static struct resource sh_mmcif_resources[] = {
@@ -431,13 +454,29 @@ const static struct fb_videomode ap4evb_lcdc_modes[] = {
 #endif
 	},
 };
+static struct sh_mobile_meram_cfg lcd_meram_cfg = {
+	.icb[0] = {
+		.marker_icb     = 28,
+		.cache_icb      = 24,
+		.meram_offset   = 0x0,
+		.meram_size     = 0x40,
+	},
+	.icb[1] = {
+		.marker_icb     = 29,
+		.cache_icb      = 25,
+		.meram_offset   = 0x40,
+		.meram_size     = 0x40,
+	},
+};
 
 static struct sh_mobile_lcdc_info lcdc_info = {
+	.meram_dev = &meram_info,
 	.ch[0] = {
 		.chan = LCDC_CHAN_MAINLCD,
 		.bpp = 16,
 		.lcd_cfg = ap4evb_lcdc_modes,
 		.num_cfg = ARRAY_SIZE(ap4evb_lcdc_modes),
+		.meram_cfg = &lcd_meram_cfg,
 	}
 };
 
@@ -708,15 +747,31 @@ static struct platform_device fsi_device = {
 static struct platform_device fsi_ak4643_device = {
 	.name		= "sh_fsi2_a_ak4643",
 };
+static struct sh_mobile_meram_cfg hdmi_meram_cfg = {
+	.icb[0] = {
+		.marker_icb     = 30,
+		.cache_icb      = 26,
+		.meram_offset   = 0x80,
+		.meram_size     = 0x100,
+	},
+	.icb[1] = {
+		.marker_icb     = 31,
+		.cache_icb      = 27,
+		.meram_offset   = 0x180,
+		.meram_size     = 0x100,
+	},
+};
 
 static struct sh_mobile_lcdc_info sh_mobile_lcdc1_info = {
 	.clock_source = LCDC_CLK_EXTERNAL,
+	.meram_dev = &meram_info,
 	.ch[0] = {
 		.chan = LCDC_CHAN_MAINLCD,
 		.bpp = 16,
 		.interface_type = RGB24,
 		.clock_divider = 1,
 		.flags = LCDC_FLAGS_DWPOL,
+		.meram_cfg = &hdmi_meram_cfg,
 	}
 };
 
@@ -945,6 +1000,7 @@ static struct platform_device *ap4evb_devices[] __initdata = {
 	&csi2_device,
 	&ceu_device,
 	&ap4evb_camera,
+	&meram_device,
 };
 
 static void __init hdmi_init_pm_clock(void)
diff --git a/arch/arm/mach-shmobile/board-mackerel.c b/arch/arm/mach-shmobile/board-mackerel.c
index 7da2ca24229..a165a9e70ee 100644
--- a/arch/arm/mach-shmobile/board-mackerel.c
+++ b/arch/arm/mach-shmobile/board-mackerel.c
@@ -279,6 +279,30 @@ static struct platform_device smc911x_device = {
 	},
 };
 
+/* MERAM */
+static struct sh_mobile_meram_info mackerel_meram_info = {
+	.addr_mode	= SH_MOBILE_MERAM_MODE1,
+};
+
+static struct resource meram_resources[] = {
+	[0] = {
+		.name	= "MERAM",
+		.start	= 0xe8000000,
+		.end	= 0xe81fffff,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device meram_device = {
+	.name		= "sh_mobile_meram",
+	.id		= 0,
+	.num_resources	= ARRAY_SIZE(meram_resources),
+	.resource	= meram_resources,
+	.dev		= {
+		.platform_data = &mackerel_meram_info,
+	},
+};
+
 /* LCDC */
 static struct fb_videomode mackerel_lcdc_modes[] = {
 	{
@@ -307,7 +331,23 @@ static int mackerel_get_brightness(void *board_data)
 	return gpio_get_value(GPIO_PORT31);
 }
 
+static struct sh_mobile_meram_cfg lcd_meram_cfg = {
+	.icb[0] = {
+		.marker_icb     = 28,
+		.cache_icb      = 24,
+		.meram_offset   = 0x0,
+		.meram_size     = 0x40,
+	},
+	.icb[1] = {
+		.marker_icb     = 29,
+		.cache_icb      = 25,
+		.meram_offset   = 0x40,
+		.meram_size     = 0x40,
+	},
+};
+
 static struct sh_mobile_lcdc_info lcdc_info = {
+	.meram_dev = &mackerel_meram_info,
 	.clock_source = LCDC_CLK_BUS,
 	.ch[0] = {
 		.chan = LCDC_CHAN_MAINLCD,
@@ -327,6 +367,7 @@ static struct sh_mobile_lcdc_info lcdc_info = {
 			.name = "sh_mobile_lcdc_bl",
 			.max_brightness = 1,
 		},
+		.meram_cfg = &lcd_meram_cfg,
 	}
 };
 
@@ -353,8 +394,23 @@ static struct platform_device lcdc_device = {
 	},
 };
 
+static struct sh_mobile_meram_cfg hdmi_meram_cfg = {
+	.icb[0] = {
+		.marker_icb     = 30,
+		.cache_icb      = 26,
+		.meram_offset   = 0x80,
+		.meram_size     = 0x100,
+	},
+	.icb[1] = {
+		.marker_icb     = 31,
+		.cache_icb      = 27,
+		.meram_offset   = 0x180,
+		.meram_size     = 0x100,
+	},
+};
 /* HDMI */
 static struct sh_mobile_lcdc_info hdmi_lcdc_info = {
+	.meram_dev = &mackerel_meram_info,
 	.clock_source = LCDC_CLK_EXTERNAL,
 	.ch[0] = {
 		.chan = LCDC_CHAN_MAINLCD,
@@ -362,6 +418,7 @@ static struct sh_mobile_lcdc_info hdmi_lcdc_info = {
 		.interface_type = RGB24,
 		.clock_divider = 1,
 		.flags = LCDC_FLAGS_DWPOL,
+		.meram_cfg = &hdmi_meram_cfg,
 	}
 };
 
@@ -949,6 +1006,7 @@ static struct platform_device *mackerel_devices[] __initdata = {
 	&mackerel_camera,
 	&hdmi_lcdc_device,
 	&hdmi_device,
+	&meram_device,
 };
 
 /* Keypad Initialization */
-- 
cgit v1.2.3-70-g09d2


From 54525552c6ccfd867e819845da14be994e303218 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Tue, 24 May 2011 10:23:59 +0000
Subject: sh: mark DMA slave ID 0 as invalid

This makes it possible to leave DMA slave IDs in the platform data
at default 0 value without hitting DMA channel allocation error paths.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7722.h | 1 +
 arch/sh/include/cpu-sh4/cpu/sh7724.h | 1 +
 arch/sh/include/cpu-sh4/cpu/sh7757.h | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7722.h b/arch/sh/include/cpu-sh4/cpu/sh7722.h
index 7a5b8a331b4..bd0622788d6 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7722.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7722.h
@@ -236,6 +236,7 @@ enum {
 };
 
 enum {
+	SHDMA_SLAVE_INVALID,
 	SHDMA_SLAVE_SCIF0_TX,
 	SHDMA_SLAVE_SCIF0_RX,
 	SHDMA_SLAVE_SCIF1_TX,
diff --git a/arch/sh/include/cpu-sh4/cpu/sh7724.h b/arch/sh/include/cpu-sh4/cpu/sh7724.h
index 7eb43599942..3daef8ecbc6 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7724.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7724.h
@@ -285,6 +285,7 @@ enum {
 };
 
 enum {
+	SHDMA_SLAVE_INVALID,
 	SHDMA_SLAVE_SCIF0_TX,
 	SHDMA_SLAVE_SCIF0_RX,
 	SHDMA_SLAVE_SCIF1_TX,
diff --git a/arch/sh/include/cpu-sh4/cpu/sh7757.h b/arch/sh/include/cpu-sh4/cpu/sh7757.h
index 05b8196c775..41f9f8b9db7 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7757.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7757.h
@@ -252,6 +252,7 @@ enum {
 };
 
 enum {
+	SHDMA_SLAVE_INVALID,
 	SHDMA_SLAVE_SDHI_TX,
 	SHDMA_SLAVE_SDHI_RX,
 	SHDMA_SLAVE_MMCIF_TX,
-- 
cgit v1.2.3-70-g09d2


From 2a919596c16b4333af851ff473ebf96e289ab90c Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 11 May 2011 12:50:28 -0500
Subject: x86, UV: Add support for SGI UV2 hub chip

This patch adds support for a new version of the SGI UV hub
chip. The hub chip is the node controller that connects multiple
blades into a larger coherent SSI.

For the most part, UV2 is compatible with UV1. The majority of
the changes are in the addresses of MMRs and in a few cases, the
contents of MMRs. These changes are the result in changes in the
system topology such as node configuration, processor types,
maximum nodes, physical address sizes, etc.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Link: http://lkml.kernel.org/r/20110511175028.GA18006@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h   |   42 +-
 arch/x86/include/asm/uv/uv_hub.h   |   71 ++-
 arch/x86/include/asm/uv/uv_mmrs.h  | 1012 +++++++++++++++++++++++++++++-------
 arch/x86/kernel/apic/x2apic_uv_x.c |   40 +-
 arch/x86/platform/uv/tlb_uv.c      |  132 ++++-
 arch/x86/platform/uv/uv_time.c     |   16 +-
 6 files changed, 1075 insertions(+), 238 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 130f1eeee5f..0652a5a9fd6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -44,7 +44,10 @@
 #define UV_ACT_STATUS_SIZE		2
 #define UV_DISTRIBUTION_SIZE		256
 #define UV_SW_ACK_NPENDING		8
-#define UV_NET_ENDPOINT_INTD		0x38
+#define UV1_NET_ENDPOINT_INTD		0x38
+#define UV2_NET_ENDPOINT_INTD		0x28
+#define UV_NET_ENDPOINT_INTD		(is_uv1_hub() ?			\
+			UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
 #define UV_DESC_BASE_PNODE_SHIFT	49
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
@@ -53,10 +56,22 @@
 #define UV_BAU_TUNABLES_FILE		"bau_tunables"
 #define WHITESPACE			" \t\n"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
+
+
 /* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
+/*
+ * UV2: Bit 19 selects between
+ *  (0): 10 microsecond timebase and
+ *  (1): 80 microseconds
+ *  we're using 655us, similar to UV1: 65 units of 10us
+ */
+#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
+#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL)
+
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	(is_uv1_hub() ?			\
+		UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD :			\
+		UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+
 #define BAU_MISC_CONTROL_MULT_MASK 3
 
 #define UVH_AGING_PRESCALE_SEL 0x000000b000UL
@@ -76,6 +91,16 @@
 #define DESC_STATUS_ACTIVE		1
 #define DESC_STATUS_DESTINATION_TIMEOUT	2
 #define DESC_STATUS_SOURCE_TIMEOUT	3
+/*
+ * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
+ * values 1 and 5 will not occur
+ */
+#define UV2H_DESC_IDLE			0
+#define UV2H_DESC_DEST_TIMEOUT		2
+#define UV2H_DESC_DEST_STRONG_NACK	3
+#define UV2H_DESC_BUSY			4
+#define UV2H_DESC_SOURCE_TIMEOUT	6
+#define UV2H_DESC_DEST_PUT_ERR		7
 
 /*
  * delay for 'plugged' timeout retries, in microseconds
@@ -96,6 +121,15 @@
 
 #define UV_LB_SUBNODEID 0x10
 
+/* these two are the same for UV1 and UV2: */
+#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
+/* 4 bits of software ack period */
+#define UV2_ACK_MASK 0x7UL
+#define UV2_ACK_UNITS_SHFT 3
+#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
+#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
+
 /*
  * number of entries in the destination side payload queue
  */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 4298002d0c8..f26544a1521 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -77,8 +77,9 @@
  *
  *		1111110000000000
  *		5432109876543210
- *		pppppppppplc0cch	Nehalem-EX
- *		ppppppppplcc0cch	Westmere-EX
+ *		pppppppppplc0cch	Nehalem-EX (12 bits in hdw reg)
+ *		ppppppppplcc0cch	Westmere-EX (12 bits in hdw reg)
+ *		pppppppppppcccch	SandyBridge (15 bits in hdw reg)
  *		sssssssssss
  *
  *			p  = pnode bits
@@ -87,7 +88,7 @@
  *			h  = hyperthread
  *			s  = bits that are in the SOCKET_ID CSR
  *
- *	Note: Processor only supports 12 bits in the APICID register. The ACPI
+ *	Note: Processor may support fewer bits in the APICID register. The ACPI
  *	      tables hold all 16 bits. Software needs to be aware of this.
  *
  *	      Unless otherwise specified, all references to APICID refer to
@@ -138,6 +139,8 @@ struct uv_hub_info_s {
 	unsigned long		global_mmr_base;
 	unsigned long		gpa_mask;
 	unsigned int		gnode_extra;
+	unsigned char		hub_revision;
+	unsigned char		apic_pnode_shift;
 	unsigned long		gnode_upper;
 	unsigned long		lowmem_remap_top;
 	unsigned long		lowmem_remap_base;
@@ -149,13 +152,31 @@ struct uv_hub_info_s {
 	unsigned char		m_val;
 	unsigned char		n_val;
 	struct uv_scir_s	scir;
-	unsigned char		apic_pnode_shift;
 };
 
 DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define uv_hub_info		(&__get_cpu_var(__uv_hub_info))
 #define uv_cpu_hub_info(cpu)	(&per_cpu(__uv_hub_info, cpu))
 
+/*
+ * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2
+ * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE.
+ * This is a software convention - NOT the hardware revision numbers in
+ * the hub chip.
+ */
+#define UV1_HUB_REVISION_BASE		1
+#define UV2_HUB_REVISION_BASE		3
+
+static inline int is_uv1_hub(void)
+{
+	return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
+}
+
+static inline int is_uv2_hub(void)
+{
+	return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
+}
+
 union uvh_apicid {
     unsigned long       v;
     struct uvh_apicid_s {
@@ -180,11 +201,25 @@ union uvh_apicid {
 #define UV_PNODE_TO_GNODE(p)		((p) |uv_hub_info->gnode_extra)
 #define UV_PNODE_TO_NASID(p)		(UV_PNODE_TO_GNODE(p) << 1)
 
-#define UV_LOCAL_MMR_BASE		0xf4000000UL
-#define UV_GLOBAL_MMR32_BASE		0xf8000000UL
+#define UV1_LOCAL_MMR_BASE		0xf4000000UL
+#define UV1_GLOBAL_MMR32_BASE		0xf8000000UL
+#define UV1_LOCAL_MMR_SIZE		(64UL * 1024 * 1024)
+#define UV1_GLOBAL_MMR32_SIZE		(64UL * 1024 * 1024)
+
+#define UV2_LOCAL_MMR_BASE		0xfa000000UL
+#define UV2_GLOBAL_MMR32_BASE		0xfc000000UL
+#define UV2_LOCAL_MMR_SIZE		(32UL * 1024 * 1024)
+#define UV2_GLOBAL_MMR32_SIZE		(32UL * 1024 * 1024)
+
+#define UV_LOCAL_MMR_BASE		(is_uv1_hub() ? UV1_LOCAL_MMR_BASE     \
+						: UV2_LOCAL_MMR_BASE)
+#define UV_GLOBAL_MMR32_BASE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE  \
+						: UV2_GLOBAL_MMR32_BASE)
+#define UV_LOCAL_MMR_SIZE		(is_uv1_hub() ? UV1_LOCAL_MMR_SIZE :   \
+						UV2_LOCAL_MMR_SIZE)
+#define UV_GLOBAL_MMR32_SIZE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
+						UV2_GLOBAL_MMR32_SIZE)
 #define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
-#define UV_LOCAL_MMR_SIZE		(64UL * 1024 * 1024)
-#define UV_GLOBAL_MMR32_SIZE		(64UL * 1024 * 1024)
 
 #define UV_GLOBAL_GRU_MMR_BASE		0x4000000
 
@@ -300,6 +335,17 @@ static inline int uv_apicid_to_pnode(int apicid)
 	return (apicid >> uv_hub_info->apic_pnode_shift);
 }
 
+/*
+ * Convert an apicid to the socket number on the blade
+ */
+static inline int uv_apicid_to_socket(int apicid)
+{
+	if (is_uv1_hub())
+		return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1;
+	else
+		return 0;
+}
+
 /*
  * Access global MMRs using the low memory MMR32 space. This region supports
  * faster MMR access but not all MMRs are accessible in this space.
@@ -519,14 +565,13 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
 
 /*
  * Get the minimum revision number of the hub chips within the partition.
- *     1 - initial rev 1.0 silicon
- *     2 - rev 2.0 production silicon
+ *     1 - UV1 rev 1.0 initial silicon
+ *     2 - UV1 rev 2.0 production silicon
+ *     3 - UV2 rev 1.0 initial silicon
  */
 static inline int uv_get_min_hub_revision_id(void)
 {
-	extern int uv_min_hub_revision_id;
-
-	return uv_min_hub_revision_id;
+	return uv_hub_info->hub_revision;
 }
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index f5bb64a823d..4be52c86344 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -11,13 +11,64 @@
 #ifndef _ASM_X86_UV_UV_MMRS_H
 #define _ASM_X86_UV_UV_MMRS_H
 
+/*
+ * This file contains MMR definitions for both UV1 & UV2 hubs.
+ *
+ * In general, MMR addresses and structures are identical on both hubs.
+ * These MMRs are identified as:
+ *	#define UVH_xxx		<address>
+ *	union uvh_xxx {
+ *		unsigned long       v;
+ *		struct uvh_int_cmpd_s {
+ *		} s;
+ *	};
+ *
+ * If the MMR exists on both hub type but has different addresses or
+ * contents, the MMR definition is similar to:
+ *	#define UV1H_xxx	<uv1 address>
+ *	#define UV2H_xxx	<uv2address>
+ *	#define UVH_xxx		(is_uv1_hub() ? UV1H_xxx : UV2H_xxx)
+ *	union uvh_xxx {
+ *		unsigned long       v;
+ *		struct uv1h_int_cmpd_s {	 (Common fields only)
+ *		} s;
+ *		struct uv1h_int_cmpd_s {	 (Full UV1 definition)
+ *		} s1;
+ *		struct uv2h_int_cmpd_s {	 (Full UV2 definition)
+ *		} s2;
+ *	};
+ *
+ * Only essential difference are enumerated. For example, if the address is
+ * the same for both UV1 & UV2, only a single #define is generated. Likewise,
+ * if the contents is the same for both hubs, only the "s" structure is
+ * generated.
+ *
+ * If the MMR exists on ONLY 1 type of hub, no generic definition is
+ * generated:
+ *	#define UVnH_xxx	<uvn address>
+ *	union uvnh_xxx {
+ *		unsigned long       v;
+ *		struct uvh_int_cmpd_s {
+ *		} sn;
+ *	};
+ */
+
 #define UV_MMR_ENABLE		(1UL << 63)
 
+#define UV1_HUB_PART_NUMBER	0x88a5
+#define UV2_HUB_PART_NUMBER	0x8eb8
+
+/* Compat: if this #define is present, UV headers support UV2 */
+#define UV2_HUB_IS_SUPPORTED	1
+
+/* KABI compat: if this #define is present, KABI hacks are present */
+#define UV2_HUB_KABI_HACKS	1
+
 /* ========================================================================= */
 /*                          UVH_BAU_DATA_BROADCAST                           */
 /* ========================================================================= */
 #define UVH_BAU_DATA_BROADCAST 0x61688UL
-#define UVH_BAU_DATA_BROADCAST_32 0x0440
+#define UVH_BAU_DATA_BROADCAST_32 0x440
 
 #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
@@ -34,7 +85,7 @@ union uvh_bau_data_broadcast_u {
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x0438
+#define UVH_BAU_DATA_CONFIG_32 0x438
 
 #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
 #define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
@@ -73,125 +124,245 @@ union uvh_bau_data_config_u {
 /*                           UVH_EVENT_OCCURRED0                             */
 /* ========================================================================= */
 #define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x005e8
-
-#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+#define UVH_EVENT_OCCURRED0_32 0x5e8
+
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
+#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
+#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
+#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
+#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
+#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
+#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
 union uvh_event_occurred0_u {
     unsigned long	v;
-    struct uvh_event_occurred0_s {
+    struct uv1h_event_occurred0_s {
 	unsigned long	lb_hcerr             :  1;  /* RW, W1C */
 	unsigned long	gr0_hcerr            :  1;  /* RW, W1C */
 	unsigned long	gr1_hcerr            :  1;  /* RW, W1C */
@@ -250,14 +421,76 @@ union uvh_event_occurred0_u {
 	unsigned long	bau_data             :  1;  /* RW, W1C */
 	unsigned long	power_management_req :  1;  /* RW, W1C */
 	unsigned long	rsvd_57_63           :  7;  /*    */
-    } s;
+    } s1;
+    struct uv2h_event_occurred0_s {
+	unsigned long	lb_hcerr            :  1;  /* RW */
+	unsigned long	qp_hcerr            :  1;  /* RW */
+	unsigned long	rh_hcerr            :  1;  /* RW */
+	unsigned long	lh0_hcerr           :  1;  /* RW */
+	unsigned long	lh1_hcerr           :  1;  /* RW */
+	unsigned long	gr0_hcerr           :  1;  /* RW */
+	unsigned long	gr1_hcerr           :  1;  /* RW */
+	unsigned long	ni0_hcerr           :  1;  /* RW */
+	unsigned long	ni1_hcerr           :  1;  /* RW */
+	unsigned long	lb_aoerr0           :  1;  /* RW */
+	unsigned long	qp_aoerr0           :  1;  /* RW */
+	unsigned long	rh_aoerr0           :  1;  /* RW */
+	unsigned long	lh0_aoerr0          :  1;  /* RW */
+	unsigned long	lh1_aoerr0          :  1;  /* RW */
+	unsigned long	gr0_aoerr0          :  1;  /* RW */
+	unsigned long	gr1_aoerr0          :  1;  /* RW */
+	unsigned long	xb_aoerr0           :  1;  /* RW */
+	unsigned long	rt_aoerr0           :  1;  /* RW */
+	unsigned long	ni0_aoerr0          :  1;  /* RW */
+	unsigned long	ni1_aoerr0          :  1;  /* RW */
+	unsigned long	lb_aoerr1           :  1;  /* RW */
+	unsigned long	qp_aoerr1           :  1;  /* RW */
+	unsigned long	rh_aoerr1           :  1;  /* RW */
+	unsigned long	lh0_aoerr1          :  1;  /* RW */
+	unsigned long	lh1_aoerr1          :  1;  /* RW */
+	unsigned long	gr0_aoerr1          :  1;  /* RW */
+	unsigned long	gr1_aoerr1          :  1;  /* RW */
+	unsigned long	xb_aoerr1           :  1;  /* RW */
+	unsigned long	rt_aoerr1           :  1;  /* RW */
+	unsigned long	ni0_aoerr1          :  1;  /* RW */
+	unsigned long	ni1_aoerr1          :  1;  /* RW */
+	unsigned long	system_shutdown_int :  1;  /* RW */
+	unsigned long	lb_irq_int_0        :  1;  /* RW */
+	unsigned long	lb_irq_int_1        :  1;  /* RW */
+	unsigned long	lb_irq_int_2        :  1;  /* RW */
+	unsigned long	lb_irq_int_3        :  1;  /* RW */
+	unsigned long	lb_irq_int_4        :  1;  /* RW */
+	unsigned long	lb_irq_int_5        :  1;  /* RW */
+	unsigned long	lb_irq_int_6        :  1;  /* RW */
+	unsigned long	lb_irq_int_7        :  1;  /* RW */
+	unsigned long	lb_irq_int_8        :  1;  /* RW */
+	unsigned long	lb_irq_int_9        :  1;  /* RW */
+	unsigned long	lb_irq_int_10       :  1;  /* RW */
+	unsigned long	lb_irq_int_11       :  1;  /* RW */
+	unsigned long	lb_irq_int_12       :  1;  /* RW */
+	unsigned long	lb_irq_int_13       :  1;  /* RW */
+	unsigned long	lb_irq_int_14       :  1;  /* RW */
+	unsigned long	lb_irq_int_15       :  1;  /* RW */
+	unsigned long	l1_nmi_int          :  1;  /* RW */
+	unsigned long	stop_clock          :  1;  /* RW */
+	unsigned long	asic_to_l1          :  1;  /* RW */
+	unsigned long	l1_to_asic          :  1;  /* RW */
+	unsigned long	la_seq_trigger      :  1;  /* RW */
+	unsigned long	ipi_int             :  1;  /* RW */
+	unsigned long	extio_int0          :  1;  /* RW */
+	unsigned long	extio_int1          :  1;  /* RW */
+	unsigned long	extio_int2          :  1;  /* RW */
+	unsigned long	extio_int3          :  1;  /* RW */
+	unsigned long	profile_int         :  1;  /* RW */
+	unsigned long	rsvd_59_63          :  5;  /*    */
+    } s2;
 };
 
 /* ========================================================================= */
 /*                        UVH_EVENT_OCCURRED0_ALIAS                          */
 /* ========================================================================= */
 #define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
 
 /* ========================================================================= */
 /*                         UVH_GR0_TLB_INT0_CONFIG                           */
@@ -432,8 +665,16 @@ union uvh_int_cmpb_u {
 /* ========================================================================= */
 #define UVH_INT_CMPC 0x22100UL
 
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT	0
+#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT	0
+#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT	(is_uv1_hub() ?		\
+			UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT :	\
+			UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT)
+#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK	0xffffffffffffffUL
+#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK	0xffffffffffffffUL
+#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK	(is_uv1_hub() ?		\
+			UV1H_INT_CMPC_REAL_TIME_CMPC_MASK :	\
+			UV2H_INT_CMPC_REAL_TIME_CMPC_MASK)
 
 union uvh_int_cmpc_u {
     unsigned long	v;
@@ -448,8 +689,16 @@ union uvh_int_cmpc_u {
 /* ========================================================================= */
 #define UVH_INT_CMPD 0x22180UL
 
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT	0
+#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT	0
+#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT	(is_uv1_hub() ?		\
+			UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT :	\
+			UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT)
+#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK	0xffffffffffffffUL
+#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK	0xffffffffffffffUL
+#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK	(is_uv1_hub() ?		\
+			UV1H_INT_CMPD_REAL_TIME_CMPD_MASK :	\
+			UV2H_INT_CMPD_REAL_TIME_CMPD_MASK)
 
 union uvh_int_cmpd_u {
     unsigned long	v;
@@ -463,7 +712,7 @@ union uvh_int_cmpd_u {
 /*                               UVH_IPI_INT                                 */
 /* ========================================================================= */
 #define UVH_IPI_INT 0x60500UL
-#define UVH_IPI_INT_32 0x0348
+#define UVH_IPI_INT_32 0x348
 
 #define UVH_IPI_INT_VECTOR_SHFT 0
 #define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
@@ -493,7 +742,7 @@ union uvh_ipi_int_u {
 /*                   UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST                     */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
@@ -515,7 +764,7 @@ union uvh_lb_bau_intd_payload_queue_first_u {
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST                     */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
@@ -533,7 +782,7 @@ union uvh_lb_bau_intd_payload_queue_last_u {
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL                     */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
@@ -551,7 +800,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
 /*                   UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE                    */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
 
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
@@ -585,6 +834,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
 union uvh_lb_bau_intd_software_acknowledge_u {
     unsigned long	v;
     struct uvh_lb_bau_intd_software_acknowledge_s {
@@ -612,13 +862,13 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 /*                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS                 */
 /* ========================================================================= */
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
 
 /* ========================================================================= */
 /*                         UVH_LB_BAU_MISC_CONTROL                           */
 /* ========================================================================= */
 #define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
+#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
 
 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
@@ -628,8 +878,8 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
 #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
 #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
-#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
-#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
@@ -650,8 +900,86 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
-#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
-#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
 
 union uvh_lb_bau_misc_control_u {
     unsigned long	v;
@@ -660,7 +988,25 @@ union uvh_lb_bau_misc_control_u {
 	unsigned long	apic_mode                          :  1;  /* RW */
 	unsigned long	force_broadcast                    :  1;  /* RW */
 	unsigned long	force_lock_nop                     :  1;  /* RW */
-	unsigned long	csi_agent_presence_vector          :  3;  /* RW */
+	unsigned long	qpi_agent_presence_vector          :  3;  /* RW */
+	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
+	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
+	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
+	unsigned long	enable_dual_mapping_mode           :  1;  /* RW */
+	unsigned long	vga_io_port_decode_enable          :  1;  /* RW */
+	unsigned long	vga_io_port_16_bit_decode          :  1;  /* RW */
+	unsigned long	suppress_dest_registration         :  1;  /* RW */
+	unsigned long	programmed_initial_priority        :  3;  /* RW */
+	unsigned long	use_incoming_priority              :  1;  /* RW */
+	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
+	unsigned long	rsvd_29_63    : 35;
+    } s;
+    struct uv1h_lb_bau_misc_control_s {
+	unsigned long	rejection_delay                    :  8;  /* RW */
+	unsigned long	apic_mode                          :  1;  /* RW */
+	unsigned long	force_broadcast                    :  1;  /* RW */
+	unsigned long	force_lock_nop                     :  1;  /* RW */
+	unsigned long	qpi_agent_presence_vector          :  3;  /* RW */
 	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
 	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
 	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
@@ -673,14 +1019,40 @@ union uvh_lb_bau_misc_control_u {
 	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
 	unsigned long	rsvd_29_47                         : 19;  /*    */
 	unsigned long	fun                                : 16;  /* RW */
-    } s;
+    } s1;
+    struct uv2h_lb_bau_misc_control_s {
+	unsigned long	rejection_delay                      :  8;  /* RW */
+	unsigned long	apic_mode                            :  1;  /* RW */
+	unsigned long	force_broadcast                      :  1;  /* RW */
+	unsigned long	force_lock_nop                       :  1;  /* RW */
+	unsigned long	qpi_agent_presence_vector            :  3;  /* RW */
+	unsigned long	descriptor_fetch_mode                :  1;  /* RW */
+	unsigned long	enable_intd_soft_ack_mode            :  1;  /* RW */
+	unsigned long	intd_soft_ack_timeout_period         :  4;  /* RW */
+	unsigned long	enable_dual_mapping_mode             :  1;  /* RW */
+	unsigned long	vga_io_port_decode_enable            :  1;  /* RW */
+	unsigned long	vga_io_port_16_bit_decode            :  1;  /* RW */
+	unsigned long	suppress_dest_registration           :  1;  /* RW */
+	unsigned long	programmed_initial_priority          :  3;  /* RW */
+	unsigned long	use_incoming_priority                :  1;  /* RW */
+	unsigned long	enable_programmed_initial_priority   :  1;  /* RW */
+	unsigned long	enable_automatic_apic_mode_selection :  1;  /* RW */
+	unsigned long	apic_mode_status                     :  1;  /* RO */
+	unsigned long	suppress_interrupts_to_self          :  1;  /* RW */
+	unsigned long	enable_lock_based_system_flush       :  1;  /* RW */
+	unsigned long	enable_extended_sb_status            :  1;  /* RW */
+	unsigned long	suppress_int_prio_udt_to_self        :  1;  /* RW */
+	unsigned long	use_legacy_descriptor_formats        :  1;  /* RW */
+	unsigned long	rsvd_36_47                           : 12;  /*    */
+	unsigned long	fun                                  : 16;  /* RW */
+    } s2;
 };
 
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
 
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
@@ -703,7 +1075,7 @@ union uvh_lb_bau_sb_activation_control_u {
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_0                      */
 /* ========================================================================= */
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
 
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
@@ -719,7 +1091,7 @@ union uvh_lb_bau_sb_activation_status_0_u {
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_1                      */
 /* ========================================================================= */
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
 
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
@@ -735,7 +1107,7 @@ union uvh_lb_bau_sb_activation_status_1_u {
 /*                      UVH_LB_BAU_SB_DESCRIPTOR_BASE                        */
 /* ========================================================================= */
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
 
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
@@ -753,23 +1125,6 @@ union uvh_lb_bau_sb_descriptor_base_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                   UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK                     */
-/* ========================================================================= */
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
-
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
-
-union uvh_lb_target_physical_apic_id_mask_u {
-	unsigned long v;
-	struct uvh_lb_target_physical_apic_id_mask_s {
-		unsigned long bit_enables : 32;  /* RW */
-		unsigned long rsvd_32_63  : 32;  /*    */
-	} s;
-};
-
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
@@ -785,14 +1140,48 @@ union uvh_lb_target_physical_apic_id_mask_u {
 #define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
 #define UVH_NODE_ID_NODE_ID_SHFT 32
 #define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UVH_NODE_ID_NI_PORT_SHFT 56
-#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+#define UV1H_NODE_ID_FORCE1_SHFT 0
+#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV1H_NODE_ID_REVISION_SHFT 28
+#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV1H_NODE_ID_NODE_ID_SHFT 32
+#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
+#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
+#define UV1H_NODE_ID_NI_PORT_SHFT 56
+#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+#define UV2H_NODE_ID_FORCE1_SHFT 0
+#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV2H_NODE_ID_REVISION_SHFT 28
+#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV2H_NODE_ID_NODE_ID_SHFT 32
+#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UV2H_NODE_ID_NI_PORT_SHFT 57
+#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
 
 union uvh_node_id_u {
     unsigned long	v;
     struct uvh_node_id_s {
+	unsigned long	force1        :  1;  /* RO */
+	unsigned long	manufacturer  : 11;  /* RO */
+	unsigned long	part_number   : 16;  /* RO */
+	unsigned long	revision      :  4;  /* RO */
+	unsigned long	node_id       : 15;  /* RW */
+	unsigned long	rsvd_47_63    : 17;
+    } s;
+    struct uv1h_node_id_s {
 	unsigned long	force1        :  1;  /* RO */
 	unsigned long	manufacturer  : 11;  /* RO */
 	unsigned long	part_number   : 16;  /* RO */
@@ -803,7 +1192,18 @@ union uvh_node_id_u {
 	unsigned long	rsvd_55       :  1;  /*    */
 	unsigned long	ni_port       :  4;  /* RO */
 	unsigned long	rsvd_60_63    :  4;  /*    */
-    } s;
+    } s1;
+    struct uv2h_node_id_s {
+	unsigned long	force1        :  1;  /* RO */
+	unsigned long	manufacturer  : 11;  /* RO */
+	unsigned long	part_number   : 16;  /* RO */
+	unsigned long	revision      :  4;  /* RO */
+	unsigned long	node_id       : 15;  /* RW */
+	unsigned long	rsvd_47_49    :  3;  /*    */
+	unsigned long	nodes_per_bit :  7;  /* RO */
+	unsigned long	ni_port       :  5;  /* RO */
+	unsigned long	rsvd_62_63    :  2;  /*    */
+    } s2;
 };
 
 /* ========================================================================= */
@@ -954,18 +1354,38 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
 #define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
-#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
-#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
 
 union uvh_rh_gam_config_mmr_u {
     unsigned long	v;
     struct uvh_rh_gam_config_mmr_s {
+	unsigned long	m_skt     :  6;  /* RW */
+	unsigned long	n_skt     :  4;  /* RW */
+	unsigned long	rsvd_10_63    : 54;
+    } s;
+    struct uv1h_rh_gam_config_mmr_s {
 	unsigned long	m_skt     :  6;  /* RW */
 	unsigned long	n_skt     :  4;  /* RW */
 	unsigned long	rsvd_10_11:  2;  /*    */
 	unsigned long	mmiol_cfg :  1;  /* RW */
 	unsigned long	rsvd_13_63: 51;  /*    */
-    } s;
+    } s1;
+    struct uv2h_rh_gam_config_mmr_s {
+	unsigned long	m_skt :  6;  /* RW */
+	unsigned long	n_skt :  4;  /* RW */
+	unsigned long	rsvd_10_63: 54;  /*    */
+    } s2;
 };
 
 /* ========================================================================= */
@@ -975,16 +1395,32 @@ union uvh_rh_gam_config_mmr_u {
 
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_gru_overlay_config_mmr_u {
     unsigned long	v;
     struct uvh_rh_gam_gru_overlay_config_mmr_s {
+	unsigned long	rsvd_0_27: 28;  /*    */
+	unsigned long	base   : 18;  /* RW */
+	unsigned long	rsvd_46_62    : 17;
+	unsigned long	enable :  1;  /* RW */
+    } s;
+    struct uv1h_rh_gam_gru_overlay_config_mmr_s {
 	unsigned long	rsvd_0_27: 28;  /*    */
 	unsigned long	base   : 18;  /* RW */
 	unsigned long	rsvd_46_47:  2;  /*    */
@@ -993,7 +1429,15 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 	unsigned long	n_gru  :  4;  /* RW */
 	unsigned long	rsvd_56_62:  7;  /*    */
 	unsigned long	enable :  1;  /* RW */
-    } s;
+    } s1;
+    struct uv2h_rh_gam_gru_overlay_config_mmr_s {
+	unsigned long	rsvd_0_27: 28;  /*    */
+	unsigned long	base   : 18;  /* RW */
+	unsigned long	rsvd_46_51:  6;  /*    */
+	unsigned long	n_gru  :  4;  /* RW */
+	unsigned long	rsvd_56_62:  7;  /*    */
+	unsigned long	enable :  1;  /* RW */
+    } s2;
 };
 
 /* ========================================================================= */
@@ -1001,25 +1445,42 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 /* ========================================================================= */
 #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
 
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_mmioh_overlay_config_mmr_u {
     unsigned long	v;
-    struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
+    struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
 	unsigned long	rsvd_0_29: 30;  /*    */
 	unsigned long	base   : 16;  /* RW */
 	unsigned long	m_io   :  6;  /* RW */
 	unsigned long	n_io   :  4;  /* RW */
 	unsigned long	rsvd_56_62:  7;  /*    */
 	unsigned long	enable :  1;  /* RW */
-    } s;
+    } s1;
+    struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
+	unsigned long	rsvd_0_26: 27;  /*    */
+	unsigned long	base   : 19;  /* RW */
+	unsigned long	m_io   :  6;  /* RW */
+	unsigned long	n_io   :  4;  /* RW */
+	unsigned long	rsvd_56_62:  7;  /*    */
+	unsigned long	enable :  1;  /* RW */
+    } s2;
 };
 
 /* ========================================================================= */
@@ -1029,20 +1490,40 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u {
 
 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_mmr_overlay_config_mmr_u {
     unsigned long	v;
     struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+	unsigned long	rsvd_0_25: 26;  /*    */
+	unsigned long	base     : 20;  /* RW */
+	unsigned long	rsvd_46_62    : 17;
+	unsigned long	enable   :  1;  /* RW */
+    } s;
+    struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
 	unsigned long	rsvd_0_25: 26;  /*    */
 	unsigned long	base     : 20;  /* RW */
 	unsigned long	dual_hub :  1;  /* RW */
 	unsigned long	rsvd_47_62: 16;  /*    */
 	unsigned long	enable   :  1;  /* RW */
-    } s;
+    } s1;
+    struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
+	unsigned long	rsvd_0_25: 26;  /*    */
+	unsigned long	base   : 20;  /* RW */
+	unsigned long	rsvd_46_62: 17;  /*    */
+	unsigned long	enable :  1;  /* RW */
+    } s2;
 };
 
 /* ========================================================================= */
@@ -1103,10 +1584,11 @@ union uvh_rtc1_int_config_u {
 /*                               UVH_SCRATCH5                                */
 /* ========================================================================= */
 #define UVH_SCRATCH5 0x2d0200UL
-#define UVH_SCRATCH5_32 0x00778
+#define UVH_SCRATCH5_32 0x778
 
 #define UVH_SCRATCH5_SCRATCH5_SHFT 0
 #define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
 union uvh_scratch5_u {
     unsigned long	v;
     struct uvh_scratch5_s {
@@ -1114,4 +1596,154 @@ union uvh_scratch5_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                           UV2H_EVENT_OCCURRED2                            */
+/* ========================================================================= */
+#define UV2H_EVENT_OCCURRED2 0x70100UL
+#define UV2H_EVENT_OCCURRED2_32 0xb68
+
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+union uv2h_event_occurred2_u {
+    unsigned long	v;
+    struct uv2h_event_occurred2_s {
+	unsigned long	rtc_0  :  1;  /* RW */
+	unsigned long	rtc_1  :  1;  /* RW */
+	unsigned long	rtc_2  :  1;  /* RW */
+	unsigned long	rtc_3  :  1;  /* RW */
+	unsigned long	rtc_4  :  1;  /* RW */
+	unsigned long	rtc_5  :  1;  /* RW */
+	unsigned long	rtc_6  :  1;  /* RW */
+	unsigned long	rtc_7  :  1;  /* RW */
+	unsigned long	rtc_8  :  1;  /* RW */
+	unsigned long	rtc_9  :  1;  /* RW */
+	unsigned long	rtc_10 :  1;  /* RW */
+	unsigned long	rtc_11 :  1;  /* RW */
+	unsigned long	rtc_12 :  1;  /* RW */
+	unsigned long	rtc_13 :  1;  /* RW */
+	unsigned long	rtc_14 :  1;  /* RW */
+	unsigned long	rtc_15 :  1;  /* RW */
+	unsigned long	rtc_16 :  1;  /* RW */
+	unsigned long	rtc_17 :  1;  /* RW */
+	unsigned long	rtc_18 :  1;  /* RW */
+	unsigned long	rtc_19 :  1;  /* RW */
+	unsigned long	rtc_20 :  1;  /* RW */
+	unsigned long	rtc_21 :  1;  /* RW */
+	unsigned long	rtc_22 :  1;  /* RW */
+	unsigned long	rtc_23 :  1;  /* RW */
+	unsigned long	rtc_24 :  1;  /* RW */
+	unsigned long	rtc_25 :  1;  /* RW */
+	unsigned long	rtc_26 :  1;  /* RW */
+	unsigned long	rtc_27 :  1;  /* RW */
+	unsigned long	rtc_28 :  1;  /* RW */
+	unsigned long	rtc_29 :  1;  /* RW */
+	unsigned long	rtc_30 :  1;  /* RW */
+	unsigned long	rtc_31 :  1;  /* RW */
+	unsigned long	rsvd_32_63: 32;  /*    */
+    } s1;
+};
+
+/* ========================================================================= */
+/*                        UV2H_EVENT_OCCURRED2_ALIAS                         */
+/* ========================================================================= */
+#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
+#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
+
+/* ========================================================================= */
+/*                    UV2H_LB_BAU_SB_ACTIVATION_STATUS_2                     */
+/* ========================================================================= */
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+union uv2h_lb_bau_sb_activation_status_2_u {
+    unsigned long	v;
+    struct uv2h_lb_bau_sb_activation_status_2_s {
+	unsigned long	aux_error : 64;  /* RW */
+    } s1;
+};
+
+/* ========================================================================= */
+/*                   UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK                    */
+/* ========================================================================= */
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
+
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
+
+union uv1h_lb_target_physical_apic_id_mask_u {
+    unsigned long	v;
+    struct uv1h_lb_target_physical_apic_id_mask_s {
+	unsigned long	bit_enables : 32;  /* RW */
+	unsigned long	rsvd_32_63  : 32;  /*    */
+    } s1;
+};
+
+
 #endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f450b683dfc..b511a011b7d 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -91,6 +91,10 @@ static int __init early_get_pnodeid(void)
 	m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
 	uv_min_hub_revision_id = node_id.s.revision;
 
+	if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+		uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+
+	uv_hub_info->hub_revision = uv_min_hub_revision_id;
 	pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
 	return pnode;
 }
@@ -112,17 +116,25 @@ static void __init early_get_apic_pnode_shift(void)
  */
 static void __init uv_set_apicid_hibit(void)
 {
-	union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
+	union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
 
-	apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
-	uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
+	if (is_uv1_hub()) {
+		apicid_mask.v =
+			uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+		uv_apicid_hibits =
+			apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+	}
 }
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int pnodeid;
+	int pnodeid, is_uv1, is_uv2;
 
-	if (!strcmp(oem_id, "SGI")) {
+	is_uv1 = !strcmp(oem_id, "SGI");
+	is_uv2 = !strcmp(oem_id, "SGI2");
+	if (is_uv1 || is_uv2) {
+		uv_hub_info->hub_revision =
+			is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
 		pnodeid = early_get_pnodeid();
 		early_get_apic_pnode_shift();
 		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
@@ -484,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
 static __init void map_mmioh_high(int max_pnode)
 {
 	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
-	int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+	int shift;
 
 	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-	if (mmioh.s.enable)
-		map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+	if (is_uv1_hub() && mmioh.s1.enable) {
+		shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
+			max_pnode, map_uc);
+	}
+	if (is_uv2_hub() && mmioh.s2.enable) {
+		shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
 			max_pnode, map_uc);
+	}
 }
 
 static __init void map_low_mmrs(void)
@@ -736,13 +755,14 @@ void __init uv_system_init(void)
 	unsigned long mmr_base, present, paddr;
 	unsigned short pnode_mask, pnode_io_mask;
 
+	printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
 	map_low_mmrs();
 
 	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
 	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-	n_io = mmioh.s.n_io;
+	n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
@@ -811,6 +831,8 @@ void __init uv_system_init(void)
 		 */
 		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
+		uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
+
 		pnode = uv_apicid_to_pnode(apicid);
 		blade = boot_pnode_to_blade(pnode);
 		lcpu = uv_blade_info[blade].nr_possible_cpus;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index c58e0ea39ef..a9856c09c42 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -397,16 +397,13 @@ end_uvhub_quiesce(struct bau_control *hmaster)
  * Wait for completion of a broadcast software ack message
  * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
  */
-static int uv_wait_completion(struct bau_desc *bau_desc,
+static int uv1_wait_completion(struct bau_desc *bau_desc,
 	unsigned long mmr_offset, int right_shift, int this_cpu,
 	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
 	unsigned long descriptor_status;
 	cycles_t ttime;
 	struct ptc_stats *stat = bcp->statp;
-	struct bau_control *hmaster;
-
-	hmaster = bcp->uvhub_master;
 
 	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
@@ -414,16 +411,76 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 			right_shift) & UV_ACT_STATUS_MASK)) !=
 			DESC_STATUS_IDLE) {
 		/*
-		 * Our software ack messages may be blocked because there are
-		 * no swack resources available.  As long as none of them
-		 * has timed out hardware will NACK our message and its
-		 * state will stay IDLE.
+		 * Our software ack messages may be blocked because
+		 * there are no swack resources available.  As long
+		 * as none of them has timed out hardware will NACK
+		 * our message and its state will stay IDLE.
 		 */
 		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
 			stat->s_stimeout++;
 			return FLUSH_GIVEUP;
 		} else if (descriptor_status ==
-					DESC_STATUS_DESTINATION_TIMEOUT) {
+				DESC_STATUS_DESTINATION_TIMEOUT) {
+			stat->s_dtimeout++;
+			ttime = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 */
+			if (cycles_2_us(ttime - bcp->send_message) <
+							timeout_us) {
+				bcp->conseccompletes = 0;
+				return FLUSH_RETRY_PLUGGED;
+			}
+
+			bcp->conseccompletes = 0;
+			return FLUSH_RETRY_TIMEOUT;
+		} else {
+			/*
+			 * descriptor_status is still BUSY
+			 */
+			cpu_relax();
+		}
+	}
+	bcp->conseccompletes++;
+	return FLUSH_COMPLETE;
+}
+
+static int uv2_wait_completion(struct bau_desc *bau_desc,
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
+{
+	unsigned long descriptor_status;
+	unsigned long descriptor_status2;
+	int cpu;
+	cycles_t ttime;
+	struct ptc_stats *stat = bcp->statp;
+
+	/* UV2 has an extra bit of status */
+	cpu = bcp->uvhub_cpu;
+	/* spin on the status MMR, waiting for it to go idle */
+	descriptor_status = (((unsigned long)(uv_read_local_mmr
+		(mmr_offset)) >> right_shift) & UV_ACT_STATUS_MASK);
+	descriptor_status2 = (((unsigned long)uv_read_local_mmr
+		(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2) >> cpu) & 0x1UL);
+	descriptor_status = (descriptor_status << 1) |
+		descriptor_status2;
+	while (descriptor_status != UV2H_DESC_IDLE) {
+		/*
+		 * Our software ack messages may be blocked because
+		 * there are no swack resources available.  As long
+		 * as none of them has timed out hardware will NACK
+		 * our message and its state will stay IDLE.
+		 */
+		if ((descriptor_status == UV2H_DESC_SOURCE_TIMEOUT) ||
+		    (descriptor_status == UV2H_DESC_DEST_STRONG_NACK) ||
+		    (descriptor_status == UV2H_DESC_DEST_PUT_ERR)) {
+			stat->s_stimeout++;
+			return FLUSH_GIVEUP;
+		} else if (descriptor_status == UV2H_DESC_DEST_TIMEOUT) {
 			stat->s_dtimeout++;
 			ttime = get_cycles();
 
@@ -447,11 +504,31 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
 			 */
 			cpu_relax();
 		}
+		descriptor_status = (((unsigned long)(uv_read_local_mmr
+			(mmr_offset)) >> right_shift) &
+			UV_ACT_STATUS_MASK);
+		descriptor_status2 = (((unsigned long)uv_read_local_mmr
+			(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2) >> cpu) &
+			0x1UL);
+		descriptor_status = (descriptor_status << 1) |
+			descriptor_status2;
 	}
 	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
+static int uv_wait_completion(struct bau_desc *bau_desc,
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
+{
+	if (is_uv1_hub())
+		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
+				   this_cpu, bcp, smaster, try);
+	else
+		return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
+				   this_cpu, bcp, smaster, try);
+}
+
 static inline cycles_t
 sec_2_cycles(unsigned long sec)
 {
@@ -585,7 +662,8 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
-	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+	if (is_uv1_hub()  &&
+			!atomic_inc_unless_ge(&hmaster->uvhub_lock,
 			&hmaster->active_descriptor_count,
 			hmaster->max_bau_concurrent)) {
 		stat->s_throttles++;
@@ -899,12 +977,17 @@ static void __init uv_enable_timeouts(void)
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
+		 * UV1:
 		 * Subsequent reversals of the timebase bit (3) cause an
 		 * immediate timeout of one or all INTD resources as
 		 * indicated in bits 2:0 (7 causes all of them to timeout).
 		 */
 		mmr_image |= ((unsigned long)1 <<
 		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
+		if (is_uv2_hub()) {
+			mmr_image |= ((unsigned long)1 << UV2_LEG_SHFT);
+			mmr_image |= ((unsigned long)1 << UV2_EXT_SHFT);
+		}
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
@@ -1486,14 +1569,27 @@ calculate_destination_timeout(void)
 	int ret;
 	unsigned long ts_ns;
 
-	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
-	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
-	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
-	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
-	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
-	base = timeout_base_ns[index];
-	ts_ns = base * mult1 * mult2;
-	ret = ts_ns / 1000;
+	if (is_uv1_hub()) {
+		mult1 = UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD &
+			BAU_MISC_CONTROL_MULT_MASK;
+		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+		base = timeout_base_ns[index];
+		ts_ns = base * mult1 * mult2;
+		ret = ts_ns / 1000;
+	} else {
+		/* 4 bits  0/1 for 10/80us, 3 bits of multiplier */
+		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
+		if (mmr_image & ((unsigned long)1 << UV2_ACK_UNITS_SHFT))
+			mult1 = 80;
+		else
+			mult1 = 10;
+		base = mmr_image & UV2_ACK_MASK;
+		ret = mult1 * base;
+	}
 	return ret;
 }
 
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 0eb90184515..9f29a01ee1b 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -99,8 +99,12 @@ static void uv_rtc_send_IPI(int cpu)
 /* Check for an RTC interrupt pending */
 static int uv_intr_pending(int pnode)
 {
-	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
-		UVH_EVENT_OCCURRED0_RTC1_MASK;
+	if (is_uv1_hub())
+		return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+			UV1H_EVENT_OCCURRED0_RTC1_MASK;
+	else
+		return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) &
+			UV2H_EVENT_OCCURRED2_RTC_1_MASK;
 }
 
 /* Setup interrupt and return non-zero if early expiration occurred. */
@@ -114,8 +118,12 @@ static int uv_setup_intr(int cpu, u64 expires)
 		UVH_RTC1_INT_CONFIG_M_MASK);
 	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
 
-	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
-		UVH_EVENT_OCCURRED0_RTC1_MASK);
+	if (is_uv1_hub())
+		uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+				UV1H_EVENT_OCCURRED0_RTC1_MASK);
+	else
+		uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS,
+				UV2H_EVENT_OCCURRED2_RTC_1_MASK);
 
 	val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
 		((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
-- 
cgit v1.2.3-70-g09d2


From f073cc8f39b48fdf4c8cd9520a6028fe69199b60 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Tue, 24 May 2011 13:07:36 -0500
Subject: x86, UV: Clean up uv_tlb.c

SGI UV's uv_tlb.c driver has become rather hard to read, with overly large
functions, non-standard coding style and (way) too long variable, constant
and function names and non-obvious code flow sequences.

This patch improves the readability and maintainability of the driver
significantly, by doing the following strict code cleanups with no side
effects:

 - Split long functions into shorter logical functions.

 - Shortened some variable and structure member names.

 - Added special functions for reads and writes of MMR regs with
   very long names.

 - Added the 'tunables' table to shortened tunables_write().

 - Added the 'stat_description' table to shorten uv_ptc_proc_write().

 - Pass fewer 'stat' arguments where it can be derived from the 'bcp'
   argument.

 - Function definitions consistent on one line, and inline in few (short) cases.

 - Moved some small structures and an atomic inline function to the header file.

 - Moved some local variables to the blocks where they are used.

 - Updated the copyright date.

 - Shortened uv_write_global_mmr64() etc. using some aliasing; no
   line breaks. Renamed many uv_.. functions that are not exported.

 - Aligned structure fields.
    [ note that not all structures are aligned the same way though; I'd like
      to keep the extensive commenting in some of them. ]

 - Shortened some long structure names.

 - Standard pass/fail exit from init_per_cpu()

 - Vertical alignment for mass initializations.

 - More separation between blocks of code.

Tested on a 16-processor Altix UV.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: penberg@kernel.org
Link: http://lkml.kernel.org/r/E1QOw12-0004MN-Lp@eag09.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  554 +++++++++------
 arch/x86/platform/uv/tlb_uv.c    | 1444 ++++++++++++++++++++------------------
 2 files changed, 1099 insertions(+), 899 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 0652a5a9fd6..a291c40efd4 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -5,7 +5,7 @@
  *
  * SGI UV Broadcast Assist Unit definitions
  *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_BAU_H
@@ -35,9 +35,9 @@
 
 #define MAX_CPUS_PER_UVHUB		64
 #define MAX_CPUS_PER_SOCKET		32
-#define UV_ADP_SIZE			64 /* hardware-provided max. */
-#define UV_CPUS_PER_ACT_STATUS		32 /* hardware-provided max. */
-#define UV_ITEMS_PER_DESCRIPTOR		8
+#define ADP_SZ				64 /* hardware-provided max. */
+#define UV_CPUS_PER_AS			32 /* hardware-provided max. */
+#define ITEMS_PER_DESC			8
 /* the 'throttle' to prevent the hardware stay-busy bug */
 #define MAX_BAU_CONCURRENT		3
 #define UV_ACT_STATUS_MASK		0x3
@@ -48,7 +48,7 @@
 #define UV2_NET_ENDPOINT_INTD		0x28
 #define UV_NET_ENDPOINT_INTD		(is_uv1_hub() ?			\
 			UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
-#define UV_DESC_BASE_PNODE_SHIFT	49
+#define UV_DESC_PSHIFT			49
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
 #define UV_BAU_BASENAME			"sgi_uv/bau_tunables"
@@ -56,7 +56,8 @@
 #define UV_BAU_TUNABLES_FILE		"bau_tunables"
 #define WHITESPACE			" \t\n"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
-
+#define cpubit_isset(cpu, bau_local_cpumask) \
+	test_bit((cpu), (bau_local_cpumask).bits)
 
 /* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
 /*
@@ -72,25 +73,37 @@
 		UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD :			\
 		UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
 
-#define BAU_MISC_CONTROL_MULT_MASK 3
+#define BAU_MISC_CONTROL_MULT_MASK	3
 
-#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+#define UVH_AGING_PRESCALE_SEL		0x000000b000UL
 /* [30:28] URGENCY_7  an index into a table of times */
-#define BAU_URGENCY_7_SHIFT 28
-#define BAU_URGENCY_7_MASK 7
+#define BAU_URGENCY_7_SHIFT		28
+#define BAU_URGENCY_7_MASK		7
 
-#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+#define UVH_TRANSACTION_TIMEOUT		0x000000b200UL
 /* [45:40] BAU - BAU transaction timeout select - a multiplier */
-#define BAU_TRANS_SHIFT 40
-#define BAU_TRANS_MASK 0x3f
+#define BAU_TRANS_SHIFT			40
+#define BAU_TRANS_MASK			0x3f
+
+/*
+ * shorten some awkward names
+ */
+#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT
+#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
+#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define write_gmmr	uv_write_global_mmr64
+#define write_lmmr	uv_write_local_mmr
+#define read_lmmr	uv_read_local_mmr
+#define read_gmmr	uv_read_global_mmr64
 
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
  */
-#define DESC_STATUS_IDLE		0
-#define DESC_STATUS_ACTIVE		1
-#define DESC_STATUS_DESTINATION_TIMEOUT	2
-#define DESC_STATUS_SOURCE_TIMEOUT	3
+#define DS_IDLE				0
+#define DS_ACTIVE			1
+#define DS_DESTINATION_TIMEOUT		2
+#define DS_SOURCE_TIMEOUT		3
 /*
  * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
  * values 1 and 5 will not occur
@@ -111,22 +124,22 @@
  * threshholds at which to use IPI to free resources
  */
 /* after this # consecutive 'plugged' timeouts, use IPI to release resources */
-#define PLUGSB4RESET 100
+#define PLUGSB4RESET			100
 /* after this many consecutive timeouts, use IPI to release resources */
-#define TIMEOUTSB4RESET 1
+#define TIMEOUTSB4RESET			1
 /* at this number uses of IPI to release resources, giveup the request */
-#define IPI_RESET_LIMIT 1
+#define IPI_RESET_LIMIT			1
 /* after this # consecutive successes, bump up the throttle if it was lowered */
-#define COMPLETE_THRESHOLD 5
+#define COMPLETE_THRESHOLD		5
 
-#define UV_LB_SUBNODEID 0x10
+#define UV_LB_SUBNODEID			0x10
 
 /* these two are the same for UV1 and UV2: */
 #define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
 #define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
 /* 4 bits of software ack period */
-#define UV2_ACK_MASK 0x7UL
-#define UV2_ACK_UNITS_SHFT 3
+#define UV2_ACK_MASK			0x7UL
+#define UV2_ACK_UNITS_SHFT		3
 #define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
 #define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
 
@@ -149,9 +162,16 @@
 /*
  * tuning the action when the numalink network is extremely delayed
  */
-#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
-#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
-#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+#define CONGESTED_RESPONSE_US		1000	/* 'long' response time, in
+						   microseconds */
+#define CONGESTED_REPS			10	/* long delays averaged over
+						   this many broadcasts */
+#define CONGESTED_PERIOD		30	/* time for the bau to be
+						   disabled, in seconds */
+/* see msg_type: */
+#define MSG_NOOP			0
+#define MSG_REGULAR			1
+#define MSG_RETRY			2
 
 /*
  * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -163,8 +183,8 @@
  * 'base_dest_nasid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
-struct bau_target_uvhubmask {
-	unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
+struct bau_targ_hubmask {
+	unsigned long		bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
 };
 
 /*
@@ -173,7 +193,7 @@ struct bau_target_uvhubmask {
  *  enough bits for max. cpu's per uvhub)
  */
 struct bau_local_cpumask {
-	unsigned long bits;
+	unsigned long		bits;
 };
 
 /*
@@ -194,14 +214,14 @@ struct bau_local_cpumask {
  * The payload is software-defined for INTD transactions
  */
 struct bau_msg_payload {
-	unsigned long address;		/* signifies a page or all TLB's
-						of the cpu */
+	unsigned long	address;		/* signifies a page or all
+						   TLB's of the cpu */
 	/* 64 bits */
-	unsigned short sending_cpu;	/* filled in by sender */
+	unsigned short	sending_cpu;		/* filled in by sender */
 	/* 16 bits */
-	unsigned short acknowledge_count;/* filled in by destination */
+	unsigned short	acknowledge_count;	/* filled in by destination */
 	/* 16 bits */
-	unsigned int reserved1:32;	/* not usable */
+	unsigned int	reserved1:32;		/* not usable */
 };
 
 
@@ -210,93 +230,96 @@ struct bau_msg_payload {
  * see table 4.2.3.0.1 in broacast_assist spec.
  */
 struct bau_msg_header {
-	unsigned int dest_subnodeid:6;	/* must be 0x10, for the LB */
+	unsigned int	dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
-	unsigned int base_dest_nasid:15; /* nasid of the */
-	/* bits 20:6 */			  /* first bit in uvhub map */
-	unsigned int command:8;	/* message type */
+	unsigned int	base_dest_nasid:15;	/* nasid of the first bit */
+	/* bits 20:6 */				/* in uvhub map */
+	unsigned int	command:8;		/* message type */
 	/* bits 28:21 */
-				/* 0x38: SN3net EndPoint Message */
-	unsigned int rsvd_1:3;	/* must be zero */
+	/* 0x38: SN3net EndPoint Message */
+	unsigned int	rsvd_1:3;		/* must be zero */
 	/* bits 31:29 */
-				/* int will align on 32 bits */
-	unsigned int rsvd_2:9;	/* must be zero */
+	/* int will align on 32 bits */
+	unsigned int	rsvd_2:9;		/* must be zero */
 	/* bits 40:32 */
-				/* Suppl_A is 56-41 */
-	unsigned int sequence:16;/* message sequence number */
-	/* bits 56:41 */	/* becomes bytes 16-17 of msg */
-				/* Address field (96:57) is never used as an
-				   address (these are address bits 42:3) */
-
-	unsigned int rsvd_3:1;	/* must be zero */
+	/* Suppl_A is 56-41 */
+	unsigned int	sequence:16;		/* message sequence number */
+	/* bits 56:41 */			/* becomes bytes 16-17 of msg */
+						/* Address field (96:57) is
+						   never used as an address
+						   (these are address bits
+						   42:3) */
+
+	unsigned int	rsvd_3:1;		/* must be zero */
 	/* bit 57 */
-				/* address bits 27:4 are payload */
+	/* address bits 27:4 are payload */
 	/* these next 24  (58-81) bits become bytes 12-14 of msg */
-
 	/* bits 65:58 land in byte 12 */
-	unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
+	unsigned int	replied_to:1;		/* sent as 0 by the source to
+						   byte 12 */
 	/* bit 58 */
-	unsigned int msg_type:3; /* software type of the message*/
+	unsigned int	msg_type:3;		/* software type of the
+						   message */
 	/* bits 61:59 */
-	unsigned int canceled:1; /* message canceled, resource to be freed*/
+	unsigned int	canceled:1;		/* message canceled, resource
+						   is to be freed*/
 	/* bit 62 */
-	unsigned int payload_1a:1;/* not currently used */
+	unsigned int	payload_1a:1;		/* not currently used */
 	/* bit 63 */
-	unsigned int payload_1b:2;/* not currently used */
+	unsigned int	payload_1b:2;		/* not currently used */
 	/* bits 65:64 */
 
 	/* bits 73:66 land in byte 13 */
-	unsigned int payload_1ca:6;/* not currently used */
+	unsigned int	payload_1ca:6;		/* not currently used */
 	/* bits 71:66 */
-	unsigned int payload_1c:2;/* not currently used */
+	unsigned int	payload_1c:2;		/* not currently used */
 	/* bits 73:72 */
 
 	/* bits 81:74 land in byte 14 */
-	unsigned int payload_1d:6;/* not currently used */
+	unsigned int	payload_1d:6;		/* not currently used */
 	/* bits 79:74 */
-	unsigned int payload_1e:2;/* not currently used */
+	unsigned int	payload_1e:2;		/* not currently used */
 	/* bits 81:80 */
 
-	unsigned int rsvd_4:7;	/* must be zero */
+	unsigned int	rsvd_4:7;		/* must be zero */
 	/* bits 88:82 */
-	unsigned int sw_ack_flag:1;/* software acknowledge flag */
+	unsigned int	swack_flag:1;		/* software acknowledge flag */
 	/* bit 89 */
-				/* INTD trasactions at destination are to
-				   wait for software acknowledge */
-	unsigned int rsvd_5:6;	/* must be zero */
+						/* INTD trasactions at
+						   destination are to wait for
+						   software acknowledge */
+	unsigned int	rsvd_5:6;		/* must be zero */
 	/* bits 95:90 */
-	unsigned int rsvd_6:5;	/* must be zero */
+	unsigned int	rsvd_6:5;		/* must be zero */
 	/* bits 100:96 */
-	unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
+	unsigned int	int_both:1;		/* if 1, interrupt both sockets
+						   on the uvhub */
 	/* bit 101*/
-	unsigned int fairness:3;/* usually zero */
+	unsigned int	fairness:3;		/* usually zero */
 	/* bits 104:102 */
-	unsigned int multilevel:1;	/* multi-level multicast format */
+	unsigned int	multilevel:1;		/* multi-level multicast
+						   format */
 	/* bit 105 */
-				/* 0 for TLB: endpoint multi-unicast messages */
-	unsigned int chaining:1;/* next descriptor is part of this activation*/
+	/* 0 for TLB: endpoint multi-unicast messages */
+	unsigned int	chaining:1;		/* next descriptor is part of
+						   this activation*/
 	/* bit 106 */
-	unsigned int rsvd_7:21;	/* must be zero */
+	unsigned int	rsvd_7:21;		/* must be zero */
 	/* bits 127:107 */
 };
 
-/* see msg_type: */
-#define MSG_NOOP 0
-#define MSG_REGULAR 1
-#define MSG_RETRY 2
-
 /*
  * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct bau_target_uvhubmask distribution;
+	struct bau_targ_hubmask	distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
-	struct bau_msg_header header;
-	struct bau_msg_payload payload;
+	struct bau_msg_header		header;
+	struct bau_msg_payload		payload;
 };
 /*
  *   -payload--    ---------header------
@@ -315,59 +338,51 @@ struct bau_desc {
  * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
  * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
  * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
- *  sw_ack_vector and payload_2)
+ *  swack_vec and payload_2)
  * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
  *  Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
  *  operation."
  */
-struct bau_payload_queue_entry {
-	unsigned long address;		/* signifies a page or all TLB's
-						of the cpu */
+struct bau_pq_entry {
+	unsigned long	address;	/* signifies a page or all TLB's
+					   of the cpu */
 	/* 64 bits, bytes 0-7 */
-
-	unsigned short sending_cpu;	/* cpu that sent the message */
+	unsigned short	sending_cpu;	/* cpu that sent the message */
 	/* 16 bits, bytes 8-9 */
-
-	unsigned short acknowledge_count; /* filled in by destination */
+	unsigned short	acknowledge_count; /* filled in by destination */
 	/* 16 bits, bytes 10-11 */
-
 	/* these next 3 bytes come from bits 58-81 of the message header */
-	unsigned short replied_to:1;    /* sent as 0 by the source */
-	unsigned short msg_type:3;      /* software message type */
-	unsigned short canceled:1;      /* sent as 0 by the source */
-	unsigned short unused1:3;       /* not currently using */
+	unsigned short	replied_to:1;	/* sent as 0 by the source */
+	unsigned short	msg_type:3;	/* software message type */
+	unsigned short	canceled:1;	/* sent as 0 by the source */
+	unsigned short	unused1:3;	/* not currently using */
 	/* byte 12 */
-
-	unsigned char unused2a;		/* not currently using */
+	unsigned char	unused2a;	/* not currently using */
 	/* byte 13 */
-	unsigned char unused2;		/* not currently using */
+	unsigned char	unused2;	/* not currently using */
 	/* byte 14 */
-
-	unsigned char sw_ack_vector;	/* filled in by the hardware */
+	unsigned char	swack_vec;	/* filled in by the hardware */
 	/* byte 15 (bits 127:120) */
-
-	unsigned short sequence;	/* message sequence number */
+	unsigned short	sequence;	/* message sequence number */
 	/* bytes 16-17 */
-	unsigned char unused4[2];	/* not currently using bytes 18-19 */
+	unsigned char	unused4[2];	/* not currently using bytes 18-19 */
 	/* bytes 18-19 */
-
-	int number_of_cpus;		/* filled in at destination */
+	int		number_of_cpus;	/* filled in at destination */
 	/* 32 bits, bytes 20-23 (aligned) */
-
-	unsigned char unused5[8];       /* not using */
+	unsigned char	unused5[8];	/* not using */
 	/* bytes 24-31 */
 };
 
 struct msg_desc {
-	struct bau_payload_queue_entry *msg;
-	int msg_slot;
-	int sw_ack_slot;
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
+	struct bau_pq_entry	*msg;
+	int			msg_slot;
+	int			swack_slot;
+	struct bau_pq_entry	*queue_first;
+	struct bau_pq_entry	*queue_last;
 };
 
 struct reset_args {
-	int sender;
+	int			sender;
 };
 
 /*
@@ -375,112 +390,226 @@ struct reset_args {
  */
 struct ptc_stats {
 	/* sender statistics */
-	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
-	unsigned long s_requestor; /* number of shootdown requests */
-	unsigned long s_stimeout; /* source side timeouts */
-	unsigned long s_dtimeout; /* destination side timeouts */
-	unsigned long s_time; /* time spent in sending side */
-	unsigned long s_retriesok; /* successful retries */
-	unsigned long s_ntargcpu; /* total number of cpu's targeted */
-	unsigned long s_ntargself; /* times the sending cpu was targeted */
-	unsigned long s_ntarglocals; /* targets of cpus on the local blade */
-	unsigned long s_ntargremotes; /* targets of cpus on remote blades */
-	unsigned long s_ntarglocaluvhub; /* targets of the local hub */
-	unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
-	unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
-	unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
-	unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
-	unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
-	unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
-	unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
-	unsigned long s_resets_plug; /* ipi-style resets from plug state */
-	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
-	unsigned long s_busy; /* status stayed busy past s/w timer */
-	unsigned long s_throttles; /* waits in throttle */
-	unsigned long s_retry_messages; /* retry broadcasts */
-	unsigned long s_bau_reenabled; /* for bau enable/disable */
-	unsigned long s_bau_disabled; /* for bau enable/disable */
+	unsigned long	s_giveup;		/* number of fall backs to
+						   IPI-style flushes */
+	unsigned long	s_requestor;		/* number of shootdown
+						   requests */
+	unsigned long	s_stimeout;		/* source side timeouts */
+	unsigned long	s_dtimeout;		/* destination side timeouts */
+	unsigned long	s_time;			/* time spent in sending side */
+	unsigned long	s_retriesok;		/* successful retries */
+	unsigned long	s_ntargcpu;		/* total number of cpu's
+						   targeted */
+	unsigned long	s_ntargself;		/* times the sending cpu was
+						   targeted */
+	unsigned long	s_ntarglocals;		/* targets of cpus on the local
+						   blade */
+	unsigned long	s_ntargremotes;		/* targets of cpus on remote
+						   blades */
+	unsigned long	s_ntarglocaluvhub;	/* targets of the local hub */
+	unsigned long	s_ntargremoteuvhub;	/* remotes hubs targeted */
+	unsigned long	s_ntarguvhub;		/* total number of uvhubs
+						   targeted */
+	unsigned long	s_ntarguvhub16;		/* number of times target
+						   hubs >= 16*/
+	unsigned long	s_ntarguvhub8;		/* number of times target
+						   hubs >= 8 */
+	unsigned long	s_ntarguvhub4;		/* number of times target
+						   hubs >= 4 */
+	unsigned long	s_ntarguvhub2;		/* number of times target
+						   hubs >= 2 */
+	unsigned long	s_ntarguvhub1;		/* number of times target
+						   hubs == 1 */
+	unsigned long	s_resets_plug;		/* ipi-style resets from plug
+						   state */
+	unsigned long	s_resets_timeout;	/* ipi-style resets from
+						   timeouts */
+	unsigned long	s_busy;			/* status stayed busy past
+						   s/w timer */
+	unsigned long	s_throttles;		/* waits in throttle */
+	unsigned long	s_retry_messages;	/* retry broadcasts */
+	unsigned long	s_bau_reenabled;	/* for bau enable/disable */
+	unsigned long	s_bau_disabled;		/* for bau enable/disable */
 	/* destination statistics */
-	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
-	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
-	unsigned long d_multmsg; /* interrupts with multiple messages */
-	unsigned long d_nomsg; /* interrupts with no message */
-	unsigned long d_time; /* time spent on destination side */
-	unsigned long d_requestee; /* number of messages processed */
-	unsigned long d_retries; /* number of retry messages processed */
-	unsigned long d_canceled; /* number of messages canceled by retries */
-	unsigned long d_nocanceled; /* retries that found nothing to cancel */
-	unsigned long d_resets; /* number of ipi-style requests processed */
-	unsigned long d_rcanceled; /* number of messages canceled by resets */
+	unsigned long	d_alltlb;		/* times all tlb's on this
+						   cpu were flushed */
+	unsigned long	d_onetlb;		/* times just one tlb on this
+						   cpu was flushed */
+	unsigned long	d_multmsg;		/* interrupts with multiple
+						   messages */
+	unsigned long	d_nomsg;		/* interrupts with no message */
+	unsigned long	d_time;			/* time spent on destination
+						   side */
+	unsigned long	d_requestee;		/* number of messages
+						   processed */
+	unsigned long	d_retries;		/* number of retry messages
+						   processed */
+	unsigned long	d_canceled;		/* number of messages canceled
+						   by retries */
+	unsigned long	d_nocanceled;		/* retries that found nothing
+						   to cancel */
+	unsigned long	d_resets;		/* number of ipi-style requests
+						   processed */
+	unsigned long	d_rcanceled;		/* number of messages canceled
+						   by resets */
+};
+
+struct tunables {
+	int			*tunp;
+	int			deflt;
 };
 
 struct hub_and_pnode {
-	short uvhub;
-	short pnode;
+	short			uvhub;
+	short			pnode;
+};
+
+struct socket_desc {
+	short			num_cpus;
+	short			cpu_number[MAX_CPUS_PER_SOCKET];
+};
+
+struct uvhub_desc {
+	unsigned short		socket_mask;
+	short			num_cpus;
+	short			uvhub;
+	short			pnode;
+	struct socket_desc	socket[2];
 };
+
 /*
  * one per-cpu; to locate the software tables
  */
 struct bau_control {
-	struct bau_desc *descriptor_base;
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_payload_queue_entry *bau_msg_head;
-	struct bau_control *uvhub_master;
-	struct bau_control *socket_master;
-	struct ptc_stats *statp;
-	unsigned long timeout_interval;
-	unsigned long set_bau_on_time;
-	atomic_t active_descriptor_count;
-	int plugged_tries;
-	int timeout_tries;
-	int ipi_attempts;
-	int conseccompletes;
-	int baudisabled;
-	int set_bau_off;
-	short cpu;
-	short osnode;
-	short uvhub_cpu;
-	short uvhub;
-	short cpus_in_socket;
-	short cpus_in_uvhub;
-	short partition_base_pnode;
-	unsigned short message_number;
-	unsigned short uvhub_quiesce;
-	short socket_acknowledge_count[DEST_Q_SIZE];
-	cycles_t send_message;
-	spinlock_t uvhub_lock;
-	spinlock_t queue_lock;
+	struct bau_desc		*descriptor_base;
+	struct bau_pq_entry	*queue_first;
+	struct bau_pq_entry	*queue_last;
+	struct bau_pq_entry	*bau_msg_head;
+	struct bau_control	*uvhub_master;
+	struct bau_control	*socket_master;
+	struct ptc_stats	*statp;
+	unsigned long		timeout_interval;
+	unsigned long		set_bau_on_time;
+	atomic_t		active_descriptor_count;
+	int			plugged_tries;
+	int			timeout_tries;
+	int			ipi_attempts;
+	int			conseccompletes;
+	int			baudisabled;
+	int			set_bau_off;
+	short			cpu;
+	short			osnode;
+	short			uvhub_cpu;
+	short			uvhub;
+	short			cpus_in_socket;
+	short			cpus_in_uvhub;
+	short			partition_base_pnode;
+	unsigned short		message_number;
+	unsigned short		uvhub_quiesce;
+	short			socket_acknowledge_count[DEST_Q_SIZE];
+	cycles_t		send_message;
+	spinlock_t		uvhub_lock;
+	spinlock_t		queue_lock;
 	/* tunables */
-	int max_bau_concurrent;
-	int max_bau_concurrent_constant;
-	int plugged_delay;
-	int plugsb4reset;
-	int timeoutsb4reset;
-	int ipi_reset_limit;
-	int complete_threshold;
-	int congested_response_us;
-	int congested_reps;
-	int congested_period;
-	cycles_t period_time;
-	long period_requests;
-	struct hub_and_pnode *target_hub_and_pnode;
+	int			max_concurr;
+	int			max_concurr_const;
+	int			plugged_delay;
+	int			plugsb4reset;
+	int			timeoutsb4reset;
+	int			ipi_reset_limit;
+	int			complete_threshold;
+	int			cong_response_us;
+	int			cong_reps;
+	int			cong_period;
+	cycles_t		period_time;
+	long			period_requests;
+	struct hub_and_pnode	*thp;
 };
 
-static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
+static unsigned long read_mmr_uv2_status(void)
+{
+	return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
+}
+
+static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
+}
+
+static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
+}
+
+static void write_mmr_activation(unsigned long index)
+{
+	write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+}
+
+static void write_gmmr_activation(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
+}
+
+static void write_mmr_payload_first(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
+}
+
+static void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
+}
+
+static void write_mmr_payload_last(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
+}
+
+static void write_mmr_misc_control(int pnode, unsigned long mmr_image)
+{
+	write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+}
+
+static unsigned long read_mmr_misc_control(int pnode)
+{
+	return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
+}
+
+static void write_mmr_sw_ack(unsigned long mr)
+{
+	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
+static unsigned long read_mmr_sw_ack(void)
+{
+	return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static unsigned long read_gmmr_sw_ack(int pnode)
+{
+	return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static void write_mmr_data_config(int pnode, unsigned long mr)
+{
+	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
+}
+
+static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp)
 {
 	return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp)
 {
 	__set_bit(pnode, &dstp->bits[0]);
 }
-static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
+static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp,
 				    int nbits)
 {
 	bitmap_zero(&dstp->bits[0], nbits);
 }
-static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
+static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp)
 {
 	return bitmap_weight((unsigned long *)&dstp->bits[0],
 				UV_DISTRIBUTION_SIZE);
@@ -491,9 +620,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 	bitmap_zero(&dstp->bits, nbits);
 }
 
-#define cpubit_isset(cpu, bau_local_cpumask) \
-	test_bit((cpu), (bau_local_cpumask).bits)
-
 extern void uv_bau_message_intr1(void);
 extern void uv_bau_timeout_intr1(void);
 
@@ -501,7 +627,7 @@ struct atomic_short {
 	short counter;
 };
 
-/**
+/*
  * atomic_read_short - read a short atomic variable
  * @v: pointer of type atomic_short
  *
@@ -512,14 +638,14 @@ static inline int atomic_read_short(const struct atomic_short *v)
 	return v->counter;
 }
 
-/**
- * atomic_add_short_return - add and return a short int
+/*
+ * atom_asr - add and return a short int
  * @i: short value to add
  * @v: pointer of type atomic_short
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline int atomic_add_short_return(short i, struct atomic_short *v)
+static inline int atom_asr(short i, struct atomic_short *v)
 {
 	short __i = i;
 	asm volatile(LOCK_PREFIX "xaddw %0, %1"
@@ -528,4 +654,26 @@ static inline int atomic_add_short_return(short i, struct atomic_short *v)
 	return i + __i;
 }
 
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+	spin_lock(lock);
+	if (atomic_read(v) >= u) {
+		spin_unlock(lock);
+		return 0;
+	}
+	atomic_inc(v);
+	spin_unlock(lock);
+	return 1;
+}
+
 #endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index a9856c09c42..68e467f69fe 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  *	SGI UltraViolet TLB flush routines.
  *
- *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
+ *	(c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
  *
  *	This code is released under the GNU General Public License version 2 or
  *	later.
@@ -35,6 +35,7 @@ static int timeout_base_ns[] = {
 		5242880,
 		167772160
 };
+
 static int timeout_us;
 static int nobau;
 static int baudisabled;
@@ -42,20 +43,70 @@ static spinlock_t disable_lock;
 static cycles_t congested_cycles;
 
 /* tunables: */
-static int max_bau_concurrent = MAX_BAU_CONCURRENT;
-static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
-static int plugged_delay = PLUGGED_DELAY;
-static int plugsb4reset = PLUGSB4RESET;
-static int timeoutsb4reset = TIMEOUTSB4RESET;
-static int ipi_reset_limit = IPI_RESET_LIMIT;
-static int complete_threshold = COMPLETE_THRESHOLD;
-static int congested_response_us = CONGESTED_RESPONSE_US;
-static int congested_reps = CONGESTED_REPS;
-static int congested_period = CONGESTED_PERIOD;
+static int max_concurr		= MAX_BAU_CONCURRENT;
+static int max_concurr_const	= MAX_BAU_CONCURRENT;
+static int plugged_delay	= PLUGGED_DELAY;
+static int plugsb4reset		= PLUGSB4RESET;
+static int timeoutsb4reset	= TIMEOUTSB4RESET;
+static int ipi_reset_limit	= IPI_RESET_LIMIT;
+static int complete_threshold	= COMPLETE_THRESHOLD;
+static int congested_respns_us	= CONGESTED_RESPONSE_US;
+static int congested_reps	= CONGESTED_REPS;
+static int congested_period	= CONGESTED_PERIOD;
+
+static struct tunables tunables[] = {
+	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
+	{&plugged_delay, PLUGGED_DELAY},
+	{&plugsb4reset, PLUGSB4RESET},
+	{&timeoutsb4reset, TIMEOUTSB4RESET},
+	{&ipi_reset_limit, IPI_RESET_LIMIT},
+	{&complete_threshold, COMPLETE_THRESHOLD},
+	{&congested_respns_us, CONGESTED_RESPONSE_US},
+	{&congested_reps, CONGESTED_REPS},
+	{&congested_period, CONGESTED_PERIOD}
+};
+
 static struct dentry *tunables_dir;
 static struct dentry *tunables_file;
 
-static int __init setup_nobau(char *arg)
+/* these correspond to the statistics printed by ptc_seq_show() */
+static char *stat_description[] = {
+	"sent:     number of shootdown messages sent",
+	"stime:    time spent sending messages",
+	"numuvhubs: number of hubs targeted with shootdown",
+	"numuvhubs16: number times 16 or more hubs targeted",
+	"numuvhubs8: number times 8 or more hubs targeted",
+	"numuvhubs4: number times 4 or more hubs targeted",
+	"numuvhubs2: number times 2 or more hubs targeted",
+	"numuvhubs1: number times 1 hub targeted",
+	"numcpus:  number of cpus targeted with shootdown",
+	"dto:      number of destination timeouts",
+	"retries:  destination timeout retries sent",
+	"rok:   :  destination timeouts successfully retried",
+	"resetp:   ipi-style resource resets for plugs",
+	"resett:   ipi-style resource resets for timeouts",
+	"giveup:   fall-backs to ipi-style shootdowns",
+	"sto:      number of source timeouts",
+	"bz:       number of stay-busy's",
+	"throt:    number times spun in throttle",
+	"swack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
+	"recv:     shootdown messages received",
+	"rtime:    time spent processing messages",
+	"all:      shootdown all-tlb messages",
+	"one:      shootdown one-tlb messages",
+	"mult:     interrupts that found multiple messages",
+	"none:     interrupts that found no messages",
+	"retry:    number of retry messages processed",
+	"canc:     number messages canceled by retries",
+	"nocan:    number retries that found nothing to cancel",
+	"reset:    number of ipi-style reset requests processed",
+	"rcan:     number messages canceled by reset requests",
+	"disable:  number times use of the BAU was disabled",
+	"enable:   number times use of the BAU was re-enabled"
+};
+
+static int __init
+setup_nobau(char *arg)
 {
 	nobau = 1;
 	return 0;
@@ -63,7 +114,7 @@ static int __init setup_nobau(char *arg)
 early_param("nobau", setup_nobau);
 
 /* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
+static int uv_base_pnode __read_mostly;
 /* position of pnode (which is nasid>>1): */
 static int uv_nshift __read_mostly;
 static unsigned long uv_mmask __read_mostly;
@@ -109,60 +160,52 @@ static int __init uvhub_to_first_apicid(int uvhub)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static inline void uv_reply_to_message(struct msg_desc *mdp,
-				       struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
 {
 	unsigned long dw;
-	struct bau_payload_queue_entry *msg;
+	struct bau_pq_entry *msg;
 
 	msg = mdp->msg;
 	if (!msg->canceled) {
-		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
-						msg->sw_ack_vector;
-		uv_write_local_mmr(
-				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
+		write_mmr_sw_ack(dw);
 	}
 	msg->replied_to = 1;
-	msg->sw_ack_vector = 0;
+	msg->swack_vec = 0;
 }
 
 /*
  * Process the receipt of a RETRY message
  */
-static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
-					    struct bau_control *bcp)
+static void bau_process_retry_msg(struct msg_desc *mdp,
+					struct bau_control *bcp)
 {
 	int i;
 	int cancel_count = 0;
-	int slot2;
 	unsigned long msg_res;
 	unsigned long mmr = 0;
-	struct bau_payload_queue_entry *msg;
-	struct bau_payload_queue_entry *msg2;
-	struct ptc_stats *stat;
+	struct bau_pq_entry *msg = mdp->msg;
+	struct bau_pq_entry *msg2;
+	struct ptc_stats *stat = bcp->statp;
 
-	msg = mdp->msg;
-	stat = bcp->statp;
 	stat->d_retries++;
 	/*
 	 * cancel any message from msg+1 to the retry itself
 	 */
 	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
-		if (msg2 > mdp->va_queue_last)
-			msg2 = mdp->va_queue_first;
+		if (msg2 > mdp->queue_last)
+			msg2 = mdp->queue_first;
 		if (msg2 == msg)
 			break;
 
-		/* same conditions for cancellation as uv_do_reset */
+		/* same conditions for cancellation as do_reset */
 		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
-		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
-			msg->sw_ack_vector) == 0) &&
+		    (msg2->swack_vec) && ((msg2->swack_vec &
+			msg->swack_vec) == 0) &&
 		    (msg2->sending_cpu == msg->sending_cpu) &&
 		    (msg2->msg_type != MSG_NOOP)) {
-			slot2 = msg2 - mdp->va_queue_first;
-			mmr = uv_read_local_mmr
-				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = msg2->sw_ack_vector;
+			mmr = read_mmr_sw_ack();
+			msg_res = msg2->swack_vec;
 			/*
 			 * This is a message retry; clear the resources held
 			 * by the previous message only if they timed out.
@@ -170,6 +213,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 			 * situation to report.
 			 */
 			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
+				unsigned long mr;
 				/*
 				 * is the resource timed out?
 				 * make everyone ignore the cancelled message.
@@ -177,10 +221,8 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
 				msg2->canceled = 1;
 				stat->d_canceled++;
 				cancel_count++;
-				uv_write_local_mmr(
-				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-					(msg_res << UV_SW_ACK_NPENDING) |
-					 msg_res);
+				mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
+				write_mmr_sw_ack(mr);
 			}
 		}
 	}
@@ -192,20 +234,19 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
  * Do all the things a cpu should do for a TLB shootdown message.
  * Other cpu's may come here at the same time for this message.
  */
-static void uv_bau_process_message(struct msg_desc *mdp,
-				   struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp,
+					struct bau_control *bcp)
 {
-	int msg_ack_count;
 	short socket_ack_count = 0;
-	struct ptc_stats *stat;
-	struct bau_payload_queue_entry *msg;
+	short *sp;
+	struct atomic_short *asp;
+	struct ptc_stats *stat = bcp->statp;
+	struct bau_pq_entry *msg = mdp->msg;
 	struct bau_control *smaster = bcp->socket_master;
 
 	/*
 	 * This must be a normal message, or retry of a normal message
 	 */
-	msg = mdp->msg;
-	stat = bcp->statp;
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
 		stat->d_alltlb++;
@@ -222,30 +263,32 @@ static void uv_bau_process_message(struct msg_desc *mdp,
 	 * cpu number.
 	 */
 	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
-		uv_bau_process_retry_msg(mdp, bcp);
+		bau_process_retry_msg(mdp, bcp);
 
 	/*
-	 * This is a sw_ack message, so we have to reply to it.
+	 * This is a swack message, so we have to reply to it.
 	 * Count each responding cpu on the socket. This avoids
 	 * pinging the count's cache line back and forth between
 	 * the sockets.
 	 */
-	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
-			&smaster->socket_acknowledge_count[mdp->msg_slot]);
+	sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
+	asp = (struct atomic_short *)sp;
+	socket_ack_count = atom_asr(1, asp);
 	if (socket_ack_count == bcp->cpus_in_socket) {
+		int msg_ack_count;
 		/*
 		 * Both sockets dump their completed count total into
 		 * the message's count.
 		 */
 		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
-		msg_ack_count = atomic_add_short_return(socket_ack_count,
-				(struct atomic_short *)&msg->acknowledge_count);
+		asp = (struct atomic_short *)&msg->acknowledge_count;
+		msg_ack_count = atom_asr(socket_ack_count, asp);
 
 		if (msg_ack_count == bcp->cpus_in_uvhub) {
 			/*
 			 * All cpus in uvhub saw it; reply
 			 */
-			uv_reply_to_message(mdp, bcp);
+			reply_to_message(mdp, bcp);
 		}
 	}
 
@@ -268,62 +311,51 @@ static int uvhub_to_first_cpu(int uvhub)
  * Last resort when we get a large number of destination timeouts is
  * to clear resources held by a given cpu.
  * Do this with IPI so that all messages in the BAU message queue
- * can be identified by their nonzero sw_ack_vector field.
+ * can be identified by their nonzero swack_vec field.
  *
  * This is entered for a single cpu on the uvhub.
  * The sender want's this uvhub to free a specific message's
- * sw_ack resources.
+ * swack resources.
  */
-static void
-uv_do_reset(void *ptr)
+static void do_reset(void *ptr)
 {
 	int i;
-	int slot;
-	int count = 0;
-	unsigned long mmr;
-	unsigned long msg_res;
-	struct bau_control *bcp;
-	struct reset_args *rap;
-	struct bau_payload_queue_entry *msg;
-	struct ptc_stats *stat;
+	struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
+	struct reset_args *rap = (struct reset_args *)ptr;
+	struct bau_pq_entry *msg;
+	struct ptc_stats *stat = bcp->statp;
 
-	bcp = &per_cpu(bau_control, smp_processor_id());
-	rap = (struct reset_args *)ptr;
-	stat = bcp->statp;
 	stat->d_resets++;
-
 	/*
 	 * We're looking for the given sender, and
-	 * will free its sw_ack resource.
+	 * will free its swack resource.
 	 * If all cpu's finally responded after the timeout, its
 	 * message 'replied_to' was set.
 	 */
-	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
-		/* uv_do_reset: same conditions for cancellation as
-		   uv_bau_process_retry_msg() */
+	for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+		unsigned long msg_res;
+		/* do_reset: same conditions for cancellation as
+		   bau_process_retry_msg() */
 		if ((msg->replied_to == 0) &&
 		    (msg->canceled == 0) &&
 		    (msg->sending_cpu == rap->sender) &&
-		    (msg->sw_ack_vector) &&
+		    (msg->swack_vec) &&
 		    (msg->msg_type != MSG_NOOP)) {
+			unsigned long mmr;
+			unsigned long mr;
 			/*
 			 * make everyone else ignore this message
 			 */
 			msg->canceled = 1;
-			slot = msg - bcp->va_queue_first;
-			count++;
 			/*
 			 * only reset the resource if it is still pending
 			 */
-			mmr = uv_read_local_mmr
-					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = msg->sw_ack_vector;
+			mmr = read_mmr_sw_ack();
+			msg_res = msg->swack_vec;
+			mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
 			if (mmr & msg_res) {
 				stat->d_rcanceled++;
-				uv_write_local_mmr(
-				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-					(msg_res << UV_SW_ACK_NPENDING) |
-					 msg_res);
+				write_mmr_sw_ack(mr);
 			}
 		}
 	}
@@ -334,39 +366,38 @@ uv_do_reset(void *ptr)
  * Use IPI to get all target uvhubs to release resources held by
  * a given sending cpu number.
  */
-static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
-			      int sender)
+static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender)
 {
 	int uvhub;
-	int cpu;
+	int maskbits;
 	cpumask_t mask;
 	struct reset_args reset_args;
 
 	reset_args.sender = sender;
-
 	cpus_clear(mask);
 	/* find a single cpu for each uvhub in this distribution mask */
-	for (uvhub = 0;
-		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
-		    uvhub++) {
+	maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE;
+	for (uvhub = 0; uvhub < maskbits; uvhub++) {
+		int cpu;
 		if (!bau_uvhub_isset(uvhub, distribution))
 			continue;
 		/* find a cpu for this uvhub */
 		cpu = uvhub_to_first_cpu(uvhub);
 		cpu_set(cpu, mask);
 	}
-	/* IPI all cpus; Preemption is already disabled */
-	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+
+	/* IPI all cpus; preemption is already disabled */
+	smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1);
 	return;
 }
 
-static inline unsigned long
-cycles_2_us(unsigned long long cyc)
+static inline unsigned long cycles_2_us(unsigned long long cyc)
 {
 	unsigned long long ns;
 	unsigned long us;
-	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
-						>> CYC2NS_SCALE_FACTOR;
+	int cpu = smp_processor_id();
+
+	ns =  (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
 	us = ns / 1000;
 	return us;
 }
@@ -376,21 +407,27 @@ cycles_2_us(unsigned long long cyc)
  * leaves uvhub_quiesce set so that no new broadcasts are started by
  * bau_flush_send_and_wait()
  */
-static inline void
-quiesce_local_uvhub(struct bau_control *hmaster)
+static inline void quiesce_local_uvhub(struct bau_control *hmaster)
 {
-	atomic_add_short_return(1, (struct atomic_short *)
-		 &hmaster->uvhub_quiesce);
+	atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
 }
 
 /*
  * mark this quiet-requestor as done
  */
-static inline void
-end_uvhub_quiesce(struct bau_control *hmaster)
+static inline void end_uvhub_quiesce(struct bau_control *hmaster)
 {
-	atomic_add_short_return(-1, (struct atomic_short *)
-		&hmaster->uvhub_quiesce);
+	atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
+}
+
+static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
+{
+	unsigned long descriptor_status;
+
+	descriptor_status = uv_read_local_mmr(mmr_offset);
+	descriptor_status >>= right_shift;
+	descriptor_status &= UV_ACT_STATUS_MASK;
+	return descriptor_status;
 }
 
 /*
@@ -398,31 +435,28 @@ end_uvhub_quiesce(struct bau_control *hmaster)
  * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
  */
 static int uv1_wait_completion(struct bau_desc *bau_desc,
-	unsigned long mmr_offset, int right_shift, int this_cpu,
-	struct bau_control *bcp, struct bau_control *smaster, long try)
+				unsigned long mmr_offset, int right_shift,
+				struct bau_control *bcp, long try)
 {
 	unsigned long descriptor_status;
-	cycles_t ttime;
+	cycles_t ttm;
 	struct ptc_stats *stat = bcp->statp;
 
+	descriptor_status = uv1_read_status(mmr_offset, right_shift);
 	/* spin on the status MMR, waiting for it to go idle */
-	while ((descriptor_status = (((unsigned long)
-		uv_read_local_mmr(mmr_offset) >>
-			right_shift) & UV_ACT_STATUS_MASK)) !=
-			DESC_STATUS_IDLE) {
+	while ((descriptor_status != DS_IDLE)) {
 		/*
 		 * Our software ack messages may be blocked because
 		 * there are no swack resources available.  As long
 		 * as none of them has timed out hardware will NACK
 		 * our message and its state will stay IDLE.
 		 */
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+		if (descriptor_status == DS_SOURCE_TIMEOUT) {
 			stat->s_stimeout++;
 			return FLUSH_GIVEUP;
-		} else if (descriptor_status ==
-				DESC_STATUS_DESTINATION_TIMEOUT) {
+		} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
 			stat->s_dtimeout++;
-			ttime = get_cycles();
+			ttm = get_cycles();
 
 			/*
 			 * Our retries may be blocked by all destination
@@ -430,8 +464,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
 			 * pending.  In that case hardware returns the
 			 * ERROR that looks like a destination timeout.
 			 */
-			if (cycles_2_us(ttime - bcp->send_message) <
-							timeout_us) {
+			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
 				bcp->conseccompletes = 0;
 				return FLUSH_RETRY_PLUGGED;
 			}
@@ -444,93 +477,106 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
 			 */
 			cpu_relax();
 		}
+		descriptor_status = uv1_read_status(mmr_offset, right_shift);
 	}
 	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
-static int uv2_wait_completion(struct bau_desc *bau_desc,
-	unsigned long mmr_offset, int right_shift, int this_cpu,
-	struct bau_control *bcp, struct bau_control *smaster, long try)
+/*
+ * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
+ */
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
 {
 	unsigned long descriptor_status;
 	unsigned long descriptor_status2;
-	int cpu;
-	cycles_t ttime;
+
+	descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
+	descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+	descriptor_status = (descriptor_status << 1) | descriptor_status2;
+	return descriptor_status;
+}
+
+static int uv2_wait_completion(struct bau_desc *bau_desc,
+				unsigned long mmr_offset, int right_shift,
+				struct bau_control *bcp, long try)
+{
+	unsigned long descriptor_stat;
+	cycles_t ttm;
+	int cpu = bcp->uvhub_cpu;
 	struct ptc_stats *stat = bcp->statp;
 
-	/* UV2 has an extra bit of status */
-	cpu = bcp->uvhub_cpu;
+	descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+
 	/* spin on the status MMR, waiting for it to go idle */
-	descriptor_status = (((unsigned long)(uv_read_local_mmr
-		(mmr_offset)) >> right_shift) & UV_ACT_STATUS_MASK);
-	descriptor_status2 = (((unsigned long)uv_read_local_mmr
-		(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2) >> cpu) & 0x1UL);
-	descriptor_status = (descriptor_status << 1) |
-		descriptor_status2;
-	while (descriptor_status != UV2H_DESC_IDLE) {
+	while (descriptor_stat != UV2H_DESC_IDLE) {
 		/*
 		 * Our software ack messages may be blocked because
 		 * there are no swack resources available.  As long
 		 * as none of them has timed out hardware will NACK
 		 * our message and its state will stay IDLE.
 		 */
-		if ((descriptor_status == UV2H_DESC_SOURCE_TIMEOUT) ||
-		    (descriptor_status == UV2H_DESC_DEST_STRONG_NACK) ||
-		    (descriptor_status == UV2H_DESC_DEST_PUT_ERR)) {
+		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
+		    (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
+		    (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
 			stat->s_stimeout++;
 			return FLUSH_GIVEUP;
-		} else if (descriptor_status == UV2H_DESC_DEST_TIMEOUT) {
+		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
 			stat->s_dtimeout++;
-			ttime = get_cycles();
-
+			ttm = get_cycles();
 			/*
 			 * Our retries may be blocked by all destination
 			 * swack resources being consumed, and a timeout
 			 * pending.  In that case hardware returns the
 			 * ERROR that looks like a destination timeout.
 			 */
-			if (cycles_2_us(ttime - bcp->send_message) <
-							timeout_us) {
+			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
 				bcp->conseccompletes = 0;
 				return FLUSH_RETRY_PLUGGED;
 			}
-
 			bcp->conseccompletes = 0;
 			return FLUSH_RETRY_TIMEOUT;
 		} else {
 			/*
-			 * descriptor_status is still BUSY
+			 * descriptor_stat is still BUSY
 			 */
 			cpu_relax();
 		}
-		descriptor_status = (((unsigned long)(uv_read_local_mmr
-			(mmr_offset)) >> right_shift) &
-			UV_ACT_STATUS_MASK);
-		descriptor_status2 = (((unsigned long)uv_read_local_mmr
-			(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2) >> cpu) &
-			0x1UL);
-		descriptor_status = (descriptor_status << 1) |
-			descriptor_status2;
+		descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
 	}
 	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
-static int uv_wait_completion(struct bau_desc *bau_desc,
-	unsigned long mmr_offset, int right_shift, int this_cpu,
-	struct bau_control *bcp, struct bau_control *smaster, long try)
+/*
+ * There are 2 status registers; each and array[32] of 2 bits. Set up for
+ * which register to read and position in that register based on cpu in
+ * current hub.
+ */
+static int wait_completion(struct bau_desc *bau_desc,
+				struct bau_control *bcp, long try)
 {
+	int right_shift;
+	unsigned long mmr_offset;
+	int cpu = bcp->uvhub_cpu;
+
+	if (cpu < UV_CPUS_PER_AS) {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+		right_shift = cpu * UV_ACT_STATUS_SIZE;
+	} else {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+		right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
+	}
+
 	if (is_uv1_hub())
 		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
-				   this_cpu, bcp, smaster, try);
+								bcp, try);
 	else
 		return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
-				   this_cpu, bcp, smaster, try);
+								bcp, try);
 }
 
-static inline cycles_t
-sec_2_cycles(unsigned long sec)
+static inline cycles_t sec_2_cycles(unsigned long sec)
 {
 	unsigned long ns;
 	cycles_t cyc;
@@ -541,63 +587,50 @@ sec_2_cycles(unsigned long sec)
 }
 
 /*
- * conditionally add 1 to *v, unless *v is >= u
- * return 0 if we cannot add 1 to *v because it is >= u
- * return 1 if we can add 1 to *v because it is < u
- * the add is atomic
- *
- * This is close to atomic_add_unless(), but this allows the 'u' value
- * to be lowered below the current 'v'.  atomic_add_unless can only stop
- * on equal.
- */
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
-{
-	spin_lock(lock);
-	if (atomic_read(v) >= u) {
-		spin_unlock(lock);
-		return 0;
-	}
-	atomic_inc(v);
-	spin_unlock(lock);
-	return 1;
-}
-
-/*
- * Our retries are blocked by all destination swack resources being
+ * Our retries are blocked by all destination sw ack resources being
  * in use, and a timeout is pending. In that case hardware immediately
  * returns the ERROR that looks like a destination timeout.
  */
-static void
-destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+static void destination_plugged(struct bau_desc *bau_desc,
+			struct bau_control *bcp,
 			struct bau_control *hmaster, struct ptc_stats *stat)
 {
 	udelay(bcp->plugged_delay);
 	bcp->plugged_tries++;
+
 	if (bcp->plugged_tries >= bcp->plugsb4reset) {
 		bcp->plugged_tries = 0;
+
 		quiesce_local_uvhub(hmaster);
+
 		spin_lock(&hmaster->queue_lock);
-		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		reset_with_ipi(&bau_desc->distribution, bcp->cpu);
 		spin_unlock(&hmaster->queue_lock);
+
 		end_uvhub_quiesce(hmaster);
+
 		bcp->ipi_attempts++;
 		stat->s_resets_plug++;
 	}
 }
 
-static void
-destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
-			struct bau_control *hmaster, struct ptc_stats *stat)
+static void destination_timeout(struct bau_desc *bau_desc,
+			struct bau_control *bcp, struct bau_control *hmaster,
+			struct ptc_stats *stat)
 {
-	hmaster->max_bau_concurrent = 1;
+	hmaster->max_concurr = 1;
 	bcp->timeout_tries++;
 	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
 		bcp->timeout_tries = 0;
+
 		quiesce_local_uvhub(hmaster);
+
 		spin_lock(&hmaster->queue_lock);
-		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		reset_with_ipi(&bau_desc->distribution, bcp->cpu);
 		spin_unlock(&hmaster->queue_lock);
+
 		end_uvhub_quiesce(hmaster);
+
 		bcp->ipi_attempts++;
 		stat->s_resets_timeout++;
 	}
@@ -607,34 +640,104 @@ destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
  * Completions are taking a very long time due to a congested numalink
  * network.
  */
-static void
-disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+static void disable_for_congestion(struct bau_control *bcp,
+					struct ptc_stats *stat)
 {
-	int tcpu;
-	struct bau_control *tbcp;
-
 	/* let only one cpu do this disabling */
 	spin_lock(&disable_lock);
+
 	if (!baudisabled && bcp->period_requests &&
 	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+		int tcpu;
+		struct bau_control *tbcp;
 		/* it becomes this cpu's job to turn on the use of the
 		   BAU again */
 		baudisabled = 1;
 		bcp->set_bau_off = 1;
-		bcp->set_bau_on_time = get_cycles() +
-			sec_2_cycles(bcp->congested_period);
+		bcp->set_bau_on_time = get_cycles();
+		bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
 		stat->s_bau_disabled++;
 		for_each_present_cpu(tcpu) {
 			tbcp = &per_cpu(bau_control, tcpu);
-				tbcp->baudisabled = 1;
+			tbcp->baudisabled = 1;
 		}
 	}
+
 	spin_unlock(&disable_lock);
 }
 
-/**
- * uv_flush_send_and_wait
- *
+static void count_max_concurr(int stat, struct bau_control *bcp,
+				struct bau_control *hmaster)
+{
+	bcp->plugged_tries = 0;
+	bcp->timeout_tries = 0;
+	if (stat != FLUSH_COMPLETE)
+		return;
+	if (bcp->conseccompletes <= bcp->complete_threshold)
+		return;
+	if (hmaster->max_concurr >= hmaster->max_concurr_const)
+		return;
+	hmaster->max_concurr++;
+}
+
+static void record_send_stats(cycles_t time1, cycles_t time2,
+		struct bau_control *bcp, struct ptc_stats *stat,
+		int completion_status, int try)
+{
+	cycles_t elapsed;
+
+	if (time2 > time1) {
+		elapsed = time2 - time1;
+		stat->s_time += elapsed;
+
+		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+			bcp->period_requests++;
+			bcp->period_time += elapsed;
+			if ((elapsed > congested_cycles) &&
+			    (bcp->period_requests > bcp->cong_reps))
+				disable_for_congestion(bcp, stat);
+		}
+	} else
+		stat->s_requestor--;
+
+	if (completion_status == FLUSH_COMPLETE && try > 1)
+		stat->s_retriesok++;
+	else if (completion_status == FLUSH_GIVEUP)
+		stat->s_giveup++;
+}
+
+/*
+ * Because of a uv1 hardware bug only a limited number of concurrent
+ * requests can be made.
+ */
+static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	spinlock_t *lock = &hmaster->uvhub_lock;
+	atomic_t *v;
+
+	v = &hmaster->active_descriptor_count;
+	if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
+		stat->s_throttles++;
+		do {
+			cpu_relax();
+		} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
+	}
+}
+
+/*
+ * Handle the completion status of a message send.
+ */
+static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
+			struct bau_control *bcp, struct bau_control *hmaster,
+			struct ptc_stats *stat)
+{
+	if (completion_status == FLUSH_RETRY_PLUGGED)
+		destination_plugged(bau_desc, bcp, hmaster, stat);
+	else if (completion_status == FLUSH_RETRY_TIMEOUT)
+		destination_timeout(bau_desc, bcp, hmaster, stat);
+}
+
+/*
  * Send a broadcast and wait for it to complete.
  *
  * The flush_mask contains the cpus the broadcast is to be sent to including
@@ -645,45 +748,23 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
  * returned to the kernel.
  */
 int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-			   struct cpumask *flush_mask, struct bau_control *bcp)
+			struct cpumask *flush_mask, struct bau_control *bcp)
 {
-	int right_shift;
-	int completion_status = 0;
 	int seq_number = 0;
+	int completion_stat = 0;
 	long try = 0;
-	int cpu = bcp->uvhub_cpu;
-	int this_cpu = bcp->cpu;
-	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
-	cycles_t elapsed;
 	struct ptc_stats *stat = bcp->statp;
-	struct bau_control *smaster = bcp->socket_master;
 	struct bau_control *hmaster = bcp->uvhub_master;
 
-	if (is_uv1_hub()  &&
-			!atomic_inc_unless_ge(&hmaster->uvhub_lock,
-			&hmaster->active_descriptor_count,
-			hmaster->max_bau_concurrent)) {
-		stat->s_throttles++;
-		do {
-			cpu_relax();
-		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
-			&hmaster->active_descriptor_count,
-			hmaster->max_bau_concurrent));
-	}
+	if (is_uv1_hub())
+		uv1_throttle(hmaster, stat);
+
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 
-	if (cpu < UV_CPUS_PER_ACT_STATUS) {
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-		right_shift = cpu * UV_ACT_STATUS_SIZE;
-	} else {
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-		right_shift =
-		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
-	}
 	time1 = get_cycles();
 	do {
 		if (try == 0) {
@@ -693,64 +774,134 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 			bau_desc->header.msg_type = MSG_RETRY;
 			stat->s_retry_messages++;
 		}
+
 		bau_desc->header.sequence = seq_number;
-		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-			bcp->uvhub_cpu;
+		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
 		bcp->send_message = get_cycles();
-		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+		write_mmr_activation(index);
+
 		try++;
-		completion_status = uv_wait_completion(bau_desc, mmr_offset,
-			right_shift, this_cpu, bcp, smaster, try);
+		completion_stat = wait_completion(bau_desc, bcp, try);
+
+		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
 
-		if (completion_status == FLUSH_RETRY_PLUGGED) {
-			destination_plugged(bau_desc, bcp, hmaster, stat);
-		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-			destination_timeout(bau_desc, bcp, hmaster, stat);
-		}
 		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
 			bcp->ipi_attempts = 0;
-			completion_status = FLUSH_GIVEUP;
+			completion_stat = FLUSH_GIVEUP;
 			break;
 		}
 		cpu_relax();
-	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
-		 (completion_status == FLUSH_RETRY_TIMEOUT));
+	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+		 (completion_stat == FLUSH_RETRY_TIMEOUT));
+
 	time2 = get_cycles();
-	bcp->plugged_tries = 0;
-	bcp->timeout_tries = 0;
-	if ((completion_status == FLUSH_COMPLETE) &&
-	    (bcp->conseccompletes > bcp->complete_threshold) &&
-	    (hmaster->max_bau_concurrent <
-					hmaster->max_bau_concurrent_constant))
-			hmaster->max_bau_concurrent++;
+
+	count_max_concurr(completion_stat, bcp, hmaster);
+
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
+
 	atomic_dec(&hmaster->active_descriptor_count);
-	if (time2 > time1) {
-		elapsed = time2 - time1;
-		stat->s_time += elapsed;
-		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
-			bcp->period_requests++;
-			bcp->period_time += elapsed;
-			if ((elapsed > congested_cycles) &&
-			    (bcp->period_requests > bcp->congested_reps)) {
-				disable_for_congestion(bcp, stat);
+
+	record_send_stats(time1, time2, bcp, stat, completion_stat, try);
+
+	if (completion_stat == FLUSH_GIVEUP)
+		return 1;
+	return 0;
+}
+
+/*
+ * The BAU is disabled. When the disabled time period has expired, the cpu
+ * that disabled it must re-enable it.
+ * Return 0 if it is re-enabled for all cpus.
+ */
+static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
+{
+	int tcpu;
+	struct bau_control *tbcp;
+
+	if (bcp->set_bau_off) {
+		if (get_cycles() >= bcp->set_bau_on_time) {
+			stat->s_bau_reenabled++;
+			baudisabled = 0;
+			for_each_present_cpu(tcpu) {
+				tbcp = &per_cpu(bau_control, tcpu);
+				tbcp->baudisabled = 0;
+				tbcp->period_requests = 0;
+				tbcp->period_time = 0;
 			}
+			return 0;
 		}
+	}
+	return -1;
+}
+
+static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
+				int remotes, struct bau_desc *bau_desc)
+{
+	stat->s_requestor++;
+	stat->s_ntargcpu += remotes + locals;
+	stat->s_ntargremotes += remotes;
+	stat->s_ntarglocals += locals;
+
+	/* uvhub statistics */
+	hubs = bau_uvhub_weight(&bau_desc->distribution);
+	if (locals) {
+		stat->s_ntarglocaluvhub++;
+		stat->s_ntargremoteuvhub += (hubs - 1);
 	} else
-		stat->s_requestor--;
-	if (completion_status == FLUSH_COMPLETE && try > 1)
-		stat->s_retriesok++;
-	else if (completion_status == FLUSH_GIVEUP) {
-		stat->s_giveup++;
-		return 1;
+		stat->s_ntargremoteuvhub += hubs;
+
+	stat->s_ntarguvhub += hubs;
+
+	if (hubs >= 16)
+		stat->s_ntarguvhub16++;
+	else if (hubs >= 8)
+		stat->s_ntarguvhub8++;
+	else if (hubs >= 4)
+		stat->s_ntarguvhub4++;
+	else if (hubs >= 2)
+		stat->s_ntarguvhub2++;
+	else
+		stat->s_ntarguvhub1++;
+}
+
+/*
+ * Translate a cpu mask to the uvhub distribution mask in the BAU
+ * activation descriptor.
+ */
+static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
+			struct bau_desc *bau_desc, int *localsp, int *remotesp)
+{
+	int cpu;
+	int pnode;
+	int cnt = 0;
+	struct hub_and_pnode *hpp;
+
+	for_each_cpu(cpu, flush_mask) {
+		/*
+		 * The distribution vector is a bit map of pnodes, relative
+		 * to the partition base pnode (and the partition base nasid
+		 * in the header).
+		 * Translate cpu to pnode and hub using a local memory array.
+		 */
+		hpp = &bcp->socket_master->thp[cpu];
+		pnode = hpp->pnode - bcp->partition_base_pnode;
+		bau_uvhub_set(pnode, &bau_desc->distribution);
+		cnt++;
+		if (hpp->uvhub == bcp->uvhub)
+			(*localsp)++;
+		else
+			(*remotesp)++;
 	}
+	if (!cnt)
+		return 1;
 	return 0;
 }
 
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
+/*
+ * globally purge translation cache of a virtual address or all TLB's
  * @cpumask: mask of all cpu's in which the address is to be removed
  * @mm: mm_struct containing virtual address range
  * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
@@ -774,20 +925,16 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
  * done.  The returned pointer is valid till preemption is re-enabled.
  */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-					  struct mm_struct *mm,
-					  unsigned long va, unsigned int cpu)
+				struct mm_struct *mm, unsigned long va,
+				unsigned int cpu)
 {
 	int locals = 0;
 	int remotes = 0;
 	int hubs = 0;
-	int tcpu;
-	int tpnode;
 	struct bau_desc *bau_desc;
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
-	struct bau_control *tbcp;
-	struct hub_and_pnode *hpp;
 
 	/* kernel was booted 'nobau' */
 	if (nobau)
@@ -798,20 +945,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	/* bau was disabled due to slow response */
 	if (bcp->baudisabled) {
-		/* the cpu that disabled it must re-enable it */
-		if (bcp->set_bau_off) {
-			if (get_cycles() >= bcp->set_bau_on_time) {
-				stat->s_bau_reenabled++;
-				baudisabled = 0;
-				for_each_present_cpu(tcpu) {
-					tbcp = &per_cpu(bau_control, tcpu);
-					tbcp->baudisabled = 0;
-					tbcp->period_requests = 0;
-					tbcp->period_time = 0;
-				}
-			}
-		}
-		return cpumask;
+		if (check_enable(bcp, stat))
+			return cpumask;
 	}
 
 	/*
@@ -822,59 +957,20 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
 	/* don't actually do a shootdown of the local cpu */
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
 	if (cpu_isset(cpu, *cpumask))
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
+	bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-
-	for_each_cpu(tcpu, flush_mask) {
-		/*
-		 * The distribution vector is a bit map of pnodes, relative
-		 * to the partition base pnode (and the partition base nasid
-		 * in the header).
-		 * Translate cpu to pnode and hub using an array stored
-		 * in local memory.
-		 */
-		hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
-		tpnode = hpp->pnode - bcp->partition_base_pnode;
-		bau_uvhub_set(tpnode, &bau_desc->distribution);
-		if (hpp->uvhub == bcp->uvhub)
-			locals++;
-		else
-			remotes++;
-	}
-	if ((locals + remotes) == 0)
+	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
 		return NULL;
-	stat->s_requestor++;
-	stat->s_ntargcpu += remotes + locals;
-	stat->s_ntargremotes += remotes;
-	stat->s_ntarglocals += locals;
-	remotes = bau_uvhub_weight(&bau_desc->distribution);
 
-	/* uvhub statistics */
-	hubs = bau_uvhub_weight(&bau_desc->distribution);
-	if (locals) {
-		stat->s_ntarglocaluvhub++;
-		stat->s_ntargremoteuvhub += (hubs - 1);
-	} else
-		stat->s_ntargremoteuvhub += hubs;
-	stat->s_ntarguvhub += hubs;
-	if (hubs >= 16)
-		stat->s_ntarguvhub16++;
-	else if (hubs >= 8)
-		stat->s_ntarguvhub8++;
-	else if (hubs >= 4)
-		stat->s_ntarguvhub4++;
-	else if (hubs >= 2)
-		stat->s_ntarguvhub2++;
-	else
-		stat->s_ntarguvhub1++;
+	record_send_statistics(stat, locals, hubs, remotes, bau_desc);
 
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
-
 	/*
 	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
 	 * or 1 if it gave up and the original cpumask should be returned.
@@ -903,26 +999,31 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 {
 	int count = 0;
 	cycles_t time_start;
-	struct bau_payload_queue_entry *msg;
+	struct bau_pq_entry *msg;
 	struct bau_control *bcp;
 	struct ptc_stats *stat;
 	struct msg_desc msgdesc;
 
 	time_start = get_cycles();
+
 	bcp = &per_cpu(bau_control, smp_processor_id());
 	stat = bcp->statp;
-	msgdesc.va_queue_first = bcp->va_queue_first;
-	msgdesc.va_queue_last = bcp->va_queue_last;
+
+	msgdesc.queue_first = bcp->queue_first;
+	msgdesc.queue_last = bcp->queue_last;
+
 	msg = bcp->bau_msg_head;
-	while (msg->sw_ack_vector) {
+	while (msg->swack_vec) {
 		count++;
-		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
-		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+
+		msgdesc.msg_slot = msg - msgdesc.queue_first;
+		msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
 		msgdesc.msg = msg;
-		uv_bau_process_message(&msgdesc, bcp);
+		bau_process_message(&msgdesc, bcp);
+
 		msg++;
-		if (msg > msgdesc.va_queue_last)
-			msg = msgdesc.va_queue_first;
+		if (msg > msgdesc.queue_last)
+			msg = msgdesc.queue_first;
 		bcp->bau_msg_head = msg;
 	}
 	stat->d_time += (get_cycles() - time_start);
@@ -930,18 +1031,17 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 		stat->d_nomsg++;
 	else if (count > 1)
 		stat->d_multmsg++;
+
 	ack_APIC_irq();
 }
 
 /*
- * uv_enable_timeouts
- *
- * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
  * shootdown message timeouts enabled.  The timeout does not cause
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
-static void __init uv_enable_timeouts(void)
+static void __init enable_timeouts(void)
 {
 	int uvhub;
 	int nuvhubs;
@@ -955,52 +1055,44 @@ static void __init uv_enable_timeouts(void)
 			continue;
 
 		pnode = uv_blade_to_pnode(uvhub);
-		mmr_image =
-		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+		mmr_image = read_mmr_misc_control(pnode);
 		/*
 		 * Set the timeout period and then lock it in, in three
 		 * steps; captures and locks in the period.
 		 *
 		 * To program the period, the SOFT_ACK_MODE must be off.
 		 */
-		mmr_image &= ~((unsigned long)1 <<
-		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
-		uv_write_global_mmr64
-		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		mmr_image &= ~(1L << SOFTACK_MSHIFT);
+		write_mmr_misc_control(pnode, mmr_image);
 		/*
 		 * Set the 4-bit period.
 		 */
-		mmr_image &= ~((unsigned long)0xf <<
-		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
-		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
-		uv_write_global_mmr64
-		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
+		mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
+		write_mmr_misc_control(pnode, mmr_image);
 		/*
 		 * UV1:
 		 * Subsequent reversals of the timebase bit (3) cause an
 		 * immediate timeout of one or all INTD resources as
 		 * indicated in bits 2:0 (7 causes all of them to timeout).
 		 */
-		mmr_image |= ((unsigned long)1 <<
-		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
+		mmr_image |= (1L << SOFTACK_MSHIFT);
 		if (is_uv2_hub()) {
-			mmr_image |= ((unsigned long)1 << UV2_LEG_SHFT);
-			mmr_image |= ((unsigned long)1 << UV2_EXT_SHFT);
+			mmr_image |= (1L << UV2_LEG_SHFT);
+			mmr_image |= (1L << UV2_EXT_SHFT);
 		}
-		uv_write_global_mmr64
-		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		write_mmr_misc_control(pnode, mmr_image);
 	}
 }
 
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
 {
 	if (*offset < num_possible_cpus())
 		return offset;
 	return NULL;
 }
 
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
 {
 	(*offset)++;
 	if (*offset < num_possible_cpus())
@@ -1008,12 +1100,11 @@ static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
 	return NULL;
 }
 
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
+static void ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
-static inline unsigned long long
-microsec_2_cycles(unsigned long microsec)
+static inline unsigned long long usec_2_cycles(unsigned long microsec)
 {
 	unsigned long ns;
 	unsigned long long cyc;
@@ -1024,29 +1115,27 @@ microsec_2_cycles(unsigned long microsec)
 }
 
 /*
- * Display the statistics thru /proc.
+ * Display the statistics thru /proc/sgi_uv/ptc_statistics
  * 'data' points to the cpu number
+ * Note: see the descriptions in stat_description[].
  */
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
+static int ptc_seq_show(struct seq_file *file, void *data)
 {
 	struct ptc_stats *stat;
 	int cpu;
 
 	cpu = *(loff_t *)data;
-
 	if (!cpu) {
 		seq_printf(file,
 			"# cpu sent stime self locals remotes ncpus localhub ");
 		seq_printf(file,
 			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
+			"numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
 		seq_printf(file,
-			"retries rok resetp resett giveup sto bz throt ");
+			"resetp resett giveup sto bz throt swack recv rtime ");
 		seq_printf(file,
-			"sw_ack recv rtime all ");
-		seq_printf(file,
-			"one mult none retry canc nocan reset rcan ");
+			"all one mult none retry canc nocan reset rcan ");
 		seq_printf(file,
 			"disable enable\n");
 	}
@@ -1073,8 +1162,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 		/* destination side statistics */
 		seq_printf(file,
 			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
-					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+			   read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
 			   stat->d_requestee, cycles_2_us(stat->d_time),
 			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
 			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
@@ -1083,7 +1171,6 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file, "%ld %ld\n",
 			stat->s_bau_disabled, stat->s_bau_reenabled);
 	}
-
 	return 0;
 }
 
@@ -1091,18 +1178,18 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
  * Display the tunables thru debugfs
  */
 static ssize_t tunables_read(struct file *file, char __user *userbuf,
-						size_t count, loff_t *ppos)
+				size_t count, loff_t *ppos)
 {
 	char *buf;
 	int ret;
 
 	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
-		"max_bau_concurrent plugged_delay plugsb4reset",
+		"max_concur plugged_delay plugsb4reset",
 		"timeoutsb4reset ipi_reset_limit complete_threshold",
 		"congested_response_us congested_reps congested_period",
-		max_bau_concurrent, plugged_delay, plugsb4reset,
+		max_concurr, plugged_delay, plugsb4reset,
 		timeoutsb4reset, ipi_reset_limit, complete_threshold,
-		congested_response_us, congested_reps, congested_period);
+		congested_respns_us, congested_reps, congested_period);
 
 	if (!buf)
 		return -ENOMEM;
@@ -1113,13 +1200,16 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
 }
 
 /*
- * -1: resetf the statistics
+ * handle a write to /proc/sgi_uv/ptc_statistics
+ * -1: reset the statistics
  *  0: display meaning of the statistics
  */
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
-				 size_t count, loff_t *data)
+static ssize_t ptc_proc_write(struct file *file, const char __user *user,
+				size_t count, loff_t *data)
 {
 	int cpu;
+	int i;
+	int elements;
 	long input_arg;
 	char optstr[64];
 	struct ptc_stats *stat;
@@ -1129,79 +1219,18 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
 	optstr[count - 1] = '\0';
+
 	if (strict_strtol(optstr, 10, &input_arg) < 0) {
 		printk(KERN_DEBUG "%s is invalid\n", optstr);
 		return -EINVAL;
 	}
 
 	if (input_arg == 0) {
+		elements = sizeof(stat_description)/sizeof(*stat_description);
 		printk(KERN_DEBUG "# cpu:      cpu number\n");
 		printk(KERN_DEBUG "Sender statistics:\n");
-		printk(KERN_DEBUG
-		"sent:     number of shootdown messages sent\n");
-		printk(KERN_DEBUG
-		"stime:    time spent sending messages\n");
-		printk(KERN_DEBUG
-		"numuvhubs: number of hubs targeted with shootdown\n");
-		printk(KERN_DEBUG
-		"numuvhubs16: number times 16 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs8: number times 8 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs4: number times 4 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs2: number times 2 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs1: number times 1 hub targeted\n");
-		printk(KERN_DEBUG
-		"numcpus:  number of cpus targeted with shootdown\n");
-		printk(KERN_DEBUG
-		"dto:      number of destination timeouts\n");
-		printk(KERN_DEBUG
-		"retries:  destination timeout retries sent\n");
-		printk(KERN_DEBUG
-		"rok:   :  destination timeouts successfully retried\n");
-		printk(KERN_DEBUG
-		"resetp:   ipi-style resource resets for plugs\n");
-		printk(KERN_DEBUG
-		"resett:   ipi-style resource resets for timeouts\n");
-		printk(KERN_DEBUG
-		"giveup:   fall-backs to ipi-style shootdowns\n");
-		printk(KERN_DEBUG
-		"sto:      number of source timeouts\n");
-		printk(KERN_DEBUG
-		"bz:       number of stay-busy's\n");
-		printk(KERN_DEBUG
-		"throt:    number times spun in throttle\n");
-		printk(KERN_DEBUG "Destination side statistics:\n");
-		printk(KERN_DEBUG
-		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
-		printk(KERN_DEBUG
-		"recv:     shootdown messages received\n");
-		printk(KERN_DEBUG
-		"rtime:    time spent processing messages\n");
-		printk(KERN_DEBUG
-		"all:      shootdown all-tlb messages\n");
-		printk(KERN_DEBUG
-		"one:      shootdown one-tlb messages\n");
-		printk(KERN_DEBUG
-		"mult:     interrupts that found multiple messages\n");
-		printk(KERN_DEBUG
-		"none:     interrupts that found no messages\n");
-		printk(KERN_DEBUG
-		"retry:    number of retry messages processed\n");
-		printk(KERN_DEBUG
-		"canc:     number messages canceled by retries\n");
-		printk(KERN_DEBUG
-		"nocan:    number retries that found nothing to cancel\n");
-		printk(KERN_DEBUG
-		"reset:    number of ipi-style reset requests processed\n");
-		printk(KERN_DEBUG
-		"rcan:     number messages canceled by reset requests\n");
-		printk(KERN_DEBUG
-		"disable:  number times use of the BAU was disabled\n");
-		printk(KERN_DEBUG
-		"enable:   number times use of the BAU was re-enabled\n");
+		for (i = 0; i < elements; i++)
+			printk(KERN_DEBUG "%s\n", stat_description[i]);
 	} else if (input_arg == -1) {
 		for_each_present_cpu(cpu) {
 			stat = &per_cpu(ptcstats, cpu);
@@ -1228,27 +1257,18 @@ static int local_atoi(const char *name)
 }
 
 /*
- * set the tunables
- * 0 values reset them to defaults
+ * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
+ * Zero values reset them to defaults.
  */
-static ssize_t tunables_write(struct file *file, const char __user *user,
-				 size_t count, loff_t *data)
+static int parse_tunables_write(struct bau_control *bcp, char *instr,
+				int count)
 {
-	int cpu;
-	int cnt = 0;
-	int val;
 	char *p;
 	char *q;
-	char instr[64];
-	struct bau_control *bcp;
-
-	if (count == 0 || count > sizeof(instr)-1)
-		return -EINVAL;
-	if (copy_from_user(instr, user, count))
-		return -EFAULT;
+	int cnt = 0;
+	int val;
+	int e = sizeof(tunables) / sizeof(*tunables);
 
-	instr[count] = '\0';
-	/* count the fields */
 	p = instr + strspn(instr, WHITESPACE);
 	q = p;
 	for (; *p; p = q + strspn(q, WHITESPACE)) {
@@ -1257,8 +1277,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
 		if (q == p)
 			break;
 	}
-	if (cnt != 9) {
-		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+	if (cnt != e) {
+		printk(KERN_INFO "bau tunable error: should be %d values\n", e);
 		return -EINVAL;
 	}
 
@@ -1270,97 +1290,80 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
 		switch (cnt) {
 		case 0:
 			if (val == 0) {
-				max_bau_concurrent = MAX_BAU_CONCURRENT;
-				max_bau_concurrent_constant =
-							MAX_BAU_CONCURRENT;
+				max_concurr = MAX_BAU_CONCURRENT;
+				max_concurr_const = MAX_BAU_CONCURRENT;
 				continue;
 			}
-			bcp = &per_cpu(bau_control, smp_processor_id());
 			if (val < 1 || val > bcp->cpus_in_uvhub) {
 				printk(KERN_DEBUG
 				"Error: BAU max concurrent %d is invalid\n",
 				val);
 				return -EINVAL;
 			}
-			max_bau_concurrent = val;
-			max_bau_concurrent_constant = val;
-			continue;
-		case 1:
-			if (val == 0)
-				plugged_delay = PLUGGED_DELAY;
-			else
-				plugged_delay = val;
+			max_concurr = val;
+			max_concurr_const = val;
 			continue;
-		case 2:
-			if (val == 0)
-				plugsb4reset = PLUGSB4RESET;
-			else
-				plugsb4reset = val;
-			continue;
-		case 3:
-			if (val == 0)
-				timeoutsb4reset = TIMEOUTSB4RESET;
-			else
-				timeoutsb4reset = val;
-			continue;
-		case 4:
-			if (val == 0)
-				ipi_reset_limit = IPI_RESET_LIMIT;
-			else
-				ipi_reset_limit = val;
-			continue;
-		case 5:
-			if (val == 0)
-				complete_threshold = COMPLETE_THRESHOLD;
-			else
-				complete_threshold = val;
-			continue;
-		case 6:
-			if (val == 0)
-				congested_response_us = CONGESTED_RESPONSE_US;
-			else
-				congested_response_us = val;
-			continue;
-		case 7:
-			if (val == 0)
-				congested_reps = CONGESTED_REPS;
-			else
-				congested_reps = val;
-			continue;
-		case 8:
+		default:
 			if (val == 0)
-				congested_period = CONGESTED_PERIOD;
+				*tunables[cnt].tunp = tunables[cnt].deflt;
 			else
-				congested_period = val;
+				*tunables[cnt].tunp = val;
 			continue;
 		}
 		if (q == p)
 			break;
 	}
+	return 0;
+}
+
+/*
+ * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+				size_t count, loff_t *data)
+{
+	int cpu;
+	int ret;
+	char instr[100];
+	struct bau_control *bcp;
+
+	if (count == 0 || count > sizeof(instr)-1)
+		return -EINVAL;
+	if (copy_from_user(instr, user, count))
+		return -EFAULT;
+
+	instr[count] = '\0';
+
+	bcp = &per_cpu(bau_control, smp_processor_id());
+
+	ret = parse_tunables_write(bcp, instr, count);
+	if (ret)
+		return ret;
+
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
-		bcp->max_bau_concurrent = max_bau_concurrent;
-		bcp->max_bau_concurrent_constant = max_bau_concurrent;
-		bcp->plugged_delay = plugged_delay;
-		bcp->plugsb4reset = plugsb4reset;
-		bcp->timeoutsb4reset = timeoutsb4reset;
-		bcp->ipi_reset_limit = ipi_reset_limit;
-		bcp->complete_threshold = complete_threshold;
-		bcp->congested_response_us = congested_response_us;
-		bcp->congested_reps = congested_reps;
-		bcp->congested_period = congested_period;
+		bcp->max_concurr =		max_concurr;
+		bcp->max_concurr_const =	max_concurr;
+		bcp->plugged_delay =		plugged_delay;
+		bcp->plugsb4reset =		plugsb4reset;
+		bcp->timeoutsb4reset =		timeoutsb4reset;
+		bcp->ipi_reset_limit =		ipi_reset_limit;
+		bcp->complete_threshold =	complete_threshold;
+		bcp->cong_response_us =		congested_respns_us;
+		bcp->cong_reps =		congested_reps;
+		bcp->cong_period =		congested_period;
 	}
 	return count;
 }
 
 static const struct seq_operations uv_ptc_seq_ops = {
-	.start		= uv_ptc_seq_start,
-	.next		= uv_ptc_seq_next,
-	.stop		= uv_ptc_seq_stop,
-	.show		= uv_ptc_seq_show
+	.start		= ptc_seq_start,
+	.next		= ptc_seq_next,
+	.stop		= ptc_seq_stop,
+	.show		= ptc_seq_show
 };
 
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
+static int ptc_proc_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &uv_ptc_seq_ops);
 }
@@ -1371,9 +1374,9 @@ static int tunables_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations proc_uv_ptc_operations = {
-	.open		= uv_ptc_proc_open,
+	.open		= ptc_proc_open,
 	.read		= seq_read,
-	.write		= uv_ptc_proc_write,
+	.write		= ptc_proc_write,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
@@ -1407,7 +1410,7 @@ static int __init uv_ptc_init(void)
 		return -EINVAL;
 	}
 	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
-			tunables_dir, NULL, &tunables_fops);
+					tunables_dir, NULL, &tunables_fops);
 	if (!tunables_file) {
 		printk(KERN_ERR "unable to create debugfs file %s\n",
 		       UV_BAU_TUNABLES_FILE);
@@ -1419,24 +1422,24 @@ static int __init uv_ptc_init(void)
 /*
  * Initialize the sending side's sending buffers.
  */
-static void
-uv_activation_descriptor_init(int node, int pnode, int base_pnode)
+static void activation_descriptor_init(int node, int pnode, int base_pnode)
 {
 	int i;
 	int cpu;
 	unsigned long pa;
 	unsigned long m;
 	unsigned long n;
+	size_t dsize;
 	struct bau_desc *bau_desc;
 	struct bau_desc *bd2;
 	struct bau_control *bcp;
 
 	/*
-	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
+	 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
+	 * per cpu; and one per cpu on the uvhub (ADP_SZ)
 	 */
-	bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
-				* UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
+	dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
+	bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
 	BUG_ON(!bau_desc);
 
 	pa = uv_gpa(bau_desc); /* need the real nasid*/
@@ -1444,27 +1447,25 @@ uv_activation_descriptor_init(int node, int pnode, int base_pnode)
 	m = pa & uv_mmask;
 
 	/* the 14-bit pnode */
-	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
-			      (n << UV_DESC_BASE_PNODE_SHIFT | m));
+	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
 	/*
-	 * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+	 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
 	 * cpu even though we only use the first one; one descriptor can
 	 * describe a broadcast to 256 uv hubs.
 	 */
-	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-		i++, bd2++) {
+	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
 		memset(bd2, 0, sizeof(struct bau_desc));
-		bd2->header.sw_ack_flag = 1;
+		bd2->header.swack_flag =	1;
 		/*
 		 * The base_dest_nasid set in the message header is the nasid
 		 * of the first uvhub in the partition. The bit map will
 		 * indicate destination pnode numbers relative to that base.
 		 * They may not be consecutive if nasid striding is being used.
 		 */
-		bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
-		bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
-		bd2->header.command = UV_NET_ENDPOINT_INTD;
-		bd2->header.int_both = 1;
+		bd2->header.base_dest_nasid =	UV_PNODE_TO_NASID(base_pnode);
+		bd2->header.dest_subnodeid =	UV_LB_SUBNODEID;
+		bd2->header.command =		UV_NET_ENDPOINT_INTD;
+		bd2->header.int_both =		1;
 		/*
 		 * all others need to be set to zero:
 		 *   fairness chaining multilevel count replied_to
@@ -1484,57 +1485,55 @@ uv_activation_descriptor_init(int node, int pnode, int base_pnode)
  * - node is first node (kernel memory notion) on the uvhub
  * - pnode is the uvhub's physical identifier
  */
-static void
-uv_payload_queue_init(int node, int pnode)
+static void pq_init(int node, int pnode)
 {
-	int pn;
 	int cpu;
+	size_t plsize;
 	char *cp;
-	unsigned long pa;
-	struct bau_payload_queue_entry *pqp;
-	struct bau_payload_queue_entry *pqp_malloc;
+	void *vp;
+	unsigned long pn;
+	unsigned long first;
+	unsigned long pn_first;
+	unsigned long last;
+	struct bau_pq_entry *pqp;
 	struct bau_control *bcp;
 
-	pqp = kmalloc_node((DEST_Q_SIZE + 1)
-			   * sizeof(struct bau_payload_queue_entry),
-			   GFP_KERNEL, node);
+	plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
+	vp = kmalloc_node(plsize, GFP_KERNEL, node);
+	pqp = (struct bau_pq_entry *)vp;
 	BUG_ON(!pqp);
-	pqp_malloc = pqp;
 
 	cp = (char *)pqp + 31;
-	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
+	pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
 
 	for_each_present_cpu(cpu) {
 		if (pnode != uv_cpu_to_pnode(cpu))
 			continue;
 		/* for every cpu on this pnode: */
 		bcp = &per_cpu(bau_control, cpu);
-		bcp->va_queue_first = pqp;
-		bcp->bau_msg_head = pqp;
-		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+		bcp->queue_first	= pqp;
+		bcp->bau_msg_head	= pqp;
+		bcp->queue_last		= pqp + (DEST_Q_SIZE - 1);
 	}
 	/*
 	 * need the pnode of where the memory was really allocated
 	 */
-	pa = uv_gpa(pqp);
-	pn = pa >> uv_nshift;
-	uv_write_global_mmr64(pnode,
-			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
-			      uv_physnodeaddr(pqp));
-	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
-			      uv_physnodeaddr(pqp));
-	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
-			      (unsigned long)
-			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+	pn = uv_gpa(pqp) >> uv_nshift;
+	first = uv_physnodeaddr(pqp);
+	pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
+	last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
+	write_mmr_payload_first(pnode, pn_first);
+	write_mmr_payload_tail(pnode, first);
+	write_mmr_payload_last(pnode, last);
+
 	/* in effect, all msg_type's are set to MSG_NOOP */
-	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
 }
 
 /*
  * Initialization of each UV hub's structures
  */
-static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
+static void __init init_uvhub(int uvhub, int vector, int base_pnode)
 {
 	int node;
 	int pnode;
@@ -1542,24 +1541,24 @@ static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
 
 	node = uvhub_to_first_node(uvhub);
 	pnode = uv_blade_to_pnode(uvhub);
-	uv_activation_descriptor_init(node, pnode, base_pnode);
-	uv_payload_queue_init(node, pnode);
+
+	activation_descriptor_init(node, pnode, base_pnode);
+
+	pq_init(node, pnode);
 	/*
 	 * The below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS.
 	 */
 	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
-	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-				      ((apicid << 32) | vector));
+	write_mmr_data_config(pnode, ((apicid << 32) | vector));
 }
 
 /*
  * We will set BAU_MISC_CONTROL with a timeout period.
  * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
- * So the destination timeout period has be be calculated from them.
+ * So the destination timeout period has to be calculated from them.
  */
-static int
-calculate_destination_timeout(void)
+static int calculate_destination_timeout(void)
 {
 	unsigned long mmr_image;
 	int mult1;
@@ -1570,8 +1569,7 @@ calculate_destination_timeout(void)
 	unsigned long ts_ns;
 
 	if (is_uv1_hub()) {
-		mult1 = UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD &
-			BAU_MISC_CONTROL_MULT_MASK;
+		mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
 		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
 		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
 		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
@@ -1583,7 +1581,7 @@ calculate_destination_timeout(void)
 		/* 4 bits  0/1 for 10/80us, 3 bits of multiplier */
 		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
 		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
-		if (mmr_image & ((unsigned long)1 << UV2_ACK_UNITS_SHFT))
+		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
 			mult1 = 80;
 		else
 			mult1 = 10;
@@ -1593,62 +1591,69 @@ calculate_destination_timeout(void)
 	return ret;
 }
 
+static void __init init_per_cpu_tunables(void)
+{
+	int cpu;
+	struct bau_control *bcp;
+
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->baudisabled		= 0;
+		bcp->statp			= &per_cpu(ptcstats, cpu);
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval		= usec_2_cycles(2*timeout_us);
+		bcp->max_concurr		= max_concurr;
+		bcp->max_concurr_const		= max_concurr;
+		bcp->plugged_delay		= plugged_delay;
+		bcp->plugsb4reset		= plugsb4reset;
+		bcp->timeoutsb4reset		= timeoutsb4reset;
+		bcp->ipi_reset_limit		= ipi_reset_limit;
+		bcp->complete_threshold		= complete_threshold;
+		bcp->cong_response_us		= congested_respns_us;
+		bcp->cong_reps			= congested_reps;
+		bcp->cong_period		= congested_period;
+	}
+}
+
 /*
- * initialize the bau_control structure for each cpu
+ * Scan all cpus to collect blade and socket summaries.
  */
-static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
+static int __init get_cpu_topology(int base_pnode,
+					struct uvhub_desc *uvhub_descs,
+					unsigned char *uvhub_mask)
 {
-	int i;
 	int cpu;
-	int tcpu;
 	int pnode;
 	int uvhub;
-	int have_hmaster;
-	short socket = 0;
-	unsigned short socket_mask;
-	unsigned char *uvhub_mask;
+	int socket;
 	struct bau_control *bcp;
 	struct uvhub_desc *bdp;
 	struct socket_desc *sdp;
-	struct bau_control *hmaster = NULL;
-	struct bau_control *smaster = NULL;
-	struct socket_desc {
-		short num_cpus;
-		short cpu_number[MAX_CPUS_PER_SOCKET];
-	};
-	struct uvhub_desc {
-		unsigned short socket_mask;
-		short num_cpus;
-		short uvhub;
-		short pnode;
-		struct socket_desc socket[2];
-	};
-	struct uvhub_desc *uvhub_descs;
 
-	timeout_us = calculate_destination_timeout();
-
-	uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
-	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
-	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
+
 		memset(bcp, 0, sizeof(struct bau_control));
+
 		pnode = uv_cpu_hub_info(cpu)->pnode;
-		if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
+		if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
 			printk(KERN_EMERG
 				"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
-				cpu, pnode, base_part_pnode,
-				UV_DISTRIBUTION_SIZE);
+				cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
 			return 1;
 		}
+
 		bcp->osnode = cpu_to_node(cpu);
-		bcp->partition_base_pnode = uv_partition_base_pnode;
+		bcp->partition_base_pnode = base_pnode;
+
 		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
 		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
 		bdp = &uvhub_descs[uvhub];
+
 		bdp->num_cpus++;
 		bdp->uvhub = uvhub;
 		bdp->pnode = pnode;
+
 		/* kludge: 'assuming' one node per socket, and assuming that
 		   disabling a socket just leaves a gap in node numbers */
 		socket = bcp->osnode & 1;
@@ -1657,84 +1662,129 @@ static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
 		sdp->cpu_number[sdp->num_cpus] = cpu;
 		sdp->num_cpus++;
 		if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
-			printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+			printk(KERN_EMERG "%d cpus per socket invalid\n",
+				sdp->num_cpus);
 			return 1;
 		}
 	}
+	return 0;
+}
+
+/*
+ * Each socket is to get a local array of pnodes/hubs.
+ */
+static void make_per_cpu_thp(struct bau_control *smaster)
+{
+	int cpu;
+	size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
+
+	smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
+	memset(smaster->thp, 0, hpsz);
+	for_each_present_cpu(cpu) {
+		smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
+		smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+	}
+}
+
+/*
+ * Initialize all the per_cpu information for the cpu's on a given socket,
+ * given what has been gathered into the socket_desc struct.
+ * And reports the chosen hub and socket masters back to the caller.
+ */
+static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
+			struct bau_control **smasterp,
+			struct bau_control **hmasterp)
+{
+	int i;
+	int cpu;
+	struct bau_control *bcp;
+
+	for (i = 0; i < sdp->num_cpus; i++) {
+		cpu = sdp->cpu_number[i];
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->cpu = cpu;
+		if (i == 0) {
+			*smasterp = bcp;
+			if (!(*hmasterp))
+				*hmasterp = bcp;
+		}
+		bcp->cpus_in_uvhub = bdp->num_cpus;
+		bcp->cpus_in_socket = sdp->num_cpus;
+		bcp->socket_master = *smasterp;
+		bcp->uvhub = bdp->uvhub;
+		bcp->uvhub_master = *hmasterp;
+		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
+				bcp->uvhub_cpu);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Summarize the blade and socket topology into the per_cpu structures.
+ */
+static int __init summarize_uvhub_sockets(int nuvhubs,
+			struct uvhub_desc *uvhub_descs,
+			unsigned char *uvhub_mask)
+{
+	int socket;
+	int uvhub;
+	unsigned short socket_mask;
+
 	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		struct uvhub_desc *bdp;
+		struct bau_control *smaster = NULL;
+		struct bau_control *hmaster = NULL;
+
 		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
 			continue;
-		have_hmaster = 0;
+
 		bdp = &uvhub_descs[uvhub];
 		socket_mask = bdp->socket_mask;
 		socket = 0;
 		while (socket_mask) {
-			if (!(socket_mask & 1))
-				goto nextsocket;
-			sdp = &bdp->socket[socket];
-			for (i = 0; i < sdp->num_cpus; i++) {
-				cpu = sdp->cpu_number[i];
-				bcp = &per_cpu(bau_control, cpu);
-				bcp->cpu = cpu;
-				if (i == 0) {
-					smaster = bcp;
-					if (!have_hmaster) {
-						have_hmaster++;
-						hmaster = bcp;
-					}
-				}
-				bcp->cpus_in_uvhub = bdp->num_cpus;
-				bcp->cpus_in_socket = sdp->num_cpus;
-				bcp->socket_master = smaster;
-				bcp->uvhub = bdp->uvhub;
-				bcp->uvhub_master = hmaster;
-				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
-						blade_processor_id;
-				if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
-					printk(KERN_EMERG
-						"%d cpus per uvhub invalid\n",
-						bcp->uvhub_cpu);
+			struct socket_desc *sdp;
+			if ((socket_mask & 1)) {
+				sdp = &bdp->socket[socket];
+				if (scan_sock(sdp, bdp, &smaster, &hmaster))
 					return 1;
-				}
 			}
-nextsocket:
 			socket++;
 			socket_mask = (socket_mask >> 1);
-			/* each socket gets a local array of pnodes/hubs */
-			bcp = smaster;
-			bcp->target_hub_and_pnode = kmalloc_node(
-				sizeof(struct hub_and_pnode) *
-				num_possible_cpus(), GFP_KERNEL, bcp->osnode);
-			memset(bcp->target_hub_and_pnode, 0,
-				sizeof(struct hub_and_pnode) *
-				num_possible_cpus());
-			for_each_present_cpu(tcpu) {
-				bcp->target_hub_and_pnode[tcpu].pnode =
-					uv_cpu_hub_info(tcpu)->pnode;
-				bcp->target_hub_and_pnode[tcpu].uvhub =
-					uv_cpu_hub_info(tcpu)->numa_blade_id;
-			}
+			make_per_cpu_thp(smaster);
 		}
 	}
+	return 0;
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
+{
+	unsigned char *uvhub_mask;
+	void *vp;
+	struct uvhub_desc *uvhub_descs;
+
+	timeout_us = calculate_destination_timeout();
+
+	vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	uvhub_descs = (struct uvhub_desc *)vp;
+	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
+
+	if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
+		return 1;
+
+	if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
+		return 1;
+
 	kfree(uvhub_descs);
 	kfree(uvhub_mask);
-	for_each_present_cpu(cpu) {
-		bcp = &per_cpu(bau_control, cpu);
-		bcp->baudisabled = 0;
-		bcp->statp = &per_cpu(ptcstats, cpu);
-		/* time interval to catch a hardware stay-busy bug */
-		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
-		bcp->max_bau_concurrent = max_bau_concurrent;
-		bcp->max_bau_concurrent_constant = max_bau_concurrent;
-		bcp->plugged_delay = plugged_delay;
-		bcp->plugsb4reset = plugsb4reset;
-		bcp->timeoutsb4reset = timeoutsb4reset;
-		bcp->ipi_reset_limit = ipi_reset_limit;
-		bcp->complete_threshold = complete_threshold;
-		bcp->congested_response_us = congested_response_us;
-		bcp->congested_reps = congested_reps;
-		bcp->congested_period = congested_period;
-	}
+	init_per_cpu_tunables();
 	return 0;
 }
 
@@ -1747,8 +1797,9 @@ static int __init uv_bau_init(void)
 	int pnode;
 	int nuvhubs;
 	int cur_cpu;
+	int cpus;
 	int vector;
-	unsigned long mmr;
+	cpumask_var_t *mask;
 
 	if (!is_uv_system())
 		return 0;
@@ -1756,24 +1807,25 @@ static int __init uv_bau_init(void)
 	if (nobau)
 		return 0;
 
-	for_each_possible_cpu(cur_cpu)
-		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
-				       GFP_KERNEL, cpu_to_node(cur_cpu));
+	for_each_possible_cpu(cur_cpu) {
+		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
+		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
+	}
 
 	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
 	nuvhubs = uv_num_possible_blades();
 	spin_lock_init(&disable_lock);
-	congested_cycles = microsec_2_cycles(congested_response_us);
+	congested_cycles = usec_2_cycles(congested_respns_us);
 
-	uv_partition_base_pnode = 0x7fffffff;
+	uv_base_pnode = 0x7fffffff;
 	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-		if (uv_blade_nr_possible_cpus(uvhub) &&
-			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
-			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+		cpus = uv_blade_nr_possible_cpus(uvhub);
+		if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
+			uv_base_pnode = uv_blade_to_pnode(uvhub);
 	}
 
-	if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
+	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
 		nobau = 1;
 		return 0;
 	}
@@ -1781,21 +1833,21 @@ static int __init uv_bau_init(void)
 	vector = UV_BAU_MESSAGE;
 	for_each_possible_blade(uvhub)
 		if (uv_blade_nr_possible_cpus(uvhub))
-			uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);
+			init_uvhub(uvhub, vector, uv_base_pnode);
 
-	uv_enable_timeouts();
+	enable_timeouts();
 	alloc_intr_gate(vector, uv_bau_message_intr1);
 
 	for_each_possible_blade(uvhub) {
 		if (uv_blade_nr_possible_cpus(uvhub)) {
+			unsigned long val;
+			unsigned long mmr;
 			pnode = uv_blade_to_pnode(uvhub);
 			/* INIT the bau */
-			uv_write_global_mmr64(pnode,
-					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
-					((unsigned long)1 << 63));
+			val = 1L << 63;
+			write_gmmr_activation(pnode, val);
 			mmr = 1; /* should be 1 to broadcast to both sockets */
-			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
-						mmr);
+			write_mmr_data_broadcast(pnode, mmr);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From aecb7b64dd9e2512c7a4c7e61dd781415d3dac5a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@st.com>
Date: Tue, 24 May 2011 14:04:09 +0530
Subject: dmaengine/dw_dmac: Update maintainer-ship

Nobody is currently maintaining dw_dmac. We are using dw_dmac for SPEAr13xx and
are currently maintaining it. After discussing with Vinod, sending this patch to
update maintainer-ship of dw_dmac.

Signed-off-by: Viresh Kumar <viresh.kumar@st.com>
Acked-by: Havard Skinnemoen <hskinnemoen@gmail.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 MAINTAINERS                | 7 +++++++
 drivers/dma/dw_dmac.c      | 2 ++
 drivers/dma/dw_dmac_regs.h | 1 +
 include/linux/dw_dmac.h    | 1 +
 4 files changed, 11 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6b4b9cdec37..8f4413014b2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5401,6 +5401,13 @@ L:	linux-serial@vger.kernel.org
 S:	Maintained
 F:	drivers/tty/serial
 
+SYNOPSYS DESIGNWARE DMAC DRIVER
+M:	Viresh Kumar <viresh.kumar@st.com>
+S:	Maintained
+F:	include/linux/dw_dmac.h
+F:	drivers/dma/dw_dmac_regs.h
+F:	drivers/dma/dw_dmac.c
+
 TIMEKEEPING, NTP
 M:	John Stultz <johnstul@us.ibm.com>
 M:	Thomas Gleixner <tglx@linutronix.de>
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index eec675bf4f9..efd836dfb65 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -3,6 +3,7 @@
  * AVR32 systems.)
  *
  * Copyright (C) 2007-2008 Atmel Corporation
+ * Copyright (C) 2010-2011 ST Microelectronics
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1575,3 +1576,4 @@ module_exit(dw_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Synopsys DesignWare DMA Controller driver");
 MODULE_AUTHOR("Haavard Skinnemoen <haavard.skinnemoen@atmel.com>");
+MODULE_AUTHOR("Viresh Kumar <viresh.kumar@st.com>");
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index c968597c32a..c3419518d70 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -2,6 +2,7 @@
  * Driver for the Synopsys DesignWare AHB DMA Controller
  *
  * Copyright (C) 2005-2007 Atmel Corporation
+ * Copyright (C) 2010-2011 ST Microelectronics
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h
index 6998d9376ef..4bfe0a2f7d5 100644
--- a/include/linux/dw_dmac.h
+++ b/include/linux/dw_dmac.h
@@ -3,6 +3,7 @@
  * AVR32 systems.)
  *
  * Copyright (C) 2007 Atmel Corporation
+ * Copyright (C) 2010-2011 ST Microelectronics
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
-- 
cgit v1.2.3-70-g09d2


From af6a25f0e1ec0265c267e6ee4513925eaba6d0ed Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Tue, 24 May 2011 14:49:59 +0100
Subject: x86: Reorder mm_context_t to remove x86_64 alignment padding and thus
 shrink mm_struct

Reorder mm_context_t to remove alignment padding on 64 bit
builds shrinking its size from 64 to 56 bytes.

This allows mm_struct to shrink from 840 to 832 bytes, so using
one fewer cache lines, and getting more objects per slab when
using slub.

slabinfo mm_struct reports
before :-

    Sizes (bytes)     Slabs
    -----------------------------------
    Object :     840  Total  :       7
    SlabObj:     896  Full   :       1
    SlabSiz:   16384  Partial:       4
    Loss   :      56  CpuSlab:       2
    Align  :      64  Objects:      18

after :-

    Sizes (bytes)     Slabs
    ----------------------------------
    Object :     832  Total  :       7
    SlabObj:     832  Full   :       1
    SlabSiz:   16384  Partial:       4
    Loss   :       0  CpuSlab:       2
    Align  :      64  Objects:      19

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: wilsons@start.ca
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1306244999.1999.5.camel@castor.rsk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mmu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index aeff3e89b22..5f55e696276 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -11,14 +11,14 @@
 typedef struct {
 	void *ldt;
 	int size;
-	struct mutex lock;
-	void *vdso;
 
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
 	unsigned short ia32_compat;
 #endif
 
+	struct mutex lock;
+	void *vdso;
 } mm_context_t;
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-70-g09d2


From 8b27f2ff7a24f0735c96055e676872f05398d99b Mon Sep 17 00:00:00 2001
From: Nikhil P Rao <nikhil.rao@intel.com>
Date: Wed, 25 May 2011 10:18:41 -0700
Subject: x86: Remove unnecessary check in detect_ht()

This patch removes a check that causes incorrect scheduler
domain setup (SMP instead of SMT) and bootlog warning messages
when cpuid extensions for topology enumeration are not supported
and the number of processors reported to the OS is smaller than
smp_num_siblings.

Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Nikhil P Rao <nikhil.rao@intel.com>
Link: http://lkml.kernel.org/r/1306343921.19325.1.camel@fedora13
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b41623377..53f02f5d8cc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -477,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 	if (smp_num_siblings <= 1)
 		goto out;
 
-	if (smp_num_siblings > nr_cpu_ids) {
-		pr_warning("CPU: Unsupported number of siblings %d",
-			   smp_num_siblings);
-		smp_num_siblings = 1;
-		return;
-	}
-
 	index_msb = get_count_order(smp_num_siblings);
 	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
 
-- 
cgit v1.2.3-70-g09d2


From 46b2903c05b248ed78304113ecfba368b4c55def Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 25 May 2011 14:49:20 -0700
Subject: dmaengine: Add API documentation for slave dma usage

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/dmaengine.txt | 97 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
index 0c1c2f63c0a..5a0cb1ef616 100644
--- a/Documentation/dmaengine.txt
+++ b/Documentation/dmaengine.txt
@@ -1 +1,96 @@
-See Documentation/crypto/async-tx-api.txt
+			DMA Engine API Guide
+			====================
+
+		 Vinod Koul <vinod dot koul at intel.com>
+
+NOTE: For DMA Engine usage in async_tx please see:
+	Documentation/crypto/async-tx-api.txt
+
+
+Below is a guide to device driver writers on how to use the Slave-DMA API of the
+DMA Engine. This is applicable only for slave DMA usage only.
+
+The slave DMA usage consists of following steps
+1. Allocate a DMA slave channel
+2. Set slave and controller specific parameters
+3. Get a descriptor for transaction
+4. Submit the transaction and wait for callback notification
+
+1. Allocate a DMA slave channel
+Channel allocation is slightly different in the slave DMA context, client
+drivers typically need a channel from a particular DMA controller only and even
+in some cases a specific channel is desired. To request a channel
+dma_request_channel() API is used.
+
+Interface:
+struct dma_chan *dma_request_channel(dma_cap_mask_t mask,
+		dma_filter_fn filter_fn,
+		void *filter_param);
+where dma_filter_fn is defined as:
+typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
+
+When the optional 'filter_fn' parameter is set to NULL dma_request_channel
+simply returns the first channel that satisfies the capability mask.  Otherwise,
+when the mask parameter is insufficient for specifying the necessary channel,
+the filter_fn routine can be used to disposition the available channels in the
+system. The filter_fn routine is called once for each free channel in the
+system.  Upon seeing a suitable channel filter_fn returns DMA_ACK which flags
+that channel to be the return value from dma_request_channel.  A channel
+allocated via this interface is exclusive to the caller, until
+dma_release_channel() is called.
+
+2. Set slave and controller specific parameters
+Next step is always to pass some specific information to the DMA driver. Most of
+the generic information which a slave DMA can use is in struct dma_slave_config.
+It allows the clients to specify DMA direction, DMA addresses, bus widths, DMA
+burst lengths etc. If some DMA controllers have more parameters to be sent then
+they should try to embed struct dma_slave_config in their controller specific
+structure. That gives flexibility to client to pass more parameters, if
+required.
+
+Interface:
+int dmaengine_slave_config(struct dma_chan *chan,
+					  struct dma_slave_config *config)
+
+3. Get a descriptor for transaction
+For slave usage the various modes of slave transfers supported by the
+DMA-engine are:
+slave_sg	- DMA a list of scatter gather buffers from/to a peripheral
+dma_cyclic	- Perform a cyclic DMA operation from/to a peripheral till the
+		  operation is explicitly stopped.
+The non NULL return of this transfer API represents a "descriptor" for the given
+transaction.
+
+Interface:
+struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_sg)(
+		struct dma_chan *chan,
+		struct scatterlist *dst_sg, unsigned int dst_nents,
+		struct scatterlist *src_sg, unsigned int src_nents,
+		unsigned long flags);
+struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_cyclic)(
+		struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
+		size_t period_len, enum dma_data_direction direction);
+
+4. Submit the transaction and wait for callback notification
+To schedule the transaction to be scheduled by dma device, the "descriptor"
+returned in above (3) needs to be submitted.
+To tell the dma driver that a transaction is ready to be serviced, the
+descriptor->submit() callback needs to be invoked. This chains the descriptor to
+the pending queue.
+The transactions in the pending queue can be activated by calling the
+issue_pending API. If channel is idle then the first transaction in queue is
+started and subsequent ones queued up.
+On completion of the DMA operation the next in queue is submitted and a tasklet
+triggered. The tasklet would then call the client driver completion callback
+routine for notification, if set.
+Interface:
+void dma_async_issue_pending(struct dma_chan *chan);
+
+==============================================================================
+
+Additional usage notes for dma driver writers
+1/ Although DMA engine specifies that completion callback routines cannot submit
+any new operations, but typically for slave DMA subsequent transaction may not
+be available for submit prior to callback routine being called. This requirement
+is not a requirement for DMA-slave devices. But they should take care to drop
+the spin-lock they might be holding before calling the callback routine
-- 
cgit v1.2.3-70-g09d2


From 0714a57c6865f31ed3366e5abdfae7da580d41d2 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Tue, 24 May 2011 11:53:29 -0700
Subject: xhci: Clear stopped_td when Stop Endpoint command completes.

When an URB is cancelled, the xHCI driver issues a Stop Endpoint command
so that it can manipulate the ring and remove the transfer.  The xHC
hardware then places a transfer event with the completion code "Stopped"
or "Stopped Invalid" to let the driver know what TD it was in the middle
of processing.  This TD and TRB is stored in ep->stopped_td and
ep->stopped_trb.  These pointers are also used in handling stalled
endpoints.

By design, the Stop Endpoint command can race with URB completion.  By
the time the Stop Endpoint command is handled, the URBs to be cancelled
may have been given back to the driver.  Unfortunately, the stopped_td
and stopped_trb pointers were not getting cleared in this case.

The USB core unconditionally tries to reset the toggle bits on any
endpoints when a new alternate interface setting is installed.  When the
xHCI driver saw that ep->stopped_td was still set from the Stop Endpoint
command, xhci_reset_endpoint assumed the endpoint was actually stalled,
and attempted to clean up the endpoint rings.  This would manifest
itself in a failed Reset Endpoint command and failed Set TR dequeue
Pointer command after a successful Configure Endpoint command.  It may
have also been causing driver oops when the stopped_td was accessed.

This patch should be backported to stable kernels since 2.6.31.  Before
2.6.33, stopped_td was found in the xhci_endpoint_ring, not the
xhci_virt_ep.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci-ring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 237a765f8d1..ba551239d28 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -692,6 +692,8 @@ static void handle_stopped_endpoint(struct xhci_hcd *xhci,
 
 	if (list_empty(&ep->cancelled_td_list)) {
 		xhci_stop_watchdog_timer_in_irq(xhci, ep);
+		ep->stopped_td = NULL;
+		ep->stopped_trb = NULL;
 		ring_doorbell_for_active_rings(xhci, slot_id, ep_index);
 		return;
 	}
-- 
cgit v1.2.3-70-g09d2


From fe6c6c13d8dcb32398eef143e2e8efc3fb4019bf Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Mon, 23 May 2011 16:41:17 -0700
Subject: xhci: Don't submit commands when the host is dead.

When the xHCI host controller dies, the USB core may attempt to reset the
devices to their default configuration before disconnecting them.  This
causes calls into the xHCI bandwidth allocation functions.  Don't allow
those functions to submit commands or work on xHCI structures if the host
controller is marked as dying.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 8f2a56ece44..58183d2a808 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1314,8 +1314,10 @@ int xhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev,
 	if (ret <= 0)
 		return ret;
 	xhci = hcd_to_xhci(hcd);
-	xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
+	if (xhci->xhc_state & XHCI_STATE_DYING)
+		return -ENODEV;
 
+	xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
 	drop_flag = xhci_get_endpoint_flag(&ep->desc);
 	if (drop_flag == SLOT_FLAG || drop_flag == EP0_FLAG) {
 		xhci_dbg(xhci, "xHCI %s - can't drop slot or ep 0 %#x\n",
@@ -1401,6 +1403,8 @@ int xhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev,
 		return ret;
 	}
 	xhci = hcd_to_xhci(hcd);
+	if (xhci->xhc_state & XHCI_STATE_DYING)
+		return -ENODEV;
 
 	added_ctxs = xhci_get_endpoint_flag(&ep->desc);
 	last_ctx = xhci_last_valid_endpoint(added_ctxs);
@@ -1676,6 +1680,8 @@ int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev)
 	if (ret <= 0)
 		return ret;
 	xhci = hcd_to_xhci(hcd);
+	if (xhci->xhc_state & XHCI_STATE_DYING)
+		return -ENODEV;
 
 	xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
 	virt_dev = xhci->devs[udev->slot_id];
-- 
cgit v1.2.3-70-g09d2


From 380032c3c855ded74c43252a0adb8e8d7afc9f45 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Tue, 5 Apr 2011 13:33:56 -0700
Subject: xhci: STFU: Remove function tracing.

Remove unnecessary debugging from the xHCI driver.  We don't need to
know what function we're calling or returning from.  Now I know how to
use markup-oops.pl to de-mystify stack dumps of crashes.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci-ring.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index ba551239d28..4794905d38d 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -2182,7 +2182,6 @@ static int xhci_handle_event(struct xhci_hcd *xhci)
 	int update_ptrs = 1;
 	int ret;
 
-	xhci_dbg(xhci, "In %s\n", __func__);
 	if (!xhci->event_ring || !xhci->event_ring->dequeue) {
 		xhci->error_bitmask |= 1 << 1;
 		return 0;
@@ -2195,7 +2194,6 @@ static int xhci_handle_event(struct xhci_hcd *xhci)
 		xhci->error_bitmask |= 1 << 2;
 		return 0;
 	}
-	xhci_dbg(xhci, "%s - OS owns TRB\n", __func__);
 
 	/*
 	 * Barrier between reading the TRB_CYCLE (valid) flag above and any
@@ -2205,20 +2203,14 @@ static int xhci_handle_event(struct xhci_hcd *xhci)
 	/* FIXME: Handle more event types. */
 	switch ((le32_to_cpu(event->event_cmd.flags) & TRB_TYPE_BITMASK)) {
 	case TRB_TYPE(TRB_COMPLETION):
-		xhci_dbg(xhci, "%s - calling handle_cmd_completion\n", __func__);
 		handle_cmd_completion(xhci, &event->event_cmd);
-		xhci_dbg(xhci, "%s - returned from handle_cmd_completion\n", __func__);
 		break;
 	case TRB_TYPE(TRB_PORT_STATUS):
-		xhci_dbg(xhci, "%s - calling handle_port_status\n", __func__);
 		handle_port_status(xhci, event);
-		xhci_dbg(xhci, "%s - returned from handle_port_status\n", __func__);
 		update_ptrs = 0;
 		break;
 	case TRB_TYPE(TRB_TRANSFER):
-		xhci_dbg(xhci, "%s - calling handle_tx_event\n", __func__);
 		ret = handle_tx_event(xhci, &event->trans_event);
-		xhci_dbg(xhci, "%s - returned from handle_tx_event\n", __func__);
 		if (ret < 0)
 			xhci->error_bitmask |= 1 << 9;
 		else
-- 
cgit v1.2.3-70-g09d2


From 5153b7b39105d8beb38e1c3f26ab4b877960d8e1 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Tue, 5 Apr 2011 13:33:56 -0700
Subject: xhci: STFU: Don't print event ring dequeue pointer.

Stop printing out the event ring dequeue pointer and status register in
the operational register set.  The host will report an OK status 99% of
the time the interrupt handler is called, and usually when it's really
hosed, a host controller won't even call the interrupt handler.  So the
line is really useless.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci-ring.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 4794905d38d..f1ae1723403 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -167,9 +167,7 @@ static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer
 		next = ring->dequeue;
 	}
 	addr = (unsigned long long) xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue);
-	if (ring == xhci->event_ring)
-		xhci_dbg(xhci, "Event ring deq = 0x%llx (DMA)\n", addr);
-	else if (ring == xhci->cmd_ring)
+	if (ring == xhci->cmd_ring)
 		xhci_dbg(xhci, "Command ring deq = 0x%llx (DMA)\n", addr);
 	else
 		xhci_dbg(xhci, "Ring deq = 0x%llx (DMA)\n", addr);
@@ -2267,16 +2265,6 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd)
 		spin_unlock(&xhci->lock);
 		return IRQ_NONE;
 	}
-	xhci_dbg(xhci, "op reg status = %08x\n", status);
-	xhci_dbg(xhci, "Event ring dequeue ptr:\n");
-	xhci_dbg(xhci, "@%llx %08x %08x %08x %08x\n",
-		 (unsigned long long)
-		 xhci_trb_virt_to_dma(xhci->event_ring->deq_seg, trb),
-		 lower_32_bits(le64_to_cpu(trb->link.segment_ptr)),
-		 upper_32_bits(le64_to_cpu(trb->link.segment_ptr)),
-		 (unsigned int) le32_to_cpu(trb->link.intr_target),
-		 (unsigned int) le32_to_cpu(trb->link.control));
-
 	if (status & STS_FATAL) {
 		xhci_warn(xhci, "WARNING: Host System Error\n");
 		xhci_halt(xhci);
-- 
cgit v1.2.3-70-g09d2


From f444ff27e9b8c953eef49da65c649fdcd202165a Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Tue, 5 Apr 2011 15:53:47 -0700
Subject: xhci: STFU: Be quieter during URB submission and completion.

Unsurprisingly, URBs get submitted and completed a lot in the xHCI
driver.  If we have to print 10 lines of debug for every URB submitted
or completed, then that can cause the whole system to stay in the
interrupt handler too long, and can cause Missed Service completion
codes for isochronous transfers.

Cut down the debugging in the URB submission and completion paths:
 - Don't squawk about successful transfers, only unsuccessful ones.
 - Only print the number of bytes transferred if this was a short
   transfer.
 - Don't print the endpoint index for successful transfers (will add
   more debug to failed transfers to show endpoint index there later).
 - Stop printing MMIO writes.  This debugging shows up when the endpoint
   doorbell is rung a to start a transfer (basically for every URB).
 - Don't print out the ring enqueue and dequeue pointers
 - Stop printing when we're pointing to a link TRB.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci-ring.c | 46 +++++++++++++++-----------------------------
 drivers/usb/host/xhci.h      |  6 ------
 2 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index f1ae1723403..d1498c03c4c 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -167,10 +167,6 @@ static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer
 		next = ring->dequeue;
 	}
 	addr = (unsigned long long) xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue);
-	if (ring == xhci->cmd_ring)
-		xhci_dbg(xhci, "Command ring deq = 0x%llx (DMA)\n", addr);
-	else
-		xhci_dbg(xhci, "Ring deq = 0x%llx (DMA)\n", addr);
 }
 
 /*
@@ -246,12 +242,6 @@ static void inc_enq(struct xhci_hcd *xhci, struct xhci_ring *ring,
 		next = ring->enqueue;
 	}
 	addr = (unsigned long long) xhci_trb_virt_to_dma(ring->enq_seg, ring->enqueue);
-	if (ring == xhci->event_ring)
-		xhci_dbg(xhci, "Event ring enq = 0x%llx (DMA)\n", addr);
-	else if (ring == xhci->cmd_ring)
-		xhci_dbg(xhci, "Command ring enq = 0x%llx (DMA)\n", addr);
-	else
-		xhci_dbg(xhci, "Ring enq = 0x%llx (DMA)\n", addr);
 }
 
 /*
@@ -634,13 +624,11 @@ static void xhci_giveback_urb_in_irq(struct xhci_hcd *xhci,
 			}
 		}
 		usb_hcd_unlink_urb_from_ep(hcd, urb);
-		xhci_dbg(xhci, "Giveback %s URB %p\n", adjective, urb);
 
 		spin_unlock(&xhci->lock);
 		usb_hcd_giveback_urb(hcd, urb, status);
 		xhci_urb_free_priv(xhci, urb_priv);
 		spin_lock(&xhci->lock);
-		xhci_dbg(xhci, "%s URB given back\n", adjective);
 	}
 }
 
@@ -1630,7 +1618,6 @@ static int process_ctrl_td(struct xhci_hcd *xhci, struct xhci_td *td,
 					"without IOC set??\n");
 			*status = -ESHUTDOWN;
 		} else {
-			xhci_dbg(xhci, "Successful control transfer!\n");
 			*status = 0;
 		}
 		break;
@@ -1727,7 +1714,6 @@ static int process_isoc_td(struct xhci_hcd *xhci, struct xhci_td *td,
 	switch (trb_comp_code) {
 	case COMP_SUCCESS:
 		frame->status = 0;
-		xhci_dbg(xhci, "Successful isoc transfer!\n");
 		break;
 	case COMP_SHORT_TX:
 		frame->status = td->urb->transfer_flags & URB_SHORT_NOT_OK ?
@@ -1837,12 +1823,6 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td,
 			else
 				*status = 0;
 		} else {
-			if (usb_endpoint_xfer_bulk(&td->urb->ep->desc))
-				xhci_dbg(xhci, "Successful bulk "
-						"transfer!\n");
-			else
-				xhci_dbg(xhci, "Successful interrupt "
-						"transfer!\n");
 			*status = 0;
 		}
 		break;
@@ -1856,11 +1836,12 @@ static int process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_td *td,
 		/* Others already handled above */
 		break;
 	}
-	xhci_dbg(xhci, "ep %#x - asked for %d bytes, "
-			"%d bytes untransferred\n",
-			td->urb->ep->desc.bEndpointAddress,
-			td->urb->transfer_buffer_length,
-		 TRB_LEN(le32_to_cpu(event->transfer_len)));
+	if (trb_comp_code == COMP_SHORT_TX)
+		xhci_dbg(xhci, "ep %#x - asked for %d bytes, "
+				"%d bytes untransferred\n",
+				td->urb->ep->desc.bEndpointAddress,
+				td->urb->transfer_buffer_length,
+				TRB_LEN(le32_to_cpu(event->transfer_len)));
 	/* Fast path - was this the last TRB in the TD for this URB? */
 	if (event_trb == td->last_trb) {
 		if (TRB_LEN(le32_to_cpu(event->transfer_len)) != 0) {
@@ -1954,7 +1935,6 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 
 	/* Endpoint ID is 1 based, our index is zero based */
 	ep_index = TRB_TO_EP_ID(le32_to_cpu(event->flags)) - 1;
-	xhci_dbg(xhci, "%s - ep index = %d\n", __func__, ep_index);
 	ep = &xdev->eps[ep_index];
 	ep_ring = xhci_dma_to_transfer_ring(ep, le64_to_cpu(event->buffer));
 	ep_ctx = xhci_get_ep_ctx(xhci, xdev->out_ctx, ep_index);
@@ -2149,9 +2129,15 @@ cleanup:
 				xhci_urb_free_priv(xhci, urb_priv);
 
 			usb_hcd_unlink_urb_from_ep(bus_to_hcd(urb->dev->bus), urb);
-			xhci_dbg(xhci, "Giveback URB %p, len = %d, "
-					"status = %d\n",
-					urb, urb->actual_length, status);
+			if ((urb->actual_length != urb->transfer_buffer_length &&
+						(urb->transfer_flags &
+						 URB_SHORT_NOT_OK)) ||
+					status != 0)
+				xhci_dbg(xhci, "Giveback URB %p, len = %d, "
+						"expected = %x, status = %d\n",
+						urb, urb->actual_length,
+						urb->transfer_buffer_length,
+						status);
 			spin_unlock(&xhci->lock);
 			usb_hcd_giveback_urb(bus_to_hcd(urb->dev->bus), urb, status);
 			spin_lock(&xhci->lock);
@@ -2379,7 +2365,6 @@ static int prepare_ring(struct xhci_hcd *xhci, struct xhci_ring *ep_ring,
 		u32 ep_state, unsigned int num_trbs, gfp_t mem_flags)
 {
 	/* Make sure the endpoint has been added to xHC schedule */
-	xhci_dbg(xhci, "Endpoint state = 0x%x\n", ep_state);
 	switch (ep_state) {
 	case EP_STATE_DISABLED:
 		/*
@@ -2416,7 +2401,6 @@ static int prepare_ring(struct xhci_hcd *xhci, struct xhci_ring *ep_ring,
 		struct xhci_ring *ring = ep_ring;
 		union xhci_trb *next;
 
-		xhci_dbg(xhci, "prepare_ring: pointing to link trb\n");
 		next = ring->enqueue;
 
 		while (last_trb(xhci, ring, ring->enq_seg, next)) {
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index e12db7cfb9b..0772a8cfea2 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1338,9 +1338,6 @@ static inline unsigned int xhci_readl(const struct xhci_hcd *xhci,
 static inline void xhci_writel(struct xhci_hcd *xhci,
 		const unsigned int val, __le32 __iomem *regs)
 {
-	xhci_dbg(xhci,
-			"`MEM_WRITE_DWORD(3'b000, 32'h%p, 32'h%0x, 4'hf);\n",
-			regs, val);
 	writel(val, regs);
 }
 
@@ -1368,9 +1365,6 @@ static inline void xhci_write_64(struct xhci_hcd *xhci,
 	u32 val_lo = lower_32_bits(val);
 	u32 val_hi = upper_32_bits(val);
 
-	xhci_dbg(xhci,
-			"`MEM_WRITE_DWORD(3'b000, 64'h%p, 64'h%0lx, 4'hf);\n",
-			regs, (long unsigned int) val);
 	writel(val_lo, ptr);
 	writel(val_hi, ptr + 1);
 }
-- 
cgit v1.2.3-70-g09d2


From f29c50419c8d1998edd759f1990c4243a248f469 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 May 2011 14:35:33 -0400
Subject: maccess,probe_kernel: Make write/read src const void *

The functions probe_kernel_write() and probe_kernel_read() do not modify
the src pointer. Allow const pointers to be passed in without the need
of a typecast.

Acked-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1305824936.1465.4.camel@gandalf.stny.rr.com
---
 arch/blackfin/mm/maccess.c | 4 ++--
 arch/s390/mm/maccess.c     | 4 ++--
 include/linux/uaccess.h    | 8 ++++----
 mm/maccess.c               | 8 ++++----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/blackfin/mm/maccess.c b/arch/blackfin/mm/maccess.c
index b71cebc1f8a..e2532114c5f 100644
--- a/arch/blackfin/mm/maccess.c
+++ b/arch/blackfin/mm/maccess.c
@@ -16,7 +16,7 @@ static int validate_memory_access_address(unsigned long addr, int size)
 	return bfin_mem_access_type(addr, size);
 }
 
-long probe_kernel_read(void *dst, void *src, size_t size)
+long probe_kernel_read(void *dst, const void *src, size_t size)
 {
 	unsigned long lsrc = (unsigned long)src;
 	int mem_type;
@@ -55,7 +55,7 @@ long probe_kernel_read(void *dst, void *src, size_t size)
 	return -EFAULT;
 }
 
-long probe_kernel_write(void *dst, void *src, size_t size)
+long probe_kernel_write(void *dst, const void *src, size_t size)
 {
 	unsigned long ldst = (unsigned long)dst;
 	int mem_type;
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 71a4b0d34be..51e5cd9b906 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -19,7 +19,7 @@
  * using the stura instruction.
  * Returns the number of bytes copied or -EFAULT.
  */
-static long probe_kernel_write_odd(void *dst, void *src, size_t size)
+static long probe_kernel_write_odd(void *dst, const void *src, size_t size)
 {
 	unsigned long count, aligned;
 	int offset, mask;
@@ -45,7 +45,7 @@ static long probe_kernel_write_odd(void *dst, void *src, size_t size)
 	return rc ? rc : count;
 }
 
-long probe_kernel_write(void *dst, void *src, size_t size)
+long probe_kernel_write(void *dst, const void *src, size_t size)
 {
 	long copied = 0;
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index d512d98dfb7..5ca0951e185 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -93,8 +93,8 @@ static inline unsigned long __copy_from_user_nocache(void *to,
  * Safely read from address @src to the buffer at @dst.  If a kernel fault
  * happens, handle that and return -EFAULT.
  */
-extern long probe_kernel_read(void *dst, void *src, size_t size);
-extern long __probe_kernel_read(void *dst, void *src, size_t size);
+extern long probe_kernel_read(void *dst, const void *src, size_t size);
+extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 
 /*
  * probe_kernel_write(): safely attempt to write to a location
@@ -105,7 +105,7 @@ extern long __probe_kernel_read(void *dst, void *src, size_t size);
  * Safely write to address @dst from the buffer at @src.  If a kernel fault
  * happens, handle that and return -EFAULT.
  */
-extern long notrace probe_kernel_write(void *dst, void *src, size_t size);
-extern long notrace __probe_kernel_write(void *dst, void *src, size_t size);
+extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
+extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/mm/maccess.c b/mm/maccess.c
index e2b6f5634e0..4cee182ab5f 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -15,10 +15,10 @@
  * happens, handle that and return -EFAULT.
  */
 
-long __weak probe_kernel_read(void *dst, void *src, size_t size)
+long __weak probe_kernel_read(void *dst, const void *src, size_t size)
     __attribute__((alias("__probe_kernel_read")));
 
-long __probe_kernel_read(void *dst, void *src, size_t size)
+long __probe_kernel_read(void *dst, const void *src, size_t size)
 {
 	long ret;
 	mm_segment_t old_fs = get_fs();
@@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
  * Safely write to address @dst from the buffer at @src.  If a kernel fault
  * happens, handle that and return -EFAULT.
  */
-long __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, const void *src, size_t size)
     __attribute__((alias("__probe_kernel_write")));
 
-long __probe_kernel_write(void *dst, void *src, size_t size)
+long __probe_kernel_write(void *dst, const void *src, size_t size)
 {
 	long ret;
 	mm_segment_t old_fs = get_fs();
-- 
cgit v1.2.3-70-g09d2


From 0d098a7d1e39553e8a3f638b923551edec4868a7 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Thu, 12 May 2011 23:33:40 +0600
Subject: x86/ftrace: Fix compiler warning in ftrace.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 Due to commit dc326fca2b64 (x86, cpu: Clean up and unify the NOP selection infrastructure), we get the following warning:

arch/x86/kernel/ftrace.c: In function ‘ftrace_make_nop’:
arch/x86/kernel/ftrace.c:308:6: warning: assignment discards qualifiers from pointer target type
arch/x86/kernel/ftrace.c: In function ‘ftrace_make_call’:
arch/x86/kernel/ftrace.c:318:6: warning: assignment discards qualifiers from pointer target type

ftrace_nop_replace() now returns const unsigned char *, so change its associated function/variable to its compatible type to keep compiler clam.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Link: http://lkml.kernel.org/r/1305221620.7986.4.camel@localhost.localdomain

[ updated for change of const void *src in probe_kernel_write() ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0ba15a6cc57..c9a281f272f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -123,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 static atomic_t nmi_running = ATOMIC_INIT(0);
 static int mod_code_status;		/* holds return value of text write */
 static void *mod_code_ip;		/* holds the IP to write to */
-static void *mod_code_newcode;		/* holds the text to write to the IP */
+static const void *mod_code_newcode;	/* holds the text to write to the IP */
 
 static unsigned nmi_wait_count;
 static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -225,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 }
 
 static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
+do_ftrace_mod_code(unsigned long ip, const void *new_code)
 {
 	/*
 	 * On x86_64, kernel text mappings are mapped read-only with
@@ -266,8 +266,8 @@ static const unsigned char *ftrace_nop_replace(void)
 }
 
 static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-		   unsigned char *new_code)
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+		   unsigned const char *new_code)
 {
 	unsigned char replaced[MCOUNT_INSN_SIZE];
 
@@ -301,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 int ftrace_make_nop(struct module *mod,
 		    struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned char *new, *old;
+	unsigned const char *new, *old;
 	unsigned long ip = rec->ip;
 
 	old = ftrace_call_replace(ip, addr);
@@ -312,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
 
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned char *new, *old;
+	unsigned const char *new, *old;
 	unsigned long ip = rec->ip;
 
 	old = ftrace_nop_replace();
-- 
cgit v1.2.3-70-g09d2


From 50d6828e898590fc5d038810334695380baa1c78 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 May 2011 14:41:17 -0400
Subject: scripts/tags.sh: Fix ctags for DEFINE_EVENT()

The regex to handle DEFINE_EVENT() should not be the same as
the TRACE_EVENT() as the first parameter in DEFINE_EVENT is the
template name, not the event name. We need the second parameter
as that is what the trace_... will use.

Tested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 scripts/tags.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/tags.sh b/scripts/tags.sh
index bd6185d529c..33b53cab5db 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -132,7 +132,7 @@ exuberant()
 	--regex-asm='/^ENTRY\(([^)]*)\).*/\1/'                  \
 	--regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' \
 	--regex-c++='/^TRACE_EVENT\(([^,)]*).*/trace_\1/'		\
-	--regex-c++='/^DEFINE_EVENT\(([^,)]*).*/trace_\1/'
+	--regex-c++='/^DEFINE_EVENT\([^,)]*, *([^,)]*).*/trace_\1/'
 
 	all_kconfigs | xargs $1 -a                              \
 	--langdef=kconfig --language-force=kconfig              \
-- 
cgit v1.2.3-70-g09d2


From 4d7a2fa876d1a615649761dc465708d0a062249a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 May 2011 14:43:57 -0400
Subject: scripts/tags.sh: Add magic for trace-events for etags too

Seems that Peter Zijlstra treats us emacs users as second class
citizens and the commit:

 commit 15664125f7cadcb6d725cb2d9b90f9715397848d
 Author: Peter Zijlstra <peterz@infradead.org>
 scripts/tags.sh: Add magic for trace-events

only updated ctags (for vim) and did not do the work to let us
lowly emacs users benefit from such a change.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 scripts/tags.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/tags.sh b/scripts/tags.sh
index 33b53cab5db..75c5d24f199 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -152,7 +152,9 @@ emacs()
 {
 	all_sources | xargs $1 -a                               \
 	--regex='/^ENTRY(\([^)]*\)).*/\1/'                      \
-	--regex='/^SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/sys_\1/'
+	--regex='/^SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/sys_\1/'   \
+	--regex='/^TRACE_EVENT(\([^,)]*\).*/trace_\1/'		\
+	--regex='/^DEFINE_EVENT([^,)]*, *\([^,)]*\).*/trace_\1/'
 
 	all_kconfigs | xargs $1 -a                              \
 	--regex='/^[ \t]*\(\(menu\)*config\)[ \t]+\([a-zA-Z0-9_]+\)/\3/'
-- 
cgit v1.2.3-70-g09d2


From 9905ce8ad7b79dddd23c7b4753d0b2cdb65bde3c Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Wed, 11 May 2011 22:53:51 +0530
Subject: ftrace/recordmcount: Avoid STT_FUNC symbols as base on ARM

While find_secsym_ndx often finds the unamed local STT_SECTION, if a
section has only one function in it, the ARM toolchain generates the
STT_FUNC symbol before the STT_SECTION, and recordmcount finds this
instead.

This is problematic on ARM because in ARM ELFs, "if a [STT_FUNC] symbol
addresses a Thumb instruction, its value is the address of the
instruction with bit zero set (in a relocatable object, the section
offset with bit zero set)".  This leads to incorrect mcount addresses
being recorded.

Fix this by not using STT_FUNC symbols as the base on ARM.

Signed-off-by: Rabin Vincent <rabin@rab.in>
Link: http://lkml.kernel.org/r/1305134631-31617-1-git-send-email-rabin@rab.in
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 scripts/recordmcount.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h
index 4be60364a40..f40a6af6bf4 100644
--- a/scripts/recordmcount.h
+++ b/scripts/recordmcount.h
@@ -43,6 +43,7 @@
 #undef ELF_R_INFO
 #undef Elf_r_info
 #undef ELF_ST_BIND
+#undef ELF_ST_TYPE
 #undef fn_ELF_R_SYM
 #undef fn_ELF_R_INFO
 #undef uint_t
@@ -76,6 +77,7 @@
 # define ELF_R_INFO		ELF64_R_INFO
 # define Elf_r_info		Elf64_r_info
 # define ELF_ST_BIND		ELF64_ST_BIND
+# define ELF_ST_TYPE		ELF64_ST_TYPE
 # define fn_ELF_R_SYM		fn_ELF64_R_SYM
 # define fn_ELF_R_INFO		fn_ELF64_R_INFO
 # define uint_t			uint64_t
@@ -108,6 +110,7 @@
 # define ELF_R_INFO		ELF32_R_INFO
 # define Elf_r_info		Elf32_r_info
 # define ELF_ST_BIND		ELF32_ST_BIND
+# define ELF_ST_TYPE		ELF32_ST_TYPE
 # define fn_ELF_R_SYM		fn_ELF32_R_SYM
 # define fn_ELF_R_INFO		fn_ELF32_R_INFO
 # define uint_t			uint32_t
@@ -427,6 +430,11 @@ static unsigned find_secsym_ndx(unsigned const txtndx,
 		if (txtndx == w2(symp->st_shndx)
 			/* avoid STB_WEAK */
 		    && (STB_LOCAL == st_bind || STB_GLOBAL == st_bind)) {
+			/* function symbols on ARM have quirks, avoid them */
+			if (w2(ehdr->e_machine) == EM_ARM
+			    && ELF_ST_TYPE(symp->st_info) == STT_FUNC)
+				continue;
+
 			*recvalp = _w(symp->st_value);
 			return symp - sym0;
 		}
-- 
cgit v1.2.3-70-g09d2


From 7cbc5b8d4a775a43875a09e29c49a2a8195b5b2d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 10 May 2011 12:43:46 +0200
Subject: jump_label: Check entries limit in __jump_label_update

When iterating the jump_label entries array (core or modules),
the __jump_label_update function peeks over the last entry.

The reason is that the end of the for loop depends on the key
value of the processed entry. Thus when going through the
last array entry, we will touch the memory behind the array
limit.

This bug probably will never be triggered, since most likely the
memory behind the jump_label entries will be accesable and the
entry->key will be different than the expected value.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Link: http://lkml.kernel.org/r/20110510104346.GC1899@jolsa.brq.redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/jump_label.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 74d1c099fbd..fa27e750dbc 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -105,9 +105,12 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 }
 
 static void __jump_label_update(struct jump_label_key *key,
-		struct jump_entry *entry, int enable)
+				struct jump_entry *entry,
+				struct jump_entry *stop, int enable)
 {
-	for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
+	for (; (entry < stop) &&
+	      (entry->key == (jump_label_t)(unsigned long)key);
+	      entry++) {
 		/*
 		 * entry->code set to 0 invalidates module init text sections
 		 * kernel_text_address() verifies we are not in core kernel
@@ -181,7 +184,11 @@ static void __jump_label_mod_update(struct jump_label_key *key, int enable)
 	struct jump_label_mod *mod = key->next;
 
 	while (mod) {
-		__jump_label_update(key, mod->entries, enable);
+		struct module *m = mod->mod;
+
+		__jump_label_update(key, mod->entries,
+				    m->jump_entries + m->num_jump_entries,
+				    enable);
 		mod = mod->next;
 	}
 }
@@ -245,7 +252,8 @@ static int jump_label_add_module(struct module *mod)
 		key->next = jlm;
 
 		if (jump_label_enabled(key))
-			__jump_label_update(key, iter, JUMP_LABEL_ENABLE);
+			__jump_label_update(key, iter, iter_stop,
+					    JUMP_LABEL_ENABLE);
 	}
 
 	return 0;
@@ -371,7 +379,7 @@ static void jump_label_update(struct jump_label_key *key, int enable)
 
 	/* if there are no users, entry can be NULL */
 	if (entry)
-		__jump_label_update(key, entry, enable);
+		__jump_label_update(key, entry, __stop___jump_table, enable);
 
 #ifdef CONFIG_MODULES
 	__jump_label_mod_update(key, enable);
-- 
cgit v1.2.3-70-g09d2


From 916f676f8dc016103f983c7ec54c18ecdbb6e349 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Wed, 25 May 2011 09:53:13 -0400
Subject: x86, efi: Retain boot service code until after switching to virtual
 mode

UEFI stands for "Unified Extensible Firmware Interface", where "Firmware"
is an ancient African word meaning "Why do something right when you can
do it so wrong that children will weep and brave adults will cower before
you", and "UEI" is Celtic for "We missed DOS so we burned it into your
ROMs". The UEFI specification provides for runtime services (ie, another
way for the operating system to be forced to depend on the firmware) and
we rely on these for certain trivial tasks such as setting up the
bootloader. But some hardware fails to work if we attempt to use these
runtime services from physical mode, and so we have to switch into virtual
mode. So far so dreadful.

The specification makes it clear that the operating system is free to do
whatever it wants with boot services code after ExitBootServices() has been
called. SetVirtualAddressMap() can't be called until ExitBootServices() has
been. So, obviously, a whole bunch of EFI implementations call into boot
services code when we do that. Since we've been charmingly naive and
trusted that the specification may be somehow relevant to the real world,
we've already stuffed a picture of a penguin or something in that address
space. And just to make things more entertaining, we've also marked it
non-executable.

This patch allocates the boot services regions during EFI init and makes
sure that they're executable. Then, after SetVirtualAddressMap(), it
discards them and everyone lives happily ever after. Except for the ones
who have to work on EFI, who live sad lives haunted by the knowledge that
someone's eventually going to write yet another firmware specification.

[ hpa: adding this to urgent with a stable tag since it fixes currently-broken
  hardware.  However, I do not know what the dependencies are and so I do
  not know which -stable versions this may be a candidate for. ]

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1306331593-28715-1-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/setup.c        |  7 +++++++
 arch/x86/platform/efi/efi.c    | 45 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/platform/efi/efi_64.c |  5 +++--
 include/linux/efi.h            |  1 +
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 605e5ae19c7..b9ca498f560 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -910,6 +910,13 @@ void __init setup_arch(char **cmdline_p)
 	memblock.current_limit = get_max_mapped();
 	memblock_x86_fill();
 
+	/*
+	 * The EFI specification says that boot service code won't be called
+	 * after ExitBootServices(). This is, in fact, a lie.
+	 */
+	if (efi_enabled)
+		efi_reserve_boot_services();
+
 	/* preallocate 4k for mptable mpc */
 	early_reserve_e820_mpc_new();
 
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index b30aa26a8df..0d3a4fa3456 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -304,6 +304,40 @@ static void __init print_efi_memmap(void)
 }
 #endif  /*  EFI_DEBUG  */
 
+void __init efi_reserve_boot_services(void)
+{
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+		if (md->type != EFI_BOOT_SERVICES_CODE &&
+		    md->type != EFI_BOOT_SERVICES_DATA)
+			continue;
+
+		memblock_x86_reserve_range(start, start + size, "EFI Boot");
+	}
+}
+
+static void __init efi_free_boot_services(void)
+{
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+		if (md->type != EFI_BOOT_SERVICES_CODE &&
+		    md->type != EFI_BOOT_SERVICES_DATA)
+			continue;
+
+		free_bootmem_late(start, size);
+	}
+}
+
 void __init efi_init(void)
 {
 	efi_config_table_t *config_tables;
@@ -536,7 +570,9 @@ void __init efi_enter_virtual_mode(void)
 
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+		if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+		    md->type != EFI_BOOT_SERVICES_CODE &&
+		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
 
 		size = md->num_pages << EFI_PAGE_SHIFT;
@@ -592,6 +628,13 @@ void __init efi_enter_virtual_mode(void)
 		panic("EFI call to SetVirtualAddressMap() failed!");
 	}
 
+	/*
+	 * Thankfully, it does seem that no runtime services other than
+	 * SetVirtualAddressMap() will touch boot services code, so we can
+	 * get rid of it all at this point
+	 */
+	efi_free_boot_services();
+
 	/*
 	 * Now that EFI is in virtual mode, update the function
 	 * pointers in the runtime service table to the new virtual addresses.
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 2649426a790..ac3aa54e265 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -49,10 +49,11 @@ static void __init early_code_mapping_set_exec(int executable)
 	if (!(__supported_pte_mask & _PAGE_NX))
 		return;
 
-	/* Make EFI runtime service code area executable */
+	/* Make EFI service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
-		if (md->type == EFI_RUNTIME_SERVICES_CODE)
+		if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+		    md->type == EFI_BOOT_SERVICES_CODE)
 			efi_set_executable(md, executable);
 	}
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 33fa1203024..e376270cd26 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -299,6 +299,7 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource,
 		struct resource *data_resource, struct resource *bss_resource);
 extern unsigned long efi_get_time(void);
 extern int efi_set_rtc_mmss(unsigned long nowtime);
+extern void efi_reserve_boot_services(void);
 extern struct efi_memory_map memmap;
 
 /**
-- 
cgit v1.2.3-70-g09d2


From a1cd6173596c6f7d1f0b41ac7d33ecf03c581edc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 May 2011 15:24:25 -0400
Subject: ftrace: Have ftrace_startup() return failure code

The register_ftrace_function() returns an error code on failure
except if the call to ftrace_startup() fails. Add a error return to
ftrace_startup() if it fails to start, allowing register_ftrace_funtion()
to return a proper error value.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d017c2c82c4..bebbc959ee8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1638,12 +1638,12 @@ static void ftrace_startup_enable(int command)
 	ftrace_run_update_code(command);
 }
 
-static void ftrace_startup(struct ftrace_ops *ops, int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
 	bool hash_enable = true;
 
 	if (unlikely(ftrace_disabled))
-		return;
+		return -ENODEV;
 
 	ftrace_start_up++;
 	command |= FTRACE_ENABLE_CALLS;
@@ -1662,6 +1662,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command)
 		ftrace_hash_rec_enable(ops, 1);
 
 	ftrace_startup_enable(command);
+
+	return 0;
 }
 
 static void ftrace_shutdown(struct ftrace_ops *ops, int command)
@@ -2501,7 +2503,7 @@ static void __enable_ftrace_function_probe(void)
 
 	ret = __register_ftrace_function(&trace_probe_ops);
 	if (!ret)
-		ftrace_startup(&trace_probe_ops, 0);
+		ret = ftrace_startup(&trace_probe_ops, 0);
 
 	ftrace_probe_registered = 1;
 }
@@ -3466,7 +3468,7 @@ device_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)	do { } while (0)
+# define ftrace_startup(ops, command)	({0;})
 # define ftrace_shutdown(ops, command)	do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
@@ -3799,7 +3801,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 
 	ret = __register_ftrace_function(ops);
 	if (!ret)
-		ftrace_startup(ops, 0);
+		ret = ftrace_startup(ops, 0);
 
 
  out_unlock:
@@ -4045,7 +4047,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_graph_return = retfunc;
 	ftrace_graph_entry = entryfunc;
 
-	ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 
 out:
 	mutex_unlock(&ftrace_lock);
-- 
cgit v1.2.3-70-g09d2


From 17bb615ad4f8d2d2c0f02794d27d7f83e0009ef4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 May 2011 15:27:46 -0400
Subject: tracing: Have event with function tracer check error return

The self tests for event tracer does not check if the function
tracing was successfully activated. It needs to before it continues
the tests, otherwise the wrong errors may be reported.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe11034135..686ec399f2a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata  =
 
 static __init void event_trace_self_test_with_function(void)
 {
-	register_ftrace_function(&trace_ops);
+	int ret;
+	ret = register_ftrace_function(&trace_ops);
+	if (WARN_ON(ret < 0)) {
+		pr_info("Failed to enable function tracer for event tests\n");
+		return;
+	}
 	pr_info("Running tests again, along with the function tracer\n");
 	event_trace_self_tests();
 	unregister_ftrace_function(&trace_ops);
-- 
cgit v1.2.3-70-g09d2


From 3b6cfdb1714a33ae4d2ca9fbc818a42cf7adee69 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 May 2011 15:33:49 -0400
Subject: ftrace: Set ops->flag to enabled even on static function tracing

When dynamic ftrace is not configured, the ops->flags still needs
to have its FTRACE_OPS_FL_ENABLED bit set in ftrace_startup().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bebbc959ee8..25949b33057 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3468,7 +3468,11 @@ device_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)	({0;})
+# define ftrace_startup(ops, command)			\
+	({						\
+		(ops)->flags |= FTRACE_OPS_FL_ENABLED;	\
+		0;					\
+	})
 # define ftrace_shutdown(ops, command)	do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
-- 
cgit v1.2.3-70-g09d2


From 2fc1b6f0d0a719e1e2a30bf076a3a799feaf6af2 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Tue, 19 Apr 2011 09:35:28 +0800
Subject: tracing: Add __print_symbolic_u64 to avoid warnings on 32bit machine

Filesystem, like Btrfs, has some "ULL" macros, and when these macros are passed
to tracepoints'__print_symbolic(), there will be 64->32 truncate WARNINGS during
compiling on 32bit box.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/4DACE6E0.7000507@cn.fujitsu.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 12 ++++++++++++
 include/trace/ftrace.h       | 13 +++++++++++++
 kernel/trace/trace_output.c  | 27 +++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index b5a550a39a7..59d3ef100eb 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,6 +16,11 @@ struct trace_print_flags {
 	const char		*name;
 };
 
+struct trace_print_flags_u64 {
+	unsigned long long	mask;
+	const char		*name;
+};
+
 const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 				   unsigned long flags,
 				   const struct trace_print_flags *flag_array);
@@ -23,6 +28,13 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 				     const struct trace_print_flags *symbol_array);
 
+#if BITS_PER_LONG == 32
+const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
+					 unsigned long long val,
+					 const struct trace_print_flags_u64
+								 *symbol_array);
+#endif
+
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 3e68366d485..533c49f4804 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -205,6 +205,19 @@
 		ftrace_print_symbols_seq(p, value, symbols);		\
 	})
 
+#undef __print_symbolic_u64
+#if BITS_PER_LONG == 32
+#define __print_symbolic_u64(value, symbol_array...)			\
+	({								\
+		static const struct trace_print_flags_u64 symbols[] =	\
+			{ symbol_array, { -1, NULL } };			\
+		ftrace_print_symbols_seq_u64(p, value, symbols);	\
+	})
+#else
+#define __print_symbolic_u64(value, symbol_array...)			\
+			__print_symbolic(value, symbol_array)
+#endif
+
 #undef __print_hex
 #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cf535ccedc8..e37de492a9e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+			 const struct trace_print_flags_u64 *symbol_array)
+{
+	int i;
+	const char *ret = p->buffer + p->len;
+
+	for (i = 0;  symbol_array[i].name; i++) {
+
+		if (val != symbol_array[i].mask)
+			continue;
+
+		trace_seq_puts(p, symbol_array[i].name);
+		break;
+	}
+
+	if (!p->len)
+		trace_seq_printf(p, "0x%llx", val);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
+
 const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
-- 
cgit v1.2.3-70-g09d2


From 7f34b746f79c1e1f8fd6d09799d133263ae7a504 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Tue, 19 Apr 2011 09:35:31 +0800
Subject: tracing: Update btrfs's tracepoints to use u64 interface

To avoid 64->32 truncating WARNING, update btrfs's tracepoints.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/4DACE6E3.8080200@cn.fujitsu.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/btrfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index f445cff66ab..4114129f079 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -28,7 +28,7 @@ struct extent_buffer;
 		{ BTRFS_SHARED_DATA_REF_KEY, 	"SHARED_DATA_REF" })
 
 #define __show_root_type(obj)						\
-	__print_symbolic(obj,						\
+	__print_symbolic_u64(obj,					\
 		{ BTRFS_ROOT_TREE_OBJECTID, 	"ROOT_TREE"	},	\
 		{ BTRFS_EXTENT_TREE_OBJECTID, 	"EXTENT_TREE"	},	\
 		{ BTRFS_CHUNK_TREE_OBJECTID, 	"CHUNK_TREE"	},	\
@@ -125,7 +125,7 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 );
 
 #define __show_map_type(type)						\
-	__print_symbolic(type,						\
+	__print_symbolic_u64(type,					\
 		{ EXTENT_MAP_LAST_BYTE, "LAST_BYTE" 	},		\
 		{ EXTENT_MAP_HOLE, 	"HOLE" 		},		\
 		{ EXTENT_MAP_INLINE, 	"INLINE" 	},		\
-- 
cgit v1.2.3-70-g09d2


From b1cff0ad1062621ae63cb6c5dc4165191fe2e9f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 25 May 2011 14:27:43 -0400
Subject: ftrace: Add internal recursive checks

Witold reported a reboot caused by the selftests of the dynamic function
tracer. He sent me a config and I used ktest to do a config_bisect on it
(as my config did not cause the crash). It pointed out that the problem
config was CONFIG_PROVE_RCU.

What happened was that if multiple callbacks are attached to the
function tracer, we iterate a list of callbacks. Because the list is
managed by synchronize_sched() and preempt_disable, the access to the
pointers uses rcu_dereference_raw().

When PROVE_RCU is enabled, the rcu_dereference_raw() calls some
debugging functions, which happen to be traced. The tracing of the debug
function would then call rcu_dereference_raw() which would then call the
debug function and then... well you get the idea.

I first wrote two different patches to solve this bug.

1) add a __rcu_dereference_raw() that would not do any checks.
2) add notrace to the offending debug functions.

Both of these patches worked.

Talking with Paul McKenney on IRC, he suggested to add recursion
detection instead. This seemed to be a better solution, so I decided to
implement it. As the task_struct already has a trace_recursion to detect
recursion in the ring buffer, and that has a very small number it
allows, I decided to use that same variable to add flags that can detect
the recursion inside the infrastructure of the function tracer.

I plan to change it so that the task struct bit can be checked in
mcount, but as that requires changes to all archs, I will hold that off
to the next merge window.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1306348063.1465.116.camel@gandalf.stny.rr.com
Reported-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h      |  2 +-
 kernel/trace/ftrace.c      | 13 ++++++++++++-
 kernel/trace/ring_buffer.c | 10 +++++-----
 kernel/trace/trace.h       | 15 +++++++++++++++
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d8b2d0bec0d..7b78d9cad47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1513,7 +1513,7 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
-	/* bitmask of trace recursion */
+	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25949b33057..1ee417fcbfa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -109,12 +109,18 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
 static void ftrace_global_list_func(unsigned long ip,
 				    unsigned long parent_ip)
 {
-	struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
+	struct ftrace_ops *op;
+
+	if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+		return;
 
+	trace_recursion_set(TRACE_GLOBAL_BIT);
+	op = rcu_dereference_raw(ftrace_global_list); /*see above*/
 	while (op != &ftrace_list_end) {
 		op->func(ip, parent_ip);
 		op = rcu_dereference_raw(op->next); /*see above*/
 	};
+	trace_recursion_clear(TRACE_GLOBAL_BIT);
 }
 
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -3490,6 +3496,10 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op;
 
+	if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+		return;
+
+	trace_recursion_set(TRACE_INTERNAL_BIT);
 	/*
 	 * Some of the ops may be dynamically allocated,
 	 * they must be freed after a synchronize_sched().
@@ -3502,6 +3512,7 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
 		op = rcu_dereference_raw(op->next);
 	};
 	preempt_enable_notrace();
+	trace_recursion_clear(TRACE_INTERNAL_BIT);
 }
 
 static void clear_ftrace_swapper(void)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f..b0c7aa40794 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
 
 	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
 		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-		    current->trace_recursion,
+		    trace_recursion_buffer(),
 		    hardirq_count() >> HARDIRQ_SHIFT,
 		    softirq_count() >> SOFTIRQ_SHIFT,
 		    in_nmi());
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
 
 static inline int trace_recursive_lock(void)
 {
-	current->trace_recursion++;
+	trace_recursion_inc();
 
-	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+	if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
 		return 0;
 
 	trace_recursive_fail();
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void)
 
 static inline void trace_recursive_unlock(void)
 {
-	WARN_ON_ONCE(!current->trace_recursion);
+	WARN_ON_ONCE(!trace_recursion_buffer());
 
-	current->trace_recursion--;
+	trace_recursion_dec();
 }
 
 #else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b69c4bd306..229f8591f61 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -784,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
 	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
 #include "trace_entries.h"
 
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT		(1<<11)
+#define TRACE_GLOBAL_BIT		(1<<12)
+
+#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit)	((current)->trace_recursion & (bit))
+
 #endif /* _LINUX_KERNEL_TRACE_H */
-- 
cgit v1.2.3-70-g09d2


From 4bff4a7ee4b9e487a1bc1d31082e77b6a381418a Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Wed, 11 May 2011 16:51:20 +0000
Subject: ARM: arch-shmobile: support SDHI card detection on mackerel, using a
 GPIO

On sh7372 the card-detection pin of SDHI0 can also produce interrupts,
when configured as GPIO. Use this feature to power down SDHI0, when no
card is plugged in.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/arm/mach-shmobile/board-mackerel.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm/mach-shmobile/board-mackerel.c b/arch/arm/mach-shmobile/board-mackerel.c
index d4fe74067d6..776f20560e7 100644
--- a/arch/arm/mach-shmobile/board-mackerel.c
+++ b/arch/arm/mach-shmobile/board-mackerel.c
@@ -39,6 +39,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
+#include <linux/pm_runtime.h>
 #include <linux/smsc911x.h>
 #include <linux/sh_intc.h>
 #include <linux/tca6416_keypad.h>
@@ -913,6 +914,17 @@ static int slot_cn7_get_cd(struct platform_device *pdev)
 }
 
 /* SDHI0 */
+static irqreturn_t mackerel_sdhi0_gpio_cd(int irq, void *arg)
+{
+	struct device *dev = arg;
+	struct sh_mobile_sdhi_info *info = dev->platform_data;
+	struct tmio_mmc_data *pdata = info->pdata;
+
+	tmio_mmc_cd_wakeup(pdata);
+
+	return IRQ_HANDLED;
+}
+
 static struct sh_mobile_sdhi_info sdhi0_info = {
 	.dma_slave_tx	= SHDMA_SLAVE_SDHI0_TX,
 	.dma_slave_rx	= SHDMA_SLAVE_SDHI0_RX,
@@ -1296,6 +1308,7 @@ static void __init mackerel_init(void)
 {
 	u32 srcr4;
 	struct clk *clk;
+	int ret;
 
 	sh7372_pinmux_init();
 
@@ -1401,6 +1414,13 @@ static void __init mackerel_init(void)
 	gpio_request(GPIO_FN_SDHID0_1, NULL);
 	gpio_request(GPIO_FN_SDHID0_0, NULL);
 
+	ret = request_irq(evt2irq(0x3340), mackerel_sdhi0_gpio_cd,
+			  IRQF_TRIGGER_FALLING, "sdhi0 cd", &sdhi0_device.dev);
+	if (!ret)
+		sdhi0_info.tmio_flags |= TMIO_MMC_HAS_COLD_CD;
+	else
+		pr_err("Cannot get IRQ #%d: %d\n", evt2irq(0x3340), ret);
+
 #if !defined(CONFIG_MMC_SH_MMCIF) && !defined(CONFIG_MMC_SH_MMCIF_MODULE)
 	/* enable SDHI1 */
 	gpio_request(GPIO_FN_SDHICMD1, NULL);
-- 
cgit v1.2.3-70-g09d2


From ea7659fb2b876337aee719d9d5ddb05531dfb334 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 26 May 2011 10:21:05 +0200
Subject: perf: Remove duplicate headers

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: trivial@kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1105261011290.17400@swampdragon.chaosbits.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 2 --
 tools/perf/builtin-script.c   | 1 -
 2 files changed, 3 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index e18eb7ed30a..7b139e1e7e8 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -7,8 +7,6 @@
  */
 #include "builtin.h"
 
-#include "util/util.h"
-
 #include "util/util.h"
 #include "util/color.h"
 #include <linux/list.h>
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 974f6d3f4e5..22747de7234 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -10,7 +10,6 @@
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/trace-event.h"
-#include "util/parse-options.h"
 #include "util/util.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
-- 
cgit v1.2.3-70-g09d2


From b80ef10e84d85a06bcd0b3a24a752ec32d0e0e40 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 26 May 2011 17:12:12 +0900
Subject: x86: Move do_page_fault()'s error path under unlikely()

Ingo suggested SIGKILL check should be moved into slowpath
function. This will reduce the page fault fastpath impact
of this recent commit:

  37b23e0525d3: x86,mm: make pagefault killable

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: kamezawa.hiroyu@jp.fujitsu.com
Cc: minchan.kim@gmail.com
Cc: willy@linux.intel.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4DDE0B5C.9050907@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f7a2a054a3c..2dbf6bf4c7e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -823,16 +823,30 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
 
-static noinline void
+static noinline int
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, unsigned int fault)
 {
+	/*
+	 * Pagefault was interrupted by SIGKILL. We have no reason to
+	 * continue pagefault.
+	 */
+	if (fatal_signal_pending(current)) {
+		if (!(fault & VM_FAULT_RETRY))
+			up_read(&current->mm->mmap_sem);
+		if (!(error_code & PF_USER))
+			no_context(regs, error_code, address);
+		return 1;
+	}
+	if (!(fault & VM_FAULT_ERROR))
+		return 0;
+
 	if (fault & VM_FAULT_OOM) {
 		/* Kernel mode? Handle exceptions or die: */
 		if (!(error_code & PF_USER)) {
 			up_read(&current->mm->mmap_sem);
 			no_context(regs, error_code, address);
-			return;
+			return 1;
 		}
 
 		out_of_memory(regs, error_code, address);
@@ -843,6 +857,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		else
 			BUG();
 	}
+	return 1;
 }
 
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -1133,19 +1148,9 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
 
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		mm_fault_error(regs, error_code, address, fault);
-		return;
-	}
-
-	/*
-	 * Pagefault was interrupted by SIGKILL. We have no reason to
-	 * continue pagefault.
-	 */
-	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
-		if (!(error_code & PF_USER))
-			no_context(regs, error_code, address);
-		return;
+	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+		if (mm_fault_error(regs, error_code, address, fault))
+			return;
 	}
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From ec80fde746e3ccf93895d25ae1a7071c9af52585 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 26 May 2011 09:53:51 -0300
Subject: perf symbols: Handle /proc/sys/kernel/kptr_restrict

Perf uses /proc/modules to figure out where kernel modules are loaded.

With the advent of kptr_restrict, non root users get zeroes for all module
start addresses.

So check if kptr_restrict is non zero and don't generate the syntethic
PERF_RECORD_MMAP events for them.

Warn the user about it in perf record and in perf report.

In perf report the reference relocation symbol being zero means that
kptr_restrict was set, thus /proc/kallsyms has only zeroed addresses, so don't
use it to fixup symbol addresses when using a valid kallsyms (in the buildid
cache) or vmlinux (in the vmlinux path) build-id located automatically or
specified by the user.

Provide an explanation about it in 'perf report' if kernel samples were taken,
checking if a suitable vmlinux or kallsyms was found/specified.

Restricted /proc/kallsyms don't go to the buildid cache anymore.

Example:

 [acme@emilia ~]$ perf record -F 100000 sleep 1

 WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted, check
 /proc/sys/kernel/kptr_restrict.

 Samples in kernel functions may not be resolved if a suitable vmlinux file is
 not found in the buildid cache or in the vmlinux path.

 Samples in kernel modules won't be resolved at all.

 If some relocation was applied (e.g. kexec) symbols may be misresolved even
 with a suitable vmlinux or kallsyms file.

 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.005 MB perf.data (~231 samples) ]
 [acme@emilia ~]$

 [acme@emilia ~]$ perf report --stdio
 Kernel address maps (/proc/{kallsyms,modules}) were restricted,
 check /proc/sys/kernel/kptr_restrict before running 'perf record'.

 If some relocation was applied (e.g. kexec) symbols may be misresolved.

 Samples in kernel modules can't be resolved as well.

 # Events: 13  cycles
 #
 # Overhead  Command      Shared Object                 Symbol
 # ........  .......  .................  .....................
 #
    20.24%    sleep  [kernel.kallsyms]  [k] page_fault
    20.04%    sleep  [kernel.kallsyms]  [k] filemap_fault
    19.78%    sleep  [kernel.kallsyms]  [k] __lru_cache_add
    19.69%    sleep  ld-2.12.so         [.] memcpy
    14.71%    sleep  [kernel.kallsyms]  [k] dput
     4.70%    sleep  [kernel.kallsyms]  [k] flush_signal_handlers
     0.73%    sleep  [kernel.kallsyms]  [k] perf_event_comm
     0.11%    sleep  [kernel.kallsyms]  [k] native_write_msr_safe

 #
 # (For a higher level overview, try: perf report --sort comm,dso)
 #
 [acme@emilia ~]$

This is because it found a suitable vmlinux (build-id checked) in
/lib/modules/2.6.39-rc7+/build/vmlinux (use -v in perf report to see the long
file name).

If we remove that file from the vmlinux path:

 [root@emilia ~]# mv /lib/modules/2.6.39-rc7+/build/vmlinux \
		     /lib/modules/2.6.39-rc7+/build/vmlinux.OFF
 [acme@emilia ~]$ perf report --stdio
 [kernel.kallsyms] with build id 57298cdbe0131f6871667ec0eaab4804dcf6f562
 not found, continuing without symbols

 Kernel address maps (/proc/{kallsyms,modules}) were restricted, check
 /proc/sys/kernel/kptr_restrict before running 'perf record'.

 As no suitable kallsyms nor vmlinux was found, kernel samples can't be
 resolved.

 Samples in kernel modules can't be resolved as well.

 # Events: 13  cycles
 #
 # Overhead  Command      Shared Object  Symbol
 # ........  .......  .................  ......
 #
    80.31%    sleep  [kernel.kallsyms]  [k] 0xffffffff8103425a
    19.69%    sleep  ld-2.12.so         [.] memcpy

 #
 # (For a higher level overview, try: perf report --sort comm,dso)
 #
 [acme@emilia ~]$

Reported-by: Stephane Eranian <eranian@google.com>
Suggested-by: David Miller <davem@davemloft.net>
Cc: Dave Jones <davej@redhat.com>
Cc: David Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/n/tip-mt512joaxxbhhp1odop04yit@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 13 ++++++++++++
 tools/perf/builtin-report.c | 26 ++++++++++++++++++++++++
 tools/perf/util/event.c     | 15 +++++++++++---
 tools/perf/util/header.c    |  8 ++++++--
 tools/perf/util/symbol.c    | 48 +++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/symbol.h    |  3 ++-
 6 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0974f957b8f..2ca107f3efd 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -823,6 +823,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 
 	symbol__init();
 
+	if (symbol_conf.kptr_restrict)
+		pr_warning("WARNING: Kernel address maps "
+			   "(/proc/{kallsyms,modules}) are restricted, "
+			   "check /proc/sys/kernel/kptr_restrict.\n\n"
+			   "Samples in kernel functions may not be resolved "
+			   "if a suitable vmlinux file is not found in the "
+			   "buildid cache or in the vmlinux path.\n\n"
+			   "Samples in kernel modules won't be resolved "
+			   "at all.\n\n"
+			   "If some relocation was applied (e.g. kexec) "
+			   "symbols may be misresolved even with a suitable "
+			   "vmlinux or kallsyms file.\n\n");
+
 	if (no_buildid_cache || no_buildid)
 		disable_buildid_cache();
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 498c6f70a74..99156c35bc6 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -116,6 +116,9 @@ static int process_sample_event(union perf_event *event,
 	if (al.filtered || (hide_unresolved && al.sym == NULL))
 		return 0;
 
+	if (al.map != NULL)
+		al.map->dso->hit = 1;
+
 	if (perf_session__add_hist_entry(session, &al, sample, evsel)) {
 		pr_debug("problem incrementing symbol period, skipping event\n");
 		return -1;
@@ -249,6 +252,8 @@ static int __cmd_report(void)
 	u64 nr_samples;
 	struct perf_session *session;
 	struct perf_evsel *pos;
+	struct map *kernel_map;
+	struct kmap *kernel_kmap;
 	const char *help = "For a higher level overview, try: perf report --sort comm,dso";
 
 	signal(SIGINT, sig_handler);
@@ -268,6 +273,27 @@ static int __cmd_report(void)
 	if (ret)
 		goto out_delete;
 
+	kernel_map = session->host_machine.vmlinux_maps[MAP__FUNCTION];
+	kernel_kmap = map__kmap(kernel_map);
+	if (kernel_map == NULL ||
+	    (kernel_map->dso->hit &&
+	     (kernel_kmap->ref_reloc_sym == NULL ||
+	      kernel_kmap->ref_reloc_sym->addr == 0))) {
+		const struct dso *kdso = kernel_map->dso;
+
+		ui__warning("Kernel address maps "
+			    "(/proc/{kallsyms,modules}) were restricted, "
+			    "check /proc/sys/kernel/kptr_restrict before "
+			    "running 'perf record'.\n\n%s\n\n"
+			    "Samples in kernel modules can't be resolved "
+			    "as well.\n\n",
+			    RB_EMPTY_ROOT(&kdso->symbols[MAP__FUNCTION]) ?
+			    "As no suitable kallsyms nor vmlinux was found, "
+			    "kernel samples can't be resolved." :
+			    "If some relocation was applied (e.g. kexec) "
+			    "symbols may be misresolved.");
+	}
+
 	if (dump_trace) {
 		perf_session__fprintf_nr_events(session, stdout);
 		goto out_delete;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 6635fcd11ca..0fe9adf7637 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -553,9 +553,18 @@ static int perf_event__process_kernel_mmap(union perf_event *event,
 			goto out_problem;
 
 		perf_event__set_kernel_mmap_len(event, machine->vmlinux_maps);
-		perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
-							 symbol_name,
-							 event->mmap.pgoff);
+
+		/*
+		 * Avoid using a zero address (kptr_restrict) for the ref reloc
+		 * symbol. Effectively having zero here means that at record
+		 * time /proc/sys/kernel/kptr_restrict was non zero.
+		 */
+		if (event->mmap.pgoff != 0) {
+			perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
+								 symbol_name,
+								 event->mmap.pgoff);
+		}
+
 		if (machine__is_default_guest(machine)) {
 			/*
 			 * preload dso of guest kernel and modules
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 0717bebc764..afb0849fe53 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -193,9 +193,13 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
 	     *linkname = malloc(size), *targetname;
 	int len, err = -1;
 
-	if (is_kallsyms)
+	if (is_kallsyms) {
+		if (symbol_conf.kptr_restrict) {
+			pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
+			return 0;
+		}
 		realname = (char *)name;
-	else
+	} else
 		realname = realpath(name, NULL);
 
 	if (realname == NULL || filename == NULL || linkname == NULL)
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 516876dfbe5..eec196329fd 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -676,9 +676,30 @@ discard_symbol:		rb_erase(&pos->rb_node, root);
 	return count + moved;
 }
 
+static bool symbol__restricted_filename(const char *filename,
+					const char *restricted_filename)
+{
+	bool restricted = false;
+
+	if (symbol_conf.kptr_restrict) {
+		char *r = realpath(filename, NULL);
+
+		if (r != NULL) {
+			restricted = strcmp(r, restricted_filename) == 0;
+			free(r);
+			return restricted;
+		}
+	}
+
+	return restricted;
+}
+
 int dso__load_kallsyms(struct dso *dso, const char *filename,
 		       struct map *map, symbol_filter_t filter)
 {
+	if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+		return -1;
+
 	if (dso__load_all_kallsyms(dso, filename, map) < 0)
 		return -1;
 
@@ -1790,6 +1811,9 @@ static int machine__create_modules(struct machine *machine)
 		modules = path;
 	}
 
+	if (symbol__restricted_filename(path, "/proc/modules"))
+		return -1;
+
 	file = fopen(modules, "r");
 	if (file == NULL)
 		return -1;
@@ -2239,6 +2263,9 @@ static u64 machine__get_kernel_start_addr(struct machine *machine)
 		}
 	}
 
+	if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+		return 0;
+
 	if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0)
 		return 0;
 
@@ -2410,6 +2437,25 @@ static int setup_list(struct strlist **list, const char *list_str,
 	return 0;
 }
 
+static bool symbol__read_kptr_restrict(void)
+{
+	bool value = false;
+
+	if (geteuid() != 0) {
+		FILE *fp = fopen("/proc/sys/kernel/kptr_restrict", "r");
+		if (fp != NULL) {
+			char line[8];
+
+			if (fgets(line, sizeof(line), fp) != NULL)
+				value = atoi(line) != 0;
+
+			fclose(fp);
+		}
+	}
+
+	return value;
+}
+
 int symbol__init(void)
 {
 	const char *symfs;
@@ -2456,6 +2502,8 @@ int symbol__init(void)
 	if (symfs != symbol_conf.symfs)
 		free((void *)symfs);
 
+	symbol_conf.kptr_restrict = symbol__read_kptr_restrict();
+
 	symbol_conf.initialized = true;
 	return 0;
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 242de0101a8..325ee36a9d2 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -75,7 +75,8 @@ struct symbol_conf {
 			use_callchain,
 			exclude_other,
 			show_cpu_utilization,
-			initialized;
+			initialized,
+			kptr_restrict;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,
-- 
cgit v1.2.3-70-g09d2


From 75911c9bd1134f8c0b682aa1e8a8dbefec3ca07a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 26 May 2011 10:13:38 -0300
Subject: perf tools: Fix build on older systems

Where /usr/include/linux/const.h is not present, e.g. RHEL5.

Reported-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/n/tip-ypcw2mu0w7dl1rrc6ncz3pee@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile                   | 2 ++
 tools/perf/util/include/linux/const.h | 1 +
 2 files changed, 3 insertions(+)
 create mode 100644 tools/perf/util/include/linux/const.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 1455413ec7a..032ba6398a5 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -215,11 +215,13 @@ LIB_FILE=$(OUTPUT)libperf.a
 LIB_H += ../../include/linux/perf_event.h
 LIB_H += ../../include/linux/rbtree.h
 LIB_H += ../../include/linux/list.h
+LIB_H += ../../include/linux/const.h
 LIB_H += ../../include/linux/hash.h
 LIB_H += ../../include/linux/stringify.h
 LIB_H += util/include/linux/bitmap.h
 LIB_H += util/include/linux/bitops.h
 LIB_H += util/include/linux/compiler.h
+LIB_H += util/include/linux/const.h
 LIB_H += util/include/linux/ctype.h
 LIB_H += util/include/linux/kernel.h
 LIB_H += util/include/linux/list.h
diff --git a/tools/perf/util/include/linux/const.h b/tools/perf/util/include/linux/const.h
new file mode 100644
index 00000000000..1b476c9ae64
--- /dev/null
+++ b/tools/perf/util/include/linux/const.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/const.h"
-- 
cgit v1.2.3-70-g09d2


From ba9f207c9f82115aba4ce04b22e0081af0ae300f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 20 May 2011 02:09:54 +0200
Subject: rcu: Fix unpaired rcu_irq_enter() from locking selftests

HARDIRQ_ENTER() maps to irq_enter() which calls rcu_irq_enter().
But HARDIRQ_EXIT() maps to __irq_exit() which doesn't call
rcu_irq_exit().

So for every locking selftest that simulates hardirq disabled,
we create an imbalance in the rcu extended quiescent state
internal state.

As a result, after the first missing rcu_irq_exit(), subsequent
irqs won't exit dyntick-idle mode after leaving the interrupt
handler.  This means that RCU won't see the affected CPU as being
in an extended quiescent state, resulting in long grace-period
delays (as in grace periods extending for hours).

To fix this, just use __irq_enter() to simulate the hardirq
context. This is sufficient for the locking selftests as we
don't need to exit any extended quiescent state or perform
any check that irqs normally do when they wake up from idle.

As a side effect, this patch makes it possible to restore
"rcu: Decrease memory-barrier usage based on semi-formal proof",
which eventually helped finding this bug.

Reported-and-tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stable <stable@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 lib/locking-selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 619313ed6c4..507a22fab73 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -144,7 +144,7 @@ static void init_shared_classes(void)
 
 #define HARDIRQ_ENTER()				\
 	local_irq_disable();			\
-	irq_enter();				\
+	__irq_enter();				\
 	WARN_ON(!in_irq());
 
 #define HARDIRQ_EXIT()				\
-- 
cgit v1.2.3-70-g09d2


From 0bbcc529fcea9c7de5e2e7243f9913b8f7302a8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 May 2011 02:24:04 -0700
Subject: rcu: Add memory barriers

Add the memory barriers added by e59fb3120b.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e486f7c3ffb..3731141d8ad 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -907,6 +907,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	unsigned long gp_duration;
 
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+
+	/*
+	 * Ensure that all grace-period and pre-grace-period activity
+	 * is seen before the assignment to rsp->completed.
+	 */
+	smp_mb(); /* See above block comment. */
 	gp_duration = jiffies - rsp->gp_start;
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
-- 
cgit v1.2.3-70-g09d2


From 1135633bddcf7a819a1490c18d04965c490bcc1e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 May 2011 02:44:06 -0700
Subject: rcu: Remove old memory barriers from rcu_process_callbacks()

Second step of partitioning of commit e59fb3120b.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3731141d8ad..011bf6f261a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1460,25 +1460,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 static void rcu_process_callbacks(void)
 {
-	/*
-	 * Memory references from any prior RCU read-side critical sections
-	 * executed by the interrupted code must be seen before any RCU
-	 * grace-period manipulations below.
-	 */
-	smp_mb(); /* See above block comment. */
-
 	__rcu_process_callbacks(&rcu_sched_state,
 				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 	rcu_preempt_process_callbacks();
 
-	/*
-	 * Memory references from any later RCU read-side critical sections
-	 * executed by the interrupted code must be seen after any RCU
-	 * grace-period manipulations above.
-	 */
-	smp_mb(); /* See above block comment. */
-
 	/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
 	rcu_needs_cpu_flush();
 }
-- 
cgit v1.2.3-70-g09d2


From b5904090c754327ed6c2ecaefed4f7d473df393f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 May 2011 02:52:04 -0700
Subject: rcu: Don't do reschedule unless in irq

Condition the set_need_resched() in rcu_irq_exit() on in_irq().  This
should be a no-op, because rcu_irq_exit() should only be called from irq.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 011bf6f261a..195b3a3313e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -421,8 +421,9 @@ void rcu_irq_exit(void)
 	WARN_ON_ONCE(rdtp->dynticks & 0x1);
 
 	/* If the interrupt queued a callback, get out of dyntick mode. */
-	if (__this_cpu_read(rcu_sched_data.nxtlist) ||
-	    __this_cpu_read(rcu_bh_data.nxtlist))
+	if (in_irq() &&
+	    (__this_cpu_read(rcu_sched_data.nxtlist) ||
+	     __this_cpu_read(rcu_bh_data.nxtlist)))
 		set_need_resched();
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4305ce7894dd38b0633bfc8978437320119223bd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 May 2011 14:27:31 -0700
Subject: rcu: Make rcu_enter_nohz() pay attention to nesting

The old version of rcu_enter_nohz() forced RCU into nohz mode even if
the nesting count was non-zero.  This change causes rcu_enter_nohz()
to hold off for non-zero nesting counts.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 195b3a3313e..99c6038ad04 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -324,8 +324,8 @@ void rcu_enter_nohz(void)
 	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
-	rdtp->dynticks++;
-	rdtp->dynticks_nesting--;
+	if (--rdtp->dynticks_nesting == 0)
+		rdtp->dynticks++;
 	WARN_ON_ONCE(rdtp->dynticks & 0x1);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-70-g09d2


From 23b5c8fa01b723c70a20d6e4ef4ff54c7656d6e1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 7 Sep 2010 10:38:22 -0700
Subject: rcu: Decrease memory-barrier usage based on semi-formal proof

(Note: this was reverted, and is now being re-applied in pieces, with
this being the fifth and final piece.  See below for the reason that
it is now felt to be safe to re-apply this.)

Commit d09b62d fixed grace-period synchronization, but left some smp_mb()
invocations in rcu_process_callbacks() that are no longer needed, but
sheer paranoia prevented them from being removed.  This commit removes
them and provides a proof of correctness in their absence.  It also adds
a memory barrier to rcu_report_qs_rsp() immediately before the update to
rsp->completed in order to handle the theoretical possibility that the
compiler or CPU might move massive quantities of code into a lock-based
critical section.  This also proves that the sheer paranoia was not
entirely unjustified, at least from a theoretical point of view.

In addition, the old dyntick-idle synchronization depended on the fact
that grace periods were many milliseconds in duration, so that it could
be assumed that no dyntick-idle CPU could reorder a memory reference
across an entire grace period.  Unfortunately for this design, the
addition of expedited grace periods breaks this assumption, which has
the unfortunate side-effect of requiring atomic operations in the
functions that track dyntick-idle state for RCU.  (There is some hope
that the algorithms used in user-level RCU might be applied here, but
some work is required to handle the NMIs that user-space applications
can happily ignore.  For the short term, better safe than sorry.)

This proof assumes that neither compiler nor CPU will allow a lock
acquisition and release to be reordered, as doing so can result in
deadlock.  The proof is as follows:

1.	A given CPU declares a quiescent state under the protection of
	its leaf rcu_node's lock.

2.	If there is more than one level of rcu_node hierarchy, the
	last CPU to declare a quiescent state will also acquire the
	->lock of the next rcu_node up in the hierarchy,  but only
	after releasing the lower level's lock.  The acquisition of this
	lock clearly cannot occur prior to the acquisition of the leaf
	node's lock.

3.	Step 2 repeats until we reach the root rcu_node structure.
	Please note again that only one lock is held at a time through
	this process.  The acquisition of the root rcu_node's ->lock
	must occur after the release of that of the leaf rcu_node.

4.	At this point, we set the ->completed field in the rcu_state
	structure in rcu_report_qs_rsp().  However, if the rcu_node
	hierarchy contains only one rcu_node, then in theory the code
	preceding the quiescent state could leak into the critical
	section.  We therefore precede the update of ->completed with a
	memory barrier.  All CPUs will therefore agree that any updates
	preceding any report of a quiescent state will have happened
	before the update of ->completed.

5.	Regardless of whether a new grace period is needed, rcu_start_gp()
	will propagate the new value of ->completed to all of the leaf
	rcu_node structures, under the protection of each rcu_node's ->lock.
	If a new grace period is needed immediately, this propagation
	will occur in the same critical section that ->completed was
	set in, but courtesy of the memory barrier in #4 above, is still
	seen to follow any pre-quiescent-state activity.

6.	When a given CPU invokes __rcu_process_gp_end(), it becomes
	aware of the end of the old grace period and therefore makes
	any RCU callbacks that were waiting on that grace period eligible
	for invocation.

	If this CPU is the same one that detected the end of the grace
	period, and if there is but a single rcu_node in the hierarchy,
	we will still be in the single critical section.  In this case,
	the memory barrier in step #4 guarantees that all callbacks will
	be seen to execute after each CPU's quiescent state.

	On the other hand, if this is a different CPU, it will acquire
	the leaf rcu_node's ->lock, and will again be serialized after
	each CPU's quiescent state for the old grace period.

On the strength of this proof, this commit therefore removes the memory
barriers from rcu_process_callbacks() and adds one to rcu_report_qs_rsp().
The effect is to reduce the number of memory barriers by one and to
reduce the frequency of execution from about once per scheduling tick
per CPU to once per grace period.

This was reverted do to hangs found during testing by Yinghai Lu and
Ingo Molnar.  Frederic Weisbecker supplied Yinghai with tracing that
located the underlying problem, and Frederic also provided the fix.

The underlying problem was that the HARDIRQ_ENTER() macro from
lib/locking-selftest.c invoked irq_enter(), which in turn invokes
rcu_irq_enter(), but HARDIRQ_EXIT() invoked __irq_exit(), which
does not invoke rcu_irq_exit().  This situation resulted in calls
to rcu_irq_enter() that were not balanced by the required calls to
rcu_irq_exit().  Therefore, after these locking selftests completed,
RCU's dyntick-idle nesting count was a large number (for example,
72), which caused RCU to to conclude that the affected CPU was not in
dyntick-idle mode when in fact it was.

RCU would therefore incorrectly wait for this dyntick-idle CPU, resulting
in hangs.

In contrast, with Frederic's patch, which replaces the irq_enter()
in HARDIRQ_ENTER() with an __irq_enter(), these tests don't ever call
either rcu_irq_enter() or rcu_irq_exit(), which works because the CPU
running the test is already marked as not being in dyntick-idle mode.
This means that the rcu_irq_enter() and rcu_irq_exit() calls and RCU
then has no problem working out which CPUs are in dyntick-idle mode and
which are not.

The reason that the imbalance was not noticed before the barrier patch
was applied is that the old implementation of rcu_enter_nohz() ignored
the nesting depth.  This could still result in delays, but much shorter
ones.  Whenever there was a delay, RCU would IPI the CPU with the
unbalanced nesting level, which would eventually result in rcu_enter_nohz()
being called, which in turn would force RCU to see that the CPU was in
dyntick-idle mode.

The reason that very few people noticed the problem is that the mismatched
irq_enter() vs. __irq_exit() occured only when the kernel was built with
CONFIG_DEBUG_LOCKING_API_SELFTESTS.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/RCU/trace.txt |  17 ++-----
 kernel/rcutree.c            | 111 ++++++++++++++++++++------------------------
 kernel/rcutree.h            |   9 ++--
 kernel/rcutree_plugin.h     |   7 ++-
 kernel/rcutree_trace.c      |  12 ++---
 5 files changed, 67 insertions(+), 89 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index c078ad48f7a..8173cec473a 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -99,18 +99,11 @@ o	"qp" indicates that RCU still expects a quiescent state from
 
 o	"dt" is the current value of the dyntick counter that is incremented
 	when entering or leaving dynticks idle state, either by the
-	scheduler or by irq.  The number after the "/" is the interrupt
-	nesting depth when in dyntick-idle state, or one greater than
-	the interrupt-nesting depth otherwise.
-
-	This field is displayed only for CONFIG_NO_HZ kernels.
-
-o	"dn" is the current value of the dyntick counter that is incremented
-	when entering or leaving dynticks idle state via NMI.  If both
-	the "dt" and "dn" values are even, then this CPU is in dynticks
-	idle mode and may be ignored by RCU.  If either of these two
-	counters is odd, then RCU must be alert to the possibility of
-	an RCU read-side critical section running on this CPU.
+	scheduler or by irq.  This number is even if the CPU is in
+	dyntick idle mode and odd otherwise.  The number after the first
+	"/" is the interrupt nesting depth when in dyntick-idle state,
+	or one greater than the interrupt-nesting depth otherwise.
+	The number after the second "/" is the NMI nesting depth.
 
 	This field is displayed only for CONFIG_NO_HZ kernels.
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 99c6038ad04..5616b17e4a2 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = 1,
-	.dynticks = 1,
+	.dynticks = ATOMIC_INIT(1),
 };
 #endif /* #ifdef CONFIG_NO_HZ */
 
@@ -321,13 +321,25 @@ void rcu_enter_nohz(void)
 	unsigned long flags;
 	struct rcu_dynticks *rdtp;
 
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
-	if (--rdtp->dynticks_nesting == 0)
-		rdtp->dynticks++;
-	WARN_ON_ONCE(rdtp->dynticks & 0x1);
+	if (--rdtp->dynticks_nesting) {
+		local_irq_restore(flags);
+		return;
+	}
+	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+	smp_mb__before_atomic_inc();  /* See above. */
+	atomic_inc(&rdtp->dynticks);
+	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 	local_irq_restore(flags);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (in_irq() &&
+	    (__get_cpu_var(rcu_sched_data).nxtlist ||
+	     __get_cpu_var(rcu_bh_data).nxtlist ||
+	     rcu_preempt_needs_cpu(smp_processor_id())))
+		set_need_resched();
 }
 
 /*
@@ -343,11 +355,16 @@ void rcu_exit_nohz(void)
 
 	local_irq_save(flags);
 	rdtp = &__get_cpu_var(rcu_dynticks);
-	rdtp->dynticks++;
-	rdtp->dynticks_nesting++;
-	WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+	if (rdtp->dynticks_nesting++) {
+		local_irq_restore(flags);
+		return;
+	}
+	smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
+	atomic_inc(&rdtp->dynticks);
+	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+	smp_mb__after_atomic_inc();  /* See above. */
+	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 	local_irq_restore(flags);
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 
 /**
@@ -361,11 +378,15 @@ void rcu_nmi_enter(void)
 {
 	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 
-	if (rdtp->dynticks & 0x1)
+	if (rdtp->dynticks_nmi_nesting == 0 &&
+	    (atomic_read(&rdtp->dynticks) & 0x1))
 		return;
-	rdtp->dynticks_nmi++;
-	WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	rdtp->dynticks_nmi_nesting++;
+	smp_mb__before_atomic_inc();  /* Force delay from prior write. */
+	atomic_inc(&rdtp->dynticks);
+	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+	smp_mb__after_atomic_inc();  /* See above. */
+	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 }
 
 /**
@@ -379,11 +400,14 @@ void rcu_nmi_exit(void)
 {
 	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 
-	if (rdtp->dynticks & 0x1)
+	if (rdtp->dynticks_nmi_nesting == 0 ||
+	    --rdtp->dynticks_nmi_nesting != 0)
 		return;
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	rdtp->dynticks_nmi++;
-	WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+	smp_mb__before_atomic_inc();  /* See above. */
+	atomic_inc(&rdtp->dynticks);
+	smp_mb__after_atomic_inc();  /* Force delay to next write. */
+	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 
 /**
@@ -394,13 +418,7 @@ void rcu_nmi_exit(void)
  */
 void rcu_irq_enter(void)
 {
-	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
-	if (rdtp->dynticks_nesting++)
-		return;
-	rdtp->dynticks++;
-	WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	rcu_exit_nohz();
 }
 
 /**
@@ -412,19 +430,7 @@ void rcu_irq_enter(void)
  */
 void rcu_irq_exit(void)
 {
-	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
-	if (--rdtp->dynticks_nesting)
-		return;
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	rdtp->dynticks++;
-	WARN_ON_ONCE(rdtp->dynticks & 0x1);
-
-	/* If the interrupt queued a callback, get out of dyntick mode. */
-	if (in_irq() &&
-	    (__this_cpu_read(rcu_sched_data.nxtlist) ||
-	     __this_cpu_read(rcu_bh_data.nxtlist)))
-		set_need_resched();
+	rcu_enter_nohz();
 }
 
 #ifdef CONFIG_SMP
@@ -436,19 +442,8 @@ void rcu_irq_exit(void)
  */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-	int ret;
-	int snap;
-	int snap_nmi;
-
-	snap = rdp->dynticks->dynticks;
-	snap_nmi = rdp->dynticks->dynticks_nmi;
-	smp_mb();	/* Order sampling of snap with end of grace period. */
-	rdp->dynticks_snap = snap;
-	rdp->dynticks_nmi_snap = snap_nmi;
-	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
-	if (ret)
-		rdp->dynticks_fqs++;
-	return ret;
+	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+	return 0;
 }
 
 /*
@@ -459,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-	long curr;
-	long curr_nmi;
-	long snap;
-	long snap_nmi;
+	unsigned long curr;
+	unsigned long snap;
 
-	curr = rdp->dynticks->dynticks;
-	snap = rdp->dynticks_snap;
-	curr_nmi = rdp->dynticks->dynticks_nmi;
-	snap_nmi = rdp->dynticks_nmi_snap;
-	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+	curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+	snap = (unsigned long)rdp->dynticks_snap;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -478,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	 * read-side critical section that started before the beginning
 	 * of the current RCU grace period.
 	 */
-	if ((curr != snap || (curr & 0x1) == 0) &&
-	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+	if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
 		rdp->dynticks_fqs++;
 		return 1;
 	}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 257664815d5..93d4a1c2e88 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,11 +84,9 @@
  * Dynticks per-CPU state.
  */
 struct rcu_dynticks {
-	int dynticks_nesting;	/* Track nesting level, sort of. */
-	int dynticks;		/* Even value for dynticks-idle, else odd. */
-	int dynticks_nmi;	/* Even value for either dynticks-idle or */
-				/*  not in nmi handler, else odd.  So this */
-				/*  remains even for nmi from irq handler. */
+	int dynticks_nesting;	/* Track irq/process nesting level. */
+	int dynticks_nmi_nesting; /* Track NMI nesting level. */
+	atomic_t dynticks;	/* Even value for dynticks-idle, else odd. */
 };
 
 /* RCU's kthread states for tracing. */
@@ -284,7 +282,6 @@ struct rcu_data {
 	/* 3) dynticks interface. */
 	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
 	int dynticks_snap;		/* Per-GP tracking for dynticks. */
-	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
 #endif /* #ifdef CONFIG_NO_HZ */
 
 	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3f6559a5f5c..ed339702481 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1520,7 +1520,6 @@ int rcu_needs_cpu(int cpu)
 {
 	int c = 0;
 	int snap;
-	int snap_nmi;
 	int thatcpu;
 
 	/* Check for being in the holdoff period. */
@@ -1531,10 +1530,10 @@ int rcu_needs_cpu(int cpu)
 	for_each_online_cpu(thatcpu) {
 		if (thatcpu == cpu)
 			continue;
-		snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
-		snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+		snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+						     thatcpu).dynticks);
 		smp_mb(); /* Order sampling of snap with end of grace period. */
-		if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+		if ((snap & 0x1) != 0) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
 			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
 			return rcu_needs_cpu_quick_check(cpu);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index aa0fd72b4bc..9678cc3650f 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -69,10 +69,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
 		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
-	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
-		   rdp->dynticks->dynticks,
+	seq_printf(m, " dt=%d/%d/%d df=%lu",
+		   atomic_read(&rdp->dynticks->dynticks),
 		   rdp->dynticks->dynticks_nesting,
-		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks->dynticks_nmi_nesting,
 		   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -141,9 +141,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, ",%d,%d,%d,%lu",
-		   rdp->dynticks->dynticks,
+		   atomic_read(&rdp->dynticks->dynticks),
 		   rdp->dynticks->dynticks_nesting,
-		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks->dynticks_nmi_nesting,
 		   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -167,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
 	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
-	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+	seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
 	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
-- 
cgit v1.2.3-70-g09d2


From e9cdd343a5e42c43bcda01e609fa23089e026470 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <ostr@amd64.org>
Date: Thu, 26 May 2011 11:19:52 -0400
Subject: x86, amd: Do not enable ARAT feature on AMD processors below family
 0x12

Commit b87cf80af3ba4b4c008b4face3c68d604e1715c6 added support for
ARAT (Always Running APIC timer) on AMD processors that are not
affected by erratum 400. This erratum is present on certain processor
families and prevents APIC timer from waking up the CPU when it
is in a deep C state, including C1E state.

Determining whether a processor is affected by this erratum may
have some corner cases and handling these cases is somewhat
complicated. In the interest of simplicity we won't claim ARAT
support on processor families below 0x12 and will go back to
broadcasting timer when going idle.

Signed-off-by: Boris Ostrovsky <ostr@amd64.org>
Link: http://lkml.kernel.org/r/1306423192-19774-1-git-send-email-ostr@amd64.org
Tested-by: Boris Petkov <borislav.petkov@amd.com>
Cc: Hans Rosenfeld <Hans.Rosenfeld@amd.com>
Cc: Andreas Herrmann <Andreas.Herrmann3@amd.com>
Cc: Chuck Ebbert <cebbert@redhat.com>
Cc: stable@kernel.org # 32.x, 38.x, 39.x
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/amd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8f5cabb3c5b..b13ed393dfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -612,8 +612,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	/* As a rule processors have APIC timer running in deep C states */
-	if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400))
+	/*
+	 * Family 0x12 and above processors have APIC timer
+	 * running in deep C states.
+	 */
+	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 21bc7af6e5e684b44725b20f679e701e38ceef15 Mon Sep 17 00:00:00 2001
From: Yogesh Ashok Powar <yogeshp@marvell.com>
Date: Wed, 18 May 2011 12:02:03 -0700
Subject: mwifiex: correct event header length

While decoding received event packet from firmware, 4 bytes
of interface header are already removed unconditionally.
So for handling event only 4 more bytes needs to be pulled.
This is achieved by changing event header length to 4.

Almost all the events, except BA stream related and AMSDU
aggregation control events, do not have the payload in their
event skb. Such events handling depends only on the event ID.
This event ID is the first four bytes of the event skb, which
is copied to a separate variable before pulling the skb header.
Hence event handling worked only for those events that didn't
have payload in event skb.

This patch fixes the broken event path of the events with
payload in their event skb without harming existing working
event path for the events without payload.

Signed-off-by: Yogesh Ashok Powar <yogeshp@marvell.com>
Signed-off-by: Kiran Divekar <dkiran@marvell.com>
Signed-off-by: Bing Zhao <bzhao@marvell.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/mwifiex/sdio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mwifiex/sdio.h b/drivers/net/wireless/mwifiex/sdio.h
index a0e9bc5253e..4e97e90aa39 100644
--- a/drivers/net/wireless/mwifiex/sdio.h
+++ b/drivers/net/wireless/mwifiex/sdio.h
@@ -167,8 +167,8 @@
 /* Rx unit register */
 #define CARD_RX_UNIT_REG		0x63
 
-/* Event header Len*/
-#define MWIFIEX_EVENT_HEADER_LEN           8
+/* Event header len w/o 4 bytes of interface header */
+#define MWIFIEX_EVENT_HEADER_LEN           4
 
 /* Max retry number of CMD53 write */
 #define MAX_WRITE_IOMEM_RETRY		2
-- 
cgit v1.2.3-70-g09d2


From 208c72f4fe44fe09577e7975ba0e7fa0278f3d03 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <coelho@ti.com>
Date: Thu, 19 May 2011 00:43:38 +0300
Subject: nl80211: fix check for valid SSID size in scan operations

In both trigger_scan and sched_scan operations, we were checking for
the SSID length before assigning the value correctly.  Since the
memory was just kzalloc'ed, the check was always failing and SSID with
over 32 characters were allowed to go through.

This was causing a buffer overflow when copying the actual SSID to the
proper place.

This bug has been there since 2.6.29-rc4.

Cc: stable@kernel.org
Signed-off-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/nl80211.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ec83f413a7e..88a565f130a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3406,12 +3406,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	i = 0;
 	if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {
 		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {
+			request->ssids[i].ssid_len = nla_len(attr);
 			if (request->ssids[i].ssid_len > IEEE80211_MAX_SSID_LEN) {
 				err = -EINVAL;
 				goto out_free;
 			}
 			memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));
-			request->ssids[i].ssid_len = nla_len(attr);
 			i++;
 		}
 	}
@@ -3572,6 +3572,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
 	if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {
 		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS],
 				    tmp) {
+			request->ssids[i].ssid_len = nla_len(attr);
 			if (request->ssids[i].ssid_len >
 			    IEEE80211_MAX_SSID_LEN) {
 				err = -EINVAL;
@@ -3579,7 +3580,6 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
 			}
 			memcpy(request->ssids[i].ssid, nla_data(attr),
 			       nla_len(attr));
-			request->ssids[i].ssid_len = nla_len(attr);
 			i++;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From a9e12869758430424804dd4332e0d2afdfdf00b0 Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Thu, 19 May 2011 10:17:04 -0500
Subject: rtlwifi: Fix kernel panic resulting from RX buffer allocation failure

To handle amsdu_8k capability, the PCI routine of this driver must
allocate receive buffers of order 2. Under heavy load, this causes
fragmentation of memory. The present code releases the current buffer
before checking to see if a new one is availble. Recovery from
allocation failures is not possible, which results in kernel panics.

The fix is to reorder the code to check that a new buffer can be
allocated before the old one is released. If not possible, the
received frame is dropped and the old one is reused. Without this
change, it is impossible to transfer a 2 GB file without a kernel panic.

Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Stable <stable@vger.kernel.org>              [2.6.{37,38,39}]
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/rtlwifi/pci.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/rtlwifi/pci.c b/drivers/net/wireless/rtlwifi/pci.c
index a4095284543..c2b83a57c58 100644
--- a/drivers/net/wireless/rtlwifi/pci.c
+++ b/drivers/net/wireless/rtlwifi/pci.c
@@ -669,11 +669,6 @@ static void _rtl_pci_rx_interrupt(struct ieee80211_hw *hw)
 							 &rx_status,
 							 (u8 *) pdesc, skb);
 
-			pci_unmap_single(rtlpci->pdev,
-					 *((dma_addr_t *) skb->cb),
-					 rtlpci->rxbuffersize,
-					 PCI_DMA_FROMDEVICE);
-
 			skb_put(skb, rtlpriv->cfg->ops->get_desc((u8 *) pdesc,
 							 false,
 							 HW_DESC_RXPKT_LEN));
@@ -690,6 +685,21 @@ static void _rtl_pci_rx_interrupt(struct ieee80211_hw *hw)
 			hdr = rtl_get_hdr(skb);
 			fc = rtl_get_fc(skb);
 
+			/* try for new buffer - if allocation fails, drop
+			 * frame and reuse old buffer
+			 */
+			new_skb = dev_alloc_skb(rtlpci->rxbuffersize);
+			if (unlikely(!new_skb)) {
+				RT_TRACE(rtlpriv, (COMP_INTR | COMP_RECV),
+					 DBG_DMESG,
+					 ("can't alloc skb for rx\n"));
+				goto done;
+			}
+			pci_unmap_single(rtlpci->pdev,
+					 *((dma_addr_t *) skb->cb),
+					 rtlpci->rxbuffersize,
+					 PCI_DMA_FROMDEVICE);
+
 			if (!stats.crc || !stats.hwerror) {
 				memcpy(IEEE80211_SKB_RXCB(skb), &rx_status,
 				       sizeof(rx_status));
@@ -758,15 +768,7 @@ static void _rtl_pci_rx_interrupt(struct ieee80211_hw *hw)
 				rtl_lps_leave(hw);
 			}
 
-			new_skb = dev_alloc_skb(rtlpci->rxbuffersize);
-			if (unlikely(!new_skb)) {
-				RT_TRACE(rtlpriv, (COMP_INTR | COMP_RECV),
-					 DBG_DMESG,
-					 ("can't alloc skb for rx\n"));
-				goto done;
-			}
 			skb = new_skb;
-			/*skb->dev = dev; */
 
 			rtlpci->rx_ring[rx_queue_idx].rx_buf[rtlpci->
 							     rx_ring
-- 
cgit v1.2.3-70-g09d2


From 0019a2c9277bf6d083032a5a9857249e75407a8c Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Thu, 19 May 2011 11:48:45 -0500
Subject: rtlwifi: Use order 2 RX buffer allocation only if necessary

Although a previous fix handles the kernel panics that result from
failure to allocate a new RX buffer, memory fragmentation can be
reduced if the amsdu_8k capability is disabled as new buffers need only
be of O(0), not O(2).

Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/rtlwifi/pci.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/wireless/rtlwifi/pci.c b/drivers/net/wireless/rtlwifi/pci.c
index c2b83a57c58..89100e7c553 100644
--- a/drivers/net/wireless/rtlwifi/pci.c
+++ b/drivers/net/wireless/rtlwifi/pci.c
@@ -1115,6 +1115,13 @@ static int _rtl_pci_init_rx_ring(struct ieee80211_hw *hw)
 
 		rtlpci->rx_ring[rx_queue_idx].idx = 0;
 
+		/* If amsdu_8k is disabled, set buffersize to 4096. This
+		 * change will reduce memory fragmentation.
+		 */
+		if (rtlpci->rxbuffersize > 4096 &&
+		    rtlpriv->rtlhal.disable_amsdu_8k)
+			rtlpci->rxbuffersize = 4096;
+
 		for (i = 0; i < rtlpci->rxringcount; i++) {
 			struct sk_buff *skb =
 			    dev_alloc_skb(rtlpci->rxbuffersize);
-- 
cgit v1.2.3-70-g09d2


From fb23d86382a088d50020fd05024d40af5b00f885 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Fri, 20 May 2011 01:04:46 +0200
Subject: b43: N-PHY: initialize last var in calibration function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported-by: Larry Finger <larry.finger@lwfinger.net>
Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/b43/phy_n.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/b43/phy_n.c b/drivers/net/wireless/b43/phy_n.c
index 9ed65157bef..05960ddde24 100644
--- a/drivers/net/wireless/b43/phy_n.c
+++ b/drivers/net/wireless/b43/phy_n.c
@@ -3093,7 +3093,7 @@ static int b43_nphy_cal_tx_iq_lo(struct b43_wldev *dev,
 	int freq;
 	bool avoid = false;
 	u8 length;
-	u16 tmp, core, type, count, max, numb, last, cmd;
+	u16 tmp, core, type, count, max, numb, last = 0, cmd;
 	const u16 *table;
 	bool phy6or5x;
 
-- 
cgit v1.2.3-70-g09d2


From a4d86d953b8593791cb29cf2acffd48f9ee6c4f9 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanoharan@atheros.com>
Date: Fri, 20 May 2011 17:52:10 +0530
Subject: ath9k: Reset chip on baseband hang

Resetting hardware helps to recover from baseband
hang/panic for AR9003 based chips.

Cc: stable@kernel.org
Signed-off-by: Rajkumar Manoharan <rmanoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index bd18b1d01be..409ef5885a1 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -670,7 +670,8 @@ void ath9k_tasklet(unsigned long data)
 	u32 status = sc->intrstatus;
 	u32 rxmask;
 
-	if (status & ATH9K_INT_FATAL) {
+	if ((status & ATH9K_INT_FATAL) ||
+	    (status & ATH9K_INT_BB_WATCHDOG)) {
 		ath_reset(sc, true);
 		return;
 	}
@@ -737,6 +738,7 @@ irqreturn_t ath_isr(int irq, void *dev)
 {
 #define SCHED_INTR (				\
 		ATH9K_INT_FATAL |		\
+		ATH9K_INT_BB_WATCHDOG |		\
 		ATH9K_INT_RXORN |		\
 		ATH9K_INT_RXEOL |		\
 		ATH9K_INT_RX |			\
-- 
cgit v1.2.3-70-g09d2


From 51ac8cbb2176dc159ee910d7074c6796079c3068 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanoharan@atheros.com>
Date: Fri, 20 May 2011 17:52:13 +0530
Subject: ath9k_hw: disable phy restart on baseband panic caused by RXSM

While receiving unsupported rate frame rx state machine
gets into a state 0xb and if phy_restart happens in that
state, BB would go hang. If RXSM is in 0xb state after
first bb panic, ensure to disable the phy_restart.

Signed-off-by: Rajkumar Manoharan <rmanoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/ar9003_phy.c | 22 ++++++++++++++++++++++
 drivers/net/wireless/ath/ath9k/hw.c         |  5 ++++-
 drivers/net/wireless/ath/ath9k/hw.h         |  2 ++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/ar9003_phy.c b/drivers/net/wireless/ath/ath9k/ar9003_phy.c
index eee23ecd118..892c48b1543 100644
--- a/drivers/net/wireless/ath/ath9k/ar9003_phy.c
+++ b/drivers/net/wireless/ath/ath9k/ar9003_phy.c
@@ -1381,3 +1381,25 @@ void ar9003_hw_bb_watchdog_dbg_info(struct ath_hw *ah)
 		"==== BB update: done ====\n\n");
 }
 EXPORT_SYMBOL(ar9003_hw_bb_watchdog_dbg_info);
+
+void ar9003_hw_disable_phy_restart(struct ath_hw *ah)
+{
+	u32 val;
+
+	/* While receiving unsupported rate frame rx state machine
+	 * gets into a state 0xb and if phy_restart happens in that
+	 * state, BB would go hang. If RXSM is in 0xb state after
+	 * first bb panic, ensure to disable the phy_restart.
+	 */
+	if (!((MS(ah->bb_watchdog_last_status,
+		  AR_PHY_WATCHDOG_RX_OFDM_SM) == 0xb) ||
+	    ah->bb_hang_rx_ofdm))
+		return;
+
+	ah->bb_hang_rx_ofdm = true;
+	val = REG_READ(ah, AR_PHY_RESTART);
+	val &= ~AR_PHY_RESTART_ENA;
+
+	REG_WRITE(ah, AR_PHY_RESTART, val);
+}
+EXPORT_SYMBOL(ar9003_hw_disable_phy_restart);
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index 72543ce8f61..1be7c8bbef8 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -1555,9 +1555,12 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 	if (ah->btcoex_hw.enabled)
 		ath9k_hw_btcoex_enable(ah);
 
-	if (AR_SREV_9300_20_OR_LATER(ah))
+	if (AR_SREV_9300_20_OR_LATER(ah)) {
 		ar9003_hw_bb_watchdog_config(ah);
 
+		ar9003_hw_disable_phy_restart(ah);
+	}
+
 	ath9k_hw_apply_gpio_override(ah);
 
 	return 0;
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index 57435ce6279..4b157c53d1a 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -842,6 +842,7 @@ struct ath_hw {
 
 	u32 bb_watchdog_last_status;
 	u32 bb_watchdog_timeout_ms; /* in ms, 0 to disable */
+	u8 bb_hang_rx_ofdm; /* true if bb hang due to rx_ofdm */
 
 	unsigned int paprd_target_power;
 	unsigned int paprd_training_power;
@@ -990,6 +991,7 @@ void ar9002_hw_enable_wep_aggregation(struct ath_hw *ah);
 void ar9003_hw_bb_watchdog_config(struct ath_hw *ah);
 void ar9003_hw_bb_watchdog_read(struct ath_hw *ah);
 void ar9003_hw_bb_watchdog_dbg_info(struct ath_hw *ah);
+void ar9003_hw_disable_phy_restart(struct ath_hw *ah);
 void ar9003_paprd_enable(struct ath_hw *ah, bool val);
 void ar9003_paprd_populate_single_table(struct ath_hw *ah,
 					struct ath9k_hw_cal_data *caldata,
-- 
cgit v1.2.3-70-g09d2


From 41e2b05b9598d6bdf91fc20280bfc538d853f769 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanoharan@atheros.com>
Date: Fri, 20 May 2011 17:52:14 +0530
Subject: ath9k: set 40 Mhz rate only if hw is configured in ht40

Whenever there is a channel width change from 40 Mhz to 20 Mhz,
the hardware is reconfigured to ht20. Meantime before doing
the rate control updation, the packets are being transmitted are
selected rate with IEEE80211_TX_RC_40_MHZ_WIDTH.

While transmitting ht40 rate packets in ht20 mode is causing
baseband panic with AR9003 based chips.

==== BB update: BB status=0x02001109 ====
ath: ** BB state: wd=1 det=1 rdar=0 rOFDM=1 rCCK=1 tOFDM=0 tCCK=0 agc=2
src=0 **
ath: ** BB WD cntl: cntl1=0xffff0085 cntl2=0x00000004 **
ath: ** BB mode: BB_gen_controls=0x000033c0 **
ath: ** BB busy times: rx_clear=99%, rx_frame=0%, tx_frame=0% **
ath: ==== BB update: done ====

Cc: stable@kernel.org
Signed-off-by: Rajkumar Manoharan <rmanoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/rc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/rc.c b/drivers/net/wireless/ath/ath9k/rc.c
index 133502b9c83..c0d0773baa2 100644
--- a/drivers/net/wireless/ath/ath9k/rc.c
+++ b/drivers/net/wireless/ath/ath9k/rc.c
@@ -689,7 +689,8 @@ static void ath_rc_rate_set_series(const struct ath_rate_table *rate_table,
 
 	if (WLAN_RC_PHY_HT(rate_table->info[rix].phy)) {
 		rate->flags |= IEEE80211_TX_RC_MCS;
-		if (WLAN_RC_PHY_40(rate_table->info[rix].phy))
+		if (WLAN_RC_PHY_40(rate_table->info[rix].phy) &&
+		    conf_is_ht40(&txrc->hw->conf))
 			rate->flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
 		if (WLAN_RC_PHY_SGI(rate_table->info[rix].phy))
 			rate->flags |= IEEE80211_TX_RC_SHORT_GI;
-- 
cgit v1.2.3-70-g09d2


From 1d38c16ce4156f63b45abbd09dd28ca2ef5172b4 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanoharan@atheros.com>
Date: Fri, 20 May 2011 17:52:15 +0530
Subject: mac80211: stop queues before rate control updation

Stop tx queues before updating rate control to ensure
proper rate selection. Otherwise packets can be transmitted
in 40 Mhz whereas hw is configured in HT20.

Signed-off-by: Rajkumar Manoharan <rmanoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4f6b2675e41..746595597b6 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -232,6 +232,9 @@ static u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
 		WARN_ON(!ieee80211_set_channel_type(local, sdata, channel_type));
 	}
 
+	ieee80211_stop_queues_by_reason(&sdata->local->hw,
+					IEEE80211_QUEUE_STOP_REASON_CSA);
+
 	/* channel_type change automatically detected */
 	ieee80211_hw_config(local, 0);
 
@@ -245,6 +248,9 @@ static u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
 		rcu_read_unlock();
 	}
 
+	ieee80211_wake_queues_by_reason(&sdata->local->hw,
+					IEEE80211_QUEUE_STOP_REASON_CSA);
+
 	ht_opmode = le16_to_cpu(hti->operation_mode);
 
 	/* if bss configuration changed store the new one */
-- 
cgit v1.2.3-70-g09d2


From bc9af76b1e87e4f925f97368dae6c22266922e8b Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 25 May 2011 16:56:34 +0530
Subject: dmaengine: add TODO items for future work on dma drivers

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/TODO | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 drivers/dma/TODO

diff --git a/drivers/dma/TODO b/drivers/dma/TODO
new file mode 100644
index 00000000000..a4af8589330
--- /dev/null
+++ b/drivers/dma/TODO
@@ -0,0 +1,14 @@
+TODO for slave dma
+
+1. Move remaining drivers to use new slave interface
+2. Remove old slave pointer machansim
+3. Make issue_pending to start the transaction in below drivers
+	- mpc512x_dma
+	- imx-dma
+	- imx-sdma
+	- mxs-dma.c
+	- dw_dmac
+	- intel_mid_dma
+	- ste_dma40
+4. Check other subsystems for dma drivers and merge/move to dmaengine
+5. Remove dma_slave_config's dma direction.
-- 
cgit v1.2.3-70-g09d2


From 7fe4fc881d1340f17396f52c836e597dadadea42 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Wed, 25 May 2011 10:33:17 -0700
Subject: Intel xhci: Add PCI id for Panther Point xHCI host.

This adds the PCI ID for the xHCI (USB 3.0) host controller in the Intel
Panther Point chipset.  It will be used by both the EHCI and xHCI driver
in the following patches.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 24787b75128..a311008af5e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2483,6 +2483,7 @@
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX	0x1c5f
 #define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0	0x1d40
 #define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1	0x1d41
+#define PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI	0x1e31
 #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN	0x1e40
 #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX	0x1e5f
 #define PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN	0x2310
-- 
cgit v1.2.3-70-g09d2


From 5dbd05d46fb7d849570c8fa09d48591aa7ce1766 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 25 May 2011 23:36:30 +0530
Subject: maintainers: add dma engine tree details

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8f4413014b2..44358930608 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2169,6 +2169,8 @@ M:	Dan Williams <dan.j.williams@intel.com>
 S:	Supported
 F:	drivers/dma/
 F:	include/linux/dma*
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/djbw/async_tx.git
+T:	git git://git.infradead.org/users/vkoul/slave-dma.git (slave-dma)
 
 DME1737 HARDWARE MONITOR DRIVER
 M:	Juerg Haefliger <juergh@gmail.com>
-- 
cgit v1.2.3-70-g09d2


From 19d78a61be6dd707dcec298c486303d4ba2c840a Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 6 May 2011 10:33:44 -0500
Subject: x86: poll waiting for I/OAT DMA channel status

For certain system configurations a 5 usec udelay before checking I/OAT DMA
channel status is sometimes not sufficient, resulting in a false failure
status and unnecessary freeing of channel resources.  Conversely, for many
configurations 5 usec is longer than necessary.

Loop for up to 20 usec waiting for successful status before failing.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma_v2.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index effd140fc04..54d1a3c24e9 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -507,6 +507,7 @@ int ioat2_alloc_chan_resources(struct dma_chan *c)
 	struct ioat_ring_ent **ring;
 	u64 status;
 	int order;
+	int i = 0;
 
 	/* have we already been set up? */
 	if (ioat->ring)
@@ -547,8 +548,11 @@ int ioat2_alloc_chan_resources(struct dma_chan *c)
 	ioat2_start_null_desc(ioat);
 
 	/* check that we got off the ground */
-	udelay(5);
-	status = ioat_chansts(chan);
+	do {
+		udelay(1);
+		status = ioat_chansts(chan);
+	} while (i++ < 20 && !is_ioat_active(status) && !is_ioat_idle(status));
+
 	if (is_ioat_active(status) || is_ioat_idle(status)) {
 		set_bit(IOAT_RUN, &chan->state);
 		return 1 << ioat->alloc_order;
-- 
cgit v1.2.3-70-g09d2


From 80b4037033c2dae31e73810d506ce93b3783be05 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 11 May 2011 20:39:05 +0800
Subject: spi/tle620x: add missing device_remove_file()

This patch includes below fixes:
1. Add missing device_remove_file for dev_attr_status_show in tle62x0_remove.
2. Fix tle62x0_probe error handling:
   Currently, if the error happens when ptr > 0, gpio_attrs[0] is not
   properly remove.

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Acked-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/spi/tle62x0.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/spi/tle62x0.c b/drivers/spi/tle62x0.c
index a3938958147..32a40876532 100644
--- a/drivers/spi/tle62x0.c
+++ b/drivers/spi/tle62x0.c
@@ -283,7 +283,7 @@ static int __devinit tle62x0_probe(struct spi_device *spi)
 	return 0;
 
  err_gpios:
-	for (; ptr > 0; ptr--)
+	while (--ptr >= 0)
 		device_remove_file(&spi->dev, gpio_attrs[ptr]);
 
 	device_remove_file(&spi->dev, &dev_attr_status_show);
@@ -301,6 +301,7 @@ static int __devexit tle62x0_remove(struct spi_device *spi)
 	for (ptr = 0; ptr < st->nr_gpio; ptr++)
 		device_remove_file(&spi->dev, gpio_attrs[ptr]);
 
+	device_remove_file(&spi->dev, &dev_attr_status_show);
 	kfree(st);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 9c3e737561b2473ea1fc5b83a0c1d51e1ff3d294 Mon Sep 17 00:00:00 2001
From: Cliff Cai <cliff.cai@analog.com>
Date: Mon, 28 Mar 2011 04:57:11 -0400
Subject: spi/spi_bfin_sport: new driver for a SPI bus via the Blackfin SPORT
 peripheral

The Blackfin SPORT peripheral is a pretty flexible device.  With enough
coaching, we can make it generate SPI compatible waveforms.  This is
desirable as the SPORT can run at much higher clock frequencies than the
dedicated on-chip SPI peripheral, and it can do full duplex DMA.  It also
opens up the possibility of multiple SPI buses in case someone wants to
dedicate a whole bus to a specific part that does not play well with
others.

Signed-off-by: Cliff Cai <cliff.cai@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/spi/Kconfig          |   9 +
 drivers/spi/Makefile         |   1 +
 drivers/spi/spi_bfin_sport.c | 952 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 962 insertions(+)
 create mode 100644 drivers/spi/spi_bfin_sport.c

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index fbd96b29530..de35c3ad8a6 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -80,6 +80,15 @@ config SPI_BFIN
 	help
 	  This is the SPI controller master driver for Blackfin 5xx processor.
 
+config SPI_BFIN_SPORT
+	tristate "SPI bus via Blackfin SPORT"
+	depends on BLACKFIN
+	help
+	  Enable support for a SPI bus via the Blackfin SPORT peripheral.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called spi_bfin_sport.
+
 config SPI_AU1550
 	tristate "Au1550/Au12x0 SPI Controller"
 	depends on (SOC_AU1550 || SOC_AU1200) && EXPERIMENTAL
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index fd2fc5f6505..0f8c69b6b19 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_SPI_ALTERA)		+= spi_altera.o
 obj-$(CONFIG_SPI_ATMEL)			+= atmel_spi.o
 obj-$(CONFIG_SPI_ATH79)			+= ath79_spi.o
 obj-$(CONFIG_SPI_BFIN)			+= spi_bfin5xx.o
+obj-$(CONFIG_SPI_BFIN_SPORT)		+= spi_bfin_sport.o
 obj-$(CONFIG_SPI_BITBANG)		+= spi_bitbang.o
 obj-$(CONFIG_SPI_AU1550)		+= au1550_spi.o
 obj-$(CONFIG_SPI_BUTTERFLY)		+= spi_butterfly.o
diff --git a/drivers/spi/spi_bfin_sport.c b/drivers/spi/spi_bfin_sport.c
new file mode 100644
index 00000000000..e557ff617b1
--- /dev/null
+++ b/drivers/spi/spi_bfin_sport.c
@@ -0,0 +1,952 @@
+/*
+ * SPI bus via the Blackfin SPORT peripheral
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright 2009-2011 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+#include <linux/workqueue.h>
+
+#include <asm/portmux.h>
+#include <asm/bfin5xx_spi.h>
+#include <asm/blackfin.h>
+#include <asm/bfin_sport.h>
+#include <asm/cacheflush.h>
+
+#define DRV_NAME	"bfin-sport-spi"
+#define DRV_DESC	"SPI bus via the Blackfin SPORT"
+
+MODULE_AUTHOR("Cliff Cai");
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:bfin-sport-spi");
+
+enum bfin_sport_spi_state {
+	START_STATE,
+	RUNNING_STATE,
+	DONE_STATE,
+	ERROR_STATE,
+};
+
+struct bfin_sport_spi_master_data;
+
+struct bfin_sport_transfer_ops {
+	void (*write) (struct bfin_sport_spi_master_data *);
+	void (*read) (struct bfin_sport_spi_master_data *);
+	void (*duplex) (struct bfin_sport_spi_master_data *);
+};
+
+struct bfin_sport_spi_master_data {
+	/* Driver model hookup */
+	struct device *dev;
+
+	/* SPI framework hookup */
+	struct spi_master *master;
+
+	/* Regs base of SPI controller */
+	struct sport_register __iomem *regs;
+	int err_irq;
+
+	/* Pin request list */
+	u16 *pin_req;
+
+	/* Driver message queue */
+	struct workqueue_struct *workqueue;
+	struct work_struct pump_messages;
+	spinlock_t lock;
+	struct list_head queue;
+	int busy;
+	bool run;
+
+	/* Message Transfer pump */
+	struct tasklet_struct pump_transfers;
+
+	/* Current message transfer state info */
+	enum bfin_sport_spi_state state;
+	struct spi_message *cur_msg;
+	struct spi_transfer *cur_transfer;
+	struct bfin_sport_spi_slave_data *cur_chip;
+	union {
+		void *tx;
+		u8 *tx8;
+		u16 *tx16;
+	};
+	void *tx_end;
+	union {
+		void *rx;
+		u8 *rx8;
+		u16 *rx16;
+	};
+	void *rx_end;
+
+	int cs_change;
+	struct bfin_sport_transfer_ops *ops;
+};
+
+struct bfin_sport_spi_slave_data {
+	u16 ctl_reg;
+	u16 baud;
+	u16 cs_chg_udelay;	/* Some devices require > 255usec delay */
+	u32 cs_gpio;
+	u16 idle_tx_val;
+	struct bfin_sport_transfer_ops *ops;
+};
+
+static void
+bfin_sport_spi_enable(struct bfin_sport_spi_master_data *drv_data)
+{
+	bfin_write_or(&drv_data->regs->tcr1, TSPEN);
+	bfin_write_or(&drv_data->regs->rcr1, TSPEN);
+	SSYNC();
+}
+
+static void
+bfin_sport_spi_disable(struct bfin_sport_spi_master_data *drv_data)
+{
+	bfin_write_and(&drv_data->regs->tcr1, ~TSPEN);
+	bfin_write_and(&drv_data->regs->rcr1, ~TSPEN);
+	SSYNC();
+}
+
+/* Caculate the SPI_BAUD register value based on input HZ */
+static u16
+bfin_sport_hz_to_spi_baud(u32 speed_hz)
+{
+	u_long clk, sclk = get_sclk();
+	int div = (sclk / (2 * speed_hz)) - 1;
+
+	if (div < 0)
+		div = 0;
+
+	clk = sclk / (2 * (div + 1));
+
+	if (clk > speed_hz)
+		div++;
+
+	return div;
+}
+
+/* Chip select operation functions for cs_change flag */
+static void
+bfin_sport_spi_cs_active(struct bfin_sport_spi_slave_data *chip)
+{
+	gpio_direction_output(chip->cs_gpio, 0);
+}
+
+static void
+bfin_sport_spi_cs_deactive(struct bfin_sport_spi_slave_data *chip)
+{
+	gpio_direction_output(chip->cs_gpio, 1);
+	/* Move delay here for consistency */
+	if (chip->cs_chg_udelay)
+		udelay(chip->cs_chg_udelay);
+}
+
+static void
+bfin_sport_spi_stat_poll_complete(struct bfin_sport_spi_master_data *drv_data)
+{
+	unsigned long timeout = jiffies + HZ;
+	while (!(bfin_read(&drv_data->regs->stat) & RXNE)) {
+		if (!time_before(jiffies, timeout))
+			break;
+	}
+}
+
+static void
+bfin_sport_spi_u8_writer(struct bfin_sport_spi_master_data *drv_data)
+{
+	u16 dummy;
+
+	while (drv_data->tx < drv_data->tx_end) {
+		bfin_write(&drv_data->regs->tx16, *drv_data->tx8++);
+		bfin_sport_spi_stat_poll_complete(drv_data);
+		dummy = bfin_read(&drv_data->regs->rx16);
+	}
+}
+
+static void
+bfin_sport_spi_u8_reader(struct bfin_sport_spi_master_data *drv_data)
+{
+	u16 tx_val = drv_data->cur_chip->idle_tx_val;
+
+	while (drv_data->rx < drv_data->rx_end) {
+		bfin_write(&drv_data->regs->tx16, tx_val);
+		bfin_sport_spi_stat_poll_complete(drv_data);
+		*drv_data->rx8++ = bfin_read(&drv_data->regs->rx16);
+	}
+}
+
+static void
+bfin_sport_spi_u8_duplex(struct bfin_sport_spi_master_data *drv_data)
+{
+	while (drv_data->rx < drv_data->rx_end) {
+		bfin_write(&drv_data->regs->tx16, *drv_data->tx8++);
+		bfin_sport_spi_stat_poll_complete(drv_data);
+		*drv_data->rx8++ = bfin_read(&drv_data->regs->rx16);
+	}
+}
+
+static struct bfin_sport_transfer_ops bfin_sport_transfer_ops_u8 = {
+	.write  = bfin_sport_spi_u8_writer,
+	.read   = bfin_sport_spi_u8_reader,
+	.duplex = bfin_sport_spi_u8_duplex,
+};
+
+static void
+bfin_sport_spi_u16_writer(struct bfin_sport_spi_master_data *drv_data)
+{
+	u16 dummy;
+
+	while (drv_data->tx < drv_data->tx_end) {
+		bfin_write(&drv_data->regs->tx16, *drv_data->tx16++);
+		bfin_sport_spi_stat_poll_complete(drv_data);
+		dummy = bfin_read(&drv_data->regs->rx16);
+	}
+}
+
+static void
+bfin_sport_spi_u16_reader(struct bfin_sport_spi_master_data *drv_data)
+{
+	u16 tx_val = drv_data->cur_chip->idle_tx_val;
+
+	while (drv_data->rx < drv_data->rx_end) {
+		bfin_write(&drv_data->regs->tx16, tx_val);
+		bfin_sport_spi_stat_poll_complete(drv_data);
+		*drv_data->rx16++ = bfin_read(&drv_data->regs->rx16);
+	}
+}
+
+static void
+bfin_sport_spi_u16_duplex(struct bfin_sport_spi_master_data *drv_data)
+{
+	while (drv_data->rx < drv_data->rx_end) {
+		bfin_write(&drv_data->regs->tx16, *drv_data->tx16++);
+		bfin_sport_spi_stat_poll_complete(drv_data);
+		*drv_data->rx16++ = bfin_read(&drv_data->regs->rx16);
+	}
+}
+
+static struct bfin_sport_transfer_ops bfin_sport_transfer_ops_u16 = {
+	.write  = bfin_sport_spi_u16_writer,
+	.read   = bfin_sport_spi_u16_reader,
+	.duplex = bfin_sport_spi_u16_duplex,
+};
+
+/* stop controller and re-config current chip */
+static void
+bfin_sport_spi_restore_state(struct bfin_sport_spi_master_data *drv_data)
+{
+	struct bfin_sport_spi_slave_data *chip = drv_data->cur_chip;
+	unsigned int bits = (drv_data->ops == &bfin_sport_transfer_ops_u8 ? 7 : 15);
+
+	bfin_sport_spi_disable(drv_data);
+	dev_dbg(drv_data->dev, "restoring spi ctl state\n");
+
+	bfin_write(&drv_data->regs->tcr1, chip->ctl_reg);
+	bfin_write(&drv_data->regs->tcr2, bits);
+	bfin_write(&drv_data->regs->tclkdiv, chip->baud);
+	bfin_write(&drv_data->regs->tfsdiv, bits);
+	SSYNC();
+
+	bfin_write(&drv_data->regs->rcr1, chip->ctl_reg & ~(ITCLK | ITFS));
+	bfin_write(&drv_data->regs->rcr2, bits);
+	SSYNC();
+
+	bfin_sport_spi_cs_active(chip);
+}
+
+/* test if there is more transfer to be done */
+static enum bfin_sport_spi_state
+bfin_sport_spi_next_transfer(struct bfin_sport_spi_master_data *drv_data)
+{
+	struct spi_message *msg = drv_data->cur_msg;
+	struct spi_transfer *trans = drv_data->cur_transfer;
+
+	/* Move to next transfer */
+	if (trans->transfer_list.next != &msg->transfers) {
+		drv_data->cur_transfer =
+		    list_entry(trans->transfer_list.next,
+			       struct spi_transfer, transfer_list);
+		return RUNNING_STATE;
+	}
+
+	return DONE_STATE;
+}
+
+/*
+ * caller already set message->status;
+ * dma and pio irqs are blocked give finished message back
+ */
+static void
+bfin_sport_spi_giveback(struct bfin_sport_spi_master_data *drv_data)
+{
+	struct bfin_sport_spi_slave_data *chip = drv_data->cur_chip;
+	unsigned long flags;
+	struct spi_message *msg;
+
+	spin_lock_irqsave(&drv_data->lock, flags);
+	msg = drv_data->cur_msg;
+	drv_data->state = START_STATE;
+	drv_data->cur_msg = NULL;
+	drv_data->cur_transfer = NULL;
+	drv_data->cur_chip = NULL;
+	queue_work(drv_data->workqueue, &drv_data->pump_messages);
+	spin_unlock_irqrestore(&drv_data->lock, flags);
+
+	if (!drv_data->cs_change)
+		bfin_sport_spi_cs_deactive(chip);
+
+	if (msg->complete)
+		msg->complete(msg->context);
+}
+
+static irqreturn_t
+sport_err_handler(int irq, void *dev_id)
+{
+	struct bfin_sport_spi_master_data *drv_data = dev_id;
+	u16 status;
+
+	dev_dbg(drv_data->dev, "%s enter\n", __func__);
+	status = bfin_read(&drv_data->regs->stat) & (TOVF | TUVF | ROVF | RUVF);
+
+	if (status) {
+		bfin_write(&drv_data->regs->stat, status);
+		SSYNC();
+
+		bfin_sport_spi_disable(drv_data);
+		dev_err(drv_data->dev, "status error:%s%s%s%s\n",
+			status & TOVF ? " TOVF" : "",
+			status & TUVF ? " TUVF" : "",
+			status & ROVF ? " ROVF" : "",
+			status & RUVF ? " RUVF" : "");
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void
+bfin_sport_spi_pump_transfers(unsigned long data)
+{
+	struct bfin_sport_spi_master_data *drv_data = (void *)data;
+	struct spi_message *message = NULL;
+	struct spi_transfer *transfer = NULL;
+	struct spi_transfer *previous = NULL;
+	struct bfin_sport_spi_slave_data *chip = NULL;
+	unsigned int bits_per_word;
+	u32 tranf_success = 1;
+	u32 transfer_speed;
+	u8 full_duplex = 0;
+
+	/* Get current state information */
+	message = drv_data->cur_msg;
+	transfer = drv_data->cur_transfer;
+	chip = drv_data->cur_chip;
+
+	if (transfer->speed_hz)
+		transfer_speed = bfin_sport_hz_to_spi_baud(transfer->speed_hz);
+	else
+		transfer_speed = chip->baud;
+	bfin_write(&drv_data->regs->tclkdiv, transfer_speed);
+	SSYNC();
+
+	/*
+	 * if msg is error or done, report it back using complete() callback
+	 */
+
+	 /* Handle for abort */
+	if (drv_data->state == ERROR_STATE) {
+		dev_dbg(drv_data->dev, "transfer: we've hit an error\n");
+		message->status = -EIO;
+		bfin_sport_spi_giveback(drv_data);
+		return;
+	}
+
+	/* Handle end of message */
+	if (drv_data->state == DONE_STATE) {
+		dev_dbg(drv_data->dev, "transfer: all done!\n");
+		message->status = 0;
+		bfin_sport_spi_giveback(drv_data);
+		return;
+	}
+
+	/* Delay if requested at end of transfer */
+	if (drv_data->state == RUNNING_STATE) {
+		dev_dbg(drv_data->dev, "transfer: still running ...\n");
+		previous = list_entry(transfer->transfer_list.prev,
+				      struct spi_transfer, transfer_list);
+		if (previous->delay_usecs)
+			udelay(previous->delay_usecs);
+	}
+
+	if (transfer->len == 0) {
+		/* Move to next transfer of this msg */
+		drv_data->state = bfin_sport_spi_next_transfer(drv_data);
+		/* Schedule next transfer tasklet */
+		tasklet_schedule(&drv_data->pump_transfers);
+	}
+
+	if (transfer->tx_buf != NULL) {
+		drv_data->tx = (void *)transfer->tx_buf;
+		drv_data->tx_end = drv_data->tx + transfer->len;
+		dev_dbg(drv_data->dev, "tx_buf is %p, tx_end is %p\n",
+			transfer->tx_buf, drv_data->tx_end);
+	} else
+		drv_data->tx = NULL;
+
+	if (transfer->rx_buf != NULL) {
+		full_duplex = transfer->tx_buf != NULL;
+		drv_data->rx = transfer->rx_buf;
+		drv_data->rx_end = drv_data->rx + transfer->len;
+		dev_dbg(drv_data->dev, "rx_buf is %p, rx_end is %p\n",
+			transfer->rx_buf, drv_data->rx_end);
+	} else
+		drv_data->rx = NULL;
+
+	drv_data->cs_change = transfer->cs_change;
+
+	/* Bits per word setup */
+	bits_per_word = transfer->bits_per_word ? : message->spi->bits_per_word;
+	if (bits_per_word == 8)
+		drv_data->ops = &bfin_sport_transfer_ops_u8;
+	else
+		drv_data->ops = &bfin_sport_transfer_ops_u16;
+
+	drv_data->state = RUNNING_STATE;
+
+	if (drv_data->cs_change)
+		bfin_sport_spi_cs_active(chip);
+
+	dev_dbg(drv_data->dev,
+		"now pumping a transfer: width is %d, len is %d\n",
+		bits_per_word, transfer->len);
+
+	/* PIO mode write then read */
+	dev_dbg(drv_data->dev, "doing IO transfer\n");
+
+	bfin_sport_spi_enable(drv_data);
+	if (full_duplex) {
+		/* full duplex mode */
+		BUG_ON((drv_data->tx_end - drv_data->tx) !=
+		       (drv_data->rx_end - drv_data->rx));
+		drv_data->ops->duplex(drv_data);
+
+		if (drv_data->tx != drv_data->tx_end)
+			tranf_success = 0;
+	} else if (drv_data->tx != NULL) {
+		/* write only half duplex */
+
+		drv_data->ops->write(drv_data);
+
+		if (drv_data->tx != drv_data->tx_end)
+			tranf_success = 0;
+	} else if (drv_data->rx != NULL) {
+		/* read only half duplex */
+
+		drv_data->ops->read(drv_data);
+		if (drv_data->rx != drv_data->rx_end)
+			tranf_success = 0;
+	}
+	bfin_sport_spi_disable(drv_data);
+
+	if (!tranf_success) {
+		dev_dbg(drv_data->dev, "IO write error!\n");
+		drv_data->state = ERROR_STATE;
+	} else {
+		/* Update total byte transfered */
+		message->actual_length += transfer->len;
+		/* Move to next transfer of this msg */
+		drv_data->state = bfin_sport_spi_next_transfer(drv_data);
+		if (drv_data->cs_change)
+			bfin_sport_spi_cs_deactive(chip);
+	}
+
+	/* Schedule next transfer tasklet */
+	tasklet_schedule(&drv_data->pump_transfers);
+}
+
+/* pop a msg from queue and kick off real transfer */
+static void
+bfin_sport_spi_pump_messages(struct work_struct *work)
+{
+	struct bfin_sport_spi_master_data *drv_data;
+	unsigned long flags;
+	struct spi_message *next_msg;
+
+	drv_data = container_of(work, struct bfin_sport_spi_master_data, pump_messages);
+
+	/* Lock queue and check for queue work */
+	spin_lock_irqsave(&drv_data->lock, flags);
+	if (list_empty(&drv_data->queue) || !drv_data->run) {
+		/* pumper kicked off but no work to do */
+		drv_data->busy = 0;
+		spin_unlock_irqrestore(&drv_data->lock, flags);
+		return;
+	}
+
+	/* Make sure we are not already running a message */
+	if (drv_data->cur_msg) {
+		spin_unlock_irqrestore(&drv_data->lock, flags);
+		return;
+	}
+
+	/* Extract head of queue */
+	next_msg = list_entry(drv_data->queue.next,
+		struct spi_message, queue);
+
+	drv_data->cur_msg = next_msg;
+
+	/* Setup the SSP using the per chip configuration */
+	drv_data->cur_chip = spi_get_ctldata(drv_data->cur_msg->spi);
+
+	list_del_init(&drv_data->cur_msg->queue);
+
+	/* Initialize message state */
+	drv_data->cur_msg->state = START_STATE;
+	drv_data->cur_transfer = list_entry(drv_data->cur_msg->transfers.next,
+					    struct spi_transfer, transfer_list);
+	bfin_sport_spi_restore_state(drv_data);
+	dev_dbg(drv_data->dev, "got a message to pump, "
+		"state is set to: baud %d, cs_gpio %i, ctl 0x%x\n",
+		drv_data->cur_chip->baud, drv_data->cur_chip->cs_gpio,
+		drv_data->cur_chip->ctl_reg);
+
+	dev_dbg(drv_data->dev,
+		"the first transfer len is %d\n",
+		drv_data->cur_transfer->len);
+
+	/* Mark as busy and launch transfers */
+	tasklet_schedule(&drv_data->pump_transfers);
+
+	drv_data->busy = 1;
+	spin_unlock_irqrestore(&drv_data->lock, flags);
+}
+
+/*
+ * got a msg to transfer, queue it in drv_data->queue.
+ * And kick off message pumper
+ */
+static int
+bfin_sport_spi_transfer(struct spi_device *spi, struct spi_message *msg)
+{
+	struct bfin_sport_spi_master_data *drv_data = spi_master_get_devdata(spi->master);
+	unsigned long flags;
+
+	spin_lock_irqsave(&drv_data->lock, flags);
+
+	if (!drv_data->run) {
+		spin_unlock_irqrestore(&drv_data->lock, flags);
+		return -ESHUTDOWN;
+	}
+
+	msg->actual_length = 0;
+	msg->status = -EINPROGRESS;
+	msg->state = START_STATE;
+
+	dev_dbg(&spi->dev, "adding an msg in transfer()\n");
+	list_add_tail(&msg->queue, &drv_data->queue);
+
+	if (drv_data->run && !drv_data->busy)
+		queue_work(drv_data->workqueue, &drv_data->pump_messages);
+
+	spin_unlock_irqrestore(&drv_data->lock, flags);
+
+	return 0;
+}
+
+/* Called every time common spi devices change state */
+static int
+bfin_sport_spi_setup(struct spi_device *spi)
+{
+	struct bfin_sport_spi_slave_data *chip, *first = NULL;
+	int ret;
+
+	/* Only alloc (or use chip_info) on first setup */
+	chip = spi_get_ctldata(spi);
+	if (chip == NULL) {
+		struct bfin5xx_spi_chip *chip_info;
+
+		chip = first = kzalloc(sizeof(*chip), GFP_KERNEL);
+		if (!chip)
+			return -ENOMEM;
+
+		/* platform chip_info isn't required */
+		chip_info = spi->controller_data;
+		if (chip_info) {
+			/*
+			 * DITFS and TDTYPE are only thing we don't set, but
+			 * they probably shouldn't be changed by people.
+			 */
+			if (chip_info->ctl_reg || chip_info->enable_dma) {
+				ret = -EINVAL;
+				dev_err(&spi->dev, "don't set ctl_reg/enable_dma fields");
+				goto error;
+			}
+			chip->cs_chg_udelay = chip_info->cs_chg_udelay;
+			chip->idle_tx_val = chip_info->idle_tx_val;
+			spi->bits_per_word = chip_info->bits_per_word;
+		}
+	}
+
+	if (spi->bits_per_word != 8 && spi->bits_per_word != 16) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	/* translate common spi framework into our register
+	 * following configure contents are same for tx and rx.
+	 */
+
+	if (spi->mode & SPI_CPHA)
+		chip->ctl_reg &= ~TCKFE;
+	else
+		chip->ctl_reg |= TCKFE;
+
+	if (spi->mode & SPI_LSB_FIRST)
+		chip->ctl_reg |= TLSBIT;
+	else
+		chip->ctl_reg &= ~TLSBIT;
+
+	/* Sport in master mode */
+	chip->ctl_reg |= ITCLK | ITFS | TFSR | LATFS | LTFS;
+
+	chip->baud = bfin_sport_hz_to_spi_baud(spi->max_speed_hz);
+
+	chip->cs_gpio = spi->chip_select;
+	ret = gpio_request(chip->cs_gpio, spi->modalias);
+	if (ret)
+		goto error;
+
+	dev_dbg(&spi->dev, "setup spi chip %s, width is %d\n",
+			spi->modalias, spi->bits_per_word);
+	dev_dbg(&spi->dev, "ctl_reg is 0x%x, GPIO is %i\n",
+			chip->ctl_reg, spi->chip_select);
+
+	spi_set_ctldata(spi, chip);
+
+	bfin_sport_spi_cs_deactive(chip);
+
+	return ret;
+
+ error:
+	kfree(first);
+	return ret;
+}
+
+/*
+ * callback for spi framework.
+ * clean driver specific data
+ */
+static void
+bfin_sport_spi_cleanup(struct spi_device *spi)
+{
+	struct bfin_sport_spi_slave_data *chip = spi_get_ctldata(spi);
+
+	if (!chip)
+		return;
+
+	gpio_free(chip->cs_gpio);
+
+	kfree(chip);
+}
+
+static int
+bfin_sport_spi_init_queue(struct bfin_sport_spi_master_data *drv_data)
+{
+	INIT_LIST_HEAD(&drv_data->queue);
+	spin_lock_init(&drv_data->lock);
+
+	drv_data->run = false;
+	drv_data->busy = 0;
+
+	/* init transfer tasklet */
+	tasklet_init(&drv_data->pump_transfers,
+		     bfin_sport_spi_pump_transfers, (unsigned long)drv_data);
+
+	/* init messages workqueue */
+	INIT_WORK(&drv_data->pump_messages, bfin_sport_spi_pump_messages);
+	drv_data->workqueue =
+	    create_singlethread_workqueue(dev_name(drv_data->master->dev.parent));
+	if (drv_data->workqueue == NULL)
+		return -EBUSY;
+
+	return 0;
+}
+
+static int
+bfin_sport_spi_start_queue(struct bfin_sport_spi_master_data *drv_data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&drv_data->lock, flags);
+
+	if (drv_data->run || drv_data->busy) {
+		spin_unlock_irqrestore(&drv_data->lock, flags);
+		return -EBUSY;
+	}
+
+	drv_data->run = true;
+	drv_data->cur_msg = NULL;
+	drv_data->cur_transfer = NULL;
+	drv_data->cur_chip = NULL;
+	spin_unlock_irqrestore(&drv_data->lock, flags);
+
+	queue_work(drv_data->workqueue, &drv_data->pump_messages);
+
+	return 0;
+}
+
+static inline int
+bfin_sport_spi_stop_queue(struct bfin_sport_spi_master_data *drv_data)
+{
+	unsigned long flags;
+	unsigned limit = 500;
+	int status = 0;
+
+	spin_lock_irqsave(&drv_data->lock, flags);
+
+	/*
+	 * This is a bit lame, but is optimized for the common execution path.
+	 * A wait_queue on the drv_data->busy could be used, but then the common
+	 * execution path (pump_messages) would be required to call wake_up or
+	 * friends on every SPI message. Do this instead
+	 */
+	drv_data->run = false;
+	while (!list_empty(&drv_data->queue) && drv_data->busy && limit--) {
+		spin_unlock_irqrestore(&drv_data->lock, flags);
+		msleep(10);
+		spin_lock_irqsave(&drv_data->lock, flags);
+	}
+
+	if (!list_empty(&drv_data->queue) || drv_data->busy)
+		status = -EBUSY;
+
+	spin_unlock_irqrestore(&drv_data->lock, flags);
+
+	return status;
+}
+
+static inline int
+bfin_sport_spi_destroy_queue(struct bfin_sport_spi_master_data *drv_data)
+{
+	int status;
+
+	status = bfin_sport_spi_stop_queue(drv_data);
+	if (status)
+		return status;
+
+	destroy_workqueue(drv_data->workqueue);
+
+	return 0;
+}
+
+static int __devinit
+bfin_sport_spi_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct bfin5xx_spi_master *platform_info;
+	struct spi_master *master;
+	struct resource *res, *ires;
+	struct bfin_sport_spi_master_data *drv_data;
+	int status;
+
+	platform_info = dev->platform_data;
+
+	/* Allocate master with space for drv_data */
+	master = spi_alloc_master(dev, sizeof(*master) + 16);
+	if (!master) {
+		dev_err(dev, "cannot alloc spi_master\n");
+		return -ENOMEM;
+	}
+
+	drv_data = spi_master_get_devdata(master);
+	drv_data->master = master;
+	drv_data->dev = dev;
+	drv_data->pin_req = platform_info->pin_req;
+
+	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST;
+	master->bus_num = pdev->id;
+	master->num_chipselect = platform_info->num_chipselect;
+	master->cleanup = bfin_sport_spi_cleanup;
+	master->setup = bfin_sport_spi_setup;
+	master->transfer = bfin_sport_spi_transfer;
+
+	/* Find and map our resources */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (res == NULL) {
+		dev_err(dev, "cannot get IORESOURCE_MEM\n");
+		status = -ENOENT;
+		goto out_error_get_res;
+	}
+
+	drv_data->regs = ioremap(res->start, resource_size(res));
+	if (drv_data->regs == NULL) {
+		dev_err(dev, "cannot map registers\n");
+		status = -ENXIO;
+		goto out_error_ioremap;
+	}
+
+	ires = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!ires) {
+		dev_err(dev, "cannot get IORESOURCE_IRQ\n");
+		status = -ENODEV;
+		goto out_error_get_ires;
+	}
+	drv_data->err_irq = ires->start;
+
+	/* Initial and start queue */
+	status = bfin_sport_spi_init_queue(drv_data);
+	if (status) {
+		dev_err(dev, "problem initializing queue\n");
+		goto out_error_queue_alloc;
+	}
+
+	status = bfin_sport_spi_start_queue(drv_data);
+	if (status) {
+		dev_err(dev, "problem starting queue\n");
+		goto out_error_queue_alloc;
+	}
+
+	status = request_irq(drv_data->err_irq, sport_err_handler,
+		0, "sport_spi_err", drv_data);
+	if (status) {
+		dev_err(dev, "unable to request sport err irq\n");
+		goto out_error_irq;
+	}
+
+	status = peripheral_request_list(drv_data->pin_req, DRV_NAME);
+	if (status) {
+		dev_err(dev, "requesting peripherals failed\n");
+		goto out_error_peripheral;
+	}
+
+	/* Register with the SPI framework */
+	platform_set_drvdata(pdev, drv_data);
+	status = spi_register_master(master);
+	if (status) {
+		dev_err(dev, "problem registering spi master\n");
+		goto out_error_master;
+	}
+
+	dev_info(dev, "%s, regs_base@%p\n", DRV_DESC, drv_data->regs);
+	return 0;
+
+ out_error_master:
+	peripheral_free_list(drv_data->pin_req);
+ out_error_peripheral:
+	free_irq(drv_data->err_irq, drv_data);
+ out_error_irq:
+ out_error_queue_alloc:
+	bfin_sport_spi_destroy_queue(drv_data);
+ out_error_get_ires:
+	iounmap(drv_data->regs);
+ out_error_ioremap:
+ out_error_get_res:
+	spi_master_put(master);
+
+	return status;
+}
+
+/* stop hardware and remove the driver */
+static int __devexit
+bfin_sport_spi_remove(struct platform_device *pdev)
+{
+	struct bfin_sport_spi_master_data *drv_data = platform_get_drvdata(pdev);
+	int status = 0;
+
+	if (!drv_data)
+		return 0;
+
+	/* Remove the queue */
+	status = bfin_sport_spi_destroy_queue(drv_data);
+	if (status)
+		return status;
+
+	/* Disable the SSP at the peripheral and SOC level */
+	bfin_sport_spi_disable(drv_data);
+
+	/* Disconnect from the SPI framework */
+	spi_unregister_master(drv_data->master);
+
+	peripheral_free_list(drv_data->pin_req);
+
+	/* Prevent double remove */
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int
+bfin_sport_spi_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct bfin_sport_spi_master_data *drv_data = platform_get_drvdata(pdev);
+	int status;
+
+	status = bfin_sport_spi_stop_queue(drv_data);
+	if (status)
+		return status;
+
+	/* stop hardware */
+	bfin_sport_spi_disable(drv_data);
+
+	return status;
+}
+
+static int
+bfin_sport_spi_resume(struct platform_device *pdev)
+{
+	struct bfin_sport_spi_master_data *drv_data = platform_get_drvdata(pdev);
+	int status;
+
+	/* Enable the SPI interface */
+	bfin_sport_spi_enable(drv_data);
+
+	/* Start the queue running */
+	status = bfin_sport_spi_start_queue(drv_data);
+	if (status)
+		dev_err(drv_data->dev, "problem resuming queue\n");
+
+	return status;
+}
+#else
+# define bfin_sport_spi_suspend NULL
+# define bfin_sport_spi_resume  NULL
+#endif
+
+static struct platform_driver bfin_sport_spi_driver = {
+	.driver	= {
+		.name = DRV_NAME,
+		.owner = THIS_MODULE,
+	},
+	.probe   = bfin_sport_spi_probe,
+	.remove  = __devexit_p(bfin_sport_spi_remove),
+	.suspend = bfin_sport_spi_suspend,
+	.resume  = bfin_sport_spi_resume,
+};
+
+static int __init bfin_sport_spi_init(void)
+{
+	return platform_driver_register(&bfin_sport_spi_driver);
+}
+module_init(bfin_sport_spi_init);
+
+static void __exit bfin_sport_spi_exit(void)
+{
+	platform_driver_unregister(&bfin_sport_spi_driver);
+}
+module_exit(bfin_sport_spi_exit);
-- 
cgit v1.2.3-70-g09d2


From 9a3865b185e77d1a4ca2d8356e37c19b78168961 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 27 May 2011 09:29:32 +0200
Subject: x86, asm: Clean up desc.h a bit

I have looked at this file and found it rather ugly - improve
readability a bit. No change in functionality.

Link: http://lkml.kernel.org/n/tip-incpt6y26yd8586idx65t9ll@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/desc.h | 152 ++++++++++++++++++++++----------------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 617bd56b307..7b439d9aea2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,30 +4,33 @@
 #include <asm/desc_defs.h>
 #include <asm/ldt.h>
 #include <asm/mmu.h>
+
 #include <linux/smp.h>
 
-static inline void fill_ldt(struct desc_struct *desc,
-			    const struct user_desc *info)
-{
-	desc->limit0 = info->limit & 0x0ffff;
-	desc->base0 = info->base_addr & 0x0000ffff;
-
-	desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
-	desc->type = (info->read_exec_only ^ 1) << 1;
-	desc->type |= info->contents << 2;
-	desc->s = 1;
-	desc->dpl = 0x3;
-	desc->p = info->seg_not_present ^ 1;
-	desc->limit = (info->limit & 0xf0000) >> 16;
-	desc->avl = info->useable;
-	desc->d = info->seg_32bit;
-	desc->g = info->limit_in_pages;
-	desc->base2 = (info->base_addr & 0xff000000) >> 24;
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+	desc->limit0		= info->limit & 0x0ffff;
+
+	desc->base0		= (info->base_addr & 0x0000ffff);
+	desc->base1		= (info->base_addr & 0x00ff0000) >> 16;
+
+	desc->type		= (info->read_exec_only ^ 1) << 1;
+	desc->type	       |= info->contents << 2;
+
+	desc->s			= 1;
+	desc->dpl		= 0x3;
+	desc->p			= info->seg_not_present ^ 1;
+	desc->limit		= (info->limit & 0xf0000) >> 16;
+	desc->avl		= info->useable;
+	desc->d			= info->seg_32bit;
+	desc->g			= info->limit_in_pages;
+
+	desc->base2		= (info->base_addr & 0xff000000) >> 24;
 	/*
 	 * Don't allow setting of the lm bit. It is useless anyway
 	 * because 64bit system calls require __USER_CS:
 	 */
-	desc->l = 0;
+	desc->l			= 0;
 }
 
 extern struct desc_ptr idt_descr;
@@ -36,6 +39,7 @@ extern gate_desc idt_table[];
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
+
 DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
@@ -48,16 +52,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
-	gate->offset_low = PTR_LOW(func);
-	gate->segment = __KERNEL_CS;
-	gate->ist = ist;
-	gate->p = 1;
-	gate->dpl = dpl;
-	gate->zero0 = 0;
-	gate->zero1 = 0;
-	gate->type = type;
-	gate->offset_middle = PTR_MIDDLE(func);
-	gate->offset_high = PTR_HIGH(func);
+	gate->offset_low	= PTR_LOW(func);
+	gate->segment		= __KERNEL_CS;
+	gate->ist		= ist;
+	gate->p			= 1;
+	gate->dpl		= dpl;
+	gate->zero0		= 0;
+	gate->zero1		= 0;
+	gate->type		= type;
+	gate->offset_middle	= PTR_MIDDLE(func);
+	gate->offset_high	= PTR_HIGH(func);
 }
 
 #else
@@ -66,8 +70,7 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
 			     unsigned short seg)
 {
 	gate->a = (seg << 16) | (base & 0xffff);
-	gate->b = (base & 0xffff0000) |
-		  (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+	gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
 }
 
 #endif
@@ -75,31 +78,29 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
 static inline int desc_empty(const void *ptr)
 {
 	const u32 *desc = ptr;
+
 	return !(desc[0] | desc[1]);
 }
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define load_TR_desc() native_load_tr_desc()
-#define load_gdt(dtr) native_load_gdt(dtr)
-#define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
-
-#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
-#define store_tr(tr) (tr = native_store_tr())
-
-#define load_TLS(t, cpu) native_load_tls(t, cpu)
-#define set_ldt native_set_ldt
-
-#define write_ldt_entry(dt, entry, desc)	\
-	native_write_ldt_entry(dt, entry, desc)
-#define write_gdt_entry(dt, entry, desc, type)		\
-	native_write_gdt_entry(dt, entry, desc, type)
-#define write_idt_entry(dt, entry, g)		\
-	native_write_idt_entry(dt, entry, g)
+#define load_TR_desc()				native_load_tr_desc()
+#define load_gdt(dtr)				native_load_gdt(dtr)
+#define load_idt(dtr)				native_load_idt(dtr)
+#define load_tr(tr)				asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt)				asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr)				native_store_gdt(dtr)
+#define store_idt(dtr)				native_store_idt(dtr)
+#define store_tr(tr)				(tr = native_store_tr())
+
+#define load_TLS(t, cpu)			native_load_tls(t, cpu)
+#define set_ldt					native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc)	native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type)	native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g)		native_write_idt_entry(dt, entry, g)
 
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
@@ -112,33 +113,27 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
 
 #define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
 
-static inline void native_write_idt_entry(gate_desc *idt, int entry,
-					  const gate_desc *gate)
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
 {
 	memcpy(&idt[entry], gate, sizeof(*gate));
 }
 
-static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
-					  const void *desc)
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
 {
 	memcpy(&ldt[entry], desc, 8);
 }
 
-static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
-					  const void *desc, int type)
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
 {
 	unsigned int size;
+
 	switch (type) {
-	case DESC_TSS:
-		size = sizeof(tss_desc);
-		break;
-	case DESC_LDT:
-		size = sizeof(ldt_desc);
-		break;
-	default:
-		size = sizeof(struct desc_struct);
-		break;
+	case DESC_TSS:	size = sizeof(tss_desc);	break;
+	case DESC_LDT:	size = sizeof(ldt_desc);	break;
+	default:	size = sizeof(*gdt);		break;
 	}
+
 	memcpy(&gdt[entry], desc, size);
 }
 
@@ -154,20 +149,21 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
 }
 
 
-static inline void set_tssldt_descriptor(void *d, unsigned long addr,
-					 unsigned type, unsigned size)
+static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
 {
 #ifdef CONFIG_X86_64
 	struct ldttss_desc64 *desc = d;
+
 	memset(desc, 0, sizeof(*desc));
-	desc->limit0 = size & 0xFFFF;
-	desc->base0 = PTR_LOW(addr);
-	desc->base1 = PTR_MIDDLE(addr) & 0xFF;
-	desc->type = type;
-	desc->p = 1;
-	desc->limit1 = (size >> 16) & 0xF;
-	desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
-	desc->base3 = PTR_HIGH(addr);
+
+	desc->limit0		= size & 0xFFFF;
+	desc->base0		= PTR_LOW(addr);
+	desc->base1		= PTR_MIDDLE(addr) & 0xFF;
+	desc->type		= type;
+	desc->p			= 1;
+	desc->limit1		= (size >> 16) & 0xF;
+	desc->base2		= (PTR_MIDDLE(addr) >> 8) & 0xFF;
+	desc->base3		= PTR_HIGH(addr);
 #else
 	pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
 #endif
@@ -237,14 +233,16 @@ static inline void native_store_idt(struct desc_ptr *dtr)
 static inline unsigned long native_store_tr(void)
 {
 	unsigned long tr;
+
 	asm volatile("str %0":"=r" (tr));
+
 	return tr;
 }
 
 static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
 {
-	unsigned int i;
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	unsigned int i;
 
 	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
 		gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
@@ -313,6 +311,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
 	gate_desc s;
+
 	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
 	/*
 	 * does not need to be atomic because it is only done once at
@@ -343,8 +342,9 @@ static inline void alloc_system_vector(int vector)
 		set_bit(vector, used_vectors);
 		if (first_system_vector > vector)
 			first_system_vector = vector;
-	} else
+	} else {
 		BUG();
+	}
 }
 
 static inline void alloc_intr_gate(unsigned int n, void *addr)
-- 
cgit v1.2.3-70-g09d2


From 287548e46aa752ce9bb87fcff46f8aa794cc5037 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2011 06:50:06 -0400
Subject: split __follow_mount_rcu() into normal and .. cases

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2358b326b22..da9c2657866 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry)
 }
 
 /*
- * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..".  True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+ * we meet a managed dentry that would need blocking.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-			       struct inode **inode, bool reverse_transit)
+			       struct inode **inode)
 {
 	for (;;) {
 		struct vfsmount *mounted;
@@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 * that wants to block transit.
 		 */
 		*inode = path->dentry->d_inode;
-		if (!reverse_transit &&
-		     unlikely(managed_dentry_might_block(path->dentry)))
+		if (unlikely(managed_dentry_might_block(path->dentry)))
 			return false;
 
 		if (!d_mountpoint(path->dentry))
@@ -949,10 +947,29 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 	}
 
 	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-		return reverse_transit;
+		return false;
 	return true;
 }
 
+static void follow_mount_rcu(struct nameidata *nd, struct path *path,
+			       struct inode **inode)
+{
+	for (;;) {
+		struct vfsmount *mounted;
+		*inode = path->dentry->d_inode;
+
+		if (!d_mountpoint(path->dentry))
+			break;
+
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			break;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+	}
+}
+
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
@@ -982,7 +999,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		inode = nd->path.dentry->d_inode;
 	}
-	__follow_mount_rcu(nd, &nd->path, &inode, true);
+	follow_mount_rcu(nd, &nd->path, &inode);
 	nd->inode = inode;
 	return 0;
 
@@ -1157,7 +1174,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
-		if (likely(__follow_mount_rcu(nd, path, inode, false)))
+		if (likely(__follow_mount_rcu(nd, path, inode)))
 			return 0;
 unlazy:
 		if (unlazy_walk(nd, dentry))
-- 
cgit v1.2.3-70-g09d2


From dea3937619cb67d2ad08e2d29ae923875b1eeee9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2011 06:53:39 -0400
Subject: Trim excessive arguments of follow_mount_rcu()

... and kill a useless local variable in follow_dotdot_rcu(), while
we are at it - follow_mount_rcu(nd, path, inode) *always* assigned
value to *inode, and always it had been path->dentry->d_inode (aka
nd->path.dentry->d_inode, since it always got &nd->path as the second
argument).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index da9c2657866..98808142409 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -951,29 +951,21 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 	return true;
 }
 
-static void follow_mount_rcu(struct nameidata *nd, struct path *path,
-			       struct inode **inode)
+static void follow_mount_rcu(struct nameidata *nd)
 {
-	for (;;) {
+	while (d_mountpoint(nd->path.dentry)) {
 		struct vfsmount *mounted;
-		*inode = path->dentry->d_inode;
-
-		if (!d_mountpoint(path->dentry))
-			break;
-
-		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
 		if (!mounted)
 			break;
-		path->mnt = mounted;
-		path->dentry = mounted->mnt_root;
-		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		nd->path.mnt = mounted;
+		nd->path.dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 	}
 }
 
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
-	struct inode *inode = nd->inode;
-
 	set_root_rcu(nd);
 
 	while (1) {
@@ -989,7 +981,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (read_seqcount_retry(&old->d_seq, nd->seq))
 				goto failed;
-			inode = parent->d_inode;
 			nd->path.dentry = parent;
 			nd->seq = seq;
 			break;
@@ -997,10 +988,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		if (!follow_up_rcu(&nd->path))
 			break;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		inode = nd->path.dentry->d_inode;
 	}
-	follow_mount_rcu(nd, &nd->path, &inode);
-	nd->inode = inode;
+	follow_mount_rcu(nd);
+	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 
 failed:
-- 
cgit v1.2.3-70-g09d2


From d6e9bd256c88ce5f4b668249e363a74f51393daa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2011 07:03:15 -0400
Subject: Lift the check for automount points into do_lookup()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 98808142409..1ab641f2e78 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -945,9 +945,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		path->dentry = mounted->mnt_root;
 		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
 	}
-
-	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-		return false;
 	return true;
 }
 
@@ -1164,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
-		if (likely(__follow_mount_rcu(nd, path, inode)))
-			return 0;
+		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+			goto unlazy;
+		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+			goto unlazy;
+		return 0;
 unlazy:
 		if (unlazy_walk(nd, dentry))
 			return -ECHILD;
-- 
cgit v1.2.3-70-g09d2


From aa38572954ade525817fe88c54faebf85e5a61c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 27 May 2011 06:53:02 -0400
Subject: fs: pass exact type of data dirties to ->dirty_inode

Tell the filesystem if we just updated timestamp (I_DIRTY_SYNC) or
anything else, so that the filesystem can track internally if it
needs to push out a transaction for fdatasync or not.

This is just the prototype change with no user for it yet.  I plan
to push large XFS changes for the next merge window, and getting
this trivial infrastructure in this window would help a lot to avoid
tree interdependencies.

Also remove incorrect comments that ->dirty_inode can't block.  That
has been changed a long time ago, and many implementations rely on it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 4 ++--
 Documentation/filesystems/vfs.txt | 2 +-
 fs/btrfs/ctree.h                  | 2 +-
 fs/btrfs/inode.c                  | 2 +-
 fs/ext3/inode.c                   | 2 +-
 fs/ext4/ext4.h                    | 2 +-
 fs/ext4/inode.c                   | 2 +-
 fs/fs-writeback.c                 | 5 +----
 fs/jffs2/fs.c                     | 2 +-
 fs/jffs2/os-linux.h               | 2 +-
 fs/jfs/inode.c                    | 2 +-
 fs/jfs/jfs_inode.h                | 2 +-
 fs/nilfs2/inode.c                 | 2 +-
 fs/nilfs2/nilfs.h                 | 2 +-
 fs/reiserfs/super.c               | 2 +-
 fs/ubifs/super.c                  | 2 +-
 fs/xfs/linux-2.6/xfs_super.c      | 3 ++-
 include/linux/ext3_fs.h           | 2 +-
 include/linux/fs.h                | 2 +-
 19 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 61b31acb917..57d827d6071 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -104,7 +104,7 @@ of the locking scheme for directory operations.
 prototypes:
 	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
-	void (*dirty_inode) (struct inode *);
+	void (*dirty_inode) (struct inode *, int flags);
 	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	int (*drop_inode) (struct inode *);
 	void (*evict_inode) (struct inode *);
@@ -126,7 +126,7 @@ locking rules:
 			s_umount
 alloc_inode:
 destroy_inode:
-dirty_inode:				(must not sleep)
+dirty_inode:
 write_inode:
 drop_inode:				!!!inode->i_lock!!!
 evict_inode:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 21a7dc467bb..88b9f5519af 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -211,7 +211,7 @@ struct super_operations {
         struct inode *(*alloc_inode)(struct super_block *sb);
         void (*destroy_inode)(struct inode *);
 
-        void (*dirty_inode) (struct inode *);
+        void (*dirty_inode) (struct inode *, int flags);
         int (*write_inode) (struct inode *, int);
         void (*drop_inode) (struct inode *);
         void (*delete_inode) (struct inode *);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f4b81de3ae..d2177e7ad64 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2522,7 +2522,7 @@ int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
+void btrfs_dirty_inode(struct inode *inode, int flags);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7cd8ab0ef04..ecff7d7a505 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4396,7 +4396,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-void btrfs_dirty_inode(struct inode *inode)
+void btrfs_dirty_inode(struct inode *inode, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 68b2e43d7c3..3451d23c3ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *current_handle = ext3_journal_current_handle();
 	handle_t *handle;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a74b89c09f9..1921392cd70 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1813,7 +1813,7 @@ extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_dirty_inode(struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c6458..a5763e3505b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34591ee804b..0f015a0468d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * In short, make sure you hash any inodes _before_ you start marking
  * them dirty.
  *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
  * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
  * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
  * the kernel-internal blockdev inode represents the dirtying time of the
@@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	 */
 	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		if (sb->s_op->dirty_inode)
-			sb->s_op->dirty_inode(inode);
+			sb->s_op->dirty_inode(inode, flags);
 	}
 
 	/*
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e896e67767e..46ad619b612 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -357,7 +357,7 @@ error:
 	return ERR_PTR(ret);
 }
 
-void jffs2_dirty_inode(struct inode *inode)
+void jffs2_dirty_inode(struct inode *inode, int flags)
 {
 	struct iattr iattr;
 
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 00bae7cc2e4..65c6c43ca48 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
 void jffs2_evict_inode (struct inode *);
-void jffs2_dirty_inode(struct inode *inode);
+void jffs2_dirty_inode(struct inode *inode, int flags);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index eddbb373209..109655904bb 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode)
 	dquot_drop(inode);
 }
 
-void jfs_dirty_inode(struct inode *inode)
+void jfs_dirty_inode(struct inode *inode, int flags)
 {
 	static int noisy = 5;
 
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 155e91eff07..ec2fb8b945f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
 extern int jfs_write_inode(struct inode *, struct writeback_control *);
 extern void jfs_evict_inode(struct inode *);
-extern void jfs_dirty_inode(struct inode *);
+extern void jfs_dirty_inode(struct inode *, int);
 extern void jfs_truncate(struct inode *);
 extern void jfs_truncate_nolock(struct inode *, loff_t);
 extern void jfs_free_zero_link(struct inode *);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 587f1843283..b954878ad6c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
  * construction. This function can be called both as a single operation
  * and as a part of indivisible file operations.
  */
-void nilfs_dirty_inode(struct inode *inode)
+void nilfs_dirty_inode(struct inode *inode, int flags)
 {
 	struct nilfs_transaction_info ti;
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a9c6a531f80..f02b9ad43a2 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
 extern int nilfs_mark_inode_dirty(struct inode *);
-extern void nilfs_dirty_inode(struct inode *);
+extern void nilfs_dirty_inode(struct inode *, int flags);
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 __u64 start, __u64 len);
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b216ff6be1c..aa91089162c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -568,7 +568,7 @@ static void destroy_inodecache(void)
 }
 
 /* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode)
+static void reiserfs_dirty_inode(struct inode *inode, int flags)
 {
 	struct reiserfs_transaction_handle th;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6db0bdaa9f7..1ab0d22e4c9 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -382,7 +382,7 @@ done:
 	end_writeback(inode);
 }
 
-static void ubifs_dirty_inode(struct inode *inode)
+static void ubifs_dirty_inode(struct inode *inode, int flags)
 {
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf..1e3a7ce804d 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -925,7 +925,8 @@ xfs_fs_inode_init_once(
  */
 STATIC void
 xfs_fs_dirty_inode(
-	struct inode	*inode)
+	struct inode	*inode,
+	int		flags)
 {
 	barrier();
 	XFS_I(inode)->i_update_core = 1;
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 85c1d302c12..5e06acf95d0 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -909,7 +909,7 @@ extern int  ext3_setattr (struct dentry *, struct iattr *);
 extern void ext3_evict_inode (struct inode *);
 extern int  ext3_sync_inode (handle_t *, struct inode *);
 extern void ext3_discard_reservation (struct inode *);
-extern void ext3_dirty_inode(struct inode *);
+extern void ext3_dirty_inode(struct inode *, int);
 extern int ext3_change_inode_journal_flag(struct inode *, int);
 extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
 extern int ext3_can_truncate(struct inode *inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 241609346df..573028df050 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1618,7 +1618,7 @@ struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
 
-   	void (*dirty_inode) (struct inode *);
+   	void (*dirty_inode) (struct inode *, int flags);
 	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	int (*drop_inode) (struct inode *);
 	void (*evict_inode) (struct inode *);
-- 
cgit v1.2.3-70-g09d2


From 55b23bde19c08f14127a27d461a4e079942c7258 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@kernel.org>
Date: Fri, 27 May 2011 14:50:36 +0200
Subject: xattr: Fix error results for non-existent / invisible attributes

Return -ENODATA when trying to read a user.* attribute which cannot
exist: user space otherwise does not have a reasonable way to
distinguish between non-existent and inaccessible attributes.

Likewise, return -ENODATA when an unprivileged process tries to read a
trusted.* attribute: to unprivileged processes, those attributes are
invisible (listxattr() won't include them).

Related to this bug report: https://bugzilla.redhat.com/660613

Signed-off-by: Andreas Gruenbacher <agruen@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xattr.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/xattr.c b/fs/xattr.c
index f1ef94974de..4be2e7666d0 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 		return 0;
 
 	/*
-	 * The trusted.* namespace can only be accessed by a privileged user.
+	 * The trusted.* namespace can only be accessed by privileged users.
 	 */
-	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-		return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+		return 0;
+	}
 
-	/* In user.* namespace, only regular files and directories can have
+	/*
+	 * In the user.* namespace, only regular files and directories can have
 	 * extended attributes. For sticky directories, only the owner and
-	 * privileged user can write attributes.
+	 * privileged users can write attributes.
 	 */
 	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-			return -EPERM;
+			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
 		    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
 			return -EPERM;
-- 
cgit v1.2.3-70-g09d2


From c642808454ac81d6a6701da6022e93bbe47bb38b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@kernel.org>
Date: Fri, 27 May 2011 14:52:09 +0200
Subject: vfs: Improve the bio_add_page() and bio_add_pc_page() descriptions

The descriptions of bio_add_page() and bio_add_pc_page() are slightly
inconsistent; improve them.

Signed-off-by: Andreas Gruenbacher <agruen@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bio.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 840a0d75524..9bfade8a609 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
  *	@offset: vec entry offset
  *
  *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block
- *	device limitations. The target block device must allow bio's
- *      smaller than PAGE_SIZE, so it is always possible to add a single
- *      page to an empty bio. This should only be used by REQ_PC bios.
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
+ *
+ *	This should only be used by REQ_PC bios.
  */
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
@@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page);
  *	@offset: vec entry offset
  *
  *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block
- *	device limitations. The target block device must allow bio's
- *      smaller than PAGE_SIZE, so it is always possible to add a single
- *      page to an empty bio.
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
  */
 int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
-- 
cgit v1.2.3-70-g09d2


From 4b4563dc80594c6a2580aa52d9fcf0177a27074e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 27 May 2011 09:28:01 -0400
Subject: fs: cosmetic inode.c cleanups

Move the lock order description after all the includes, remove several
fairly outdated and/or incorrect comments, move Andrea's
copyright/changelog to the top where it belongs, remove the pointless
filename in the top of the file comment, and remove to useless macros.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 54 +++++-------------------------------------------------
 1 file changed, 5 insertions(+), 49 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 990d284877a..0f7e88a7803 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1,9 +1,7 @@
 /*
- * linux/fs/inode.c
- *
  * (C) 1997 Linus Torvalds
+ * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
  */
-
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/dcache.h>
@@ -27,10 +25,11 @@
 #include <linux/prefetch.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
+#include <linux/buffer_head.h> /* for inode_has_buffers */
 #include "internal.h"
 
 /*
- * inode locking rules.
+ * Inode locking rules:
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
@@ -60,54 +59,11 @@
  *   inode_hash_lock
  */
 
-/*
- * This is needed for the following functions:
- *  - inode_has_buffers
- *  - invalidate_bdev
- *
- * FIXME: remove all knowledge of the buffer layer from this file
- */
-#include <linux/buffer_head.h>
-
-/*
- * New inode.c implementation.
- *
- * This implementation has the basic premise of trying
- * to be extremely low-overhead and SMP-safe, yet be
- * simple enough to be "obviously correct".
- *
- * Famous last words.
- */
-
-/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
-
-/* #define INODE_PARANOIA 1 */
-/* #define INODE_DEBUG 1 */
-
-/*
- * Inode lookup is no longer as critical as it used to be:
- * most of the lookups are going to be through the dcache.
- */
-#define I_HASHBITS	i_hash_shift
-#define I_HASHMASK	i_hash_mask
-
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
-/*
- * Each inode can be on two separate lists. One is
- * the hash list of the inode, used for lookups. The
- * other linked list is the "type" list:
- *  "in_use" - valid inode, i_count > 0, i_nlink > 0
- *  "dirty"  - as "in_use" but also dirty
- *  "unused" - valid inode, i_count = 0
- *
- * A "dirty" list is maintained for each super block,
- * allowing for low-overhead inode sync() operations.
- */
-
 static LIST_HEAD(inode_lru);
 static DEFINE_SPINLOCK(inode_lru_lock);
 
@@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
 
 	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
 			L1_CACHE_BYTES;
-	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-	return tmp & I_HASHMASK;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+	return tmp & i_hash_mask;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 1007da0604b1d2f064bfecece0f131d57237b03f Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Thu, 26 May 2011 09:57:33 -0600
Subject: ASoC: Fix dapm_is_shared_kcontrol so everything isn't shared

Commit af46800 ("ASoC: Implement mux control sharing") introduced
function dapm_is_shared_kcontrol.

When this function returns true, the naming of DAPM controls is derived
from the kcontrol_new. Otherwise, the name comes from the widget (and
possibly a widget's naming prefix).

A bug in the implementation of dapm_is_shared_kcontrol made it return 1
in all cases. Hence, that commit caused a change in control naming for
all controls instead of just shared controls.

Specifically, a control is always considered shared because it is always
compared against itself. Solve this by never comparing against the widget
containing the control being created.

Equally, controls should never be shared between DAPM contexts; when the
same codec is instantiated multiple times, the same kcontrol_new will be
used. However, the control should no be shared between the multiple
instances.

I tested that with the Tegra WM8903 driver:
* Shared is now mostly 0 as expected, and sometimes 1.
* The expected controls are still generated after this change.

However, I don't have any systems that have a widget/control naming
prefix, so I can't test that aspect.

Thanks for Jarkko Nikula for pointing out how to fix this.

Reported-by: Liam Girdwood <lrg@ti.com>
Tested-by: Jarkko Nikula <jhnikula@gmail.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Liam Girdwood <lrg@ti.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 sound/soc/soc-dapm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 999bb08cdfb..776e6f41830 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -325,6 +325,7 @@ static int dapm_connect_mixer(struct snd_soc_dapm_context *dapm,
 }
 
 static int dapm_is_shared_kcontrol(struct snd_soc_dapm_context *dapm,
+	struct snd_soc_dapm_widget *kcontrolw,
 	const struct snd_kcontrol_new *kcontrol_new,
 	struct snd_kcontrol **kcontrol)
 {
@@ -334,6 +335,8 @@ static int dapm_is_shared_kcontrol(struct snd_soc_dapm_context *dapm,
 	*kcontrol = NULL;
 
 	list_for_each_entry(w, &dapm->card->widgets, list) {
+		if (w == kcontrolw || w->dapm != kcontrolw->dapm)
+			continue;
 		for (i = 0; i < w->num_kcontrols; i++) {
 			if (&w->kcontrol_news[i] == kcontrol_new) {
 				if (w->kcontrols)
@@ -468,7 +471,7 @@ static int dapm_new_mux(struct snd_soc_dapm_context *dapm,
 		return -EINVAL;
 	}
 
-	shared = dapm_is_shared_kcontrol(dapm, &w->kcontrol_news[0],
+	shared = dapm_is_shared_kcontrol(dapm, w, &w->kcontrol_news[0],
 					 &kcontrol);
 	if (kcontrol) {
 		wlist = kcontrol->private_data;
-- 
cgit v1.2.3-70-g09d2


From ea02c63d57d7ec099f66ddb2942b4022e865cd5f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 27 May 2011 21:56:16 +0800
Subject: ASoC: Fix wm_hubs input PGA ZC bits

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Liam Girdwood <lrg@ti.com>
---
 sound/soc/codecs/wm_hubs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sound/soc/codecs/wm_hubs.c b/sound/soc/codecs/wm_hubs.c
index e55b298c14a..9e370d14ad8 100644
--- a/sound/soc/codecs/wm_hubs.c
+++ b/sound/soc/codecs/wm_hubs.c
@@ -215,23 +215,23 @@ static const struct snd_kcontrol_new analogue_snd_controls[] = {
 SOC_SINGLE_TLV("IN1L Volume", WM8993_LEFT_LINE_INPUT_1_2_VOLUME, 0, 31, 0,
 	       inpga_tlv),
 SOC_SINGLE("IN1L Switch", WM8993_LEFT_LINE_INPUT_1_2_VOLUME, 7, 1, 1),
-SOC_SINGLE("IN1L ZC Switch", WM8993_LEFT_LINE_INPUT_1_2_VOLUME, 7, 1, 0),
+SOC_SINGLE("IN1L ZC Switch", WM8993_LEFT_LINE_INPUT_1_2_VOLUME, 6, 1, 0),
 
 SOC_SINGLE_TLV("IN1R Volume", WM8993_RIGHT_LINE_INPUT_1_2_VOLUME, 0, 31, 0,
 	       inpga_tlv),
 SOC_SINGLE("IN1R Switch", WM8993_RIGHT_LINE_INPUT_1_2_VOLUME, 7, 1, 1),
-SOC_SINGLE("IN1R ZC Switch", WM8993_RIGHT_LINE_INPUT_1_2_VOLUME, 7, 1, 0),
+SOC_SINGLE("IN1R ZC Switch", WM8993_RIGHT_LINE_INPUT_1_2_VOLUME, 6, 1, 0),
 
 
 SOC_SINGLE_TLV("IN2L Volume", WM8993_LEFT_LINE_INPUT_3_4_VOLUME, 0, 31, 0,
 	       inpga_tlv),
 SOC_SINGLE("IN2L Switch", WM8993_LEFT_LINE_INPUT_3_4_VOLUME, 7, 1, 1),
-SOC_SINGLE("IN2L ZC Switch", WM8993_LEFT_LINE_INPUT_3_4_VOLUME, 7, 1, 0),
+SOC_SINGLE("IN2L ZC Switch", WM8993_LEFT_LINE_INPUT_3_4_VOLUME, 6, 1, 0),
 
 SOC_SINGLE_TLV("IN2R Volume", WM8993_RIGHT_LINE_INPUT_3_4_VOLUME, 0, 31, 0,
 	       inpga_tlv),
 SOC_SINGLE("IN2R Switch", WM8993_RIGHT_LINE_INPUT_3_4_VOLUME, 7, 1, 1),
-SOC_SINGLE("IN2R ZC Switch", WM8993_RIGHT_LINE_INPUT_3_4_VOLUME, 7, 1, 0),
+SOC_SINGLE("IN2R ZC Switch", WM8993_RIGHT_LINE_INPUT_3_4_VOLUME, 6, 1, 0),
 
 SOC_SINGLE_TLV("MIXINL IN2L Volume", WM8993_INPUT_MIXER3, 7, 1, 0,
 	       inmix_sw_tlv),
-- 
cgit v1.2.3-70-g09d2


From f133ecca9cbb31b5e6e9bda27cbe3034fbf656df Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 26 May 2011 12:40:09 -0400
Subject: arch/tile: more /proc and /sys file support

This change introduces a few of the less controversial /proc and
/proc/sys interfaces for tile, along with sysfs attributes for
various things that were originally proposed as /proc/tile files.
It also adjusts the "hardwall" proc API.

Arnd Bergmann reviewed the initial arch/tile submission, which
included a complete set of all the /proc/tile and /proc/sys/tile
knobs that we had added in a somewhat ad hoc way during initial
development, and provided feedback on where most of them should go.

One knob turned out to be similar enough to the existing
/proc/sys/debug/exception-trace that it was re-implemented to use
that model instead.

Another knob was /proc/tile/grid, which reported the "grid" dimensions
of a tile chip (e.g. 8x8 processors = 64-core chip).  Arnd suggested
looking at sysfs for that, so this change moves that information
to a pair of sysfs attributes (chip_width and chip_height) in the
/sys/devices/system/cpu directory.  We also put the "chip_serial"
and "chip_revision" information from our old /proc/tile/board file
as attributes in /sys/devices/system/cpu.

Other information collected via hypervisor APIs is now placed in
/sys/hypervisor.  We create a /sys/hypervisor/type file (holding the
constant string "tilera") to be parallel with the Xen use of
/sys/hypervisor/type holding "xen".  We create three top-level files,
"version" (the hypervisor's own version), "config_version" (the
version of the configuration file), and "hvconfig" (the contents of
the configuration file).  The remaining information from our old
/proc/tile/board and /proc/tile/switch files becomes an attribute
group appearing under /sys/hypervisor/board/.

Finally, after some feedback from Arnd Bergmann for the previous
version of this patch, the /proc/tile/hardwall file is split up into
two conceptual parts.  First, a directory /proc/tile/hardwall/ which
contains one file per active hardwall, each file named after the
hardwall's ID and holding a cpulist that says which cpus are enclosed by
the hardwall.  Second, a /proc/PID file "hardwall" that is either
empty (for non-hardwall-using processes) or contains the hardwall ID.

Finally, this change pushes the /proc/sys/tile/unaligned_fixup/
directory, with knobs controlling the kernel code for handling the
fixup of unaligned exceptions.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/Kconfig                |   1 +
 arch/tile/include/asm/hardwall.h |  15 +++-
 arch/tile/kernel/Makefile        |   2 +-
 arch/tile/kernel/hardwall.c      |  90 ++++++++++++++-----
 arch/tile/kernel/proc.c          |  73 +++++++++++++++
 arch/tile/kernel/sysfs.c         | 185 +++++++++++++++++++++++++++++++++++++++
 fs/proc/base.c                   |   9 ++
 7 files changed, 347 insertions(+), 28 deletions(-)
 create mode 100644 arch/tile/kernel/sysfs.c

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 635e1bfb1c5..3f7d63c7ae8 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -12,6 +12,7 @@ config TILE
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
+	select SYS_HYPERVISOR
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
diff --git a/arch/tile/include/asm/hardwall.h b/arch/tile/include/asm/hardwall.h
index 0bed3ec7b42..2ac422848c7 100644
--- a/arch/tile/include/asm/hardwall.h
+++ b/arch/tile/include/asm/hardwall.h
@@ -40,6 +40,10 @@
 #define HARDWALL_DEACTIVATE \
  _IO(HARDWALL_IOCTL_BASE, _HARDWALL_DEACTIVATE)
 
+#define _HARDWALL_GET_ID 4
+#define HARDWALL_GET_ID \
+ _IO(HARDWALL_IOCTL_BASE, _HARDWALL_GET_ID)
+
 #ifndef __KERNEL__
 
 /* This is the canonical name expected by userspace. */
@@ -47,9 +51,14 @@
 
 #else
 
-/* Hook for /proc/tile/hardwall. */
-struct seq_file;
-int proc_tile_hardwall_show(struct seq_file *sf, void *v);
+/* /proc hooks for hardwall. */
+struct proc_dir_entry;
+#ifdef CONFIG_HARDWALL
+void proc_tile_hardwall_init(struct proc_dir_entry *root);
+int proc_pid_hardwall(struct task_struct *task, char *buffer);
+#else
+static inline void proc_tile_hardwall_init(struct proc_dir_entry *root) {}
+#endif
 
 #endif
 
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b4c8e8ec45d..b4dbc057baa 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y := vmlinux.lds head_$(BITS).o
 obj-y := backtrace.o entry.o init_task.o irq.o messaging.o \
 	pci-dma.o proc.o process.o ptrace.o reboot.o \
-	setup.o signal.o single_step.o stack.o sys.o time.o traps.o \
+	setup.o signal.o single_step.o stack.o sys.o sysfs.o time.o traps.o \
 	intvec_$(BITS).o regs_$(BITS).o tile-desc_$(BITS).o
 
 obj-$(CONFIG_HARDWALL)		+= hardwall.o
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 3bddef710de..8c41891aab3 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -40,16 +40,25 @@
 struct hardwall_info {
 	struct list_head list;             /* "rectangles" list */
 	struct list_head task_head;        /* head of tasks in this hardwall */
+	struct cpumask cpumask;            /* cpus in the rectangle */
 	int ulhc_x;                        /* upper left hand corner x coord */
 	int ulhc_y;                        /* upper left hand corner y coord */
 	int width;                         /* rectangle width */
 	int height;                        /* rectangle height */
+	int id;                            /* integer id for this hardwall */
 	int teardown_in_progress;          /* are we tearing this one down? */
 };
 
 /* Currently allocated hardwall rectangles */
 static LIST_HEAD(rectangles);
 
+/* /proc/tile/hardwall */
+static struct proc_dir_entry *hardwall_proc_dir;
+
+/* Functions to manage files in /proc/tile/hardwall. */
+static void hardwall_add_proc(struct hardwall_info *rect);
+static void hardwall_remove_proc(struct hardwall_info *rect);
+
 /*
  * Guard changes to the hardwall data structures.
  * This could be finer grained (e.g. one lock for the list of hardwall
@@ -105,6 +114,8 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask)
 	r->ulhc_y = cpu_y(ulhc);
 	r->width = cpu_x(lrhc) - r->ulhc_x + 1;
 	r->height = cpu_y(lrhc) - r->ulhc_y + 1;
+	cpumask_copy(&r->cpumask, mask);
+	r->id = ulhc;   /* The ulhc cpu id can be the hardwall id. */
 
 	/* Width and height must be positive */
 	if (r->width <= 0 || r->height <= 0)
@@ -388,6 +399,9 @@ static struct hardwall_info *hardwall_create(
 	/* Set up appropriate hardwalling on all affected cpus. */
 	hardwall_setup(rect);
 
+	/* Create a /proc/tile/hardwall entry. */
+	hardwall_add_proc(rect);
+
 	return rect;
 }
 
@@ -645,6 +659,9 @@ static void hardwall_destroy(struct hardwall_info *rect)
 	/* Restart switch and disable firewall. */
 	on_each_cpu_mask(&mask, restart_udn_switch, NULL, 1);
 
+	/* Remove the /proc/tile/hardwall entry. */
+	hardwall_remove_proc(rect);
+
 	/* Now free the rectangle from the list. */
 	spin_lock_irqsave(&hardwall_lock, flags);
 	BUG_ON(!list_empty(&rect->task_head));
@@ -654,35 +671,57 @@ static void hardwall_destroy(struct hardwall_info *rect)
 }
 
 
-/*
- * Dump hardwall state via /proc; initialized in arch/tile/sys/proc.c.
- */
-int proc_tile_hardwall_show(struct seq_file *sf, void *v)
+static int hardwall_proc_show(struct seq_file *sf, void *v)
 {
-	struct hardwall_info *r;
+	struct hardwall_info *rect = sf->private;
+	char buf[256];
 
-	if (udn_disabled) {
-		seq_printf(sf, "%dx%d 0,0 pids:\n", smp_width, smp_height);
-		return 0;
-	}
-
-	spin_lock_irq(&hardwall_lock);
-	list_for_each_entry(r, &rectangles, list) {
-		struct task_struct *p;
-		seq_printf(sf, "%dx%d %d,%d pids:",
-			   r->width, r->height, r->ulhc_x, r->ulhc_y);
-		list_for_each_entry(p, &r->task_head, thread.hardwall_list) {
-			unsigned int cpu = cpumask_first(&p->cpus_allowed);
-			unsigned int x = cpu % smp_width;
-			unsigned int y = cpu / smp_width;
-			seq_printf(sf, " %d@%d,%d", p->pid, x, y);
-		}
-		seq_printf(sf, "\n");
-	}
-	spin_unlock_irq(&hardwall_lock);
+	int rc = cpulist_scnprintf(buf, sizeof(buf), &rect->cpumask);
+	buf[rc++] = '\n';
+	seq_write(sf, buf, rc);
 	return 0;
 }
 
+static int hardwall_proc_open(struct inode *inode,
+			      struct file *file)
+{
+	return single_open(file, hardwall_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations hardwall_proc_fops = {
+	.open		= hardwall_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void hardwall_add_proc(struct hardwall_info *rect)
+{
+	char buf[64];
+	snprintf(buf, sizeof(buf), "%d", rect->id);
+	proc_create_data(buf, 0444, hardwall_proc_dir,
+			 &hardwall_proc_fops, rect);
+}
+
+static void hardwall_remove_proc(struct hardwall_info *rect)
+{
+	char buf[64];
+	snprintf(buf, sizeof(buf), "%d", rect->id);
+	remove_proc_entry(buf, hardwall_proc_dir);
+}
+
+int proc_pid_hardwall(struct task_struct *task, char *buffer)
+{
+	struct hardwall_info *rect = task->thread.hardwall;
+	return rect ? sprintf(buffer, "%d\n", rect->id) : 0;
+}
+
+void proc_tile_hardwall_init(struct proc_dir_entry *root)
+{
+	if (!udn_disabled)
+		hardwall_proc_dir = proc_mkdir("hardwall", root);
+}
+
 
 /*
  * Character device support via ioctl/close.
@@ -716,6 +755,9 @@ static long hardwall_ioctl(struct file *file, unsigned int a, unsigned long b)
 			return -EINVAL;
 		return hardwall_deactivate(current);
 
+	case _HARDWALL_GET_ID:
+		return rect ? rect->id : -EINVAL;
+
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c
index 2e02c41ddf3..62d820833c6 100644
--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -27,6 +27,7 @@
 #include <asm/processor.h>
 #include <asm/sections.h>
 #include <asm/homecache.h>
+#include <asm/hardwall.h>
 #include <arch/chip.h>
 
 
@@ -88,3 +89,75 @@ const struct seq_operations cpuinfo_op = {
 	.stop	= c_stop,
 	.show	= show_cpuinfo,
 };
+
+/*
+ * Support /proc/tile directory
+ */
+
+static int __init proc_tile_init(void)
+{
+	struct proc_dir_entry *root = proc_mkdir("tile", NULL);
+	if (root == NULL)
+		return 0;
+
+	proc_tile_hardwall_init(root);
+
+	return 0;
+}
+
+arch_initcall(proc_tile_init);
+
+/*
+ * Support /proc/sys/tile directory
+ */
+
+#ifndef __tilegx__  /* FIXME: GX: no support for unaligned access yet */
+static ctl_table unaligned_subtable[] = {
+	{
+		.procname	= "enabled",
+		.data		= &unaligned_fixup,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.procname	= "printk",
+		.data		= &unaligned_printk,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.procname	= "count",
+		.data		= &unaligned_fixup_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{}
+};
+
+static ctl_table unaligned_table[] = {
+	{
+		.procname	= "unaligned_fixup",
+		.mode		= 0555,
+		.child		= unaligned_subtable
+	},
+	{}
+};
+#endif
+
+static struct ctl_path tile_path[] = {
+	{ .procname = "tile" },
+	{ }
+};
+
+static int __init proc_sys_tile_init(void)
+{
+#ifndef __tilegx__  /* FIXME: GX: no support for unaligned access yet */
+	register_sysctl_paths(tile_path, unaligned_table);
+#endif
+	return 0;
+}
+
+arch_initcall(proc_sys_tile_init);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
new file mode 100644
index 00000000000..b671a86f451
--- /dev/null
+++ b/arch/tile/kernel/sysfs.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * /sys entry support.
+ */
+
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <hv/hypervisor.h>
+
+/* Return a string queried from the hypervisor, truncated to page size. */
+static ssize_t get_hv_confstr(char *page, int query)
+{
+	ssize_t n = hv_confstr(query, (unsigned long)page, PAGE_SIZE - 1);
+	n = n < 0 ? 0 : min(n, (ssize_t)PAGE_SIZE - 1) - 1;
+	if (n)
+		page[n++] = '\n';
+	page[n] = '\0';
+	return n;
+}
+
+static ssize_t chip_width_show(struct sysdev_class *dev,
+			       struct sysdev_class_attribute *attr,
+			       char *page)
+{
+	return sprintf(page, "%u\n", smp_width);
+}
+static SYSDEV_CLASS_ATTR(chip_width, 0444, chip_width_show, NULL);
+
+static ssize_t chip_height_show(struct sysdev_class *dev,
+				struct sysdev_class_attribute *attr,
+				char *page)
+{
+	return sprintf(page, "%u\n", smp_height);
+}
+static SYSDEV_CLASS_ATTR(chip_height, 0444, chip_height_show, NULL);
+
+static ssize_t chip_serial_show(struct sysdev_class *dev,
+				struct sysdev_class_attribute *attr,
+				char *page)
+{
+	return get_hv_confstr(page, HV_CONFSTR_CHIP_SERIAL_NUM);
+}
+static SYSDEV_CLASS_ATTR(chip_serial, 0444, chip_serial_show, NULL);
+
+static ssize_t chip_revision_show(struct sysdev_class *dev,
+				  struct sysdev_class_attribute *attr,
+				  char *page)
+{
+	return get_hv_confstr(page, HV_CONFSTR_CHIP_REV);
+}
+static SYSDEV_CLASS_ATTR(chip_revision, 0444, chip_revision_show, NULL);
+
+
+static ssize_t type_show(struct sysdev_class *dev,
+			    struct sysdev_class_attribute *attr,
+			    char *page)
+{
+	return sprintf(page, "tilera\n");
+}
+static SYSDEV_CLASS_ATTR(type, 0444, type_show, NULL);
+
+#define HV_CONF_ATTR(name, conf)					\
+	static ssize_t name ## _show(struct sysdev_class *dev,		\
+				     struct sysdev_class_attribute *attr, \
+				     char *page)			\
+	{								\
+		return get_hv_confstr(page, conf);			\
+	}								\
+	static SYSDEV_CLASS_ATTR(name, 0444, name ## _show, NULL);
+
+HV_CONF_ATTR(version,		HV_CONFSTR_HV_SW_VER)
+HV_CONF_ATTR(config_version,	HV_CONFSTR_HV_CONFIG_VER)
+
+HV_CONF_ATTR(board_part,	HV_CONFSTR_BOARD_PART_NUM)
+HV_CONF_ATTR(board_serial,	HV_CONFSTR_BOARD_SERIAL_NUM)
+HV_CONF_ATTR(board_revision,	HV_CONFSTR_BOARD_REV)
+HV_CONF_ATTR(board_description,	HV_CONFSTR_BOARD_DESC)
+HV_CONF_ATTR(mezz_part,		HV_CONFSTR_MEZZ_PART_NUM)
+HV_CONF_ATTR(mezz_serial,	HV_CONFSTR_MEZZ_SERIAL_NUM)
+HV_CONF_ATTR(mezz_revision,	HV_CONFSTR_MEZZ_REV)
+HV_CONF_ATTR(mezz_description,	HV_CONFSTR_MEZZ_DESC)
+HV_CONF_ATTR(switch_control,	HV_CONFSTR_SWITCH_CONTROL)
+
+static struct attribute *board_attrs[] = {
+	&attr_board_part.attr,
+	&attr_board_serial.attr,
+	&attr_board_revision.attr,
+	&attr_board_description.attr,
+	&attr_mezz_part.attr,
+	&attr_mezz_serial.attr,
+	&attr_mezz_revision.attr,
+	&attr_mezz_description.attr,
+	&attr_switch_control.attr,
+	NULL
+};
+
+static struct attribute_group board_attr_group = {
+	.name   = "board",
+	.attrs  = board_attrs,
+};
+
+
+static struct bin_attribute hvconfig_bin;
+
+static ssize_t
+hvconfig_bin_read(struct file *filp, struct kobject *kobj,
+		  struct bin_attribute *bin_attr,
+		  char *buf, loff_t off, size_t count)
+{
+	static size_t size;
+
+	/* Lazily learn the true size (minus the trailing NUL). */
+	if (size == 0)
+		size = hv_confstr(HV_CONFSTR_HV_CONFIG, 0, 0) - 1;
+
+	/* Check and adjust input parameters. */
+	if (off > size)
+		return -EINVAL;
+	if (count > size - off)
+		count = size - off;
+
+	if (count) {
+		/* Get a copy of the hvc and copy out the relevant portion. */
+		char *hvc;
+
+		size = off + count;
+		hvc = kmalloc(size, GFP_KERNEL);
+		if (hvc == NULL)
+			return -ENOMEM;
+		hv_confstr(HV_CONFSTR_HV_CONFIG, (unsigned long)hvc, size);
+		memcpy(buf, hvc + off, count);
+		kfree(hvc);
+	}
+
+	return count;
+}
+
+static int __init create_sysfs_entries(void)
+{
+	struct sysdev_class *cls = &cpu_sysdev_class;
+	int err = 0;
+
+#define create_cpu_attr(name)						\
+	if (!err)							\
+		err = sysfs_create_file(&cls->kset.kobj, &attr_##name.attr);
+	create_cpu_attr(chip_width);
+	create_cpu_attr(chip_height);
+	create_cpu_attr(chip_serial);
+	create_cpu_attr(chip_revision);
+
+#define create_hv_attr(name)						\
+	if (!err)							\
+		err = sysfs_create_file(hypervisor_kobj, &attr_##name.attr);
+	create_hv_attr(type);
+	create_hv_attr(version);
+	create_hv_attr(config_version);
+
+	if (!err)
+		err = sysfs_create_group(hypervisor_kobj, &board_attr_group);
+
+	if (!err) {
+		sysfs_bin_attr_init(&hvconfig_bin);
+		hvconfig_bin.attr.name = "hvconfig";
+		hvconfig_bin.attr.mode = S_IRUGO;
+		hvconfig_bin.read = hvconfig_bin_read;
+		hvconfig_bin.size = PAGE_SIZE;
+		err = sysfs_create_bin_file(hypervisor_kobj, &hvconfig_bin);
+	}
+
+	return err;
+}
+subsys_initcall(create_sysfs_entries);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e5..3ad615fb865 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,9 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
 #include "internal.h"
 
 /* NOTE:
@@ -2894,6 +2897,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tgid_io_accounting),
 #endif
+#ifdef CONFIG_HARDWALL
+	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file * filp,
@@ -3232,6 +3238,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tid_io_accounting),
 #endif
+#ifdef CONFIG_HARDWALL
+	INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
-- 
cgit v1.2.3-70-g09d2


From 39ddf3bf6463bc86041e806b43e014d50a144aa4 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:32 -0700
Subject: asus-wmi: Remove __init from asus_wmi_platform_init

It's used by a non-init function.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/asus-wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 832a3fd7c1c..c8c9842753d 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -1223,7 +1223,7 @@ static int asus_wmi_sysfs_init(struct platform_device *device)
 /*
  * Platform device
  */
-static int __init asus_wmi_platform_init(struct asus_wmi *asus)
+static int asus_wmi_platform_init(struct asus_wmi *asus)
 {
 	int rv;
 
-- 
cgit v1.2.3-70-g09d2


From 3a35125f97be0658305c9509c51077959c19b244 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:33 -0700
Subject: ibm_rtl: Remove warnings from casts of pointer to int

Just print them as %p.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/ibm_rtl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/ibm_rtl.c b/drivers/platform/x86/ibm_rtl.c
index b1396e5b295..10563458408 100644
--- a/drivers/platform/x86/ibm_rtl.c
+++ b/drivers/platform/x86/ibm_rtl.c
@@ -288,7 +288,7 @@ static int __init ibm_rtl_init(void) {
 		if ((readq(&tmp->signature) & RTL_MASK) == RTL_SIGNATURE) {
 			phys_addr_t addr;
 			unsigned int plen;
-			RTL_DEBUG("found RTL_SIGNATURE at %#llx\n", (u64)tmp);
+			RTL_DEBUG("found RTL_SIGNATURE at %p\n", tmp);
 			rtl_table = tmp;
 			/* The address, value, width and offset are platform
 			 * dependent and found in the ibm_rtl_table */
@@ -300,7 +300,7 @@ static int __init ibm_rtl_init(void) {
 			RTL_DEBUG("addr = %#llx\n", (unsigned long long)addr);
 			plen = rtl_cmd_width/sizeof(char);
 			rtl_cmd_addr = rtl_port_map(addr, plen);
-			RTL_DEBUG("rtl_cmd_addr = %#llx\n", (u64)rtl_cmd_addr);
+			RTL_DEBUG("rtl_cmd_addr = %p\n", rtl_cmd_addr);
 			if (!rtl_cmd_addr) {
 				ret = -ENOMEM;
 				break;
-- 
cgit v1.2.3-70-g09d2


From 249c720d88fa325522cb763dda09f8c9e8e964cb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:34 -0700
Subject: acer-wmi: pr_<level> cleanups

Convert pr_warning to pr_warn.
Add some missing newlines to pr_<level> uses.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index ac4e7f83ce6..fa8fa73f543 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -1081,7 +1081,7 @@ static acpi_status wmid3_get_device_status(u32 *value, u16 device)
 		return AE_ERROR;
 	}
 	if (obj->buffer.length != 8) {
-		pr_warning("Unknown buffer length %d\n", obj->buffer.length);
+		pr_warn("Unknown buffer length %d\n", obj->buffer.length);
 		kfree(obj);
 		return AE_ERROR;
 	}
@@ -1090,8 +1090,8 @@ static acpi_status wmid3_get_device_status(u32 *value, u16 device)
 	kfree(obj);
 
 	if (return_value.error_code || return_value.ec_return_value)
-		pr_warning("Get Device Status failed: "
-			"0x%x - 0x%x\n", return_value.error_code,
+		pr_warn("Get Device Status failed: 0x%x - 0x%x\n",
+			return_value.error_code,
 			return_value.ec_return_value);
 	else
 		*value = !!(return_value.devices & device);
@@ -1317,7 +1317,7 @@ static void acer_wmi_notify(u32 value, void *context)
 
 	status = wmi_get_event_data(value, &response);
 	if (status != AE_OK) {
-		pr_warning("bad event status 0x%x\n", status);
+		pr_warn("bad event status 0x%x\n", status);
 		return;
 	}
 
@@ -1326,12 +1326,12 @@ static void acer_wmi_notify(u32 value, void *context)
 	if (!obj)
 		return;
 	if (obj->type != ACPI_TYPE_BUFFER) {
-		pr_warning("Unknown response received %d\n", obj->type);
+		pr_warn("Unknown response received %d\n", obj->type);
 		kfree(obj);
 		return;
 	}
 	if (obj->buffer.length != 8) {
-		pr_warning("Unknown buffer length %d\n", obj->buffer.length);
+		pr_warn("Unknown buffer length %d\n", obj->buffer.length);
 		kfree(obj);
 		return;
 	}
@@ -1356,11 +1356,11 @@ static void acer_wmi_notify(u32 value, void *context)
 		}
 		if (!sparse_keymap_report_event(acer_wmi_input_dev,
 				return_value.key_num, 1, true))
-			pr_warning("Unknown key number - 0x%x\n",
+			pr_warn("Unknown key number - 0x%x\n",
 				return_value.key_num);
 		break;
 	default:
-		pr_warning("Unknown function number - %d - %d\n",
+		pr_warn("Unknown function number - %d - %d\n",
 			return_value.function, return_value.key_num);
 		break;
 	}
@@ -1389,7 +1389,7 @@ wmid3_set_lm_mode(struct lm_input_params *params,
 		return AE_ERROR;
 	}
 	if (obj->buffer.length != 4) {
-		pr_warning("Unknown buffer length %d\n", obj->buffer.length);
+		pr_warn("Unknown buffer length %d\n", obj->buffer.length);
 		kfree(obj);
 		return AE_ERROR;
 	}
@@ -1414,11 +1414,11 @@ static int acer_wmi_enable_ec_raw(void)
 	status = wmid3_set_lm_mode(&params, &return_value);
 
 	if (return_value.error_code || return_value.ec_return_value)
-		pr_warning("Enabling EC raw mode failed: "
-		       "0x%x - 0x%x\n", return_value.error_code,
-		       return_value.ec_return_value);
+		pr_warn("Enabling EC raw mode failed: 0x%x - 0x%x\n",
+			return_value.error_code,
+			return_value.ec_return_value);
 	else
-		pr_info("Enabled EC raw mode");
+		pr_info("Enabled EC raw mode\n");
 
 	return status;
 }
@@ -1437,9 +1437,9 @@ static int acer_wmi_enable_lm(void)
 	status = wmid3_set_lm_mode(&params, &return_value);
 
 	if (return_value.error_code || return_value.ec_return_value)
-		pr_warning("Enabling Launch Manager failed: "
-		       "0x%x - 0x%x\n", return_value.error_code,
-		       return_value.ec_return_value);
+		pr_warn("Enabling Launch Manager failed: 0x%x - 0x%x\n",
+			return_value.error_code,
+			return_value.ec_return_value);
 
 	return status;
 }
-- 
cgit v1.2.3-70-g09d2


From 5ad77dcfb49a85715bcb9174a133cba140eda1bc Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:35 -0700
Subject: asus: Add pr_fmt and convert printks to pr_<level>

Add pr_fmt, prefixes each log message.
Convert printks to pr_<level>.
Convert pr_warning to pr_warn.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/asus-laptop.c | 34 ++++++++---------
 drivers/platform/x86/asus-wmi.c    | 20 +++++-----
 drivers/platform/x86/asus_acpi.c   | 77 ++++++++++++++++----------------------
 3 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c
index c53b3ff7978..d65df92e2ac 100644
--- a/drivers/platform/x86/asus-laptop.c
+++ b/drivers/platform/x86/asus-laptop.c
@@ -318,7 +318,7 @@ static int acpi_check_handle(acpi_handle handle, const char *method,
 
 	if (status != AE_OK) {
 		if (ret)
-			pr_warning("Error finding %s\n", method);
+			pr_warn("Error finding %s\n", method);
 		return -ENODEV;
 	}
 	return 0;
@@ -383,7 +383,7 @@ static int asus_kled_lvl(struct asus_laptop *asus)
 	rv = acpi_evaluate_integer(asus->handle, METHOD_KBD_LIGHT_GET,
 				   &params, &kblv);
 	if (ACPI_FAILURE(rv)) {
-		pr_warning("Error reading kled level\n");
+		pr_warn("Error reading kled level\n");
 		return -ENODEV;
 	}
 	return kblv;
@@ -397,7 +397,7 @@ static int asus_kled_set(struct asus_laptop *asus, int kblv)
 		kblv = 0;
 
 	if (write_acpi_int(asus->handle, METHOD_KBD_LIGHT_SET, kblv)) {
-		pr_warning("Keyboard LED display write failed\n");
+		pr_warn("Keyboard LED display write failed\n");
 		return -EINVAL;
 	}
 	return 0;
@@ -531,7 +531,7 @@ static int asus_read_brightness(struct backlight_device *bd)
 	rv = acpi_evaluate_integer(asus->handle, METHOD_BRIGHTNESS_GET,
 				   NULL, &value);
 	if (ACPI_FAILURE(rv))
-		pr_warning("Error reading brightness\n");
+		pr_warn("Error reading brightness\n");
 
 	return value;
 }
@@ -541,7 +541,7 @@ static int asus_set_brightness(struct backlight_device *bd, int value)
 	struct asus_laptop *asus = bl_get_data(bd);
 
 	if (write_acpi_int(asus->handle, METHOD_BRIGHTNESS_SET, value)) {
-		pr_warning("Error changing brightness\n");
+		pr_warn("Error changing brightness\n");
 		return -EIO;
 	}
 	return 0;
@@ -730,7 +730,7 @@ static ssize_t store_ledd(struct device *dev, struct device_attribute *attr,
 	rv = parse_arg(buf, count, &value);
 	if (rv > 0) {
 		if (write_acpi_int(asus->handle, METHOD_LEDD, value)) {
-			pr_warning("LED display write failed\n");
+			pr_warn("LED display write failed\n");
 			return -ENODEV;
 		}
 		asus->ledd_status = (u32) value;
@@ -752,7 +752,7 @@ static int asus_wireless_status(struct asus_laptop *asus, int mask)
 	rv = acpi_evaluate_integer(asus->handle, METHOD_WL_STATUS,
 				   NULL, &status);
 	if (ACPI_FAILURE(rv)) {
-		pr_warning("Error reading Wireless status\n");
+		pr_warn("Error reading Wireless status\n");
 		return -EINVAL;
 	}
 	return !!(status & mask);
@@ -764,7 +764,7 @@ static int asus_wireless_status(struct asus_laptop *asus, int mask)
 static int asus_wlan_set(struct asus_laptop *asus, int status)
 {
 	if (write_acpi_int(asus->handle, METHOD_WLAN, !!status)) {
-		pr_warning("Error setting wlan status to %d", status);
+		pr_warn("Error setting wlan status to %d\n", status);
 		return -EIO;
 	}
 	return 0;
@@ -792,7 +792,7 @@ static ssize_t store_wlan(struct device *dev, struct device_attribute *attr,
 static int asus_bluetooth_set(struct asus_laptop *asus, int status)
 {
 	if (write_acpi_int(asus->handle, METHOD_BLUETOOTH, !!status)) {
-		pr_warning("Error setting bluetooth status to %d", status);
+		pr_warn("Error setting bluetooth status to %d\n", status);
 		return -EIO;
 	}
 	return 0;
@@ -821,7 +821,7 @@ static ssize_t store_bluetooth(struct device *dev,
 static int asus_wimax_set(struct asus_laptop *asus, int status)
 {
 	if (write_acpi_int(asus->handle, METHOD_WIMAX, !!status)) {
-		pr_warning("Error setting wimax status to %d", status);
+		pr_warn("Error setting wimax status to %d\n", status);
 		return -EIO;
 	}
 	return 0;
@@ -850,7 +850,7 @@ static ssize_t store_wimax(struct device *dev,
 static int asus_wwan_set(struct asus_laptop *asus, int status)
 {
 	if (write_acpi_int(asus->handle, METHOD_WWAN, !!status)) {
-		pr_warning("Error setting wwan status to %d", status);
+		pr_warn("Error setting wwan status to %d\n", status);
 		return -EIO;
 	}
 	return 0;
@@ -880,7 +880,7 @@ static void asus_set_display(struct asus_laptop *asus, int value)
 {
 	/* no sanity check needed for now */
 	if (write_acpi_int(asus->handle, METHOD_SWITCH_DISPLAY, value))
-		pr_warning("Error setting display\n");
+		pr_warn("Error setting display\n");
 	return;
 }
 
@@ -909,7 +909,7 @@ static ssize_t store_disp(struct device *dev, struct device_attribute *attr,
 static void asus_als_switch(struct asus_laptop *asus, int value)
 {
 	if (write_acpi_int(asus->handle, METHOD_ALS_CONTROL, value))
-		pr_warning("Error setting light sensor switch\n");
+		pr_warn("Error setting light sensor switch\n");
 	asus->light_switch = value;
 }
 
@@ -937,7 +937,7 @@ static ssize_t store_lssw(struct device *dev, struct device_attribute *attr,
 static void asus_als_level(struct asus_laptop *asus, int value)
 {
 	if (write_acpi_int(asus->handle, METHOD_ALS_LEVEL, value))
-		pr_warning("Error setting light sensor level\n");
+		pr_warn("Error setting light sensor level\n");
 	asus->light_level = value;
 }
 
@@ -976,7 +976,7 @@ static int asus_gps_status(struct asus_laptop *asus)
 	rv = acpi_evaluate_integer(asus->handle, METHOD_GPS_STATUS,
 				   NULL, &status);
 	if (ACPI_FAILURE(rv)) {
-		pr_warning("Error reading GPS status\n");
+		pr_warn("Error reading GPS status\n");
 		return -ENODEV;
 	}
 	return !!status;
@@ -1284,7 +1284,7 @@ static int asus_laptop_get_info(struct asus_laptop *asus)
 	 */
 	status = acpi_get_table(ACPI_SIG_DSDT, 1, &asus->dsdt_info);
 	if (ACPI_FAILURE(status))
-		pr_warning("Couldn't get the DSDT table header\n");
+		pr_warn("Couldn't get the DSDT table header\n");
 
 	/* We have to write 0 on init this far for all ASUS models */
 	if (write_acpi_int_ret(asus->handle, "INIT", 0, &buffer)) {
@@ -1296,7 +1296,7 @@ static int asus_laptop_get_info(struct asus_laptop *asus)
 	status =
 	    acpi_evaluate_integer(asus->handle, "BSTS", NULL, &bsts_result);
 	if (ACPI_FAILURE(status))
-		pr_warning("Error calling BSTS\n");
+		pr_warn("Error calling BSTS\n");
 	else if (bsts_result)
 		pr_notice("BSTS called, 0x%02x returned\n",
 		       (uint) bsts_result);
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index c8c9842753d..00460cb9587 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -425,7 +425,7 @@ static void asus_rfkill_hotplug(struct asus_wmi *asus)
 	if (asus->hotplug_slot) {
 		bus = pci_find_bus(0, 1);
 		if (!bus) {
-			pr_warning("Unable to find PCI bus 1?\n");
+			pr_warn("Unable to find PCI bus 1?\n");
 			goto out_unlock;
 		}
 
@@ -436,12 +436,12 @@ static void asus_rfkill_hotplug(struct asus_wmi *asus)
 		absent = (l == 0xffffffff);
 
 		if (blocked != absent) {
-			pr_warning("BIOS says wireless lan is %s, "
-				   "but the pci device is %s\n",
-				   blocked ? "blocked" : "unblocked",
-				   absent ? "absent" : "present");
-			pr_warning("skipped wireless hotplug as probably "
-				   "inappropriate for this model\n");
+			pr_warn("BIOS says wireless lan is %s, "
+				"but the pci device is %s\n",
+				blocked ? "blocked" : "unblocked",
+				absent ? "absent" : "present");
+			pr_warn("skipped wireless hotplug as probably "
+				"inappropriate for this model\n");
 			goto out_unlock;
 		}
 
@@ -500,7 +500,7 @@ static int asus_register_rfkill_notifier(struct asus_wmi *asus, char *node)
 						     ACPI_SYSTEM_NOTIFY,
 						     asus_rfkill_notify, asus);
 		if (ACPI_FAILURE(status))
-			pr_warning("Failed to register notify on %s\n", node);
+			pr_warn("Failed to register notify on %s\n", node);
 	} else
 		return -ENODEV;
 
@@ -1583,12 +1583,12 @@ static int asus_wmi_probe(struct platform_device *pdev)
 	int ret;
 
 	if (!wmi_has_guid(ASUS_WMI_MGMT_GUID)) {
-		pr_warning("Management GUID not found\n");
+		pr_warn("Management GUID not found\n");
 		return -ENODEV;
 	}
 
 	if (wdrv->event_guid && !wmi_has_guid(wdrv->event_guid)) {
-		pr_warning("Event GUID not found\n");
+		pr_warn("Event GUID not found\n");
 		return -ENODEV;
 	}
 
diff --git a/drivers/platform/x86/asus_acpi.c b/drivers/platform/x86/asus_acpi.c
index f503607c064..d9312b3073e 100644
--- a/drivers/platform/x86/asus_acpi.c
+++ b/drivers/platform/x86/asus_acpi.c
@@ -30,6 +30,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -581,8 +583,7 @@ static int read_led(const char *ledname, int ledmask)
 		if (read_acpi_int(NULL, ledname, &led_status))
 			return led_status;
 		else
-			printk(KERN_WARNING "Asus ACPI: Error reading LED "
-			       "status\n");
+			pr_warn("Error reading LED status\n");
 	}
 	return (hotk->status & ledmask) ? 1 : 0;
 }
@@ -621,8 +622,7 @@ write_led(const char __user *buffer, unsigned long count,
 		led_out = !led_out;
 
 	if (!write_acpi_int(hotk->handle, ledname, led_out, NULL))
-		printk(KERN_WARNING "Asus ACPI: LED (%s) write failed\n",
-		       ledname);
+		pr_warn("LED (%s) write failed\n", ledname);
 
 	return rv;
 }
@@ -679,8 +679,7 @@ static ssize_t ledd_proc_write(struct file *file, const char __user *buffer,
 	if (rv > 0) {
 		if (!write_acpi_int
 		    (hotk->handle, hotk->methods->mt_ledd, value, NULL))
-			printk(KERN_WARNING
-			       "Asus ACPI: LED display write failed\n");
+			pr_warn("LED display write failed\n");
 		else
 			hotk->ledd_status = (u32) value;
 	}
@@ -838,8 +837,7 @@ static int get_lcd_state(void)
 	} else {
 		/* We don't have to check anything if we are here */
 		if (!read_acpi_int(NULL, hotk->methods->lcd_status, &lcd))
-			printk(KERN_WARNING
-			       "Asus ACPI: Error reading LCD status\n");
+			pr_warn("Error reading LCD status\n");
 
 		if (hotk->model == L2D)
 			lcd = ~lcd;
@@ -871,7 +869,7 @@ static int set_lcd_state(int value)
 			   the exact behaviour is simulated here */
 		}
 		if (ACPI_FAILURE(status))
-			printk(KERN_WARNING "Asus ACPI: Error switching LCD\n");
+			pr_warn("Error switching LCD\n");
 	}
 	return 0;
 
@@ -915,13 +913,11 @@ static int read_brightness(struct backlight_device *bd)
 	if (hotk->methods->brightness_get) {	/* SPLV/GPLV laptop */
 		if (!read_acpi_int(hotk->handle, hotk->methods->brightness_get,
 				   &value))
-			printk(KERN_WARNING
-			       "Asus ACPI: Error reading brightness\n");
+			pr_warn("Error reading brightness\n");
 	} else if (hotk->methods->brightness_status) {	/* For D1 for example */
 		if (!read_acpi_int(NULL, hotk->methods->brightness_status,
 				   &value))
-			printk(KERN_WARNING
-			       "Asus ACPI: Error reading brightness\n");
+			pr_warn("Error reading brightness\n");
 	} else			/* No GPLV method */
 		value = hotk->brightness;
 	return value;
@@ -939,8 +935,7 @@ static int set_brightness(int value)
 	if (hotk->methods->brightness_set) {
 		if (!write_acpi_int(hotk->handle, hotk->methods->brightness_set,
 				    value, NULL)) {
-			printk(KERN_WARNING
-			       "Asus ACPI: Error changing brightness\n");
+			pr_warn("Error changing brightness\n");
 			ret = -EIO;
 		}
 		goto out;
@@ -955,8 +950,7 @@ static int set_brightness(int value)
 					      NULL, NULL);
 		(value > 0) ? value-- : value++;
 		if (ACPI_FAILURE(status)) {
-			printk(KERN_WARNING
-			       "Asus ACPI: Error changing brightness\n");
+			pr_warn("Error changing brightness\n");
 			ret = -EIO;
 		}
 	}
@@ -1008,7 +1002,7 @@ static void set_display(int value)
 	/* no sanity check needed for now */
 	if (!write_acpi_int(hotk->handle, hotk->methods->display_set,
 			    value, NULL))
-		printk(KERN_WARNING "Asus ACPI: Error setting display\n");
+		pr_warn("Error setting display\n");
 	return;
 }
 
@@ -1021,8 +1015,7 @@ static int disp_proc_show(struct seq_file *m, void *v)
 	int value = 0;
 
 	if (!read_acpi_int(hotk->handle, hotk->methods->display_get, &value))
-		printk(KERN_WARNING
-		       "Asus ACPI: Error reading display status\n");
+		pr_warn("Error reading display status\n");
 	value &= 0x07;	/* needed for some models, shouldn't hurt others */
 	seq_printf(m, "%d\n", value);
 	return 0;
@@ -1068,7 +1061,7 @@ asus_proc_add(char *name, const struct file_operations *proc_fops, mode_t mode,
 	proc = proc_create_data(name, mode, acpi_device_dir(device),
 				proc_fops, acpi_driver_data(device));
 	if (!proc) {
-		printk(KERN_WARNING "  Unable to create %s fs entry\n", name);
+		pr_warn("  Unable to create %s fs entry\n", name);
 		return -1;
 	}
 	proc->uid = asus_uid;
@@ -1085,8 +1078,8 @@ static int asus_hotk_add_fs(struct acpi_device *device)
 		mode = S_IFREG | S_IRUGO | S_IWUSR | S_IWGRP;
 	} else {
 		mode = S_IFREG | S_IRUSR | S_IRGRP | S_IWUSR | S_IWGRP;
-		printk(KERN_WARNING "  asus_uid and asus_gid parameters are "
-		       "deprecated, use chown and chmod instead!\n");
+		pr_warn("  asus_uid and asus_gid parameters are "
+			"deprecated, use chown and chmod instead!\n");
 	}
 
 	acpi_device_dir(device) = asus_proc_dir;
@@ -1099,8 +1092,7 @@ static int asus_hotk_add_fs(struct acpi_device *device)
 		proc->uid = asus_uid;
 		proc->gid = asus_gid;
 	} else {
-		printk(KERN_WARNING "  Unable to create " PROC_INFO
-		       " fs entry\n");
+		pr_warn("  Unable to create " PROC_INFO " fs entry\n");
 	}
 
 	if (hotk->methods->mt_wled) {
@@ -1283,20 +1275,19 @@ static int asus_hotk_get_info(void)
 	 */
 	status = acpi_get_table(ACPI_SIG_DSDT, 1, &asus_info);
 	if (ACPI_FAILURE(status))
-		printk(KERN_WARNING "  Couldn't get the DSDT table header\n");
+		pr_warn("  Couldn't get the DSDT table header\n");
 
 	/* We have to write 0 on init this far for all ASUS models */
 	if (!write_acpi_int(hotk->handle, "INIT", 0, &buffer)) {
-		printk(KERN_ERR "  Hotkey initialization failed\n");
+		pr_err("  Hotkey initialization failed\n");
 		return -ENODEV;
 	}
 
 	/* This needs to be called for some laptops to init properly */
 	if (!read_acpi_int(hotk->handle, "BSTS", &bsts_result))
-		printk(KERN_WARNING "  Error calling BSTS\n");
+		pr_warn("  Error calling BSTS\n");
 	else if (bsts_result)
-		printk(KERN_NOTICE "  BSTS called, 0x%02x returned\n",
-		       bsts_result);
+		pr_notice("  BSTS called, 0x%02x returned\n", bsts_result);
 
 	/*
 	 * Try to match the object returned by INIT to the specific model.
@@ -1324,23 +1315,21 @@ static int asus_hotk_get_info(void)
 		if (asus_info &&
 		    strncmp(asus_info->oem_table_id, "ODEM", 4) == 0) {
 			hotk->model = P30;
-			printk(KERN_NOTICE
-			       "  Samsung P30 detected, supported\n");
+			pr_notice("  Samsung P30 detected, supported\n");
 			hotk->methods = &model_conf[hotk->model];
 			kfree(model);
 			return 0;
 		} else {
 			hotk->model = M2E;
-			printk(KERN_NOTICE "  unsupported model %s, trying "
-			       "default values\n", string);
-			printk(KERN_NOTICE
-			       "  send /proc/acpi/dsdt to the developers\n");
+			pr_notice("  unsupported model %s, trying default values\n",
+				  string);
+			pr_notice("  send /proc/acpi/dsdt to the developers\n");
 			kfree(model);
 			return -ENODEV;
 		}
 	}
 	hotk->methods = &model_conf[hotk->model];
-	printk(KERN_NOTICE "  %s model detected, supported\n", string);
+	pr_notice("  %s model detected, supported\n", string);
 
 	/* Sort of per-model blacklist */
 	if (strncmp(string, "L2B", 3) == 0)
@@ -1385,7 +1374,7 @@ static int asus_hotk_check(void)
 	if (hotk->device->status.present) {
 		result = asus_hotk_get_info();
 	} else {
-		printk(KERN_ERR "  Hotkey device not present, aborting\n");
+		pr_err("  Hotkey device not present, aborting\n");
 		return -EINVAL;
 	}
 
@@ -1399,8 +1388,7 @@ static int asus_hotk_add(struct acpi_device *device)
 	acpi_status status = AE_OK;
 	int result;
 
-	printk(KERN_NOTICE "Asus Laptop ACPI Extras version %s\n",
-	       ASUS_ACPI_VERSION);
+	pr_notice("Asus Laptop ACPI Extras version %s\n", ASUS_ACPI_VERSION);
 
 	hotk = kzalloc(sizeof(struct asus_hotk), GFP_KERNEL);
 	if (!hotk)
@@ -1428,15 +1416,14 @@ static int asus_hotk_add(struct acpi_device *device)
 		    acpi_evaluate_object(NULL, hotk->methods->brightness_down,
 					 NULL, NULL);
 		if (ACPI_FAILURE(status))
-			printk(KERN_WARNING "  Error changing brightness\n");
+			pr_warn("  Error changing brightness\n");
 		else {
 			status =
 			    acpi_evaluate_object(NULL,
 						 hotk->methods->brightness_up,
 						 NULL, NULL);
 			if (ACPI_FAILURE(status))
-				printk(KERN_WARNING "  Strange, error changing"
-				       " brightness\n");
+				pr_warn("  Strange, error changing brightness\n");
 		}
 	}
 
@@ -1488,7 +1475,7 @@ static int __init asus_acpi_init(void)
 
 	asus_proc_dir = proc_mkdir(PROC_ASUS, acpi_root_dir);
 	if (!asus_proc_dir) {
-		printk(KERN_ERR "Asus ACPI: Unable to create /proc entry\n");
+		pr_err("Unable to create /proc entry\n");
 		acpi_bus_unregister_driver(&asus_hotk_driver);
 		return -ENODEV;
 	}
@@ -1513,7 +1500,7 @@ static int __init asus_acpi_init(void)
 							  &asus_backlight_data,
 							  &props);
 	if (IS_ERR(asus_backlight_device)) {
-		printk(KERN_ERR "Could not register asus backlight device\n");
+		pr_err("Could not register asus backlight device\n");
 		asus_backlight_device = NULL;
 		asus_acpi_exit();
 		return -ENODEV;
-- 
cgit v1.2.3-70-g09d2


From b4a4bc0bd1328b587a905e346d8494f9b9423cc5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:36 -0700
Subject: compal-laptop: Convert printks to pr_<level>

Add pr_fmt.
Convert printks to pr_<level> removing DRIVER_NAME prefix.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/compal-laptop.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/platform/x86/compal-laptop.c b/drivers/platform/x86/compal-laptop.c
index c16a27641ce..4c4a7422c5e 100644
--- a/drivers/platform/x86/compal-laptop.c
+++ b/drivers/platform/x86/compal-laptop.c
@@ -68,6 +68,8 @@
  * only enabled on a JHL90 board until it is verified that they work on the
  * other boards too.  See the extra_features variable. */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -760,16 +762,14 @@ static struct rfkill *bt_rfkill;
 
 static int dmi_check_cb(const struct dmi_system_id *id)
 {
-	printk(KERN_INFO DRIVER_NAME": Identified laptop model '%s'\n",
-		id->ident);
+	pr_info("Identified laptop model '%s'\n", id->ident);
 	extra_features = false;
 	return 1;
 }
 
 static int dmi_check_cb_extra(const struct dmi_system_id *id)
 {
-	printk(KERN_INFO DRIVER_NAME": Identified laptop model '%s', "
-		"enabling extra features\n",
+	pr_info("Identified laptop model '%s', enabling extra features\n",
 		id->ident);
 	extra_features = true;
 	return 1;
@@ -956,14 +956,12 @@ static int __init compal_init(void)
 	int ret;
 
 	if (acpi_disabled) {
-		printk(KERN_ERR DRIVER_NAME": ACPI needs to be enabled for "
-						"this driver to work!\n");
+		pr_err("ACPI needs to be enabled for this driver to work!\n");
 		return -ENODEV;
 	}
 
 	if (!force && !dmi_check_system(compal_dmi_table)) {
-		printk(KERN_ERR DRIVER_NAME": Motherboard not recognized (You "
-				"could try the module's force-parameter)");
+		pr_err("Motherboard not recognized (You could try the module's force-parameter)\n");
 		return -ENODEV;
 	}
 
@@ -998,8 +996,7 @@ static int __init compal_init(void)
 	if (ret)
 		goto err_rfkill;
 
-	printk(KERN_INFO DRIVER_NAME": Driver "DRIVER_VERSION
-						" successfully loaded\n");
+	pr_info("Driver " DRIVER_VERSION " successfully loaded\n");
 	return 0;
 
 err_rfkill:
@@ -1064,7 +1061,7 @@ static void __exit compal_cleanup(void)
 	rfkill_destroy(wifi_rfkill);
 	rfkill_destroy(bt_rfkill);
 
-	printk(KERN_INFO DRIVER_NAME": Driver unloaded\n");
+	pr_info("Driver unloaded\n");
 }
 
 static int __devexit compal_remove(struct platform_device *pdev)
@@ -1074,8 +1071,7 @@ static int __devexit compal_remove(struct platform_device *pdev)
 	if (!extra_features)
 		return 0;
 
-	printk(KERN_INFO DRIVER_NAME": Unloading: resetting fan control "
-							"to motherboard\n");
+	pr_info("Unloading: resetting fan control to motherboard\n");
 	pwm_disable_control();
 
 	data = platform_get_drvdata(pdev);
-- 
cgit v1.2.3-70-g09d2


From eb8895241dfb6c26114928b186cc1810cbd57f1b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:37 -0700
Subject: dell: Convert printks to pr_<level>

Add pr_fmt.
Remove hard coded prefixes and use pr_<level>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/dell-laptop.c  | 12 ++++++------
 drivers/platform/x86/dell-wmi-aio.c |  3 ++-
 drivers/platform/x86/dell-wmi.c     | 17 ++++++++---------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index de301aa8e5c..d3841de6a8c 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -11,6 +11,8 @@
  *  published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -434,8 +436,7 @@ static int __init dell_setup_rfkill(void)
 	int ret;
 
 	if (dmi_check_system(dell_blacklist)) {
-		printk(KERN_INFO "dell-laptop: Blacklisted hardware detected - "
-				"not enabling rfkill\n");
+		pr_info("Blacklisted hardware detected - not enabling rfkill\n");
 		return 0;
 	}
 
@@ -606,7 +607,7 @@ static int __init dell_init(void)
 	dmi_walk(find_tokens, NULL);
 
 	if (!da_tokens)  {
-		printk(KERN_INFO "dell-laptop: Unable to find dmi tokens\n");
+		pr_info("Unable to find dmi tokens\n");
 		return -ENODEV;
 	}
 
@@ -636,14 +637,13 @@ static int __init dell_init(void)
 	ret = dell_setup_rfkill();
 
 	if (ret) {
-		printk(KERN_WARNING "dell-laptop: Unable to setup rfkill\n");
+		pr_warn("Unable to setup rfkill\n");
 		goto fail_rfkill;
 	}
 
 	ret = i8042_install_filter(dell_laptop_i8042_filter);
 	if (ret) {
-		printk(KERN_WARNING
-		       "dell-laptop: Unable to install key filter\n");
+		pr_warn("Unable to install key filter\n");
 		goto fail_filter;
 	}
 
diff --git a/drivers/platform/x86/dell-wmi-aio.c b/drivers/platform/x86/dell-wmi-aio.c
index 0ed84573ae1..3f945457f71 100644
--- a/drivers/platform/x86/dell-wmi-aio.c
+++ b/drivers/platform/x86/dell-wmi-aio.c
@@ -15,6 +15,7 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
@@ -138,7 +139,7 @@ static int __init dell_wmi_aio_init(void)
 
 	guid = dell_wmi_aio_find();
 	if (!guid) {
-		pr_warning("No known WMI GUID found\n");
+		pr_warn("No known WMI GUID found\n");
 		return -ENXIO;
 	}
 
diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c
index 77f1d55414c..ce790827e19 100644
--- a/drivers/platform/x86/dell-wmi.c
+++ b/drivers/platform/x86/dell-wmi.c
@@ -23,6 +23,8 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -141,7 +143,7 @@ static void dell_wmi_notify(u32 value, void *context)
 
 	status = wmi_get_event_data(value, &response);
 	if (status != AE_OK) {
-		printk(KERN_INFO "dell-wmi: bad event status 0x%x\n", status);
+		pr_info("bad event status 0x%x\n", status);
 		return;
 	}
 
@@ -153,8 +155,8 @@ static void dell_wmi_notify(u32 value, void *context)
 		u16 *buffer_entry = (u16 *)obj->buffer.pointer;
 
 		if (dell_new_hk_type && (buffer_entry[1] != 0x10)) {
-			printk(KERN_INFO "dell-wmi: Received unknown WMI event"
-					 " (0x%x)\n", buffer_entry[1]);
+			pr_info("Received unknown WMI event (0x%x)\n",
+				buffer_entry[1]);
 			kfree(obj);
 			return;
 		}
@@ -167,8 +169,7 @@ static void dell_wmi_notify(u32 value, void *context)
 		key = sparse_keymap_entry_from_scancode(dell_wmi_input_dev,
 							reported_key);
 		if (!key) {
-			printk(KERN_INFO "dell-wmi: Unknown key %x pressed\n",
-				reported_key);
+			pr_info("Unknown key %x pressed\n", reported_key);
 		} else if ((key->keycode == KEY_BRIGHTNESSUP ||
 			    key->keycode == KEY_BRIGHTNESSDOWN) && acpi_video) {
 			/* Don't report brightness notifications that will also
@@ -275,7 +276,7 @@ static int __init dell_wmi_init(void)
 	acpi_status status;
 
 	if (!wmi_has_guid(DELL_EVENT_GUID)) {
-		printk(KERN_WARNING "dell-wmi: No known WMI GUID found\n");
+		pr_warn("No known WMI GUID found\n");
 		return -ENODEV;
 	}
 
@@ -290,9 +291,7 @@ static int __init dell_wmi_init(void)
 					 dell_wmi_notify, NULL);
 	if (ACPI_FAILURE(status)) {
 		dell_wmi_input_destroy();
-		printk(KERN_ERR
-			"dell-wmi: Unable to register notify handler - %d\n",
-			status);
+		pr_err("Unable to register notify handler - %d\n", status);
 		return -ENODEV;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 22441ffeed17b94b28bd7c609f2cdb24d11e81f3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:38 -0700
Subject: eeepc: Use pr_warn

Just a trivial pr_warning to pr_warn conversion
while adding a few missing newlines.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/eeepc-laptop.c | 21 +++++++++++----------
 drivers/platform/x86/eeepc-wmi.c    | 14 +++++++-------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 2c1abf63957..1c45d92e216 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -228,7 +228,7 @@ static int set_acpi(struct eeepc_laptop *eeepc, int cm, int value)
 		return -ENODEV;
 
 	if (write_acpi_int(eeepc->handle, method, value))
-		pr_warning("Error writing %s\n", method);
+		pr_warn("Error writing %s\n", method);
 	return 0;
 }
 
@@ -243,7 +243,7 @@ static int get_acpi(struct eeepc_laptop *eeepc, int cm)
 		return -ENODEV;
 
 	if (read_acpi_int(eeepc->handle, method, &value))
-		pr_warning("Error reading %s\n", method);
+		pr_warn("Error reading %s\n", method);
 	return value;
 }
 
@@ -261,7 +261,7 @@ static int acpi_setter_handle(struct eeepc_laptop *eeepc, int cm,
 	status = acpi_get_handle(eeepc->handle, (char *)method,
 				 handle);
 	if (status != AE_OK) {
-		pr_warning("Error finding %s\n", method);
+		pr_warn("Error finding %s\n", method);
 		return -ENODEV;
 	}
 	return 0;
@@ -417,7 +417,7 @@ static ssize_t store_cpufv_disabled(struct device *dev,
 	switch (value) {
 	case 0:
 		if (eeepc->cpufv_disabled)
-			pr_warning("cpufv enabled (not officially supported "
+			pr_warn("cpufv enabled (not officially supported "
 				"on this model)\n");
 		eeepc->cpufv_disabled = false;
 		return rv;
@@ -609,7 +609,7 @@ static void eeepc_rfkill_hotplug(struct eeepc_laptop *eeepc, acpi_handle handle)
 		bus = port->subordinate;
 
 		if (!bus) {
-			pr_warning("Unable to find PCI bus?\n");
+			pr_warn("Unable to find PCI bus 1?\n");
 			goto out_unlock;
 		}
 
@@ -621,12 +621,12 @@ static void eeepc_rfkill_hotplug(struct eeepc_laptop *eeepc, acpi_handle handle)
 		absent = (l == 0xffffffff);
 
 		if (blocked != absent) {
-			pr_warning("BIOS says wireless lan is %s, "
-					"but the pci device is %s\n",
+			pr_warn("BIOS says wireless lan is %s, "
+				"but the pci device is %s\n",
 				blocked ? "blocked" : "unblocked",
 				absent ? "absent" : "present");
-			pr_warning("skipped wireless hotplug as probably "
-					"inappropriate for this model\n");
+			pr_warn("skipped wireless hotplug as probably "
+				"inappropriate for this model\n");
 			goto out_unlock;
 		}
 
@@ -691,7 +691,8 @@ static int eeepc_register_rfkill_notifier(struct eeepc_laptop *eeepc,
 						     eeepc_rfkill_notify,
 						     eeepc);
 		if (ACPI_FAILURE(status))
-			pr_warning("Failed to register notify on %s\n", node);
+			pr_warn("Failed to register notify on %s\n", node);
+
 		/*
 		 * Refresh pci hotplug in case the rfkill state was
 		 * changed during setup.
diff --git a/drivers/platform/x86/eeepc-wmi.c b/drivers/platform/x86/eeepc-wmi.c
index 649dcadd8ea..4aa867a9b88 100644
--- a/drivers/platform/x86/eeepc-wmi.c
+++ b/drivers/platform/x86/eeepc-wmi.c
@@ -84,7 +84,7 @@ static const struct key_entry eeepc_wmi_keymap[] = {
 static acpi_status eeepc_wmi_parse_device(acpi_handle handle, u32 level,
 						 void *context, void **retval)
 {
-	pr_warning("Found legacy ATKD device (%s)", EEEPC_ACPI_HID);
+	pr_warn("Found legacy ATKD device (%s)\n", EEEPC_ACPI_HID);
 	*(bool *)context = true;
 	return AE_CTRL_TERMINATE;
 }
@@ -105,12 +105,12 @@ static int eeepc_wmi_check_atkd(void)
 static int eeepc_wmi_probe(struct platform_device *pdev)
 {
 	if (eeepc_wmi_check_atkd()) {
-		pr_warning("WMI device present, but legacy ATKD device is also "
-			   "present and enabled.");
-		pr_warning("You probably booted with acpi_osi=\"Linux\" or "
-			   "acpi_osi=\"!Windows 2009\"");
-		pr_warning("Can't load eeepc-wmi, use default acpi_osi "
-			   "(preferred) or eeepc-laptop");
+		pr_warn("WMI device present, but legacy ATKD device is also "
+			"present and enabled\n");
+		pr_warn("You probably booted with acpi_osi=\"Linux\" or "
+			"acpi_osi=\"!Windows 2009\"\n");
+		pr_warn("Can't load eeepc-wmi, use default acpi_osi "
+			"(preferred) or eeepc-laptop\n");
 		return -EBUSY;
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 77bad7c830e20085deba6a760ce85c13ecb12f4d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:39 -0700
Subject: fujitsu-laptop: Convert printks to pr_<level>

Added pr_fmt, converted printks and removed
hard coded prefixes.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/fujitsu-laptop.c | 39 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c
index 493054c2dbe..6b26666b37f 100644
--- a/drivers/platform/x86/fujitsu-laptop.c
+++ b/drivers/platform/x86/fujitsu-laptop.c
@@ -56,6 +56,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -585,8 +587,7 @@ static struct platform_driver fujitsupf_driver = {
 static void dmi_check_cb_common(const struct dmi_system_id *id)
 {
 	acpi_handle handle;
-	printk(KERN_INFO "fujitsu-laptop: Identified laptop model '%s'.\n",
-	       id->ident);
+	pr_info("Identified laptop model '%s'\n", id->ident);
 	if (use_alt_lcd_levels == -1) {
 		if (ACPI_SUCCESS(acpi_get_handle(NULL,
 				"\\_SB.PCI0.LPCB.FJEX.SBL2", &handle)))
@@ -691,11 +692,11 @@ static int acpi_fujitsu_add(struct acpi_device *device)
 
 	result = acpi_bus_update_power(fujitsu->acpi_handle, &state);
 	if (result) {
-		printk(KERN_ERR "Error reading power state\n");
+		pr_err("Error reading power state\n");
 		goto err_unregister_input_dev;
 	}
 
-	printk(KERN_INFO "ACPI: %s [%s] (%s)\n",
+	pr_info("ACPI: %s [%s] (%s)\n",
 	       acpi_device_name(device), acpi_device_bid(device),
 	       !device->power.state ? "on" : "off");
 
@@ -707,7 +708,7 @@ static int acpi_fujitsu_add(struct acpi_device *device)
 		if (ACPI_FAILURE
 		    (acpi_evaluate_object
 		     (device->handle, METHOD_NAME__INI, NULL, NULL)))
-			printk(KERN_ERR "_INI Method failed\n");
+			pr_err("_INI Method failed\n");
 	}
 
 	/* do config (detect defaults) */
@@ -827,7 +828,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 	error = kfifo_alloc(&fujitsu_hotkey->fifo, RINGBUFFERSIZE * sizeof(int),
 			GFP_KERNEL);
 	if (error) {
-		printk(KERN_ERR "kfifo_alloc failed\n");
+		pr_err("kfifo_alloc failed\n");
 		goto err_stop;
 	}
 
@@ -859,13 +860,13 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 
 	result = acpi_bus_update_power(fujitsu_hotkey->acpi_handle, &state);
 	if (result) {
-		printk(KERN_ERR "Error reading power state\n");
+		pr_err("Error reading power state\n");
 		goto err_unregister_input_dev;
 	}
 
-	printk(KERN_INFO "ACPI: %s [%s] (%s)\n",
-	       acpi_device_name(device), acpi_device_bid(device),
-	       !device->power.state ? "on" : "off");
+	pr_info("ACPI: %s [%s] (%s)\n",
+		acpi_device_name(device), acpi_device_bid(device),
+		!device->power.state ? "on" : "off");
 
 	fujitsu_hotkey->dev = device;
 
@@ -875,7 +876,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 		if (ACPI_FAILURE
 		    (acpi_evaluate_object
 		     (device->handle, METHOD_NAME__INI, NULL, NULL)))
-			printk(KERN_ERR "_INI Method failed\n");
+			pr_err("_INI Method failed\n");
 	}
 
 	i = 0;
@@ -897,8 +898,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 			call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0);
 
 	/* Suspect this is a keymap of the application panel, print it */
-	printk(KERN_INFO "fujitsu-laptop: BTNI: [0x%x]\n",
-		call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0));
+	pr_info("BTNI: [0x%x]\n", call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0));
 
 #if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
 	if (call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & LOGOLAMP_POWERON) {
@@ -907,8 +907,8 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 		if (result == 0) {
 			fujitsu_hotkey->logolamp_registered = 1;
 		} else {
-			printk(KERN_ERR "fujitsu-laptop: Could not register "
-			"LED handler for logo lamp, error %i\n", result);
+			pr_err("Could not register LED handler for logo lamp, error %i\n",
+			       result);
 		}
 	}
 
@@ -919,8 +919,8 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 		if (result == 0) {
 			fujitsu_hotkey->kblamps_registered = 1;
 		} else {
-			printk(KERN_ERR "fujitsu-laptop: Could not register "
-			"LED handler for keyboard lamps, error %i\n", result);
+			pr_err("Could not register LED handler for keyboard lamps, error %i\n",
+			       result);
 		}
 	}
 #endif
@@ -1169,8 +1169,7 @@ static int __init fujitsu_init(void)
 			fujitsu->bl_device->props.power = 0;
 	}
 
-	printk(KERN_INFO "fujitsu-laptop: driver " FUJITSU_DRIVER_VERSION
-	       " successfully loaded.\n");
+	pr_info("driver " FUJITSU_DRIVER_VERSION " successfully loaded\n");
 
 	return 0;
 
@@ -1216,7 +1215,7 @@ static void __exit fujitsu_cleanup(void)
 
 	kfree(fujitsu);
 
-	printk(KERN_INFO "fujitsu-laptop: driver unloaded.\n");
+	pr_info("driver unloaded\n");
 }
 
 module_init(fujitsu_init);
-- 
cgit v1.2.3-70-g09d2


From 611f5763fd0e38f896aa38346d4bc3f5c388b608 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:40 -0700
Subject: hdaps: Convert printks to pr_<level>

Added pr_fmt, converted printks and removed
hard coded prefixes.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/hdaps.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c
index 067bf36d32f..5a34973dc16 100644
--- a/drivers/platform/x86/hdaps.c
+++ b/drivers/platform/x86/hdaps.c
@@ -26,6 +26,8 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/input-polldev.h>
@@ -238,7 +240,7 @@ static int hdaps_device_init(void)
 		     __check_latch(0x1611, 0x01))
 		goto out;
 
-	printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x).\n",
+	printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n",
 	       __get_latch(0x1611));
 
 	outb(0x17, 0x1610);
@@ -299,7 +301,7 @@ static int hdaps_probe(struct platform_device *dev)
 	if (ret)
 		return ret;
 
-	printk(KERN_INFO "hdaps: device successfully initialized.\n");
+	pr_info("device successfully initialized\n");
 	return 0;
 }
 
@@ -480,7 +482,7 @@ static struct attribute_group hdaps_attribute_group = {
 /* hdaps_dmi_match - found a match.  return one, short-circuiting the hunt. */
 static int __init hdaps_dmi_match(const struct dmi_system_id *id)
 {
-	printk(KERN_INFO "hdaps: %s detected.\n", id->ident);
+	pr_info("%s detected\n", id->ident);
 	return 1;
 }
 
@@ -488,8 +490,7 @@ static int __init hdaps_dmi_match(const struct dmi_system_id *id)
 static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id)
 {
 	hdaps_invert = (unsigned long)id->driver_data;
-	printk(KERN_INFO "hdaps: inverting axis (%u) readings.\n",
-	       hdaps_invert);
+	pr_info("inverting axis (%u) readings\n", hdaps_invert);
 	return hdaps_dmi_match(id);
 }
 
@@ -543,7 +544,7 @@ static int __init hdaps_init(void)
 	int ret;
 
 	if (!dmi_check_system(hdaps_whitelist)) {
-		printk(KERN_WARNING "hdaps: supported laptop not found!\n");
+		pr_warn("supported laptop not found!\n");
 		ret = -ENODEV;
 		goto out;
 	}
@@ -595,7 +596,7 @@ static int __init hdaps_init(void)
 	if (ret)
 		goto out_idev;
 
-	printk(KERN_INFO "hdaps: driver successfully loaded.\n");
+	pr_info("driver successfully loaded\n");
 	return 0;
 
 out_idev:
@@ -609,7 +610,7 @@ out_driver:
 out_region:
 	release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS);
 out:
-	printk(KERN_WARNING "hdaps: driver init failed (ret=%d)!\n", ret);
+	pr_warn("driver init failed (ret=%d)!\n", ret);
 	return ret;
 }
 
@@ -622,7 +623,7 @@ static void __exit hdaps_exit(void)
 	platform_driver_unregister(&hdaps_driver);
 	release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS);
 
-	printk(KERN_INFO "hdaps: driver unloaded.\n");
+	pr_info("driver unloaded\n");
 }
 
 module_init(hdaps_init);
-- 
cgit v1.2.3-70-g09d2


From b5a4223c1ce5edbaae3db9494de8dcb2d08a755b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:41 -0700
Subject: hp-wmi: Convert printks to pr_<level>

Added pr_fmt and converted printks to pr_<level>.
Removed now unused PREFIX and UNIMPL #defines.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/hp-wmi.c | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c
index 1bc4a7539ba..f94017bcdd6 100644
--- a/drivers/platform/x86/hp-wmi.c
+++ b/drivers/platform/x86/hp-wmi.c
@@ -24,6 +24,8 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -54,9 +56,6 @@ MODULE_ALIAS("wmi:5FB7F034-2C63-45e9-BE91-3D44E2C707E4");
 #define HPWMI_HOTKEY_QUERY 0xc
 #define HPWMI_WIRELESS2_QUERY 0x1b
 
-#define PREFIX "HP WMI: "
-#define UNIMP "Unimplemented "
-
 enum hp_wmi_radio {
 	HPWMI_WIFI = 0,
 	HPWMI_BLUETOOTH = 1,
@@ -228,9 +227,8 @@ static int hp_wmi_perform_query(int query, int write, void *buffer,
 
 	if (bios_return->return_code) {
 		if (bios_return->return_code != HPWMI_RET_UNKNOWN_CMDTYPE)
-			printk(KERN_WARNING PREFIX "query 0x%x returned "
-						   "error 0x%x\n",
-			       query, bios_return->return_code);
+			pr_warn("query 0x%x returned error 0x%x\n",
+				query, bios_return->return_code);
 		kfree(obj);
 		return bios_return->return_code;
 	}
@@ -384,8 +382,7 @@ static int hp_wmi_rfkill2_refresh(void)
 
 		if (num >= state.count ||
 		    devstate->rfkill_id != rfkill2[i].id) {
-			printk(KERN_WARNING PREFIX "power configuration of "
-			       "the wireless devices unexpectedly changed\n");
+			pr_warn("power configuration of the wireless devices unexpectedly changed\n");
 			continue;
 		}
 
@@ -471,7 +468,7 @@ static void hp_wmi_notify(u32 value, void *context)
 
 	status = wmi_get_event_data(value, &response);
 	if (status != AE_OK) {
-		printk(KERN_INFO PREFIX "bad event status 0x%x\n", status);
+		pr_info("bad event status 0x%x\n", status);
 		return;
 	}
 
@@ -480,8 +477,7 @@ static void hp_wmi_notify(u32 value, void *context)
 	if (!obj)
 		return;
 	if (obj->type != ACPI_TYPE_BUFFER) {
-		printk(KERN_INFO "hp-wmi: Unknown response received %d\n",
-		       obj->type);
+		pr_info("Unknown response received %d\n", obj->type);
 		kfree(obj);
 		return;
 	}
@@ -498,8 +494,7 @@ static void hp_wmi_notify(u32 value, void *context)
 		event_id = *location;
 		event_data = *(location + 2);
 	} else {
-		printk(KERN_INFO "hp-wmi: Unknown buffer length %d\n",
-		       obj->buffer.length);
+		pr_info("Unknown buffer length %d\n", obj->buffer.length);
 		kfree(obj);
 		return;
 	}
@@ -527,8 +522,7 @@ static void hp_wmi_notify(u32 value, void *context)
 
 		if (!sparse_keymap_report_event(hp_wmi_input_dev,
 						key_code, 1, true))
-			printk(KERN_INFO PREFIX "Unknown key code - 0x%x\n",
-			       key_code);
+			pr_info("Unknown key code - 0x%x\n", key_code);
 		break;
 	case HPWMI_WIRELESS:
 		if (rfkill2_count) {
@@ -550,14 +544,12 @@ static void hp_wmi_notify(u32 value, void *context)
 					  hp_wmi_get_hw_state(HPWMI_WWAN));
 		break;
 	case HPWMI_CPU_BATTERY_THROTTLE:
-		printk(KERN_INFO PREFIX UNIMP "CPU throttle because of 3 Cell"
-		       " battery event detected\n");
+		pr_info("Unimplemented CPU throttle because of 3 Cell battery event detected\n");
 		break;
 	case HPWMI_LOCK_SWITCH:
 		break;
 	default:
-		printk(KERN_INFO PREFIX "Unknown event_id - %d - 0x%x\n",
-		       event_id, event_data);
+		pr_info("Unknown event_id - %d - 0x%x\n", event_id, event_data);
 		break;
 	}
 }
@@ -705,7 +697,7 @@ static int __devinit hp_wmi_rfkill2_setup(struct platform_device *device)
 		return err;
 
 	if (state.count > HPWMI_MAX_RFKILL2_DEVICES) {
-		printk(KERN_WARNING PREFIX "unable to parse 0x1b query output\n");
+		pr_warn("unable to parse 0x1b query output\n");
 		return -EINVAL;
 	}
 
@@ -727,14 +719,14 @@ static int __devinit hp_wmi_rfkill2_setup(struct platform_device *device)
 			name = "hp-wwan";
 			break;
 		default:
-			printk(KERN_WARNING PREFIX "unknown device type 0x%x\n",
-				 state.device[i].radio_type);
+			pr_warn("unknown device type 0x%x\n",
+				state.device[i].radio_type);
 			continue;
 		}
 
 		if (!state.device[i].vendor_id) {
-			printk(KERN_WARNING PREFIX "zero device %d while %d "
-			       "reported\n", i, state.count);
+			pr_warn("zero device %d while %d reported\n",
+				i, state.count);
 			continue;
 		}
 
@@ -755,8 +747,7 @@ static int __devinit hp_wmi_rfkill2_setup(struct platform_device *device)
 				    IS_HWBLOCKED(state.device[i].power));
 
 		if (!(state.device[i].power & HPWMI_POWER_BIOS))
-			printk(KERN_INFO PREFIX "device %s blocked by BIOS\n",
-			       name);
+			pr_info("device %s blocked by BIOS\n", name);
 
 		err = rfkill_register(rfkill);
 		if (err) {
-- 
cgit v1.2.3-70-g09d2


From 323623a717c7feec3f36099f4caa18b3319c5cca Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:42 -0700
Subject: ibm_rtl: Use pr_fmt and pr_<level>

Remove hard coded prefixes from logging messages.
Neaten RTL_DEBUG macro and uses.
Convert __FUNCTION__ to __func__.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/ibm_rtl.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/platform/x86/ibm_rtl.c b/drivers/platform/x86/ibm_rtl.c
index 10563458408..811d436cd67 100644
--- a/drivers/platform/x86/ibm_rtl.c
+++ b/drivers/platform/x86/ibm_rtl.c
@@ -22,6 +22,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/module.h>
@@ -69,9 +71,10 @@ struct ibm_rtl_table {
 #define RTL_SIGNATURE 0x0000005f4c54525fULL
 #define RTL_MASK      0x000000ffffffffffULL
 
-#define RTL_DEBUG(A, ...) do { \
-	if (debug) \
-		pr_info("ibm-rtl: " A, ##__VA_ARGS__ ); \
+#define RTL_DEBUG(fmt, ...)				\
+do {							\
+	if (debug)					\
+		pr_info(fmt, ##__VA_ARGS__);		\
 } while (0)
 
 static DEFINE_MUTEX(rtl_lock);
@@ -114,7 +117,7 @@ static int ibm_rtl_write(u8 value)
 	int ret = 0, count = 0;
 	static u32 cmd_port_val;
 
-	RTL_DEBUG("%s(%d)\n", __FUNCTION__, value);
+	RTL_DEBUG("%s(%d)\n", __func__, value);
 
 	value = value == 1 ? RTL_CMD_ENTER_PRTM : RTL_CMD_EXIT_PRTM;
 
@@ -144,8 +147,8 @@ static int ibm_rtl_write(u8 value)
 		while (ioread8(&rtl_table->command)) {
 			msleep(10);
 			if (count++ > 500) {
-				pr_err("ibm-rtl: Hardware not responding to "
-					"mode switch request\n");
+				pr_err("Hardware not responding to "
+				       "mode switch request\n");
 				ret = -EIO;
 				break;
 			}
@@ -250,7 +253,7 @@ static int __init ibm_rtl_init(void) {
 	int ret = -ENODEV, i;
 
 	if (force)
-		pr_warning("ibm-rtl: module loaded by force\n");
+		pr_warn("module loaded by force\n");
 	/* first ensure that we are running on IBM HW */
 	else if (efi_enabled || !dmi_check_system(ibm_rtl_dmi_table))
 		return -ENODEV;
@@ -295,7 +298,7 @@ static int __init ibm_rtl_init(void) {
 			rtl_cmd_width = ioread8(&rtl_table->cmd_granularity);
 			rtl_cmd_type = ioread8(&rtl_table->cmd_address_type);
 			RTL_DEBUG("rtl_cmd_width = %u, rtl_cmd_type = %u\n",
-			      rtl_cmd_width, rtl_cmd_type);
+				  rtl_cmd_width, rtl_cmd_type);
 			addr = ioread32(&rtl_table->cmd_port_address);
 			RTL_DEBUG("addr = %#llx\n", (unsigned long long)addr);
 			plen = rtl_cmd_width/sizeof(char);
-- 
cgit v1.2.3-70-g09d2


From 9ab23989d3e774f4545c83eeab0137e35f7115be Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:43 -0700
Subject: ideapad-laptop: Add pr_fmt

Add pr_fmt to prefix logging messages.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/ideapad-laptop.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c
index 21b101899ba..bfdda33feb2 100644
--- a/drivers/platform/x86/ideapad-laptop.c
+++ b/drivers/platform/x86/ideapad-laptop.c
@@ -20,6 +20,8 @@
  *  02110-1301, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-- 
cgit v1.2.3-70-g09d2


From 4686f6df69c5bca5ea795e01048d32f6eed07ba7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:44 -0700
Subject: intel_menlow: Add pr_fmt and use pr_<level>

Add pr_fmt to prefix the logging messages.
Convert printk to pr_<level>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/intel_menlow.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/intel_menlow.c b/drivers/platform/x86/intel_menlow.c
index eacd5da7dd2..809adea4965 100644
--- a/drivers/platform/x86/intel_menlow.c
+++ b/drivers/platform/x86/intel_menlow.c
@@ -27,6 +27,8 @@
  *  to get/set bandwidth.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -135,8 +137,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev,
 	    acpi_evaluate_integer(handle, MEMORY_SET_BANDWIDTH, &arg_list,
 				  &temp);
 
-	printk(KERN_INFO
-	       "Bandwidth value was %ld: status is %d\n", state, status);
+	pr_info("Bandwidth value was %ld: status is %d\n", state, status);
 	if (ACPI_FAILURE(status))
 		return -EFAULT;
 
-- 
cgit v1.2.3-70-g09d2


From 9a2ffd168ef8ec3705060ffb21d6dcb20eeb9506 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:45 -0700
Subject: intel_pmic_gpio: Convert printks to pr_<level>

Add #define pr_fmt(fmt) "%s: " fmt, __func__
to prefix function name to each output message.
Convert printks to pr_<level>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/intel_pmic_gpio.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/platform/x86/intel_pmic_gpio.c b/drivers/platform/x86/intel_pmic_gpio.c
index 464bb3fc4d8..1686c1e07d5 100644
--- a/drivers/platform/x86/intel_pmic_gpio.c
+++ b/drivers/platform/x86/intel_pmic_gpio.c
@@ -19,6 +19,8 @@
  * Moorestown platform PMIC chip
  */
 
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
@@ -90,8 +92,7 @@ static void pmic_program_irqtype(int gpio, int type)
 static int pmic_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 {
 	if (offset > 8) {
-		printk(KERN_ERR
-			"%s: only pin 0-7 support input\n", __func__);
+		pr_err("only pin 0-7 support input\n");
 		return -1;/* we only have 8 GPIO can use as input */
 	}
 	return intel_scu_ipc_update_register(GPIO0 + offset,
@@ -116,8 +117,7 @@ static int pmic_gpio_direction_output(struct gpio_chip *chip,
 				value ? 1 << (offset - 16) : 0,
 				1 << (offset - 16));
 	else {
-		printk(KERN_ERR
-			"%s: invalid PMIC GPIO pin %d!\n", __func__, offset);
+		pr_err("invalid PMIC GPIO pin %d!\n", offset);
 		WARN_ON(1);
 	}
 
@@ -260,7 +260,7 @@ static int __devinit platform_pmic_gpio_probe(struct platform_device *pdev)
 	/* setting up SRAM mapping for GPIOINT register */
 	pg->gpiointr = ioremap_nocache(pdata->gpiointr, 8);
 	if (!pg->gpiointr) {
-		printk(KERN_ERR "%s: Can not map GPIOINT.\n", __func__);
+		pr_err("Can not map GPIOINT\n");
 		retval = -EINVAL;
 		goto err2;
 	}
@@ -281,13 +281,13 @@ static int __devinit platform_pmic_gpio_probe(struct platform_device *pdev)
 	pg->chip.dev = dev;
 	retval = gpiochip_add(&pg->chip);
 	if (retval) {
-		printk(KERN_ERR "%s: Can not add pmic gpio chip.\n", __func__);
+		pr_err("Can not add pmic gpio chip\n");
 		goto err;
 	}
 
 	retval = request_irq(pg->irq, pmic_irq_handler, 0, "pmic", pg);
 	if (retval) {
-		printk(KERN_WARNING "pmic: Interrupt request failed\n");
+		pr_warn("Interrupt request failed\n");
 		goto err;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From f9dcf192ed527fbf2f3a8a3574006d1ae855100b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:46 -0700
Subject: msi-laptop: pr_<level> neatening

Just making it a bit more like other logging message uses.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/msi-laptop.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/platform/x86/msi-laptop.c b/drivers/platform/x86/msi-laptop.c
index 23fb2afda00..f89c0b6a9ae 100644
--- a/drivers/platform/x86/msi-laptop.c
+++ b/drivers/platform/x86/msi-laptop.c
@@ -447,7 +447,7 @@ static struct platform_device *msipf_device;
 
 static int dmi_check_cb(const struct dmi_system_id *id)
 {
-	pr_info("Identified laptop model '%s'.\n", id->ident);
+	pr_info("Identified laptop model '%s'\n", id->ident);
 	return 1;
 }
 
@@ -875,8 +875,7 @@ static int __init msi_init(void)
 	/* Register backlight stuff */
 
 	if (acpi_video_backlight_support()) {
-		pr_info("Brightness ignored, must be controlled "
-		       "by ACPI video driver\n");
+		pr_info("Brightness ignored, must be controlled by ACPI video driver\n");
 	} else {
 		struct backlight_properties props;
 		memset(&props, 0, sizeof(struct backlight_properties));
@@ -930,7 +929,7 @@ static int __init msi_init(void)
 	if (auto_brightness != 2)
 		set_auto_brightness(auto_brightness);
 
-	pr_info("driver "MSI_DRIVER_VERSION" successfully loaded.\n");
+	pr_info("driver " MSI_DRIVER_VERSION " successfully loaded\n");
 
 	return 0;
 
@@ -978,7 +977,7 @@ static void __exit msi_cleanup(void)
 	if (auto_brightness != 2)
 		set_auto_brightness(1);
 
-	pr_info("driver unloaded.\n");
+	pr_info("driver unloaded\n");
 }
 
 module_init(msi_init);
-- 
cgit v1.2.3-70-g09d2


From dd3c7f2308e91d6ecbe5cd8a3310bb49e47e1ae7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:47 -0700
Subject: msi-wmi: Use pr_fmt and pr_<level>

Added pr_fmt.
Removed now unused #define DRV_PFX
Convert dprintk to pr_debug.
Convert printks to pr_<level>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/msi-wmi.c | 45 +++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/platform/x86/msi-wmi.c b/drivers/platform/x86/msi-wmi.c
index d5419c9ec07..c832e3356cd 100644
--- a/drivers/platform/x86/msi-wmi.c
+++ b/drivers/platform/x86/msi-wmi.c
@@ -20,6 +20,7 @@
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
 #include <linux/input.h>
@@ -36,13 +37,10 @@ MODULE_ALIAS("wmi:551A1F84-FBDD-4125-91DB-3EA8F44F1D45");
 MODULE_ALIAS("wmi:B6F3EEF2-3D2F-49DC-9DE3-85BCE18C62F2");
 
 #define DRV_NAME "msi-wmi"
-#define DRV_PFX DRV_NAME ": "
 
 #define MSIWMI_BIOS_GUID "551A1F84-FBDD-4125-91DB-3EA8F44F1D45"
 #define MSIWMI_EVENT_GUID "B6F3EEF2-3D2F-49DC-9DE3-85BCE18C62F2"
 
-#define dprintk(msg...) pr_debug(DRV_PFX msg)
-
 #define SCANCODE_BASE 0xD0
 #define MSI_WMI_BRIGHTNESSUP   SCANCODE_BASE
 #define MSI_WMI_BRIGHTNESSDOWN (SCANCODE_BASE + 1)
@@ -78,7 +76,7 @@ static int msi_wmi_query_block(int instance, int *ret)
 
 	if (!obj || obj->type != ACPI_TYPE_INTEGER) {
 		if (obj) {
-			printk(KERN_ERR DRV_PFX "query block returned object "
+			pr_err("query block returned object "
 			       "type: %d - buffer length:%d\n", obj->type,
 			       obj->type == ACPI_TYPE_BUFFER ?
 			       obj->buffer.length : 0);
@@ -97,8 +95,8 @@ static int msi_wmi_set_block(int instance, int value)
 
 	struct acpi_buffer input = { sizeof(int), &value };
 
-	dprintk("Going to set block of instance: %d - value: %d\n",
-		instance, value);
+	pr_debug("Going to set block of instance: %d - value: %d\n",
+		 instance, value);
 
 	status = wmi_set_block(MSIWMI_BIOS_GUID, instance, &input);
 
@@ -112,20 +110,19 @@ static int bl_get(struct backlight_device *bd)
 	/* Instance 1 is "get backlight", cmp with DSDT */
 	err = msi_wmi_query_block(1, &ret);
 	if (err) {
-		printk(KERN_ERR DRV_PFX "Could not query backlight: %d\n", err);
+		pr_err("Could not query backlight: %d\n", err);
 		return -EINVAL;
 	}
-	dprintk("Get: Query block returned: %d\n", ret);
+	pr_debug("Get: Query block returned: %d\n", ret);
 	for (level = 0; level < ARRAY_SIZE(backlight_map); level++) {
 		if (backlight_map[level] == ret) {
-			dprintk("Current backlight level: 0x%X - index: %d\n",
-				backlight_map[level], level);
+			pr_debug("Current backlight level: 0x%X - index: %d\n",
+				 backlight_map[level], level);
 			break;
 		}
 	}
 	if (level == ARRAY_SIZE(backlight_map)) {
-		printk(KERN_ERR DRV_PFX "get: Invalid brightness value: 0x%X\n",
-		       ret);
+		pr_err("get: Invalid brightness value: 0x%X\n", ret);
 		return -EINVAL;
 	}
 	return level;
@@ -156,7 +153,7 @@ static void msi_wmi_notify(u32 value, void *context)
 
 	status = wmi_get_event_data(value, &response);
 	if (status != AE_OK) {
-		printk(KERN_INFO DRV_PFX "bad event status 0x%x\n", status);
+		pr_info("bad event status 0x%x\n", status);
 		return;
 	}
 
@@ -164,7 +161,7 @@ static void msi_wmi_notify(u32 value, void *context)
 
 	if (obj && obj->type == ACPI_TYPE_INTEGER) {
 		int eventcode = obj->integer.value;
-		dprintk("Eventcode: 0x%x\n", eventcode);
+		pr_debug("Eventcode: 0x%x\n", eventcode);
 		key = sparse_keymap_entry_from_scancode(msi_wmi_input_dev,
 				eventcode);
 		if (key) {
@@ -175,8 +172,8 @@ static void msi_wmi_notify(u32 value, void *context)
 			/* Ignore event if the same event happened in a 50 ms
 			   timeframe -> Key press may result in 10-20 GPEs */
 			if (ktime_to_us(diff) < 1000 * 50) {
-				dprintk("Suppressed key event 0x%X - "
-					"Last press was %lld us ago\n",
+				pr_debug("Suppressed key event 0x%X - "
+					 "Last press was %lld us ago\n",
 					 key->code, ktime_to_us(diff));
 				return;
 			}
@@ -187,17 +184,16 @@ static void msi_wmi_notify(u32 value, void *context)
 			(!acpi_video_backlight_support() ||
 			(key->code != MSI_WMI_BRIGHTNESSUP &&
 			key->code != MSI_WMI_BRIGHTNESSDOWN))) {
-				dprintk("Send key: 0x%X - "
-					"Input layer keycode: %d\n", key->code,
-					 key->keycode);
+				pr_debug("Send key: 0x%X - "
+					 "Input layer keycode: %d\n",
+					 key->code, key->keycode);
 				sparse_keymap_report_entry(msi_wmi_input_dev,
 						key, 1, true);
 			}
 		} else
-			printk(KERN_INFO "Unknown key pressed - %x\n",
-			       eventcode);
+			pr_info("Unknown key pressed - %x\n", eventcode);
 	} else
-		printk(KERN_INFO DRV_PFX "Unknown event received\n");
+		pr_info("Unknown event received\n");
 	kfree(response.pointer);
 }
 
@@ -238,8 +234,7 @@ static int __init msi_wmi_init(void)
 	int err;
 
 	if (!wmi_has_guid(MSIWMI_EVENT_GUID)) {
-		printk(KERN_ERR
-		       "This machine doesn't have MSI-hotkeys through WMI\n");
+		pr_err("This machine doesn't have MSI-hotkeys through WMI\n");
 		return -ENODEV;
 	}
 	err = wmi_install_notify_handler(MSIWMI_EVENT_GUID,
@@ -270,7 +265,7 @@ static int __init msi_wmi_init(void)
 
 		backlight->props.brightness = err;
 	}
-	dprintk("Event handler installed\n");
+	pr_debug("Event handler installed\n");
 
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 50f581a4f09f5b7f78a1c791e09b2c8d24a1e20c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:48 -0700
Subject: sony-laptop: Add and use #define pr_fmt

Add pr_fmt.
Remove now unused #define DRV_PRX.
Neaten dprintk macro.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/sony-laptop.c | 104 +++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 55 deletions(-)

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 6fe8cd6e23b..12dd0a62f30 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -42,6 +42,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -70,10 +72,10 @@
 #include <linux/miscdevice.h>
 #endif
 
-#define DRV_PFX			"sony-laptop: "
-#define dprintk(msg...)		do {	\
-	if (debug)			\
-		pr_warn(DRV_PFX msg);	\
+#define dprintk(fmt, ...)			\
+do {						\
+	if (debug)				\
+		pr_warn(fmt, ##__VA_ARGS__);	\
 } while (0)
 
 #define SONY_LAPTOP_DRIVER_VERSION	"0.6"
@@ -418,7 +420,7 @@ static int sony_laptop_setup_input(struct acpi_device *acpi_device)
 	error = kfifo_alloc(&sony_laptop_input.fifo,
 			    SONY_LAPTOP_BUF_SIZE, GFP_KERNEL);
 	if (error) {
-		pr_err(DRV_PFX "kfifo_alloc failed\n");
+		pr_err("kfifo_alloc failed\n");
 		goto err_dec_users;
 	}
 
@@ -702,7 +704,7 @@ static int acpi_callgetfunc(acpi_handle handle, char *name, int *result)
 		return 0;
 	}
 
-	pr_warn(DRV_PFX "acpi_callreadfunc failed\n");
+	pr_warn("acpi_callreadfunc failed\n");
 
 	return -1;
 }
@@ -728,8 +730,7 @@ static int acpi_callsetfunc(acpi_handle handle, char *name, int value,
 	if (status == AE_OK) {
 		if (result != NULL) {
 			if (out_obj.type != ACPI_TYPE_INTEGER) {
-				pr_warn(DRV_PFX "acpi_evaluate_object bad "
-				       "return type\n");
+				pr_warn("acpi_evaluate_object bad return type\n");
 				return -1;
 			}
 			*result = out_obj.integer.value;
@@ -737,7 +738,7 @@ static int acpi_callsetfunc(acpi_handle handle, char *name, int value,
 		return 0;
 	}
 
-	pr_warn(DRV_PFX "acpi_evaluate_object failed\n");
+	pr_warn("acpi_evaluate_object failed\n");
 
 	return -1;
 }
@@ -1104,10 +1105,8 @@ static void sony_nc_notify(struct acpi_device *device, u32 event)
 				}
 
 				if (!key_event->data)
-					pr_info(DRV_PFX
-							"Unknown event: 0x%x 0x%x\n",
-							key_handle,
-							ev);
+					pr_info("Unknown event: 0x%x 0x%x\n",
+						key_handle, ev);
 				else
 					sony_laptop_report_input_event(ev);
 			}
@@ -1128,7 +1127,7 @@ static acpi_status sony_walk_callback(acpi_handle handle, u32 level,
 	struct acpi_device_info *info;
 
 	if (ACPI_SUCCESS(acpi_get_object_info(handle, &info))) {
-		pr_warn(DRV_PFX "method: name: %4.4s, args %X\n",
+		pr_warn("method: name: %4.4s, args %X\n",
 			(char *)&info->name, info->param_count);
 
 		kfree(info);
@@ -1169,7 +1168,7 @@ static int sony_nc_resume(struct acpi_device *device)
 		ret = acpi_callsetfunc(sony_nc_acpi_handle, *item->acpiset,
 				       item->value, NULL);
 		if (ret < 0) {
-			pr_err(DRV_PFX "%s: %d\n", __func__, ret);
+			pr_err("%s: %d\n", __func__, ret);
 			break;
 		}
 	}
@@ -1336,12 +1335,12 @@ static void sony_nc_rfkill_setup(struct acpi_device *device)
 
 	device_enum = (union acpi_object *) buffer.pointer;
 	if (!device_enum) {
-		pr_err(DRV_PFX "No SN06 return object.");
+		pr_err("No SN06 return object\n");
 		goto out_no_enum;
 	}
 	if (device_enum->type != ACPI_TYPE_BUFFER) {
-		pr_err(DRV_PFX "Invalid SN06 return object 0x%.2x\n",
-				device_enum->type);
+		pr_err("Invalid SN06 return object 0x%.2x\n",
+		       device_enum->type);
 		goto out_no_enum;
 	}
 
@@ -1662,7 +1661,7 @@ static void sony_nc_backlight_setup(void)
 						      ops, &props);
 
 	if (IS_ERR(sony_bl_props.dev)) {
-		pr_warn(DRV_PFX "unable to register backlight device\n");
+		pr_warn("unable to register backlight device\n");
 		sony_bl_props.dev = NULL;
 	} else
 		sony_bl_props.dev->props.brightness =
@@ -1682,8 +1681,7 @@ static int sony_nc_add(struct acpi_device *device)
 	acpi_handle handle;
 	struct sony_nc_value *item;
 
-	pr_info(DRV_PFX "%s v%s.\n", SONY_NC_DRIVER_NAME,
-			SONY_LAPTOP_DRIVER_VERSION);
+	pr_info("%s v%s\n", SONY_NC_DRIVER_NAME, SONY_LAPTOP_DRIVER_VERSION);
 
 	sony_nc_acpi_device = device;
 	strcpy(acpi_device_class(device), "sony/hotkey");
@@ -1708,7 +1706,7 @@ static int sony_nc_add(struct acpi_device *device)
 				sony_nc_acpi_handle, 1, sony_walk_callback,
 				NULL, NULL, NULL);
 		if (ACPI_FAILURE(status)) {
-			pr_warn(DRV_PFX "unable to walk acpi resources\n");
+			pr_warn("unable to walk acpi resources\n");
 			result = -ENODEV;
 			goto outpresent;
 		}
@@ -1736,13 +1734,12 @@ static int sony_nc_add(struct acpi_device *device)
 	/* setup input devices and helper fifo */
 	result = sony_laptop_setup_input(device);
 	if (result) {
-		pr_err(DRV_PFX "Unable to create input devices.\n");
+		pr_err("Unable to create input devices\n");
 		goto outkbdbacklight;
 	}
 
 	if (acpi_video_backlight_support()) {
-		pr_info(DRV_PFX "brightness ignored, must be "
-		       "controlled by ACPI video driver\n");
+		pr_info("brightness ignored, must be controlled by ACPI video driver\n");
 	} else {
 		sony_nc_backlight_setup();
 	}
@@ -2265,9 +2262,9 @@ out:
 	if (pcidev)
 		pci_dev_put(pcidev);
 
-	pr_info(DRV_PFX "detected Type%d model\n",
-			dev->model == SONYPI_DEVICE_TYPE1 ? 1 :
-			dev->model == SONYPI_DEVICE_TYPE2 ? 2 : 3);
+	pr_info("detected Type%d model\n",
+		dev->model == SONYPI_DEVICE_TYPE1 ? 1 :
+		dev->model == SONYPI_DEVICE_TYPE2 ? 2 : 3);
 }
 
 /* camera tests and poweron/poweroff */
@@ -2313,7 +2310,7 @@ static int __sony_pic_camera_ready(void)
 static int __sony_pic_camera_off(void)
 {
 	if (!camera) {
-		pr_warn(DRV_PFX "camera control not enabled\n");
+		pr_warn("camera control not enabled\n");
 		return -ENODEV;
 	}
 
@@ -2333,7 +2330,7 @@ static int __sony_pic_camera_on(void)
 	int i, j, x;
 
 	if (!camera) {
-		pr_warn(DRV_PFX "camera control not enabled\n");
+		pr_warn("camera control not enabled\n");
 		return -ENODEV;
 	}
 
@@ -2356,7 +2353,7 @@ static int __sony_pic_camera_on(void)
 	}
 
 	if (j == 0) {
-		pr_warn(DRV_PFX "failed to power on camera\n");
+		pr_warn("failed to power on camera\n");
 		return -ENODEV;
 	}
 
@@ -2412,8 +2409,7 @@ int sony_pic_camera_command(int command, u8 value)
 				ITERATIONS_SHORT);
 		break;
 	default:
-		pr_err(DRV_PFX "sony_pic_camera_command invalid: %d\n",
-		       command);
+		pr_err("sony_pic_camera_command invalid: %d\n", command);
 		break;
 	}
 	mutex_unlock(&spic_dev.lock);
@@ -2819,7 +2815,7 @@ static int sonypi_compat_init(void)
 	error =
 	 kfifo_alloc(&sonypi_compat.fifo, SONY_LAPTOP_BUF_SIZE, GFP_KERNEL);
 	if (error) {
-		pr_err(DRV_PFX "kfifo_alloc failed\n");
+		pr_err("kfifo_alloc failed\n");
 		return error;
 	}
 
@@ -2829,12 +2825,12 @@ static int sonypi_compat_init(void)
 		sonypi_misc_device.minor = minor;
 	error = misc_register(&sonypi_misc_device);
 	if (error) {
-		pr_err(DRV_PFX "misc_register failed\n");
+		pr_err("misc_register failed\n");
 		goto err_free_kfifo;
 	}
 	if (minor == -1)
-		pr_info(DRV_PFX "device allocated minor is %d\n",
-		       sonypi_misc_device.minor);
+		pr_info("device allocated minor is %d\n",
+			sonypi_misc_device.minor);
 
 	return 0;
 
@@ -2893,8 +2889,8 @@ sony_pic_read_possible_resource(struct acpi_resource *resource, void *context)
 			}
 			for (i = 0; i < p->interrupt_count; i++) {
 				if (!p->interrupts[i]) {
-					pr_warn(DRV_PFX "Invalid IRQ %d\n",
-							p->interrupts[i]);
+					pr_warn("Invalid IRQ %d\n",
+						p->interrupts[i]);
 					continue;
 				}
 				interrupt = kzalloc(sizeof(*interrupt),
@@ -2932,14 +2928,14 @@ sony_pic_read_possible_resource(struct acpi_resource *resource, void *context)
 						ioport->io2.address_length);
 			}
 			else {
-				pr_err(DRV_PFX "Unknown SPIC Type, more than 2 IO Ports\n");
+				pr_err("Unknown SPIC Type, more than 2 IO Ports\n");
 				return AE_ERROR;
 			}
 			return AE_OK;
 		}
 	default:
 		dprintk("Resource %d isn't an IRQ nor an IO port\n",
-				resource->type);
+			resource->type);
 
 	case ACPI_RESOURCE_TYPE_END_TAG:
 		return AE_OK;
@@ -2960,7 +2956,7 @@ static int sony_pic_possible_resources(struct acpi_device *device)
 	dprintk("Evaluating _STA\n");
 	result = acpi_bus_get_status(device);
 	if (result) {
-		pr_warn(DRV_PFX "Unable to read status\n");
+		pr_warn("Unable to read status\n");
 		goto end;
 	}
 
@@ -2976,8 +2972,7 @@ static int sony_pic_possible_resources(struct acpi_device *device)
 	status = acpi_walk_resources(device->handle, METHOD_NAME__PRS,
 			sony_pic_read_possible_resource, &spic_dev);
 	if (ACPI_FAILURE(status)) {
-		pr_warn(DRV_PFX "Failure evaluating %s\n",
-				METHOD_NAME__PRS);
+		pr_warn("Failure evaluating %s\n", METHOD_NAME__PRS);
 		result = -ENODEV;
 	}
 end:
@@ -3090,7 +3085,7 @@ static int sony_pic_enable(struct acpi_device *device,
 
 	/* check for total failure */
 	if (ACPI_FAILURE(status)) {
-		pr_err(DRV_PFX "Error evaluating _SRS\n");
+		pr_err("Error evaluating _SRS\n");
 		result = -ENODEV;
 		goto end;
 	}
@@ -3182,7 +3177,7 @@ static int sony_pic_remove(struct acpi_device *device, int type)
 	struct sony_pic_irq *irq, *tmp_irq;
 
 	if (sony_pic_disable(device)) {
-		pr_err(DRV_PFX "Couldn't disable device.\n");
+		pr_err("Couldn't disable device\n");
 		return -ENXIO;
 	}
 
@@ -3222,8 +3217,7 @@ static int sony_pic_add(struct acpi_device *device)
 	struct sony_pic_ioport *io, *tmp_io;
 	struct sony_pic_irq *irq, *tmp_irq;
 
-	pr_info(DRV_PFX "%s v%s.\n", SONY_PIC_DRIVER_NAME,
-			SONY_LAPTOP_DRIVER_VERSION);
+	pr_info("%s v%s\n", SONY_PIC_DRIVER_NAME, SONY_LAPTOP_DRIVER_VERSION);
 
 	spic_dev.acpi_dev = device;
 	strcpy(acpi_device_class(device), "sony/hotkey");
@@ -3233,14 +3227,14 @@ static int sony_pic_add(struct acpi_device *device)
 	/* read _PRS resources */
 	result = sony_pic_possible_resources(device);
 	if (result) {
-		pr_err(DRV_PFX "Unable to read possible resources.\n");
+		pr_err("Unable to read possible resources\n");
 		goto err_free_resources;
 	}
 
 	/* setup input devices and helper fifo */
 	result = sony_laptop_setup_input(device);
 	if (result) {
-		pr_err(DRV_PFX "Unable to create input devices.\n");
+		pr_err("Unable to create input devices\n");
 		goto err_free_resources;
 	}
 
@@ -3281,7 +3275,7 @@ static int sony_pic_add(struct acpi_device *device)
 		}
 	}
 	if (!spic_dev.cur_ioport) {
-		pr_err(DRV_PFX "Failed to request_region.\n");
+		pr_err("Failed to request_region\n");
 		result = -ENODEV;
 		goto err_remove_compat;
 	}
@@ -3301,7 +3295,7 @@ static int sony_pic_add(struct acpi_device *device)
 		}
 	}
 	if (!spic_dev.cur_irq) {
-		pr_err(DRV_PFX "Failed to request_irq.\n");
+		pr_err("Failed to request_irq\n");
 		result = -ENODEV;
 		goto err_release_region;
 	}
@@ -3309,7 +3303,7 @@ static int sony_pic_add(struct acpi_device *device)
 	/* set resource status _SRS */
 	result = sony_pic_enable(device, spic_dev.cur_ioport, spic_dev.cur_irq);
 	if (result) {
-		pr_err(DRV_PFX "Couldn't enable device.\n");
+		pr_err("Couldn't enable device\n");
 		goto err_free_irq;
 	}
 
@@ -3418,7 +3412,7 @@ static int __init sony_laptop_init(void)
 	if (!no_spic && dmi_check_system(sonypi_dmi_table)) {
 		result = acpi_bus_register_driver(&sony_pic_driver);
 		if (result) {
-			pr_err(DRV_PFX "Unable to register SPIC driver.");
+			pr_err("Unable to register SPIC driver\n");
 			goto out;
 		}
 		spic_drv_registered = 1;
@@ -3426,7 +3420,7 @@ static int __init sony_laptop_init(void)
 
 	result = acpi_bus_register_driver(&sony_nc_driver);
 	if (result) {
-		pr_err(DRV_PFX "Unable to register SNC driver.");
+		pr_err("Unable to register SNC driver\n");
 		goto out_unregister_pic;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 33cab1b71ec1bd02d55cede277bbc91485a68068 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:49 -0700
Subject: tc1100-wmi: Add pr_fmt, use pr_<level>

Use the more normal logging styles.
Removed now unused local logging #defines.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/tc1100-wmi.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/platform/x86/tc1100-wmi.c b/drivers/platform/x86/tc1100-wmi.c
index 865ef78d6f1..e24f5ae475a 100644
--- a/drivers/platform/x86/tc1100-wmi.c
+++ b/drivers/platform/x86/tc1100-wmi.c
@@ -25,6 +25,8 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -40,9 +42,6 @@
 #define TC1100_INSTANCE_WIRELESS		1
 #define TC1100_INSTANCE_JOGDIAL		2
 
-#define TC1100_LOGPREFIX "tc1100-wmi: "
-#define TC1100_INFO KERN_INFO TC1100_LOGPREFIX
-
 MODULE_AUTHOR("Jamey Hicks, Carlos Corbacho");
 MODULE_DESCRIPTION("HP Compaq TC1100 Tablet WMI Extras");
 MODULE_LICENSE("GPL");
@@ -264,7 +263,7 @@ static int __init tc1100_init(void)
 	if (error)
 		goto err_device_del;
 
-	printk(TC1100_INFO "HP Compaq TC1100 Tablet WMI Extras loaded\n");
+	pr_info("HP Compaq TC1100 Tablet WMI Extras loaded\n");
 	return 0;
 
  err_device_del:
-- 
cgit v1.2.3-70-g09d2


From 93c1d05b5fa83ac771b5d1a5b4eb0fe6f54b3b46 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:51 -0700
Subject: topstar-laptop: Convert remaining printk to pr_info

To be similar to all other uses.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/topstar-laptop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/topstar-laptop.c b/drivers/platform/x86/topstar-laptop.c
index 1d07d6d09f2..4c20447ddbb 100644
--- a/drivers/platform/x86/topstar-laptop.c
+++ b/drivers/platform/x86/topstar-laptop.c
@@ -194,7 +194,7 @@ static int __init topstar_laptop_init(void)
 	if (ret < 0)
 		return ret;
 
-	printk(KERN_INFO "Topstar Laptop ACPI extras driver loaded\n");
+	pr_info("ACPI extras driver loaded\n");
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 7e33460d8d991843a5821d667b55c75a092cf6e3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:52 -0700
Subject: toshiba: Convert printks to pr_<level>

Add pr_fmt.
Remove local MY_<foo> #defines.
Convert printks to pr_<level>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/toshiba_acpi.c      | 59 +++++++++++++++-----------------
 drivers/platform/x86/toshiba_bluetooth.c | 11 +++---
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 63f42a22e10..cb009b2629e 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -35,6 +35,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #define TOSHIBA_ACPI_VERSION	"0.19"
 #define PROC_INTERFACE_VERSION	1
 
@@ -60,11 +62,6 @@ MODULE_AUTHOR("John Belmonte");
 MODULE_DESCRIPTION("Toshiba Laptop ACPI Extras Driver");
 MODULE_LICENSE("GPL");
 
-#define MY_LOGPREFIX "toshiba_acpi: "
-#define MY_ERR KERN_ERR MY_LOGPREFIX
-#define MY_NOTICE KERN_NOTICE MY_LOGPREFIX
-#define MY_INFO KERN_INFO MY_LOGPREFIX
-
 /* Toshiba ACPI method paths */
 #define METHOD_LCD_BRIGHTNESS	"\\_SB_.PCI0.VGA_.LCD_._BCM"
 #define TOSH_INTERFACE_1	"\\_SB_.VALD"
@@ -301,7 +298,7 @@ static int toshiba_illumination_available(void)
 	in[0] = 0xf100;
 	status = hci_raw(in, out);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "Illumination device not available\n");
+		pr_info("Illumination device not available\n");
 		return 0;
 	}
 	in[0] = 0xf400;
@@ -320,7 +317,7 @@ static void toshiba_illumination_set(struct led_classdev *cdev,
 	in[0] = 0xf100;
 	status = hci_raw(in, out);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "Illumination device not available\n");
+		pr_info("Illumination device not available\n");
 		return;
 	}
 
@@ -331,7 +328,7 @@ static void toshiba_illumination_set(struct led_classdev *cdev,
 		in[2] = 1;
 		status = hci_raw(in, out);
 		if (ACPI_FAILURE(status)) {
-			printk(MY_INFO "ACPI call for illumination failed.\n");
+			pr_info("ACPI call for illumination failed\n");
 			return;
 		}
 	} else {
@@ -341,7 +338,7 @@ static void toshiba_illumination_set(struct led_classdev *cdev,
 		in[2] = 0;
 		status = hci_raw(in, out);
 		if (ACPI_FAILURE(status)) {
-			printk(MY_INFO "ACPI call for illumination failed.\n");
+			pr_info("ACPI call for illumination failed.\n");
 			return;
 		}
 	}
@@ -364,7 +361,7 @@ static enum led_brightness toshiba_illumination_get(struct led_classdev *cdev)
 	in[0] = 0xf100;
 	status = hci_raw(in, out);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "Illumination device not available\n");
+		pr_info("Illumination device not available\n");
 		return LED_OFF;
 	}
 
@@ -373,7 +370,7 @@ static enum led_brightness toshiba_illumination_get(struct led_classdev *cdev)
 	in[1] = 0x14e;
 	status = hci_raw(in, out);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "ACPI call for illumination failed.\n");
+		pr_info("ACPI call for illumination failed.\n");
 		return LED_OFF;
 	}
 
@@ -517,7 +514,7 @@ static int lcd_proc_show(struct seq_file *m, void *v)
 		seq_printf(m, "brightness_levels:       %d\n",
 			     HCI_LCD_BRIGHTNESS_LEVELS);
 	} else {
-		printk(MY_ERR "Error reading LCD brightness\n");
+		pr_err("Error reading LCD brightness\n");
 	}
 
 	return 0;
@@ -592,7 +589,7 @@ static int video_proc_show(struct seq_file *m, void *v)
 		seq_printf(m, "crt_out:                 %d\n", is_crt);
 		seq_printf(m, "tv_out:                  %d\n", is_tv);
 	} else {
-		printk(MY_ERR "Error reading video out status\n");
+		pr_err("Error reading video out status\n");
 	}
 
 	return 0;
@@ -686,7 +683,7 @@ static int fan_proc_show(struct seq_file *m, void *v)
 		seq_printf(m, "running:                 %d\n", (value > 0));
 		seq_printf(m, "force_on:                %d\n", force_fan);
 	} else {
-		printk(MY_ERR "Error reading fan status\n");
+		pr_err("Error reading fan status\n");
 	}
 
 	return 0;
@@ -750,9 +747,9 @@ static int keys_proc_show(struct seq_file *m, void *v)
 			 * some machines where system events sporadically
 			 * become disabled. */
 			hci_write1(HCI_SYSTEM_EVENT, 1, &hci_result);
-			printk(MY_NOTICE "Re-enabled hotkeys\n");
+			pr_notice("Re-enabled hotkeys\n");
 		} else {
-			printk(MY_ERR "Error reading hotkey status\n");
+			pr_err("Error reading hotkey status\n");
 			goto end;
 		}
 	}
@@ -863,7 +860,7 @@ static void toshiba_acpi_notify(acpi_handle handle, u32 event, void *context)
 
 			if (!sparse_keymap_report_event(toshiba_acpi.hotkey_dev,
 							value, 1, true)) {
-				printk(MY_INFO "Unknown key %x\n",
+				pr_info("Unknown key %x\n",
 				       value);
 			}
 		} else if (hci_result == HCI_NOT_SUPPORTED) {
@@ -871,7 +868,7 @@ static void toshiba_acpi_notify(acpi_handle handle, u32 event, void *context)
 			 * some machines where system events sporadically
 			 * become disabled. */
 			hci_write1(HCI_SYSTEM_EVENT, 1, &hci_result);
-			printk(MY_NOTICE "Re-enabled hotkeys\n");
+			pr_notice("Re-enabled hotkeys\n");
 		}
 	} while (hci_result != HCI_EMPTY);
 }
@@ -883,13 +880,13 @@ static int __init toshiba_acpi_setup_keyboard(char *device)
 
 	status = acpi_get_handle(NULL, device, &toshiba_acpi.handle);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "Unable to get notification device\n");
+		pr_info("Unable to get notification device\n");
 		return -ENODEV;
 	}
 
 	toshiba_acpi.hotkey_dev = input_allocate_device();
 	if (!toshiba_acpi.hotkey_dev) {
-		printk(MY_INFO "Unable to register input device\n");
+		pr_info("Unable to register input device\n");
 		return -ENOMEM;
 	}
 
@@ -905,21 +902,21 @@ static int __init toshiba_acpi_setup_keyboard(char *device)
 	status = acpi_install_notify_handler(toshiba_acpi.handle,
 				ACPI_DEVICE_NOTIFY, toshiba_acpi_notify, NULL);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "Unable to install hotkey notification\n");
+		pr_info("Unable to install hotkey notification\n");
 		error = -ENODEV;
 		goto err_free_keymap;
 	}
 
 	status = acpi_evaluate_object(toshiba_acpi.handle, "ENAB", NULL, NULL);
 	if (ACPI_FAILURE(status)) {
-		printk(MY_INFO "Unable to enable hotkeys\n");
+		pr_info("Unable to enable hotkeys\n");
 		error = -ENODEV;
 		goto err_remove_notify;
 	}
 
 	error = input_register_device(toshiba_acpi.hotkey_dev);
 	if (error) {
-		printk(MY_INFO "Unable to register input device\n");
+		pr_info("Unable to register input device\n");
 		goto err_remove_notify;
 	}
 
@@ -980,17 +977,17 @@ static int __init toshiba_acpi_init(void)
 	if (is_valid_acpi_path(TOSH_INTERFACE_1 GHCI_METHOD)) {
 		method_hci = TOSH_INTERFACE_1 GHCI_METHOD;
 		if (toshiba_acpi_setup_keyboard(TOSH_INTERFACE_1))
-			printk(MY_INFO "Unable to activate hotkeys\n");
+			pr_info("Unable to activate hotkeys\n");
 	} else if (is_valid_acpi_path(TOSH_INTERFACE_2 GHCI_METHOD)) {
 		method_hci = TOSH_INTERFACE_2 GHCI_METHOD;
 		if (toshiba_acpi_setup_keyboard(TOSH_INTERFACE_2))
-			printk(MY_INFO "Unable to activate hotkeys\n");
+			pr_info("Unable to activate hotkeys\n");
 	} else
 		return -ENODEV;
 
-	printk(MY_INFO "Toshiba Laptop ACPI Extras version %s\n",
+	pr_info("Toshiba Laptop ACPI Extras version %s\n",
 	       TOSHIBA_ACPI_VERSION);
-	printk(MY_INFO "    HCI method: %s\n", method_hci);
+	pr_info("    HCI method: %s\n", method_hci);
 
 	mutex_init(&toshiba_acpi.mutex);
 
@@ -998,7 +995,7 @@ static int __init toshiba_acpi_init(void)
 							      -1, NULL, 0);
 	if (IS_ERR(toshiba_acpi.p_dev)) {
 		ret = PTR_ERR(toshiba_acpi.p_dev);
-		printk(MY_ERR "unable to register platform device\n");
+		pr_err("unable to register platform device\n");
 		toshiba_acpi.p_dev = NULL;
 		toshiba_acpi_exit();
 		return ret;
@@ -1028,7 +1025,7 @@ static int __init toshiba_acpi_init(void)
         if (IS_ERR(toshiba_backlight_device)) {
 		ret = PTR_ERR(toshiba_backlight_device);
 
-		printk(KERN_ERR "Could not register toshiba backlight device\n");
+		pr_err("Could not register toshiba backlight device\n");
 		toshiba_backlight_device = NULL;
 		toshiba_acpi_exit();
 		return ret;
@@ -1042,14 +1039,14 @@ static int __init toshiba_acpi_init(void)
 						   &toshiba_rfk_ops,
 						   &toshiba_acpi);
 		if (!toshiba_acpi.bt_rfk) {
-			printk(MY_ERR "unable to allocate rfkill device\n");
+			pr_err("unable to allocate rfkill device\n");
 			toshiba_acpi_exit();
 			return -ENOMEM;
 		}
 
 		ret = rfkill_register(toshiba_acpi.bt_rfk);
 		if (ret) {
-			printk(MY_ERR "unable to register rfkill device\n");
+			pr_err("unable to register rfkill device\n");
 			rfkill_destroy(toshiba_acpi.bt_rfk);
 			toshiba_acpi_exit();
 			return ret;
diff --git a/drivers/platform/x86/toshiba_bluetooth.c b/drivers/platform/x86/toshiba_bluetooth.c
index 94406861191..5fb7186694d 100644
--- a/drivers/platform/x86/toshiba_bluetooth.c
+++ b/drivers/platform/x86/toshiba_bluetooth.c
@@ -17,6 +17,8 @@
  * delivered.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -70,14 +72,13 @@ static int toshiba_bluetooth_enable(acpi_handle handle)
 	if (!(result & 0x01))
 		return 0;
 
-	printk(KERN_INFO "toshiba_bluetooth: Re-enabling Toshiba Bluetooth\n");
+	pr_info("Re-enabling Toshiba Bluetooth\n");
 	res1 = acpi_evaluate_object(handle, "AUSB", NULL, NULL);
 	res2 = acpi_evaluate_object(handle, "BTPO", NULL, NULL);
 	if (!ACPI_FAILURE(res1) || !ACPI_FAILURE(res2))
 		return 0;
 
-	printk(KERN_WARNING "toshiba_bluetooth: Failed to re-enable "
-	       "Toshiba Bluetooth\n");
+	pr_warn("Failed to re-enable Toshiba Bluetooth\n");
 
 	return -ENODEV;
 }
@@ -107,8 +108,8 @@ static int toshiba_bt_rfkill_add(struct acpi_device *device)
 				       &bt_present);
 
 	if (!ACPI_FAILURE(status) && bt_present) {
-		printk(KERN_INFO "Detected Toshiba ACPI Bluetooth device - "
-		      "installing RFKill handler\n");
+		pr_info("Detected Toshiba ACPI Bluetooth device - "
+			"installing RFKill handler\n");
 		result = toshiba_bluetooth_enable(device->handle);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From dd8e908e82964b8b493ef241707fb6109aa41cfd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:53 -0700
Subject: wmi: Removed trailing whitespace from logging message.

Just neatening.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/wmi.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 05cc79672a8..f23d5a84e7b 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -486,16 +486,16 @@ static void wmi_dump_wdg(const struct guid_block *g)
 	pr_info("\tnotify_id: %02X\n", g->notify_id);
 	pr_info("\treserved: %02X\n", g->reserved);
 	pr_info("\tinstance_count: %d\n", g->instance_count);
-	pr_info("\tflags: %#x ", g->flags);
+	pr_info("\tflags: %#x", g->flags);
 	if (g->flags) {
 		if (g->flags & ACPI_WMI_EXPENSIVE)
-			pr_cont("ACPI_WMI_EXPENSIVE ");
+			pr_cont(" ACPI_WMI_EXPENSIVE");
 		if (g->flags & ACPI_WMI_METHOD)
-			pr_cont("ACPI_WMI_METHOD ");
+			pr_cont(" ACPI_WMI_METHOD");
 		if (g->flags & ACPI_WMI_STRING)
-			pr_cont("ACPI_WMI_STRING ");
+			pr_cont(" ACPI_WMI_STRING");
 		if (g->flags & ACPI_WMI_EVENT)
-			pr_cont("ACPI_WMI_EVENT ");
+			pr_cont(" ACPI_WMI_EVENT");
 	}
 	pr_cont("\n");
 
-- 
cgit v1.2.3-70-g09d2


From ad3f2f038fbd67f3341f760507aed90381114145 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 29 Mar 2011 15:21:54 -0700
Subject: xo15-ebook: Use pr_<level>

Use the current logging styles.

Remove local #define PREFIX.
Add pr_fmt.
Convert printk to pr_<level>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/xo15-ebook.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/xo15-ebook.c b/drivers/platform/x86/xo15-ebook.c
index c1372ed9d2e..fad153dc035 100644
--- a/drivers/platform/x86/xo15-ebook.c
+++ b/drivers/platform/x86/xo15-ebook.c
@@ -11,6 +11,8 @@
  *  your option) any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -20,7 +22,6 @@
 #include <acpi/acpi_drivers.h>
 
 #define MODULE_NAME "xo15-ebook"
-#define PREFIX MODULE_NAME ": "
 
 #define XO15_EBOOK_CLASS		MODULE_NAME
 #define XO15_EBOOK_TYPE_UNKNOWN	0x00
@@ -105,7 +106,7 @@ static int ebook_switch_add(struct acpi_device *device)
 	class = acpi_device_class(device);
 
 	if (strcmp(hid, XO15_EBOOK_HID)) {
-		printk(KERN_ERR PREFIX "Unsupported hid [%s]\n", hid);
+		pr_err("Unsupported hid [%s]\n", hid);
 		error = -ENODEV;
 		goto err_free_input;
 	}
-- 
cgit v1.2.3-70-g09d2


From 253a0069d7a222f7ad55f2c40f541a7648920c77 Mon Sep 17 00:00:00 2001
From: Ameya Palande <ameya.palande@nokia.com>
Date: Fri, 1 Apr 2011 16:54:11 +0300
Subject: platform-x86: intel_mid_thermal: Fix coding style

Before fixing checkpatch.pl reported 74 errors and 234 warnings

Signed-off-by: Ameya Palande <ameya.palande@nokia.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/intel_mid_thermal.c | 606 +++++++++++++++----------------
 1 file changed, 300 insertions(+), 306 deletions(-)

diff --git a/drivers/platform/x86/intel_mid_thermal.c b/drivers/platform/x86/intel_mid_thermal.c
index c2f4bd8013b..f0776504521 100644
--- a/drivers/platform/x86/intel_mid_thermal.c
+++ b/drivers/platform/x86/intel_mid_thermal.c
@@ -37,49 +37,50 @@
 #include <asm/intel_scu_ipc.h>
 
 /* Number of thermal sensors */
-#define MSIC_THERMAL_SENSORS   4
+#define MSIC_THERMAL_SENSORS	4
 
 /* ADC1 - thermal registers */
-#define MSIC_THERM_ADC1CNTL1   0x1C0
-#define MSIC_ADC_ENBL          0x10
-#define MSIC_ADC_START         0x08
+#define MSIC_THERM_ADC1CNTL1	0x1C0
+#define MSIC_ADC_ENBL		0x10
+#define MSIC_ADC_START		0x08
 
-#define MSIC_THERM_ADC1CNTL3   0x1C2
-#define MSIC_ADCTHERM_ENBL     0x04
-#define MSIC_ADCRRDATA_ENBL    0x05
-#define MSIC_CHANL_MASK_VAL    0x0F
+#define MSIC_THERM_ADC1CNTL3	0x1C2
+#define MSIC_ADCTHERM_ENBL	0x04
+#define MSIC_ADCRRDATA_ENBL	0x05
+#define MSIC_CHANL_MASK_VAL	0x0F
 
-#define MSIC_STOPBIT_MASK      16
-#define MSIC_ADCTHERM_MASK     4
-#define ADC_CHANLS_MAX         15 /* Number of ADC channels */
-#define ADC_LOOP_MAX           (ADC_CHANLS_MAX - MSIC_THERMAL_SENSORS)
+#define MSIC_STOPBIT_MASK	16
+#define MSIC_ADCTHERM_MASK	4
+/* Number of ADC channels */
+#define ADC_CHANLS_MAX		15
+#define ADC_LOOP_MAX		(ADC_CHANLS_MAX - MSIC_THERMAL_SENSORS)
 
 /* ADC channel code values */
-#define SKIN_SENSOR0_CODE      0x08
-#define SKIN_SENSOR1_CODE      0x09
-#define SYS_SENSOR_CODE                0x0A
-#define MSIC_DIE_SENSOR_CODE   0x03
+#define SKIN_SENSOR0_CODE	0x08
+#define SKIN_SENSOR1_CODE	0x09
+#define SYS_SENSOR_CODE		0x0A
+#define MSIC_DIE_SENSOR_CODE	0x03
 
-#define SKIN_THERM_SENSOR0     0
-#define SKIN_THERM_SENSOR1     1
-#define SYS_THERM_SENSOR2      2
-#define MSIC_DIE_THERM_SENSOR3 3
+#define SKIN_THERM_SENSOR0	0
+#define SKIN_THERM_SENSOR1	1
+#define SYS_THERM_SENSOR2	2
+#define MSIC_DIE_THERM_SENSOR3	3
 
 /* ADC code range */
-#define ADC_MAX                        977
-#define ADC_MIN                        162
-#define ADC_VAL0C              887
-#define ADC_VAL20C             720
-#define ADC_VAL40C             508
-#define ADC_VAL60C             315
+#define ADC_MAX			977
+#define ADC_MIN			162
+#define ADC_VAL0C		887
+#define ADC_VAL20C		720
+#define ADC_VAL40C		508
+#define ADC_VAL60C		315
 
 /* ADC base addresses */
-#define ADC_CHNL_START_ADDR    0x1C5   /* increments by 1 */
-#define ADC_DATA_START_ADDR     0x1D4   /* increments by 2 */
+#define ADC_CHNL_START_ADDR	0x1C5	/* increments by 1 */
+#define ADC_DATA_START_ADDR	0x1D4	/* increments by 2 */
 
 /* MSIC die attributes */
-#define MSIC_DIE_ADC_MIN       488
-#define MSIC_DIE_ADC_MAX       1004
+#define MSIC_DIE_ADC_MIN	488
+#define MSIC_DIE_ADC_MAX	1004
 
 /* This holds the address of the first free ADC channel,
  * among the 15 channels
@@ -87,15 +88,15 @@
 static int channel_index;
 
 struct platform_info {
-       struct platform_device *pdev;
-       struct thermal_zone_device *tzd[MSIC_THERMAL_SENSORS];
+	struct platform_device *pdev;
+	struct thermal_zone_device *tzd[MSIC_THERMAL_SENSORS];
 };
 
 struct thermal_device_info {
-       unsigned int chnl_addr;
-       int direct;
-       /* This holds the current temperature in millidegree celsius */
-       long curr_temp;
+	unsigned int chnl_addr;
+	int direct;
+	/* This holds the current temperature in millidegree celsius */
+	long curr_temp;
 };
 
 /**
@@ -106,7 +107,7 @@ struct thermal_device_info {
  */
 static int to_msic_die_temp(uint16_t adc_val)
 {
-       return (368 * (adc_val) / 1000) - 220;
+	return (368 * (adc_val) / 1000) - 220;
 }
 
 /**
@@ -118,7 +119,7 @@ static int to_msic_die_temp(uint16_t adc_val)
  */
 static int is_valid_adc(uint16_t adc_val, uint16_t min, uint16_t max)
 {
-       return (adc_val >= min) && (adc_val <= max);
+	return (adc_val >= min) && (adc_val <= max);
 }
 
 /**
@@ -136,35 +137,35 @@ static int is_valid_adc(uint16_t adc_val, uint16_t min, uint16_t max)
  */
 static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
 {
-       int temp;
-
-       /* Direct conversion for die temperature */
-       if (direct) {
-               if (is_valid_adc(adc_val, MSIC_DIE_ADC_MIN, MSIC_DIE_ADC_MAX)) {
-                       *tp = to_msic_die_temp(adc_val) * 1000;
-                       return 0;
-               }
-               return -ERANGE;
-       }
-
-       if (!is_valid_adc(adc_val, ADC_MIN, ADC_MAX))
-               return -ERANGE;
-
-       /* Linear approximation for skin temperature */
-       if (adc_val > ADC_VAL0C)
-               temp = 177 - (adc_val/5);
-       else if ((adc_val <= ADC_VAL0C) && (adc_val > ADC_VAL20C))
-               temp = 111 - (adc_val/8);
-       else if ((adc_val <= ADC_VAL20C) && (adc_val > ADC_VAL40C))
-               temp = 92 - (adc_val/10);
-       else if ((adc_val <= ADC_VAL40C) && (adc_val > ADC_VAL60C))
-               temp = 91 - (adc_val/10);
-       else
-               temp = 112 - (adc_val/6);
-
-       /* Convert temperature in celsius to milli degree celsius */
-       *tp = temp * 1000;
-       return 0;
+	int temp;
+
+	/* Direct conversion for die temperature */
+	if (direct) {
+		if (is_valid_adc(adc_val, MSIC_DIE_ADC_MIN, MSIC_DIE_ADC_MAX)) {
+			*tp = to_msic_die_temp(adc_val) * 1000;
+			return 0;
+		}
+		return -ERANGE;
+	}
+
+	if (!is_valid_adc(adc_val, ADC_MIN, ADC_MAX))
+		return -ERANGE;
+
+	/* Linear approximation for skin temperature */
+	if (adc_val > ADC_VAL0C)
+		temp = 177 - (adc_val/5);
+	else if ((adc_val <= ADC_VAL0C) && (adc_val > ADC_VAL20C))
+		temp = 111 - (adc_val/8);
+	else if ((adc_val <= ADC_VAL20C) && (adc_val > ADC_VAL40C))
+		temp = 92 - (adc_val/10);
+	else if ((adc_val <= ADC_VAL40C) && (adc_val > ADC_VAL60C))
+		temp = 91 - (adc_val/10);
+	else
+		temp = 112 - (adc_val/6);
+
+	/* Convert temperature in celsius to milli degree celsius */
+	*tp = temp * 1000;
+	return 0;
 }
 
 /**
@@ -178,47 +179,47 @@ static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
  */
 static int mid_read_temp(struct thermal_zone_device *tzd, unsigned long *temp)
 {
-       struct thermal_device_info *td_info = tzd->devdata;
-       uint16_t adc_val, addr;
-       uint8_t data = 0;
-       int ret;
-       unsigned long curr_temp;
-
-
-       addr = td_info->chnl_addr;
-
-       /* Enable the msic for conversion before reading */
-       ret = intel_scu_ipc_iowrite8(MSIC_THERM_ADC1CNTL3, MSIC_ADCRRDATA_ENBL);
-       if (ret)
-               return ret;
-
-       /* Re-toggle the RRDATARD bit (temporary workaround) */
-       ret = intel_scu_ipc_iowrite8(MSIC_THERM_ADC1CNTL3, MSIC_ADCTHERM_ENBL);
-       if (ret)
-               return ret;
-
-       /* Read the higher bits of data */
-       ret = intel_scu_ipc_ioread8(addr, &data);
-       if (ret)
-               return ret;
-
-       /* Shift bits to accommodate the lower two data bits */
-       adc_val = (data << 2);
-       addr++;
-
-       ret = intel_scu_ipc_ioread8(addr, &data);/* Read lower bits */
-       if (ret)
-               return ret;
-
-       /* Adding lower two bits to the higher bits */
-       data &= 03;
-       adc_val += data;
-
-       /* Convert ADC value to temperature */
-       ret = adc_to_temp(td_info->direct, adc_val, &curr_temp);
-       if (ret == 0)
-               *temp = td_info->curr_temp = curr_temp;
-       return ret;
+	struct thermal_device_info *td_info = tzd->devdata;
+	uint16_t adc_val, addr;
+	uint8_t data = 0;
+	int ret;
+	unsigned long curr_temp;
+
+
+	addr = td_info->chnl_addr;
+
+	/* Enable the msic for conversion before reading */
+	ret = intel_scu_ipc_iowrite8(MSIC_THERM_ADC1CNTL3, MSIC_ADCRRDATA_ENBL);
+	if (ret)
+		return ret;
+
+	/* Re-toggle the RRDATARD bit (temporary workaround) */
+	ret = intel_scu_ipc_iowrite8(MSIC_THERM_ADC1CNTL3, MSIC_ADCTHERM_ENBL);
+	if (ret)
+		return ret;
+
+	/* Read the higher bits of data */
+	ret = intel_scu_ipc_ioread8(addr, &data);
+	if (ret)
+		return ret;
+
+	/* Shift bits to accommodate the lower two data bits */
+	adc_val = (data << 2);
+	addr++;
+
+	ret = intel_scu_ipc_ioread8(addr, &data);/* Read lower bits */
+	if (ret)
+		return ret;
+
+	/* Adding lower two bits to the higher bits */
+	data &= 03;
+	adc_val += data;
+
+	/* Convert ADC value to temperature */
+	ret = adc_to_temp(td_info->direct, adc_val, &curr_temp);
+	if (ret == 0)
+		*temp = td_info->curr_temp = curr_temp;
+	return ret;
 }
 
 /**
@@ -231,22 +232,21 @@ static int mid_read_temp(struct thermal_zone_device *tzd, unsigned long *temp)
  */
 static int configure_adc(int val)
 {
-       int ret;
-       uint8_t data;
-
-       ret = intel_scu_ipc_ioread8(MSIC_THERM_ADC1CNTL1, &data);
-       if (ret)
-               return ret;
-
-       if (val) {
-               /* Enable and start the ADC */
-               data |= (MSIC_ADC_ENBL | MSIC_ADC_START);
-       } else {
-               /* Just stop the ADC */
-               data &= (~MSIC_ADC_START);
-       }
-
-       return intel_scu_ipc_iowrite8(MSIC_THERM_ADC1CNTL1, data);
+	int ret;
+	uint8_t data;
+
+	ret = intel_scu_ipc_ioread8(MSIC_THERM_ADC1CNTL1, &data);
+	if (ret)
+		return ret;
+
+	if (val) {
+		/* Enable and start the ADC */
+		data |= (MSIC_ADC_ENBL | MSIC_ADC_START);
+	} else {
+		/* Just stop the ADC */
+		data &= (~MSIC_ADC_START);
+	}
+	return intel_scu_ipc_iowrite8(MSIC_THERM_ADC1CNTL1, data);
 }
 
 /**
@@ -259,30 +259,30 @@ static int configure_adc(int val)
  */
 static int set_up_therm_channel(u16 base_addr)
 {
-       int ret;
-
-       /* Enable all the sensor channels */
-       ret = intel_scu_ipc_iowrite8(base_addr, SKIN_SENSOR0_CODE);
-       if (ret)
-               return ret;
-
-       ret = intel_scu_ipc_iowrite8(base_addr + 1, SKIN_SENSOR1_CODE);
-       if (ret)
-               return ret;
-
-       ret = intel_scu_ipc_iowrite8(base_addr + 2, SYS_SENSOR_CODE);
-       if (ret)
-               return ret;
-
-       /* Since this is the last channel, set the stop bit
-          to 1 by ORing the DIE_SENSOR_CODE with 0x10 */
-       ret = intel_scu_ipc_iowrite8(base_addr + 3,
-                                       (MSIC_DIE_SENSOR_CODE | 0x10));
-       if (ret)
-               return ret;
-
-       /* Enable ADC and start it */
-       return configure_adc(1);
+	int ret;
+
+	/* Enable all the sensor channels */
+	ret = intel_scu_ipc_iowrite8(base_addr, SKIN_SENSOR0_CODE);
+	if (ret)
+		return ret;
+
+	ret = intel_scu_ipc_iowrite8(base_addr + 1, SKIN_SENSOR1_CODE);
+	if (ret)
+		return ret;
+
+	ret = intel_scu_ipc_iowrite8(base_addr + 2, SYS_SENSOR_CODE);
+	if (ret)
+		return ret;
+
+	/* Since this is the last channel, set the stop bit
+	 * to 1 by ORing the DIE_SENSOR_CODE with 0x10 */
+	ret = intel_scu_ipc_iowrite8(base_addr + 3,
+			(MSIC_DIE_SENSOR_CODE | 0x10));
+	if (ret)
+		return ret;
+
+	/* Enable ADC and start it */
+	return configure_adc(1);
 }
 
 /**
@@ -293,13 +293,13 @@ static int set_up_therm_channel(u16 base_addr)
  */
 static int reset_stopbit(uint16_t addr)
 {
-       int ret;
-       uint8_t data;
-       ret = intel_scu_ipc_ioread8(addr, &data);
-       if (ret)
-               return ret;
-       /* Set the stop bit to zero */
-       return intel_scu_ipc_iowrite8(addr, (data & 0xEF));
+	int ret;
+	uint8_t data;
+	ret = intel_scu_ipc_ioread8(addr, &data);
+	if (ret)
+		return ret;
+	/* Set the stop bit to zero */
+	return intel_scu_ipc_iowrite8(addr, (data & 0xEF));
 }
 
 /**
@@ -317,30 +317,30 @@ static int reset_stopbit(uint16_t addr)
  */
 static int find_free_channel(void)
 {
-       int ret;
-       int i;
-       uint8_t data;
-
-       /* check whether ADC is enabled */
-       ret = intel_scu_ipc_ioread8(MSIC_THERM_ADC1CNTL1, &data);
-       if (ret)
-               return ret;
-
-       if ((data & MSIC_ADC_ENBL) == 0)
-               return 0;
-
-       /* ADC is already enabled; Looking for an empty channel */
-       for (i = 0; i < ADC_CHANLS_MAX; i++) {
-               ret = intel_scu_ipc_ioread8(ADC_CHNL_START_ADDR + i, &data);
-               if (ret)
-                       return ret;
-
-               if (data & MSIC_STOPBIT_MASK) {
-                       ret = i;
-                       break;
-               }
-       }
-       return (ret > ADC_LOOP_MAX) ? (-EINVAL) : ret;
+	int ret;
+	int i;
+	uint8_t data;
+
+	/* check whether ADC is enabled */
+	ret = intel_scu_ipc_ioread8(MSIC_THERM_ADC1CNTL1, &data);
+	if (ret)
+		return ret;
+
+	if ((data & MSIC_ADC_ENBL) == 0)
+		return 0;
+
+	/* ADC is already enabled; Looking for an empty channel */
+	for (i = 0; i < ADC_CHANLS_MAX; i++) {
+		ret = intel_scu_ipc_ioread8(ADC_CHNL_START_ADDR + i, &data);
+		if (ret)
+			return ret;
+
+		if (data & MSIC_STOPBIT_MASK) {
+			ret = i;
+			break;
+		}
+	}
+	return (ret > ADC_LOOP_MAX) ? (-EINVAL) : ret;
 }
 
 /**
@@ -351,48 +351,48 @@ static int find_free_channel(void)
  */
 static int mid_initialize_adc(struct device *dev)
 {
-       u8  data;
-       u16 base_addr;
-       int ret;
-
-       /*
-        * Ensure that adctherm is disabled before we
-        * initialize the ADC
-        */
-       ret = intel_scu_ipc_ioread8(MSIC_THERM_ADC1CNTL3, &data);
-       if (ret)
-               return ret;
-
-       if (data & MSIC_ADCTHERM_MASK)
-               dev_warn(dev, "ADCTHERM already set");
-
-       /* Index of the first channel in which the stop bit is set */
-       channel_index = find_free_channel();
-       if (channel_index < 0) {
-               dev_err(dev, "No free ADC channels");
-               return channel_index;
-       }
-
-       base_addr = ADC_CHNL_START_ADDR + channel_index;
-
-       if (!(channel_index == 0 || channel_index == ADC_LOOP_MAX)) {
-               /* Reset stop bit for channels other than 0 and 12 */
-               ret = reset_stopbit(base_addr);
-               if (ret)
-                       return ret;
-
-               /* Index of the first free channel */
-               base_addr++;
-               channel_index++;
-       }
-
-       ret = set_up_therm_channel(base_addr);
-       if (ret) {
-               dev_err(dev, "unable to enable ADC");
-               return ret;
-       }
-       dev_dbg(dev, "ADC initialization successful");
-       return ret;
+	u8  data;
+	u16 base_addr;
+	int ret;
+
+	/*
+	 * Ensure that adctherm is disabled before we
+	 * initialize the ADC
+	 */
+	ret = intel_scu_ipc_ioread8(MSIC_THERM_ADC1CNTL3, &data);
+	if (ret)
+		return ret;
+
+	if (data & MSIC_ADCTHERM_MASK)
+		dev_warn(dev, "ADCTHERM already set");
+
+	/* Index of the first channel in which the stop bit is set */
+	channel_index = find_free_channel();
+	if (channel_index < 0) {
+		dev_err(dev, "No free ADC channels");
+		return channel_index;
+	}
+
+	base_addr = ADC_CHNL_START_ADDR + channel_index;
+
+	if (!(channel_index == 0 || channel_index == ADC_LOOP_MAX)) {
+		/* Reset stop bit for channels other than 0 and 12 */
+		ret = reset_stopbit(base_addr);
+		if (ret)
+			return ret;
+
+		/* Index of the first free channel */
+		base_addr++;
+		channel_index++;
+	}
+
+	ret = set_up_therm_channel(base_addr);
+	if (ret) {
+		dev_err(dev, "unable to enable ADC");
+		return ret;
+	}
+	dev_dbg(dev, "ADC initialization successful");
+	return ret;
 }
 
 /**
@@ -403,18 +403,18 @@ static int mid_initialize_adc(struct device *dev)
  */
 static struct thermal_device_info *initialize_sensor(int index)
 {
-       struct thermal_device_info *td_info =
-               kzalloc(sizeof(struct thermal_device_info), GFP_KERNEL);
-
-       if (!td_info)
-               return NULL;
-
-       /* Set the base addr of the channel for this sensor */
-       td_info->chnl_addr = ADC_DATA_START_ADDR + 2 * (channel_index + index);
-       /* Sensor 3 is direct conversion */
-       if (index == 3)
-               td_info->direct = 1;
-       return td_info;
+	struct thermal_device_info *td_info =
+		kzalloc(sizeof(struct thermal_device_info), GFP_KERNEL);
+
+	if (!td_info)
+		return NULL;
+
+	/* Set the base addr of the channel for this sensor */
+	td_info->chnl_addr = ADC_DATA_START_ADDR + 2 * (channel_index + index);
+	/* Sensor 3 is direct conversion */
+	if (index == 3)
+		td_info->direct = 1;
+	return td_info;
 }
 
 /**
@@ -425,7 +425,7 @@ static struct thermal_device_info *initialize_sensor(int index)
  */
 static int mid_thermal_resume(struct platform_device *pdev)
 {
-       return mid_initialize_adc(&pdev->dev);
+	return mid_initialize_adc(&pdev->dev);
 }
 
 /**
@@ -437,12 +437,12 @@ static int mid_thermal_resume(struct platform_device *pdev)
  */
 static int mid_thermal_suspend(struct platform_device *pdev, pm_message_t mesg)
 {
-       /*
-        * This just stops the ADC and does not disable it.
-        * temporary workaround until we have a generic ADC driver.
-        * If 0 is passed, it disables the ADC.
-        */
-       return configure_adc(0);
+	/*
+	 * This just stops the ADC and does not disable it.
+	 * temporary workaround until we have a generic ADC driver.
+	 * If 0 is passed, it disables the ADC.
+	 */
+	return configure_adc(0);
 }
 
 /**
@@ -453,16 +453,15 @@ static int mid_thermal_suspend(struct platform_device *pdev, pm_message_t mesg)
  */
 static int read_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
 {
-       WARN_ON(tzd == NULL);
-       return mid_read_temp(tzd, temp);
+	WARN_ON(tzd == NULL);
+	return mid_read_temp(tzd, temp);
 }
 
 /* Can't be const */
 static struct thermal_zone_device_ops tzd_ops = {
-       .get_temp = read_curr_temp,
+	.get_temp = read_curr_temp,
 };
 
-
 /**
  * mid_thermal_probe - mfld thermal initialize
  * @pdev: platform device structure
@@ -472,46 +471,45 @@ static struct thermal_zone_device_ops tzd_ops = {
  */
 static int mid_thermal_probe(struct platform_device *pdev)
 {
-       static char *name[MSIC_THERMAL_SENSORS] = {
-               "skin0", "skin1", "sys", "msicdie"
-       };
-
-       int ret;
-       int i;
-       struct platform_info *pinfo;
-
-       pinfo = kzalloc(sizeof(struct platform_info), GFP_KERNEL);
-       if (!pinfo)
-               return -ENOMEM;
-
-       /* Initializing the hardware */
-       ret = mid_initialize_adc(&pdev->dev);
-       if (ret) {
-               dev_err(&pdev->dev, "ADC init failed");
-               kfree(pinfo);
-               return ret;
-       }
-
-       /* Register each sensor with the generic thermal framework*/
-       for (i = 0; i < MSIC_THERMAL_SENSORS; i++) {
-               pinfo->tzd[i] = thermal_zone_device_register(name[i],
-                                       0, initialize_sensor(i),
-                                       &tzd_ops, 0, 0, 0, 0);
-               if (IS_ERR(pinfo->tzd[i]))
-                       goto reg_fail;
-       }
-
-       pinfo->pdev = pdev;
-       platform_set_drvdata(pdev, pinfo);
-       return 0;
+	static char *name[MSIC_THERMAL_SENSORS] = {
+		"skin0", "skin1", "sys", "msicdie"
+	};
+
+	int ret;
+	int i;
+	struct platform_info *pinfo;
+
+	pinfo = kzalloc(sizeof(struct platform_info), GFP_KERNEL);
+	if (!pinfo)
+		return -ENOMEM;
+
+	/* Initializing the hardware */
+	ret = mid_initialize_adc(&pdev->dev);
+	if (ret) {
+		dev_err(&pdev->dev, "ADC init failed");
+		kfree(pinfo);
+		return ret;
+	}
+
+	/* Register each sensor with the generic thermal framework*/
+	for (i = 0; i < MSIC_THERMAL_SENSORS; i++) {
+		pinfo->tzd[i] = thermal_zone_device_register(name[i],
+				0, initialize_sensor(i), &tzd_ops, 0, 0, 0, 0);
+		if (IS_ERR(pinfo->tzd[i]))
+			goto reg_fail;
+	}
+
+	pinfo->pdev = pdev;
+	platform_set_drvdata(pdev, pinfo);
+	return 0;
 
 reg_fail:
-       ret = PTR_ERR(pinfo->tzd[i]);
-       while (--i >= 0)
-               thermal_zone_device_unregister(pinfo->tzd[i]);
-       configure_adc(0);
-       kfree(pinfo);
-       return ret;
+	ret = PTR_ERR(pinfo->tzd[i]);
+	while (--i >= 0)
+		thermal_zone_device_unregister(pinfo->tzd[i]);
+	configure_adc(0);
+	kfree(pinfo);
+	return ret;
 }
 
 /**
@@ -523,49 +521,45 @@ reg_fail:
  */
 static int mid_thermal_remove(struct platform_device *pdev)
 {
-       int i;
-       struct platform_info *pinfo = platform_get_drvdata(pdev);
+	int i;
+	struct platform_info *pinfo = platform_get_drvdata(pdev);
 
-       for (i = 0; i < MSIC_THERMAL_SENSORS; i++)
-               thermal_zone_device_unregister(pinfo->tzd[i]);
+	for (i = 0; i < MSIC_THERMAL_SENSORS; i++)
+		thermal_zone_device_unregister(pinfo->tzd[i]);
 
-       platform_set_drvdata(pdev, NULL);
+	platform_set_drvdata(pdev, NULL);
 
-       /* Stop the ADC */
-       return configure_adc(0);
+	/* Stop the ADC */
+	return configure_adc(0);
 }
 
-/*********************************************************************
- *             Driver initialisation and finalization
- *********************************************************************/
-
 #define DRIVER_NAME "msic_sensor"
 
 static const struct platform_device_id therm_id_table[] = {
-       { DRIVER_NAME, 1 },
-       { }
+	{ DRIVER_NAME, 1 },
+	{ }
 };
 
 static struct platform_driver mid_thermal_driver = {
-       .driver = {
-               .name = DRIVER_NAME,
-               .owner = THIS_MODULE,
-       },
-       .probe = mid_thermal_probe,
-       .suspend = mid_thermal_suspend,
-       .resume = mid_thermal_resume,
-       .remove = __devexit_p(mid_thermal_remove),
-       .id_table = therm_id_table,
+	.driver = {
+		.name = DRIVER_NAME,
+		.owner = THIS_MODULE,
+	},
+	.probe = mid_thermal_probe,
+	.suspend = mid_thermal_suspend,
+	.resume = mid_thermal_resume,
+	.remove = __devexit_p(mid_thermal_remove),
+	.id_table = therm_id_table,
 };
 
 static int __init mid_thermal_module_init(void)
 {
-       return platform_driver_register(&mid_thermal_driver);
+	return platform_driver_register(&mid_thermal_driver);
 }
 
 static void __exit mid_thermal_module_exit(void)
 {
-       platform_driver_unregister(&mid_thermal_driver);
+	platform_driver_unregister(&mid_thermal_driver);
 }
 
 module_init(mid_thermal_module_init);
-- 
cgit v1.2.3-70-g09d2


From 112a6ee053f9e9f014ab64f2549d3a25551aa349 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 4 Apr 2011 10:06:24 -0700
Subject: thinkpad_acpi: Correct !CONFIG_THINKPAD_ACPI_VIDEO warning

Move TPACPI_HANDLE declaration into #ifdef block
and neaten it a bit.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/thinkpad_acpi.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 562fcf0dd2b..7693e89459f 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -535,15 +535,6 @@ TPACPI_HANDLE(hkey, ec, "\\_SB.HKEY",	/* 600e/x, 770e, 770x */
 	   "HKEY",		/* all others */
 	   );			/* 570 */
 
-TPACPI_HANDLE(vid, root, "\\_SB.PCI.AGP.VGA",	/* 570 */
-	   "\\_SB.PCI0.AGP0.VID0",	/* 600e/x, 770x */
-	   "\\_SB.PCI0.VID0",	/* 770e */
-	   "\\_SB.PCI0.VID",	/* A21e, G4x, R50e, X30, X40 */
-	   "\\_SB.PCI0.AGP.VGA",	/* X100e and a few others */
-	   "\\_SB.PCI0.AGP.VID",	/* all others */
-	   );				/* R30, R31 */
-
-
 /*************************************************************************
  * ACPI helpers
  */
@@ -4444,6 +4435,15 @@ static int video_orig_autosw;
 static int video_autosw_get(void);
 static int video_autosw_set(int enable);
 
+TPACPI_HANDLE(vid, root,
+	      "\\_SB.PCI.AGP.VGA",	/* 570 */
+	      "\\_SB.PCI0.AGP0.VID0",	/* 600e/x, 770x */
+	      "\\_SB.PCI0.VID0",	/* 770e */
+	      "\\_SB.PCI0.VID",		/* A21e, G4x, R50e, X30, X40 */
+	      "\\_SB.PCI0.AGP.VGA",	/* X100e and a few others */
+	      "\\_SB.PCI0.AGP.VID",	/* all others */
+	);				/* R30, R31 */
+
 TPACPI_HANDLE(vid2, root, "\\_SB.PCI0.AGPB.VID");	/* G41 */
 
 static int __init video_init(struct ibm_init_struct *iibm)
-- 
cgit v1.2.3-70-g09d2


From 0978e012cfbaca8bd312933e98cdea2d11778e11 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 4 Apr 2011 10:06:25 -0700
Subject: thinkpad_acpi: Convert printks to pr_<level>

Add pr_fmt.
Removed local TPACPI_<level> #defines, convert to pr_<level>.
Neaten dbg_<foo> macros.
Added a few missing newlines to logging messages.
Added static inline str_supported for !CONFIG_THINKPAD_ACPI_DEBUG vdbg_printk
defect reported by Sedat Dilek <sedat.dilek@googlemail.com>.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/thinkpad_acpi.c | 472 ++++++++++++++---------------------
 1 file changed, 188 insertions(+), 284 deletions(-)

diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 7693e89459f..77f6e707a2a 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -21,6 +21,8 @@
  *  02110-1301, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #define TPACPI_VERSION "0.24"
 #define TPACPI_SYSFS_VERSION 0x020700
 
@@ -224,17 +226,6 @@ enum tpacpi_hkey_event_t {
 
 #define TPACPI_MAX_ACPI_ARGS 3
 
-/* printk headers */
-#define TPACPI_LOG TPACPI_FILE ": "
-#define TPACPI_EMERG	KERN_EMERG	TPACPI_LOG
-#define TPACPI_ALERT	KERN_ALERT	TPACPI_LOG
-#define TPACPI_CRIT	KERN_CRIT	TPACPI_LOG
-#define TPACPI_ERR	KERN_ERR	TPACPI_LOG
-#define TPACPI_WARN	KERN_WARNING	TPACPI_LOG
-#define TPACPI_NOTICE	KERN_NOTICE	TPACPI_LOG
-#define TPACPI_INFO	KERN_INFO	TPACPI_LOG
-#define TPACPI_DEBUG	KERN_DEBUG	TPACPI_LOG
-
 /* Debugging printk groups */
 #define TPACPI_DBG_ALL		0xffff
 #define TPACPI_DBG_DISCLOSETASK	0x8000
@@ -389,34 +380,36 @@ static int tpacpi_uwb_emulstate;
  *  Debugging helpers
  */
 
-#define dbg_printk(a_dbg_level, format, arg...) \
-	do { if (dbg_level & (a_dbg_level)) \
-		printk(TPACPI_DEBUG "%s: " format, __func__ , ## arg); \
-	} while (0)
+#define dbg_printk(a_dbg_level, format, arg...)				\
+do {									\
+	if (dbg_level & (a_dbg_level))					\
+		printk(KERN_DEBUG pr_fmt("%s: " format),		\
+		       __func__, ##arg);				\
+} while (0)
 
 #ifdef CONFIG_THINKPAD_ACPI_DEBUG
 #define vdbg_printk dbg_printk
 static const char *str_supported(int is_supported);
 #else
-#define vdbg_printk(a_dbg_level, format, arg...) \
-	do { } while (0)
+static inline const char *str_supported(int is_supported) { return ""; }
+#define vdbg_printk(a_dbg_level, format, arg...)	\
+	no_printk(format, ##arg)
 #endif
 
 static void tpacpi_log_usertask(const char * const what)
 {
-	printk(TPACPI_DEBUG "%s: access by process with PID %d\n",
-		what, task_tgid_vnr(current));
+	printk(KERN_DEBUG pr_fmt("%s: access by process with PID %d\n"),
+	       what, task_tgid_vnr(current));
 }
 
-#define tpacpi_disclose_usertask(what, format, arg...) \
-	do { \
-		if (unlikely( \
-		    (dbg_level & TPACPI_DBG_DISCLOSETASK) && \
-		    (tpacpi_lifecycle == TPACPI_LIFE_RUNNING))) { \
-			printk(TPACPI_DEBUG "%s: PID %d: " format, \
-				what, task_tgid_vnr(current), ## arg); \
-		} \
-	} while (0)
+#define tpacpi_disclose_usertask(what, format, arg...)			\
+do {									\
+	if (unlikely((dbg_level & TPACPI_DBG_DISCLOSETASK) &&		\
+		     (tpacpi_lifecycle == TPACPI_LIFE_RUNNING))) {	\
+		printk(KERN_DEBUG pr_fmt("%s: PID %d: " format),	\
+		       what, task_tgid_vnr(current), ## arg);		\
+	}								\
+} while (0)
 
 /*
  * Quirk handling helpers
@@ -554,7 +547,7 @@ static int acpi_evalf(acpi_handle handle,
 	int quiet;
 
 	if (!*fmt) {
-		printk(TPACPI_ERR "acpi_evalf() called with empty format\n");
+		pr_err("acpi_evalf() called with empty format\n");
 		return 0;
 	}
 
@@ -579,7 +572,7 @@ static int acpi_evalf(acpi_handle handle,
 			break;
 			/* add more types as needed */
 		default:
-			printk(TPACPI_ERR "acpi_evalf() called "
+			pr_err("acpi_evalf() called "
 			       "with invalid format character '%c'\n", c);
 			va_end(ap);
 			return 0;
@@ -608,13 +601,13 @@ static int acpi_evalf(acpi_handle handle,
 		break;
 		/* add more types as needed */
 	default:
-		printk(TPACPI_ERR "acpi_evalf() called "
+		pr_err("acpi_evalf() called "
 		       "with invalid format character '%c'\n", res_type);
 		return 0;
 	}
 
 	if (!success && !quiet)
-		printk(TPACPI_ERR "acpi_evalf(%s, %s, ...) failed: %s\n",
+		pr_err("acpi_evalf(%s, %s, ...) failed: %s\n",
 		       method, fmt0, acpi_format_exception(status));
 
 	return success;
@@ -758,8 +751,7 @@ static int __init setup_acpi_notify(struct ibm_struct *ibm)
 
 	rc = acpi_bus_get_device(*ibm->acpi->handle, &ibm->acpi->device);
 	if (rc < 0) {
-		printk(TPACPI_ERR "acpi_bus_get_device(%s) failed: %d\n",
-			ibm->name, rc);
+		pr_err("acpi_bus_get_device(%s) failed: %d\n", ibm->name, rc);
 		return -ENODEV;
 	}
 
@@ -772,12 +764,10 @@ static int __init setup_acpi_notify(struct ibm_struct *ibm)
 			ibm->acpi->type, dispatch_acpi_notify, ibm);
 	if (ACPI_FAILURE(status)) {
 		if (status == AE_ALREADY_EXISTS) {
-			printk(TPACPI_NOTICE
-			       "another device driver is already "
-			       "handling %s events\n", ibm->name);
+			pr_notice("another device driver is already "
+				  "handling %s events\n", ibm->name);
 		} else {
-			printk(TPACPI_ERR
-			       "acpi_install_notify_handler(%s) failed: %s\n",
+			pr_err("acpi_install_notify_handler(%s) failed: %s\n",
 			       ibm->name, acpi_format_exception(status));
 		}
 		return -ENODEV;
@@ -802,8 +792,7 @@ static int __init register_tpacpi_subdriver(struct ibm_struct *ibm)
 
 	ibm->acpi->driver = kzalloc(sizeof(struct acpi_driver), GFP_KERNEL);
 	if (!ibm->acpi->driver) {
-		printk(TPACPI_ERR
-		       "failed to allocate memory for ibm->acpi->driver\n");
+		pr_err("failed to allocate memory for ibm->acpi->driver\n");
 		return -ENOMEM;
 	}
 
@@ -814,7 +803,7 @@ static int __init register_tpacpi_subdriver(struct ibm_struct *ibm)
 
 	rc = acpi_bus_register_driver(ibm->acpi->driver);
 	if (rc < 0) {
-		printk(TPACPI_ERR "acpi_bus_register_driver(%s) failed: %d\n",
+		pr_err("acpi_bus_register_driver(%s) failed: %d\n",
 		       ibm->name, rc);
 		kfree(ibm->acpi->driver);
 		ibm->acpi->driver = NULL;
@@ -1072,15 +1061,14 @@ static int parse_strtoul(const char *buf,
 static void tpacpi_disable_brightness_delay(void)
 {
 	if (acpi_evalf(hkey_handle, NULL, "PWMS", "qvd", 0))
-		printk(TPACPI_NOTICE
-			"ACPI backlight control delay disabled\n");
+		pr_notice("ACPI backlight control delay disabled\n");
 }
 
 static void printk_deprecated_attribute(const char * const what,
 					const char * const details)
 {
 	tpacpi_log_usertask("deprecated sysfs attribute");
-	printk(TPACPI_WARN "WARNING: sysfs attribute %s is deprecated and "
+	pr_warn("WARNING: sysfs attribute %s is deprecated and "
 		"will be removed. %s\n",
 		what, details);
 }
@@ -1255,8 +1243,7 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 						&tpacpi_rfk_rfkill_ops,
 						atp_rfk);
 	if (!atp_rfk || !atp_rfk->rfkill) {
-		printk(TPACPI_ERR
-			"failed to allocate memory for rfkill class\n");
+		pr_err("failed to allocate memory for rfkill class\n");
 		kfree(atp_rfk);
 		return -ENOMEM;
 	}
@@ -1266,9 +1253,8 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 
 	sw_status = (tp_rfkops->get_status)();
 	if (sw_status < 0) {
-		printk(TPACPI_ERR
-			"failed to read initial state for %s, error %d\n",
-			name, sw_status);
+		pr_err("failed to read initial state for %s, error %d\n",
+		       name, sw_status);
 	} else {
 		sw_state = (sw_status == TPACPI_RFK_RADIO_OFF);
 		if (set_default) {
@@ -1282,9 +1268,7 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 
 	res = rfkill_register(atp_rfk->rfkill);
 	if (res < 0) {
-		printk(TPACPI_ERR
-			"failed to register %s rfkill switch: %d\n",
-			name, res);
+		pr_err("failed to register %s rfkill switch: %d\n", name, res);
 		rfkill_destroy(atp_rfk->rfkill);
 		kfree(atp_rfk);
 		return res;
@@ -1292,7 +1276,7 @@ static int __init tpacpi_new_rfkill(const enum tpacpi_rfk_id id,
 
 	tpacpi_rfkill_switches[id] = atp_rfk;
 
-	printk(TPACPI_INFO "rfkill switch %s: radio is %sblocked\n",
+	pr_info("rfkill switch %s: radio is %sblocked\n",
 		name, (sw_state || hw_state) ? "" : "un");
 	return 0;
 }
@@ -1816,10 +1800,8 @@ static void __init tpacpi_check_outdated_fw(void)
 		 * broken, or really stable to begin with, so it is
 		 * best if the user upgrades the firmware anyway.
 		 */
-		printk(TPACPI_WARN
-			"WARNING: Outdated ThinkPad BIOS/EC firmware\n");
-		printk(TPACPI_WARN
-			"WARNING: This firmware may be missing critical bug "
+		pr_warn("WARNING: Outdated ThinkPad BIOS/EC firmware\n");
+		pr_warn("WARNING: This firmware may be missing critical bug "
 			"fixes and/or important features\n");
 	}
 }
@@ -2108,9 +2090,7 @@ void static hotkey_mask_warn_incomplete_mask(void)
 		(hotkey_all_mask | TPACPI_HKEY_NVRAM_KNOWN_MASK);
 
 	if (wantedmask)
-		printk(TPACPI_NOTICE
-			"required events 0x%08x not enabled!\n",
-			wantedmask);
+		pr_notice("required events 0x%08x not enabled!\n", wantedmask);
 }
 
 /*
@@ -2148,10 +2128,9 @@ static int hotkey_mask_set(u32 mask)
 	 * a given event.
 	 */
 	if (!hotkey_mask_get() && !rc && (fwmask & ~hotkey_acpi_mask)) {
-		printk(TPACPI_NOTICE
-		       "asked for hotkey mask 0x%08x, but "
-		       "firmware forced it to 0x%08x\n",
-		       fwmask, hotkey_acpi_mask);
+		pr_notice("asked for hotkey mask 0x%08x, but "
+			  "firmware forced it to 0x%08x\n",
+			  fwmask, hotkey_acpi_mask);
 	}
 
 	if (tpacpi_lifecycle != TPACPI_LIFE_EXITING)
@@ -2175,13 +2154,11 @@ static int hotkey_user_mask_set(const u32 mask)
 	    (mask == 0xffff || mask == 0xffffff ||
 	     mask == 0xffffffff)) {
 		tp_warned.hotkey_mask_ff = 1;
-		printk(TPACPI_NOTICE
-		       "setting the hotkey mask to 0x%08x is likely "
-		       "not the best way to go about it\n", mask);
-		printk(TPACPI_NOTICE
-		       "please consider using the driver defaults, "
-		       "and refer to up-to-date thinkpad-acpi "
-		       "documentation\n");
+		pr_notice("setting the hotkey mask to 0x%08x is likely "
+			  "not the best way to go about it\n", mask);
+		pr_notice("please consider using the driver defaults, "
+			  "and refer to up-to-date thinkpad-acpi "
+			  "documentation\n");
 	}
 
 	/* Try to enable what the user asked for, plus whatever we need.
@@ -2565,8 +2542,7 @@ static void hotkey_poll_setup(const bool may_warn)
 					NULL, TPACPI_NVRAM_KTHREAD_NAME);
 			if (IS_ERR(tpacpi_hotkey_task)) {
 				tpacpi_hotkey_task = NULL;
-				printk(TPACPI_ERR
-				       "could not create kernel thread "
+				pr_err("could not create kernel thread "
 				       "for hotkey polling\n");
 			}
 		}
@@ -2574,11 +2550,10 @@ static void hotkey_poll_setup(const bool may_warn)
 		hotkey_poll_stop_sync();
 		if (may_warn && (poll_driver_mask || poll_user_mask) &&
 		    hotkey_poll_freq == 0) {
-			printk(TPACPI_NOTICE
-				"hot keys 0x%08x and/or events 0x%08x "
-				"require polling, which is currently "
-				"disabled\n",
-				poll_user_mask, poll_driver_mask);
+			pr_notice("hot keys 0x%08x and/or events 0x%08x "
+				  "require polling, which is currently "
+				  "disabled\n",
+				  poll_user_mask, poll_driver_mask);
 		}
 	}
 }
@@ -2802,13 +2777,13 @@ static ssize_t hotkey_source_mask_store(struct device *dev,
 	mutex_unlock(&hotkey_mutex);
 
 	if (rc < 0)
-		printk(TPACPI_ERR "hotkey_source_mask: failed to update the"
-			"firmware event mask!\n");
+		pr_err("hotkey_source_mask: "
+		       "failed to update the firmware event mask!\n");
 
 	if (r_ev)
-		printk(TPACPI_NOTICE "hotkey_source_mask: "
-			"some important events were disabled: "
-			"0x%04x\n", r_ev);
+		pr_notice("hotkey_source_mask: "
+			  "some important events were disabled: 0x%04x\n",
+			  r_ev);
 
 	tpacpi_disclose_usertask("hotkey_source_mask", "set to 0x%08lx\n", t);
 
@@ -3039,8 +3014,7 @@ static void hotkey_exit(void)
 	if (((tp_features.hotkey_mask &&
 	      hotkey_mask_set(hotkey_orig_mask)) |
 	     hotkey_status_set(false)) != 0)
-		printk(TPACPI_ERR
-		       "failed to restore hot key mask "
+		pr_err("failed to restore hot key mask "
 		       "to BIOS defaults\n");
 }
 
@@ -3279,10 +3253,9 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 	   for HKEY interface version 0x100 */
 	if (acpi_evalf(hkey_handle, &hkeyv, "MHKV", "qd")) {
 		if ((hkeyv >> 8) != 1) {
-			printk(TPACPI_ERR "unknown version of the "
-			       "HKEY interface: 0x%x\n", hkeyv);
-			printk(TPACPI_ERR "please report this to %s\n",
-			       TPACPI_MAIL);
+			pr_err("unknown version of the HKEY interface: 0x%x\n",
+			       hkeyv);
+			pr_err("please report this to %s\n", TPACPI_MAIL);
 		} else {
 			/*
 			 * MHKV 0x100 in A31, R40, R40e,
@@ -3295,8 +3268,7 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 			/* Paranoia check AND init hotkey_all_mask */
 			if (!acpi_evalf(hkey_handle, &hotkey_all_mask,
 					"MHKA", "qd")) {
-				printk(TPACPI_ERR
-				       "missing MHKA handler, "
+				pr_err("missing MHKA handler, "
 				       "please report this to %s\n",
 				       TPACPI_MAIL);
 				/* Fallback: pre-init for FN+F3,F4,F12 */
@@ -3334,16 +3306,14 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 	if (dbg_wlswemul) {
 		tp_features.hotkey_wlsw = 1;
 		radiosw_state = !!tpacpi_wlsw_emulstate;
-		printk(TPACPI_INFO
-			"radio switch emulation enabled\n");
+		pr_info("radio switch emulation enabled\n");
 	} else
 #endif
 	/* Not all thinkpads have a hardware radio switch */
 	if (acpi_evalf(hkey_handle, &status, "WLSW", "qd")) {
 		tp_features.hotkey_wlsw = 1;
 		radiosw_state = !!status;
-		printk(TPACPI_INFO
-			"radio switch found; radios are %s\n",
+		pr_info("radio switch found; radios are %s\n",
 			enabled(status, 0));
 	}
 	if (tp_features.hotkey_wlsw)
@@ -3354,8 +3324,7 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 	if (!res && acpi_evalf(hkey_handle, &status, "MHKG", "qd")) {
 		tp_features.hotkey_tablet = 1;
 		tabletsw_state = !!(status & TP_HOTKEY_TABLET_MASK);
-		printk(TPACPI_INFO
-			"possible tablet mode switch found; "
+		pr_info("possible tablet mode switch found; "
 			"ThinkPad in %s mode\n",
 			(tabletsw_state) ? "tablet" : "laptop");
 		res = add_to_attr_set(hotkey_dev_attributes,
@@ -3373,8 +3342,7 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 	hotkey_keycode_map = kmalloc(TPACPI_HOTKEY_MAP_SIZE,
 					GFP_KERNEL);
 	if (!hotkey_keycode_map) {
-		printk(TPACPI_ERR
-			"failed to allocate memory for key map\n");
+		pr_err("failed to allocate memory for key map\n");
 		res = -ENOMEM;
 		goto err_exit;
 	}
@@ -3417,13 +3385,11 @@ static int __init hotkey_init(struct ibm_init_struct *iibm)
 	 * userspace. tpacpi_detect_brightness_capabilities() must have
 	 * been called before this point  */
 	if (tp_features.bright_acpimode && acpi_video_backlight_support()) {
-		printk(TPACPI_INFO
-		       "This ThinkPad has standard ACPI backlight "
-		       "brightness control, supported by the ACPI "
-		       "video driver\n");
-		printk(TPACPI_NOTICE
-		       "Disabling thinkpad-acpi brightness events "
-		       "by default...\n");
+		pr_info("This ThinkPad has standard ACPI backlight "
+			"brightness control, supported by the ACPI "
+			"video driver\n");
+		pr_notice("Disabling thinkpad-acpi brightness events "
+			  "by default...\n");
 
 		/* Disable brightness up/down on Lenovo thinkpads when
 		 * ACPI is handling them, otherwise it is plain impossible
@@ -3530,8 +3496,7 @@ static bool hotkey_notify_wakeup(const u32 hkey,
 
 	case TP_HKEY_EV_WKUP_S3_BATLOW: /* Battery on critical low level/S3 */
 	case TP_HKEY_EV_WKUP_S4_BATLOW: /* Battery on critical low level/S4 */
-		printk(TPACPI_ALERT
-			"EMERGENCY WAKEUP: battery almost empty\n");
+		pr_alert("EMERGENCY WAKEUP: battery almost empty\n");
 		/* how to auto-heal: */
 		/* 2313: woke up from S3, go to S4/S5 */
 		/* 2413: woke up from S4, go to S5 */
@@ -3542,9 +3507,7 @@ static bool hotkey_notify_wakeup(const u32 hkey,
 	}
 
 	if (hotkey_wakeup_reason != TP_ACPI_WAKEUP_NONE) {
-		printk(TPACPI_INFO
-		       "woke up due to a hot-unplug "
-		       "request...\n");
+		pr_info("woke up due to a hot-unplug request...\n");
 		hotkey_wakeup_reason_notify_change();
 	}
 	return true;
@@ -3596,37 +3559,31 @@ static bool hotkey_notify_thermal(const u32 hkey,
 
 	switch (hkey) {
 	case TP_HKEY_EV_THM_TABLE_CHANGED:
-		printk(TPACPI_INFO
-			"EC reports that Thermal Table has changed\n");
+		pr_info("EC reports that Thermal Table has changed\n");
 		/* recommended action: do nothing, we don't have
 		 * Lenovo ATM information */
 		return true;
 	case TP_HKEY_EV_ALARM_BAT_HOT:
-		printk(TPACPI_CRIT
-			"THERMAL ALARM: battery is too hot!\n");
+		pr_crit("THERMAL ALARM: battery is too hot!\n");
 		/* recommended action: warn user through gui */
 		break;
 	case TP_HKEY_EV_ALARM_BAT_XHOT:
-		printk(TPACPI_ALERT
-			"THERMAL EMERGENCY: battery is extremely hot!\n");
+		pr_alert("THERMAL EMERGENCY: battery is extremely hot!\n");
 		/* recommended action: immediate sleep/hibernate */
 		break;
 	case TP_HKEY_EV_ALARM_SENSOR_HOT:
-		printk(TPACPI_CRIT
-			"THERMAL ALARM: "
+		pr_crit("THERMAL ALARM: "
 			"a sensor reports something is too hot!\n");
 		/* recommended action: warn user through gui, that */
 		/* some internal component is too hot */
 		break;
 	case TP_HKEY_EV_ALARM_SENSOR_XHOT:
-		printk(TPACPI_ALERT
-			"THERMAL EMERGENCY: "
-			"a sensor reports something is extremely hot!\n");
+		pr_alert("THERMAL EMERGENCY: "
+			 "a sensor reports something is extremely hot!\n");
 		/* recommended action: immediate sleep/hibernate */
 		break;
 	default:
-		printk(TPACPI_ALERT
-			 "THERMAL ALERT: unknown thermal alarm received\n");
+		pr_alert("THERMAL ALERT: unknown thermal alarm received\n");
 		known = false;
 	}
 
@@ -3643,8 +3600,7 @@ static void hotkey_notify(struct ibm_struct *ibm, u32 event)
 	bool known_ev;
 
 	if (event != 0x80) {
-		printk(TPACPI_ERR
-		       "unknown HKEY notification event %d\n", event);
+		pr_err("unknown HKEY notification event %d\n", event);
 		/* forward it to userspace, maybe it knows how to handle it */
 		acpi_bus_generate_netlink_event(
 					ibm->acpi->device->pnp.device_class,
@@ -3655,7 +3611,7 @@ static void hotkey_notify(struct ibm_struct *ibm, u32 event)
 
 	while (1) {
 		if (!acpi_evalf(hkey_handle, &hkey, "MHKP", "d")) {
-			printk(TPACPI_ERR "failed to retrieve HKEY event\n");
+			pr_err("failed to retrieve HKEY event\n");
 			return;
 		}
 
@@ -3683,8 +3639,7 @@ static void hotkey_notify(struct ibm_struct *ibm, u32 event)
 			switch (hkey) {
 			case TP_HKEY_EV_BAYEJ_ACK:
 				hotkey_autosleep_ack = 1;
-				printk(TPACPI_INFO
-				       "bay ejected\n");
+				pr_info("bay ejected\n");
 				hotkey_wakeup_hotunplug_complete_notify_change();
 				known_ev = true;
 				break;
@@ -3700,8 +3655,7 @@ static void hotkey_notify(struct ibm_struct *ibm, u32 event)
 			/* 0x4000-0x4FFF: dock-related wakeups */
 			if (hkey == TP_HKEY_EV_UNDOCK_ACK) {
 				hotkey_autosleep_ack = 1;
-				printk(TPACPI_INFO
-				       "undocked\n");
+				pr_info("undocked\n");
 				hotkey_wakeup_hotunplug_complete_notify_change();
 				known_ev = true;
 			} else {
@@ -3732,11 +3686,9 @@ static void hotkey_notify(struct ibm_struct *ibm, u32 event)
 			known_ev = false;
 		}
 		if (!known_ev) {
-			printk(TPACPI_NOTICE
-			       "unhandled HKEY event 0x%04x\n", hkey);
-			printk(TPACPI_NOTICE
-			       "please report the conditions when this "
-			       "event happened to %s\n", TPACPI_MAIL);
+			pr_notice("unhandled HKEY event 0x%04x\n", hkey);
+			pr_notice("please report the conditions when this "
+				  "event happened to %s\n", TPACPI_MAIL);
 		}
 
 		/* Legacy events */
@@ -3769,8 +3721,7 @@ static void hotkey_resume(void)
 
 	if (hotkey_status_set(true) < 0 ||
 	    hotkey_mask_set(hotkey_acpi_mask) < 0)
-		printk(TPACPI_ERR
-		       "error while attempting to reset the event "
+		pr_err("error while attempting to reset the event "
 		       "firmware interface\n");
 
 	tpacpi_send_radiosw_update();
@@ -3815,14 +3766,12 @@ static void hotkey_enabledisable_warn(bool enable)
 {
 	tpacpi_log_usertask("procfs hotkey enable/disable");
 	if (!WARN((tpacpi_lifecycle == TPACPI_LIFE_RUNNING || !enable),
-			TPACPI_WARN
-			"hotkey enable/disable functionality has been "
-			"removed from the driver.  Hotkeys are always "
-			"enabled\n"))
-		printk(TPACPI_ERR
-			"Please remove the hotkey=enable module "
-			"parameter, it is deprecated.  Hotkeys are always "
-			"enabled\n");
+		  pr_fmt("hotkey enable/disable functionality has been "
+			 "removed from the driver.  "
+			 "Hotkeys are always enabled.\n")))
+		pr_err("Please remove the hotkey=enable module "
+		       "parameter, it is deprecated.  "
+		       "Hotkeys are always enabled.\n");
 }
 
 static int hotkey_write(char *buf)
@@ -4002,8 +3951,7 @@ static void bluetooth_shutdown(void)
 	/* Order firmware to save current state to NVRAM */
 	if (!acpi_evalf(NULL, NULL, "\\BLTH", "vd",
 			TP_ACPI_BLTH_SAVE_STATE))
-		printk(TPACPI_NOTICE
-			"failed to save bluetooth state to NVRAM\n");
+		pr_notice("failed to save bluetooth state to NVRAM\n");
 	else
 		vdbg_printk(TPACPI_DBG_RFKILL,
 			"bluestooth state saved to NVRAM\n");
@@ -4042,8 +3990,7 @@ static int __init bluetooth_init(struct ibm_init_struct *iibm)
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_bluetoothemul) {
 		tp_features.bluetooth = 1;
-		printk(TPACPI_INFO
-			"bluetooth switch emulation enabled\n");
+		pr_info("bluetooth switch emulation enabled\n");
 	} else
 #endif
 	if (tp_features.bluetooth &&
@@ -4194,8 +4141,7 @@ static void wan_shutdown(void)
 	/* Order firmware to save current state to NVRAM */
 	if (!acpi_evalf(NULL, NULL, "\\WGSV", "vd",
 			TP_ACPI_WGSV_SAVE_STATE))
-		printk(TPACPI_NOTICE
-			"failed to save WWAN state to NVRAM\n");
+		pr_notice("failed to save WWAN state to NVRAM\n");
 	else
 		vdbg_printk(TPACPI_DBG_RFKILL,
 			"WWAN state saved to NVRAM\n");
@@ -4232,8 +4178,7 @@ static int __init wan_init(struct ibm_init_struct *iibm)
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_wwanemul) {
 		tp_features.wan = 1;
-		printk(TPACPI_INFO
-			"wwan switch emulation enabled\n");
+		pr_info("wwan switch emulation enabled\n");
 	} else
 #endif
 	if (tp_features.wan &&
@@ -4373,8 +4318,7 @@ static int __init uwb_init(struct ibm_init_struct *iibm)
 #ifdef CONFIG_THINKPAD_ACPI_DEBUGFACILITIES
 	if (dbg_uwbemul) {
 		tp_features.uwb = 1;
-		printk(TPACPI_INFO
-			"uwb switch emulation enabled\n");
+		pr_info("uwb switch emulation enabled\n");
 	} else
 #endif
 	if (tp_features.uwb &&
@@ -4487,7 +4431,7 @@ static void video_exit(void)
 	dbg_printk(TPACPI_DBG_EXIT,
 		   "restoring original video autoswitch mode\n");
 	if (video_autosw_set(video_orig_autosw))
-		printk(TPACPI_ERR "error while trying to restore original "
+		pr_err("error while trying to restore original "
 			"video autoswitch mode\n");
 }
 
@@ -4560,8 +4504,7 @@ static int video_outputsw_set(int status)
 		res = acpi_evalf(vid_handle, NULL,
 				 "ASWT", "vdd", status * 0x100, 0);
 		if (!autosw && video_autosw_set(autosw)) {
-			printk(TPACPI_ERR
-			       "video auto-switch left enabled due to error\n");
+			pr_err("video auto-switch left enabled due to error\n");
 			return -EIO;
 		}
 		break;
@@ -4630,8 +4573,7 @@ static int video_outputsw_cycle(void)
 		return -ENOSYS;
 	}
 	if (!autosw && video_autosw_set(autosw)) {
-		printk(TPACPI_ERR
-		       "video auto-switch left enabled due to error\n");
+		pr_err("video auto-switch left enabled due to error\n");
 		return -EIO;
 	}
 
@@ -5348,7 +5290,7 @@ static int __init led_init(struct ibm_init_struct *iibm)
 	tpacpi_leds = kzalloc(sizeof(*tpacpi_leds) * TPACPI_LED_NUMLEDS,
 			      GFP_KERNEL);
 	if (!tpacpi_leds) {
-		printk(TPACPI_ERR "Out of memory for LED data\n");
+		pr_err("Out of memory for LED data\n");
 		return -ENOMEM;
 	}
 
@@ -5367,9 +5309,8 @@ static int __init led_init(struct ibm_init_struct *iibm)
 	}
 
 #ifdef CONFIG_THINKPAD_ACPI_UNSAFE_LEDS
-	printk(TPACPI_NOTICE
-		"warning: userspace override of important "
-		"firmware LEDs is enabled\n");
+	pr_notice("warning: userspace override of important "
+		  "firmware LEDs is enabled\n");
 #endif
 	return 0;
 }
@@ -5639,17 +5580,16 @@ static void thermal_dump_all_sensors(void)
 	if (n <= 0)
 		return;
 
-	printk(TPACPI_NOTICE
-		"temperatures (Celsius):");
+	pr_notice("temperatures (Celsius):");
 
 	for (i = 0; i < n; i++) {
 		if (t.temp[i] != TPACPI_THERMAL_SENSOR_NA)
-			printk(KERN_CONT " %d", (int)(t.temp[i] / 1000));
+			pr_cont(" %d", (int)(t.temp[i] / 1000));
 		else
-			printk(KERN_CONT " N/A");
+			pr_cont(" N/A");
 	}
 
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 }
 
 /* sysfs temp##_input -------------------------------------------------- */
@@ -5769,14 +5709,12 @@ static int __init thermal_init(struct ibm_init_struct *iibm)
 		if (ta1 == 0) {
 			/* This is sheer paranoia, but we handle it anyway */
 			if (acpi_tmp7) {
-				printk(TPACPI_ERR
-				       "ThinkPad ACPI EC access misbehaving, "
+				pr_err("ThinkPad ACPI EC access misbehaving, "
 				       "falling back to ACPI TMPx access "
 				       "mode\n");
 				thermal_read_mode = TPACPI_THERMAL_ACPI_TMP07;
 			} else {
-				printk(TPACPI_ERR
-				       "ThinkPad ACPI EC access misbehaving, "
+				pr_err("ThinkPad ACPI EC access misbehaving, "
 				       "disabling thermal sensors access\n");
 				thermal_read_mode = TPACPI_THERMAL_NONE;
 			}
@@ -6129,8 +6067,8 @@ static int __init tpacpi_query_bcl_levels(acpi_handle handle)
 	if (ACPI_SUCCESS(acpi_evaluate_object(handle, "_BCL", NULL, &buffer))) {
 		obj = (union acpi_object *)buffer.pointer;
 		if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) {
-			printk(TPACPI_ERR "Unknown _BCL data, "
-			       "please report this to %s\n", TPACPI_MAIL);
+			pr_err("Unknown _BCL data, please report this to %s\n",
+			       TPACPI_MAIL);
 			rc = 0;
 		} else {
 			rc = obj->package.count;
@@ -6214,18 +6152,15 @@ static void __init tpacpi_detect_brightness_capabilities(void)
 	switch (b) {
 	case 16:
 		bright_maxlvl = 15;
-		printk(TPACPI_INFO
-		       "detected a 16-level brightness capable ThinkPad\n");
+		pr_info("detected a 16-level brightness capable ThinkPad\n");
 		break;
 	case 8:
 	case 0:
 		bright_maxlvl = 7;
-		printk(TPACPI_INFO
-		       "detected a 8-level brightness capable ThinkPad\n");
+		pr_info("detected a 8-level brightness capable ThinkPad\n");
 		break;
 	default:
-		printk(TPACPI_ERR
-		       "Unsupported brightness interface, "
+		pr_err("Unsupported brightness interface, "
 		       "please contact %s\n", TPACPI_MAIL);
 		tp_features.bright_unkfw = 1;
 		bright_maxlvl = b - 1;
@@ -6260,22 +6195,19 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
 
 	if (acpi_video_backlight_support()) {
 		if (brightness_enable > 1) {
-			printk(TPACPI_INFO
-			       "Standard ACPI backlight interface "
-			       "available, not loading native one.\n");
+			pr_info("Standard ACPI backlight interface "
+				"available, not loading native one\n");
 			return 1;
 		} else if (brightness_enable == 1) {
-			printk(TPACPI_WARN
-				"Cannot enable backlight brightness support, "
+			pr_warn("Cannot enable backlight brightness support, "
 				"ACPI is already handling it.  Refer to the "
-				"acpi_backlight kernel parameter\n");
+				"acpi_backlight kernel parameter.\n");
 			return 1;
 		}
 	} else if (tp_features.bright_acpimode && brightness_enable > 1) {
-		printk(TPACPI_NOTICE
-			"Standard ACPI backlight interface not "
-			"available, thinkpad_acpi native "
-			"brightness control enabled\n");
+		pr_notice("Standard ACPI backlight interface not "
+			  "available, thinkpad_acpi native "
+			  "brightness control enabled\n");
 	}
 
 	/*
@@ -6319,19 +6251,17 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
 	if (IS_ERR(ibm_backlight_device)) {
 		int rc = PTR_ERR(ibm_backlight_device);
 		ibm_backlight_device = NULL;
-		printk(TPACPI_ERR "Could not register backlight device\n");
+		pr_err("Could not register backlight device\n");
 		return rc;
 	}
 	vdbg_printk(TPACPI_DBG_INIT | TPACPI_DBG_BRGHT,
 			"brightness is supported\n");
 
 	if (quirks & TPACPI_BRGHT_Q_ASK) {
-		printk(TPACPI_NOTICE
-			"brightness: will use unverified default: "
-			"brightness_mode=%d\n", brightness_mode);
-		printk(TPACPI_NOTICE
-			"brightness: please report to %s whether it works well "
-			"or not on your ThinkPad\n", TPACPI_MAIL);
+		pr_notice("brightness: will use unverified default: "
+			  "brightness_mode=%d\n", brightness_mode);
+		pr_notice("brightness: please report to %s whether it works well "
+			  "or not on your ThinkPad\n", TPACPI_MAIL);
 	}
 
 	/* Added by mistake in early 2007.  Probably useless, but it could
@@ -6804,8 +6734,7 @@ static int __init volume_create_alsa_mixer(void)
 	rc = snd_card_create(alsa_index, alsa_id, THIS_MODULE,
 			    sizeof(struct tpacpi_alsa_data), &card);
 	if (rc < 0 || !card) {
-		printk(TPACPI_ERR
-			"Failed to create ALSA card structures: %d\n", rc);
+		pr_err("Failed to create ALSA card structures: %d\n", rc);
 		return 1;
 	}
 
@@ -6839,9 +6768,8 @@ static int __init volume_create_alsa_mixer(void)
 		ctl_vol = snd_ctl_new1(&volume_alsa_control_vol, NULL);
 		rc = snd_ctl_add(card, ctl_vol);
 		if (rc < 0) {
-			printk(TPACPI_ERR
-				"Failed to create ALSA volume control: %d\n",
-				rc);
+			pr_err("Failed to create ALSA volume control: %d\n",
+			       rc);
 			goto err_exit;
 		}
 		data->ctl_vol_id = &ctl_vol->id;
@@ -6850,8 +6778,7 @@ static int __init volume_create_alsa_mixer(void)
 	ctl_mute = snd_ctl_new1(&volume_alsa_control_mute, NULL);
 	rc = snd_ctl_add(card, ctl_mute);
 	if (rc < 0) {
-		printk(TPACPI_ERR "Failed to create ALSA mute control: %d\n",
-			rc);
+		pr_err("Failed to create ALSA mute control: %d\n", rc);
 		goto err_exit;
 	}
 	data->ctl_mute_id = &ctl_mute->id;
@@ -6859,7 +6786,7 @@ static int __init volume_create_alsa_mixer(void)
 	snd_card_set_dev(card, &tpacpi_pdev->dev);
 	rc = snd_card_register(card);
 	if (rc < 0) {
-		printk(TPACPI_ERR "Failed to register ALSA card: %d\n", rc);
+		pr_err("Failed to register ALSA card: %d\n", rc);
 		goto err_exit;
 	}
 
@@ -6915,9 +6842,8 @@ static int __init volume_init(struct ibm_init_struct *iibm)
 		return -EINVAL;
 
 	if (volume_mode == TPACPI_VOL_MODE_UCMS_STEP) {
-		printk(TPACPI_ERR
-			"UCMS step volume mode not implemented, "
-			"please contact %s\n", TPACPI_MAIL);
+		pr_err("UCMS step volume mode not implemented, "
+		       "please contact %s\n", TPACPI_MAIL);
 		return 1;
 	}
 
@@ -6981,13 +6907,11 @@ static int __init volume_init(struct ibm_init_struct *iibm)
 
 	rc = volume_create_alsa_mixer();
 	if (rc) {
-		printk(TPACPI_ERR
-			"Could not create the ALSA mixer interface\n");
+		pr_err("Could not create the ALSA mixer interface\n");
 		return rc;
 	}
 
-	printk(TPACPI_INFO
-		"Console audio control enabled, mode: %s\n",
+	pr_info("Console audio control enabled, mode: %s\n",
 		(volume_control_allowed) ?
 			"override (read/write)" :
 			"monitor (read only)");
@@ -7049,12 +6973,10 @@ static int volume_write(char *buf)
 	if (!volume_control_allowed && tpacpi_lifecycle != TPACPI_LIFE_INIT) {
 		if (unlikely(!tp_warned.volume_ctrl_forbidden)) {
 			tp_warned.volume_ctrl_forbidden = 1;
-			printk(TPACPI_NOTICE
-				"Console audio control in monitor mode, "
-				"changes are not allowed.\n");
-			printk(TPACPI_NOTICE
-				"Use the volume_control=1 module parameter "
-				"to enable volume control\n");
+			pr_notice("Console audio control in monitor mode, "
+				  "changes are not allowed\n");
+			pr_notice("Use the volume_control=1 module parameter "
+				  "to enable volume control\n");
 		}
 		return -EPERM;
 	}
@@ -7129,8 +7051,7 @@ static void inline volume_alsa_notify_change(void)
 
 static int __init volume_init(struct ibm_init_struct *iibm)
 {
-	printk(TPACPI_INFO
-		"volume: disabled as there is no ALSA support in this kernel\n");
+	pr_info("volume: disabled as there is no ALSA support in this kernel\n");
 
 	return 1;
 }
@@ -7337,9 +7258,8 @@ TPACPI_HANDLE(sfan, ec, "SFAN",	/* 570 */
 static void fan_quirk1_setup(void)
 {
 	if (fan_control_initial_status == 0x07) {
-		printk(TPACPI_NOTICE
-		       "fan_init: initial fan status is unknown, "
-		       "assuming it is in auto mode\n");
+		pr_notice("fan_init: initial fan status is unknown, "
+			  "assuming it is in auto mode\n");
 		tp_features.fan_ctrl_status_undef = 1;
 	}
 }
@@ -7726,8 +7646,7 @@ static void fan_watchdog_reset(void)
 		if (!queue_delayed_work(tpacpi_wq, &fan_watchdog_task,
 				msecs_to_jiffies(fan_watchdog_maxinterval
 						 * 1000))) {
-			printk(TPACPI_ERR
-			       "failed to queue the fan watchdog, "
+			pr_err("failed to queue the fan watchdog, "
 			       "watchdog will not trigger\n");
 		}
 	} else
@@ -7741,11 +7660,11 @@ static void fan_watchdog_fire(struct work_struct *ignored)
 	if (tpacpi_lifecycle != TPACPI_LIFE_RUNNING)
 		return;
 
-	printk(TPACPI_NOTICE "fan watchdog: enabling fan\n");
+	pr_notice("fan watchdog: enabling fan\n");
 	rc = fan_set_enable();
 	if (rc < 0) {
-		printk(TPACPI_ERR "fan watchdog: error %d while enabling fan, "
-			"will try again later...\n", -rc);
+		pr_err("fan watchdog: error %d while enabling fan, "
+		       "will try again later...\n", -rc);
 		/* reschedule for later */
 		fan_watchdog_reset();
 	}
@@ -8049,8 +7968,7 @@ static int __init fan_init(struct ibm_init_struct *iibm)
 					"secondary fan support enabled\n");
 			}
 		} else {
-			printk(TPACPI_ERR
-			       "ThinkPad ACPI EC access misbehaving, "
+			pr_err("ThinkPad ACPI EC access misbehaving, "
 			       "fan status and control unavailable\n");
 			return 1;
 		}
@@ -8150,9 +8068,8 @@ static void fan_suspend(pm_message_t state)
 	fan_control_resume_level = 0;
 	rc = fan_get_status_safe(&fan_control_resume_level);
 	if (rc < 0)
-		printk(TPACPI_NOTICE
-			"failed to read fan level for later "
-			"restore during resume: %d\n", rc);
+		pr_notice("failed to read fan level for later "
+			  "restore during resume: %d\n", rc);
 
 	/* if it is undefined, don't attempt to restore it.
 	 * KEEP THIS LAST */
@@ -8207,13 +8124,11 @@ static void fan_resume(void)
 		return;
 	}
 	if (do_set) {
-		printk(TPACPI_NOTICE
-			"restoring fan level to 0x%02x\n",
-			fan_control_resume_level);
+		pr_notice("restoring fan level to 0x%02x\n",
+			  fan_control_resume_level);
 		rc = fan_set_level_safe(fan_control_resume_level);
 		if (rc < 0)
-			printk(TPACPI_NOTICE
-				"failed to restore fan level: %d\n", rc);
+			pr_notice("failed to restore fan level: %d\n", rc);
 	}
 }
 
@@ -8305,8 +8220,8 @@ static int fan_write_cmd_level(const char *cmd, int *rc)
 
 	*rc = fan_set_level_safe(level);
 	if (*rc == -ENXIO)
-		printk(TPACPI_ERR "level command accepted for unsupported "
-		       "access mode %d", fan_control_access_mode);
+		pr_err("level command accepted for unsupported access mode %d\n",
+		       fan_control_access_mode);
 	else if (!*rc)
 		tpacpi_disclose_usertask("procfs fan",
 			"set level to %d\n", level);
@@ -8321,8 +8236,8 @@ static int fan_write_cmd_enable(const char *cmd, int *rc)
 
 	*rc = fan_set_enable();
 	if (*rc == -ENXIO)
-		printk(TPACPI_ERR "enable command accepted for unsupported "
-		       "access mode %d", fan_control_access_mode);
+		pr_err("enable command accepted for unsupported access mode %d\n",
+		       fan_control_access_mode);
 	else if (!*rc)
 		tpacpi_disclose_usertask("procfs fan", "enable\n");
 
@@ -8336,8 +8251,8 @@ static int fan_write_cmd_disable(const char *cmd, int *rc)
 
 	*rc = fan_set_disable();
 	if (*rc == -ENXIO)
-		printk(TPACPI_ERR "disable command accepted for unsupported "
-		       "access mode %d", fan_control_access_mode);
+		pr_err("disable command accepted for unsupported access mode %d\n",
+		       fan_control_access_mode);
 	else if (!*rc)
 		tpacpi_disclose_usertask("procfs fan", "disable\n");
 
@@ -8356,8 +8271,8 @@ static int fan_write_cmd_speed(const char *cmd, int *rc)
 
 	*rc = fan_set_speed(speed);
 	if (*rc == -ENXIO)
-		printk(TPACPI_ERR "speed command accepted for unsupported "
-		       "access mode %d", fan_control_access_mode);
+		pr_err("speed command accepted for unsupported access mode %d\n",
+		       fan_control_access_mode);
 	else if (!*rc)
 		tpacpi_disclose_usertask("procfs fan",
 			"set speed to %d\n", speed);
@@ -8560,8 +8475,8 @@ static int __init ibm_init(struct ibm_init_struct *iibm)
 		if (ibm->acpi->notify) {
 			ret = setup_acpi_notify(ibm);
 			if (ret == -ENODEV) {
-				printk(TPACPI_NOTICE "disabling subdriver %s\n",
-					ibm->name);
+				pr_notice("disabling subdriver %s\n",
+					  ibm->name);
 				ret = 0;
 				goto err_out;
 			}
@@ -8583,8 +8498,7 @@ static int __init ibm_init(struct ibm_init_struct *iibm)
 		entry = proc_create_data(ibm->name, mode, proc_dir,
 					 &dispatch_proc_fops, ibm);
 		if (!entry) {
-			printk(TPACPI_ERR "unable to create proc entry %s\n",
-			       ibm->name);
+			pr_err("unable to create proc entry %s\n", ibm->name);
 			ret = -ENODEV;
 			goto err_out;
 		}
@@ -8683,13 +8597,11 @@ static int __must_check __init get_thinkpad_model_data(
 				tp->ec_release = (ec_fw_string[4] << 8)
 						| ec_fw_string[5];
 			} else {
-				printk(TPACPI_NOTICE
-					"ThinkPad firmware release %s "
-					"doesn't match the known patterns\n",
-					ec_fw_string);
-				printk(TPACPI_NOTICE
-					"please report this to %s\n",
-					TPACPI_MAIL);
+				pr_notice("ThinkPad firmware release %s "
+					  "doesn't match the known patterns\n",
+					  ec_fw_string);
+				pr_notice("please report this to %s\n",
+					  TPACPI_MAIL);
 			}
 			break;
 		}
@@ -8733,8 +8645,7 @@ static int __init probe_for_thinkpad(void)
 	tpacpi_acpi_handle_locate("ec", TPACPI_ACPI_EC_HID, &ec_handle);
 	if (!ec_handle) {
 		if (is_thinkpad)
-			printk(TPACPI_ERR
-				"Not yet supported ThinkPad detected!\n");
+			pr_err("Not yet supported ThinkPad detected!\n");
 		return -ENODEV;
 	}
 
@@ -8746,10 +8657,10 @@ static int __init probe_for_thinkpad(void)
 
 static void __init thinkpad_acpi_init_banner(void)
 {
-	printk(TPACPI_INFO "%s v%s\n", TPACPI_DESC, TPACPI_VERSION);
-	printk(TPACPI_INFO "%s\n", TPACPI_URL);
+	pr_info("%s v%s\n", TPACPI_DESC, TPACPI_VERSION);
+	pr_info("%s\n", TPACPI_URL);
 
-	printk(TPACPI_INFO "ThinkPad BIOS %s, EC %s\n",
+	pr_info("ThinkPad BIOS %s, EC %s\n",
 		(thinkpad_id.bios_version_str) ?
 			thinkpad_id.bios_version_str : "unknown",
 		(thinkpad_id.ec_version_str) ?
@@ -8758,7 +8669,7 @@ static void __init thinkpad_acpi_init_banner(void)
 	BUG_ON(!thinkpad_id.vendor);
 
 	if (thinkpad_id.model_str)
-		printk(TPACPI_INFO "%s %s, model %s\n",
+		pr_info("%s %s, model %s\n",
 			(thinkpad_id.vendor == PCI_VENDOR_ID_IBM) ?
 				"IBM" : ((thinkpad_id.vendor ==
 						PCI_VENDOR_ID_LENOVO) ?
@@ -9024,8 +8935,7 @@ static int __init thinkpad_acpi_module_init(void)
 
 	ret = get_thinkpad_model_data(&thinkpad_id);
 	if (ret) {
-		printk(TPACPI_ERR
-			"unable to get DMI data: %d\n", ret);
+		pr_err("unable to get DMI data: %d\n", ret);
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
@@ -9051,16 +8961,14 @@ static int __init thinkpad_acpi_module_init(void)
 
 	proc_dir = proc_mkdir(TPACPI_PROC_DIR, acpi_root_dir);
 	if (!proc_dir) {
-		printk(TPACPI_ERR
-		       "unable to create proc dir " TPACPI_PROC_DIR);
+		pr_err("unable to create proc dir " TPACPI_PROC_DIR "\n");
 		thinkpad_acpi_module_exit();
 		return -ENODEV;
 	}
 
 	ret = platform_driver_register(&tpacpi_pdriver);
 	if (ret) {
-		printk(TPACPI_ERR
-		       "unable to register main platform driver\n");
+		pr_err("unable to register main platform driver\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
@@ -9068,8 +8976,7 @@ static int __init thinkpad_acpi_module_init(void)
 
 	ret = platform_driver_register(&tpacpi_hwmon_pdriver);
 	if (ret) {
-		printk(TPACPI_ERR
-		       "unable to register hwmon platform driver\n");
+		pr_err("unable to register hwmon platform driver\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
@@ -9082,8 +8989,7 @@ static int __init thinkpad_acpi_module_init(void)
 					&tpacpi_hwmon_pdriver.driver);
 	}
 	if (ret) {
-		printk(TPACPI_ERR
-		       "unable to create sysfs driver attributes\n");
+		pr_err("unable to create sysfs driver attributes\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
@@ -9096,7 +9002,7 @@ static int __init thinkpad_acpi_module_init(void)
 	if (IS_ERR(tpacpi_pdev)) {
 		ret = PTR_ERR(tpacpi_pdev);
 		tpacpi_pdev = NULL;
-		printk(TPACPI_ERR "unable to register platform device\n");
+		pr_err("unable to register platform device\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
@@ -9106,16 +9012,14 @@ static int __init thinkpad_acpi_module_init(void)
 	if (IS_ERR(tpacpi_sensors_pdev)) {
 		ret = PTR_ERR(tpacpi_sensors_pdev);
 		tpacpi_sensors_pdev = NULL;
-		printk(TPACPI_ERR
-		       "unable to register hwmon platform device\n");
+		pr_err("unable to register hwmon platform device\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
 	ret = device_create_file(&tpacpi_sensors_pdev->dev,
 				 &dev_attr_thinkpad_acpi_pdev_name);
 	if (ret) {
-		printk(TPACPI_ERR
-		       "unable to create sysfs hwmon device attributes\n");
+		pr_err("unable to create sysfs hwmon device attributes\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
@@ -9124,14 +9028,14 @@ static int __init thinkpad_acpi_module_init(void)
 	if (IS_ERR(tpacpi_hwmon)) {
 		ret = PTR_ERR(tpacpi_hwmon);
 		tpacpi_hwmon = NULL;
-		printk(TPACPI_ERR "unable to register hwmon device\n");
+		pr_err("unable to register hwmon device\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	}
 	mutex_init(&tpacpi_inputdev_send_mutex);
 	tpacpi_inputdev = input_allocate_device();
 	if (!tpacpi_inputdev) {
-		printk(TPACPI_ERR "unable to allocate input device\n");
+		pr_err("unable to allocate input device\n");
 		thinkpad_acpi_module_exit();
 		return -ENOMEM;
 	} else {
@@ -9163,7 +9067,7 @@ static int __init thinkpad_acpi_module_init(void)
 
 	ret = input_register_device(tpacpi_inputdev);
 	if (ret < 0) {
-		printk(TPACPI_ERR "unable to register input device\n");
+		pr_err("unable to register input device\n");
 		thinkpad_acpi_module_exit();
 		return ret;
 	} else {
-- 
cgit v1.2.3-70-g09d2


From bb3ce2020451bdebe5ceac770504f427724008a9 Mon Sep 17 00:00:00 2001
From: Yin Kangkai <kangkai.yin@linux.intel.com>
Date: Wed, 6 Apr 2011 15:05:24 +0100
Subject: platform/oaktrail: ACPI EC Extra driver for Oaktrail

This driver implements an Extra ACPI EC driver for products based on Intel
Oaktrail platform.

This driver does below things:
1. registers itself in the Linux backlight control in
   /sys/class/backlight/intel_oaktrail/

2. registers in the rfkill subsystem here: /sys/class/rfkill/rfkillX/
   for these components: wifi, bluetooth, wwan (3g), gps

Signed-off-by: Yin Kangkai <kangkai.yin@linux.intel.com>

[Extracted from a bigger patch by Yin Kangkai, this version leaves out some
 sysfs bits that probably want to be driver managed, and ACPI i2c enumeration]

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/Kconfig          |   9 +
 drivers/platform/x86/Makefile         |   1 +
 drivers/platform/x86/intel_oaktrail.c | 396 ++++++++++++++++++++++++++++++++++
 3 files changed, 406 insertions(+)
 create mode 100644 drivers/platform/x86/intel_oaktrail.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 5cb999b50f9..83c2411acb5 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -760,4 +760,13 @@ config MXM_WMI
           MXM is a standard for laptop graphics cards, the WMI interface
 	  is required for switchable nvidia graphics machines
 
+config INTEL_OAKTRAIL
+	tristate "Intel Oaktrail Platform Extras"
+	depends on ACPI
+	depends on RFKILL && BACKLIGHT_CLASS_DEVICE && ACPI
+	---help---
+	  Intel Oaktrail platform need this driver to provide interfaces to
+	  enable/disable the Camera, WiFi, BT etc. devices. If in doubt, say Y
+	  here; it will only load on supported platforms.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index a7ab3bc7b3a..af683fb5a80 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -43,3 +43,4 @@ obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
 obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
 obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
 obj-$(CONFIG_MXM_WMI)		+= mxm-wmi.o
+obj-$(CONFIG_INTEL_OAKTRAIL)	+= intel_oaktrail.o
diff --git a/drivers/platform/x86/intel_oaktrail.c b/drivers/platform/x86/intel_oaktrail.c
new file mode 100644
index 00000000000..e936364a609
--- /dev/null
+++ b/drivers/platform/x86/intel_oaktrail.c
@@ -0,0 +1,396 @@
+/*
+ * intel_oaktrail.c - Intel OakTrail Platform support.
+ *
+ * Copyright (C) 2010-2011 Intel Corporation
+ * Author: Yin Kangkai (kangkai.yin@intel.com)
+ *
+ * based on Compal driver, Copyright (C) 2008 Cezary Jackiewicz
+ * <cezary.jackiewicz (at) gmail.com>, based on MSI driver
+ * Copyright (C) 2006 Lennart Poettering <mzxreary (at) 0pointer (dot) de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ *  02110-1301, USA.
+ *
+ * This driver does below things:
+ * 1. registers itself in the Linux backlight control in
+ *    /sys/class/backlight/intel_oaktrail/
+ *
+ * 2. registers in the rfkill subsystem here: /sys/class/rfkill/rfkillX/
+ *    for these components: wifi, bluetooth, wwan (3g), gps
+ *
+ * This driver might work on other products based on Oaktrail. If you
+ * want to try it you can pass force=1 as argument to the module which
+ * will force it to load even when the DMI data doesn't identify the
+ * product as compatible.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/fb.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/backlight.h>
+#include <linux/platform_device.h>
+#include <linux/dmi.h>
+#include <linux/rfkill.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+
+
+#define DRIVER_NAME	"intel_oaktrail"
+#define DRIVER_VERSION	"0.4ac1"
+
+/*
+ * This is the devices status address in EC space, and the control bits
+ * definition:
+ *
+ * (1 << 0):	Camera enable/disable, RW.
+ * (1 << 1):	Bluetooth enable/disable, RW.
+ * (1 << 2):	GPS enable/disable, RW.
+ * (1 << 3):	WiFi enable/disable, RW.
+ * (1 << 4):	WWAN (3G) enable/disalbe, RW.
+ * (1 << 5):	Touchscreen enable/disable, Read Only.
+ */
+#define OT_EC_DEVICE_STATE_ADDRESS	0xD6
+
+#define OT_EC_CAMERA_MASK	(1 << 0)
+#define OT_EC_BT_MASK		(1 << 1)
+#define OT_EC_GPS_MASK		(1 << 2)
+#define OT_EC_WIFI_MASK		(1 << 3)
+#define OT_EC_WWAN_MASK		(1 << 4)
+#define OT_EC_TS_MASK		(1 << 5)
+
+/*
+ * This is the address in EC space and commands used to control LCD backlight:
+ *
+ * Two steps needed to change the LCD backlight:
+ *   1. write the backlight percentage into OT_EC_BL_BRIGHTNESS_ADDRESS;
+ *   2. write OT_EC_BL_CONTROL_ON_DATA into OT_EC_BL_CONTROL_ADDRESS.
+ *
+ * To read the LCD back light, just read out the value from
+ * OT_EC_BL_BRIGHTNESS_ADDRESS.
+ *
+ * LCD backlight brightness range: 0 - 100 (OT_EC_BL_BRIGHTNESS_MAX)
+ */
+#define OT_EC_BL_BRIGHTNESS_ADDRESS	0x44
+#define OT_EC_BL_BRIGHTNESS_MAX		100
+#define OT_EC_BL_CONTROL_ADDRESS	0x3A
+#define OT_EC_BL_CONTROL_ON_DATA	0x1A
+
+
+static int force;
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Force driver load, ignore DMI data");
+
+static struct platform_device *oaktrail_device;
+static struct backlight_device *oaktrail_bl_device;
+static struct rfkill *bt_rfkill;
+static struct rfkill *gps_rfkill;
+static struct rfkill *wifi_rfkill;
+static struct rfkill *wwan_rfkill;
+
+
+/* rfkill */
+static int oaktrail_rfkill_set(void *data, bool blocked)
+{
+	u8 value;
+	u8 result;
+	unsigned long radio = (unsigned long) data;
+
+	ec_read(OT_EC_DEVICE_STATE_ADDRESS, &result);
+
+	if (!blocked)
+		value = (u8) (result | radio);
+	else
+		value = (u8) (result & ~radio);
+
+	ec_write(OT_EC_DEVICE_STATE_ADDRESS, value);
+
+	return 0;
+}
+
+static const struct rfkill_ops oaktrail_rfkill_ops = {
+	.set_block = oaktrail_rfkill_set,
+};
+
+static struct rfkill *oaktrail_rfkill_new(char *name, enum rfkill_type type,
+					  unsigned long mask)
+{
+	struct rfkill *rfkill_dev;
+	u8 value;
+	int err;
+
+	rfkill_dev = rfkill_alloc(name, &oaktrail_device->dev, type,
+				  &oaktrail_rfkill_ops, (void *)mask);
+	if (!rfkill_dev)
+		return ERR_PTR(-ENOMEM);
+
+	ec_read(OT_EC_DEVICE_STATE_ADDRESS, &value);
+	rfkill_init_sw_state(rfkill_dev, (value & mask) != 1);
+
+	err = rfkill_register(rfkill_dev);
+	if (err) {
+		rfkill_destroy(rfkill_dev);
+		return ERR_PTR(err);
+	}
+
+	return rfkill_dev;
+}
+
+static inline void __oaktrail_rfkill_cleanup(struct rfkill *rf)
+{
+	if (rf) {
+		rfkill_unregister(rf);
+		rfkill_destroy(rf);
+	}
+}
+
+static void oaktrail_rfkill_cleanup(void)
+{
+	__oaktrail_rfkill_cleanup(wifi_rfkill);
+	__oaktrail_rfkill_cleanup(bt_rfkill);
+	__oaktrail_rfkill_cleanup(gps_rfkill);
+	__oaktrail_rfkill_cleanup(wwan_rfkill);
+}
+
+static int oaktrail_rfkill_init(void)
+{
+	int ret;
+
+	wifi_rfkill = oaktrail_rfkill_new("oaktrail-wifi",
+					  RFKILL_TYPE_WLAN,
+					  OT_EC_WIFI_MASK);
+	if (IS_ERR(wifi_rfkill)) {
+		ret = PTR_ERR(wifi_rfkill);
+		wifi_rfkill = NULL;
+		goto cleanup;
+	}
+
+	bt_rfkill = oaktrail_rfkill_new("oaktrail-bluetooth",
+					RFKILL_TYPE_BLUETOOTH,
+					OT_EC_BT_MASK);
+	if (IS_ERR(bt_rfkill)) {
+		ret = PTR_ERR(bt_rfkill);
+		bt_rfkill = NULL;
+		goto cleanup;
+	}
+
+	gps_rfkill = oaktrail_rfkill_new("oaktrail-gps",
+					 RFKILL_TYPE_GPS,
+					 OT_EC_GPS_MASK);
+	if (IS_ERR(gps_rfkill)) {
+		ret = PTR_ERR(gps_rfkill);
+		gps_rfkill = NULL;
+		goto cleanup;
+	}
+
+	wwan_rfkill = oaktrail_rfkill_new("oaktrail-wwan",
+					  RFKILL_TYPE_WWAN,
+					  OT_EC_WWAN_MASK);
+	if (IS_ERR(wwan_rfkill)) {
+		ret = PTR_ERR(wwan_rfkill);
+		wwan_rfkill = NULL;
+		goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	oaktrail_rfkill_cleanup();
+	return ret;
+}
+
+
+/* backlight */
+static int get_backlight_brightness(struct backlight_device *b)
+{
+	u8 value;
+	ec_read(OT_EC_BL_BRIGHTNESS_ADDRESS, &value);
+
+	return value;
+}
+
+static int set_backlight_brightness(struct backlight_device *b)
+{
+	u8 percent = (u8) b->props.brightness;
+	if (percent < 0 || percent > OT_EC_BL_BRIGHTNESS_MAX)
+		return -EINVAL;
+
+	ec_write(OT_EC_BL_BRIGHTNESS_ADDRESS, percent);
+	ec_write(OT_EC_BL_CONTROL_ADDRESS, OT_EC_BL_CONTROL_ON_DATA);
+
+	return 0;
+}
+
+static const struct backlight_ops oaktrail_bl_ops = {
+	.get_brightness = get_backlight_brightness,
+	.update_status	= set_backlight_brightness,
+};
+
+static int oaktrail_backlight_init(void)
+{
+	struct backlight_device *bd;
+	struct backlight_properties props;
+
+	memset(&props, 0, sizeof(struct backlight_properties));
+	props.max_brightness = OT_EC_BL_BRIGHTNESS_MAX;
+	bd = backlight_device_register(DRIVER_NAME,
+				       &oaktrail_device->dev, NULL,
+				       &oaktrail_bl_ops,
+				       &props);
+
+	if (IS_ERR(bd)) {
+		oaktrail_bl_device = NULL;
+		pr_warning("Unable to register backlight device\n");
+		return PTR_ERR(bd);
+	}
+
+	oaktrail_bl_device = bd;
+
+	bd->props.brightness = get_backlight_brightness(bd);
+	bd->props.power = FB_BLANK_UNBLANK;
+	backlight_update_status(bd);
+
+	return 0;
+}
+
+static void oaktrail_backlight_exit(void)
+{
+	if (oaktrail_bl_device)
+		backlight_device_unregister(oaktrail_bl_device);
+}
+
+static int __devinit oaktrail_probe(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static int __devexit oaktrail_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static struct platform_driver oaktrail_driver = {
+	.driver = {
+		.name = DRIVER_NAME,
+		.owner = THIS_MODULE,
+	},
+	.probe	= oaktrail_probe,
+	.remove	= __devexit_p(oaktrail_remove)
+};
+
+static int dmi_check_cb(const struct dmi_system_id *id)
+{
+	pr_info("Identified model '%s'\n", id->ident);
+	return 0;
+}
+
+static struct dmi_system_id __initdata oaktrail_dmi_table[] = {
+	{
+		.ident = "OakTrail platform",
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_NAME, "OakTrail platform"),
+		},
+		.callback = dmi_check_cb
+	},
+	{ }
+};
+
+static int __init oaktrail_init(void)
+{
+	int ret;
+
+	if (acpi_disabled) {
+		pr_err("ACPI needs to be enabled for this driver to work!\n");
+		return -ENODEV;
+	}
+
+	if (!force && !dmi_check_system(oaktrail_dmi_table)) {
+		pr_err("Platform not recognized (You could try the module's force-parameter)");
+		return -ENODEV;
+	}
+
+	ret = platform_driver_register(&oaktrail_driver);
+	if (ret) {
+		pr_warning("Unable to register platform driver\n");
+		goto err_driver_reg;
+	}
+
+	oaktrail_device = platform_device_alloc(DRIVER_NAME, -1);
+	if (!oaktrail_device) {
+		pr_warning("Unable to allocate platform device\n");
+		ret = -ENOMEM;
+		goto err_device_alloc;
+	}
+
+	ret = platform_device_add(oaktrail_device);
+	if (ret) {
+		pr_warning("Unable to add platform device\n");
+		goto err_device_add;
+	}
+
+	if (!acpi_video_backlight_support()) {
+		ret = oaktrail_backlight_init();
+		if (ret)
+			goto err_backlight;
+
+	} else
+		pr_info("Backlight controlled by ACPI video driver\n");
+
+	ret = oaktrail_rfkill_init();
+	if (ret) {
+		pr_warning("Setup rfkill failed\n");
+		goto err_rfkill;
+	}
+
+	pr_info("Driver "DRIVER_VERSION" successfully loaded\n");
+	return 0;
+
+err_rfkill:
+	oaktrail_backlight_exit();
+err_backlight:
+	platform_device_del(oaktrail_device);
+err_device_add:
+	platform_device_put(oaktrail_device);
+err_device_alloc:
+	platform_driver_unregister(&oaktrail_driver);
+err_driver_reg:
+
+	return ret;
+}
+
+static void __exit oaktrail_cleanup(void)
+{
+	oaktrail_backlight_exit();
+	oaktrail_rfkill_cleanup();
+	platform_device_unregister(oaktrail_device);
+	platform_driver_unregister(&oaktrail_driver);
+
+	pr_info("Driver unloaded\n");
+}
+
+module_init(oaktrail_init);
+module_exit(oaktrail_cleanup);
+
+MODULE_AUTHOR("Yin Kangkai (kangkai.yin@intel.com)");
+MODULE_DESCRIPTION("Intel Oaktrail Platform ACPI Extras");
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dmi:*:svnIntelCorporation:pnOakTrailplatform:*");
-- 
cgit v1.2.3-70-g09d2


From c2647b5e99c8ff1b3f535c7c84564cdc53214edf Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Fri, 15 Apr 2011 18:42:47 +0800
Subject: acer-wmi: does not allow negative number set to initial device state

The driver set module parameter value: mailled, threeg and brightness
to BIOS by evaluate wmi method when driver was initialed. The default
values for those parameters are -1, so, that will be better don't set
negative value to BIOS.

Cc: Carlos Corbacho <carlos@strangeworlds.co.uk>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Corentin Chary <corentincj@iksaif.net>
Signed-off-by: Lee, Chun-Yi <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index fa8fa73f543..0682ecb3074 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -961,10 +961,12 @@ static void __init acer_commandline_init(void)
 	 * These will all fail silently if the value given is invalid, or the
 	 * capability isn't available on the given interface
 	 */
-	set_u32(mailled, ACER_CAP_MAILLED);
-	if (!has_type_aa)
+	if (mailled >= 0)
+		set_u32(mailled, ACER_CAP_MAILLED);
+	if (!has_type_aa && threeg >= 0)
 		set_u32(threeg, ACER_CAP_THREEG);
-	set_u32(brightness, ACER_CAP_BRIGHTNESS);
+	if (brightness >= 0)
+		set_u32(brightness, ACER_CAP_BRIGHTNESS);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 95c7021540661a1d19130296be3e799d5a9135f8 Mon Sep 17 00:00:00 2001
From: Carlos Corbacho <carlos@strangeworlds.co.uk>
Date: Mon, 2 May 2011 09:57:08 +0100
Subject: tc1100-wmi: Orphan driver

I've never owned the hardware, this was a port of an existing driver to prove
that the ACPI-WMI code was useful to more than just acer-wmi.

Signed-off-by: Carlos Corbacho <carlos@strangeworlds.co.uk>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c4fc1daddcb..bcb07cb927b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3024,9 +3024,8 @@ S:	Maintained
 F:	drivers/net/wireless/hostap/
 
 HP COMPAQ TC1100 TABLET WMI EXTRAS DRIVER
-M:	Carlos Corbacho <carlos@strangeworlds.co.uk>
 L:	platform-driver-x86@vger.kernel.org
-S:	Odd Fixes
+S:	Orphan
 F:	drivers/platform/x86/tc1100-wmi.c
 
 HP100:	Driver for HP 10/100 Mbit/s Voice Grade Network Adapter Series
-- 
cgit v1.2.3-70-g09d2


From 5b9272594b5144538b96ca7eb82898c49519ec16 Mon Sep 17 00:00:00 2001
From: Carlos Corbacho <carlos@strangeworlds.co.uk>
Date: Mon, 2 May 2011 09:57:13 +0100
Subject: wmi: Orphan ACPI-WMI driver

I no longer have the time to work on this, and haven't really been doing any
work to this either. Time to let someone else take the reins.

Signed-off-by: Carlos Corbacho <carlos@strangeworlds.co.uk>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 MAINTAINERS | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index bcb07cb927b..390a4130a61 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -271,10 +271,8 @@ S:	Supported
 F:	drivers/acpi/video.c
 
 ACPI WMI DRIVER
-M:	Carlos Corbacho <carlos@strangeworlds.co.uk>
 L:	platform-driver-x86@vger.kernel.org
-W:	http://www.lesswatts.org/projects/acpi/
-S:	Maintained
+S:	Orphan
 F:	drivers/platform/x86/wmi.c
 
 AD1889 ALSA SOUND DRIVER
-- 
cgit v1.2.3-70-g09d2


From d9269a7146e33147866ecbf1ae485f6d1a1379f1 Mon Sep 17 00:00:00 2001
From: Carlos Corbacho <carlos@strangeworlds.co.uk>
Date: Mon, 2 May 2011 09:57:18 +0100
Subject: acer-wmi: Update MAINTAINERS

I don't have the time or much interest these days in maintaining acer-wmi, as I
don't have access to newer Acer hardware. As he's been doing most of the work
these days anyway, Joey Lee has kindly agreed to take over.

Signed-off-by: Carlos Corbacho <carlos@strangeworlds.co.uk>
Cc: Joey Lee <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 MAINTAINERS | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 390a4130a61..bad19e9d9dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -223,10 +223,8 @@ S:	Maintained
 F:	drivers/platform/x86/acerhdf.c
 
 ACER WMI LAPTOP EXTRAS
-M:	Carlos Corbacho <carlos@strangeworlds.co.uk>
-L:	aceracpi@googlegroups.com (subscribers-only)
+M:	Joey Lee <jlee@novell.com>
 L:	platform-driver-x86@vger.kernel.org
-W:	http://code.google.com/p/aceracpi
 S:	Maintained
 F:	drivers/platform/x86/acer-wmi.c
 
-- 
cgit v1.2.3-70-g09d2


From e569b223d53cb2720f55be72932f5758a056ff6d Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 24 Apr 2011 11:00:08 +0200
Subject: acerhdf: Drop pointless dependency on THERMAL_HWMON

The THERMAL_HWMON config option simply exposes the thermal zone
temperature values and limits to user-space. It makes no sense for a
kernel driver to depend on this.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Peter Feuerer <peter@piie.net>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 83c2411acb5..45e0191c35d 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -39,7 +39,7 @@ config ACER_WMI
 
 config ACERHDF
 	tristate "Acer Aspire One temperature and fan driver"
-	depends on THERMAL && THERMAL_HWMON && ACPI
+	depends on THERMAL && ACPI
 	---help---
 	  This is a driver for Acer Aspire One netbooks. It allows to access
 	  the temperature sensor and to control the fan.
-- 
cgit v1.2.3-70-g09d2


From c4bae98c4f913d3220d185c47d8817b5e2bba007 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 24 Apr 2011 18:07:55 +0200
Subject: acerhdf: Clean up includes

* The acerhdf driver isn't an ACPI driver, so it needs not include
  <acpi/acpi_drivers.h>. All it uses is ec_read() and ec_write(), for
  which <linux/acpi.h> is sufficient.
* I couldn't find any reason why <linux/fs.h> and <linux/sched.h> were
  included.

This should avoid unneeded rebuilds of the acerhdf driver.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Peter Feuerer <peter@piie.net>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acerhdf.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index 60f9cfcac93..fca3489218b 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -35,10 +35,8 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/fs.h>
 #include <linux/dmi.h>
-#include <acpi/acpi_drivers.h>
-#include <linux/sched.h>
+#include <linux/acpi.h>
 #include <linux/thermal.h>
 #include <linux/platform_device.h>
 
-- 
cgit v1.2.3-70-g09d2


From 020036678e810be10a352339faa9a1df2bd8f5a3 Mon Sep 17 00:00:00 2001
From: Carlos Corbacho <carlos@strangeworlds.co.uk>
Date: Mon, 2 May 2011 09:57:23 +0100
Subject: acer-wmi: Delete out-of-date documentation

The documentation file for acer-wmi is long out of date, and there's not
much point in keeping it around either.

Signed-off-by: Carlos Corbacho <carlos@strangeworlds.co.uk>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 Documentation/laptops/acer-wmi.txt | 184 -------------------------------------
 1 file changed, 184 deletions(-)
 delete mode 100644 Documentation/laptops/acer-wmi.txt

diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
deleted file mode 100644
index 4beafa663dd..00000000000
--- a/Documentation/laptops/acer-wmi.txt
+++ /dev/null
@@ -1,184 +0,0 @@
-Acer Laptop WMI Extras Driver
-http://code.google.com/p/aceracpi
-Version 0.3
-4th April 2009
-
-Copyright 2007-2009 Carlos Corbacho <carlos@strangeworlds.co.uk>
-
-acer-wmi is a driver to allow you to control various parts of your Acer laptop
-hardware under Linux which are exposed via ACPI-WMI.
-
-This driver completely replaces the old out-of-tree acer_acpi, which I am
-currently maintaining for bug fixes only on pre-2.6.25 kernels. All development
-work is now focused solely on acer-wmi.
-
-Disclaimer
-**********
-
-Acer and Wistron have provided nothing towards the development acer_acpi or
-acer-wmi. All information we have has been through the efforts of the developers
-and the users to discover as much as possible about the hardware.
-
-As such, I do warn that this could break your hardware - this is extremely
-unlikely of course, but please bear this in mind.
-
-Background
-**********
-
-acer-wmi is derived from acer_acpi, originally developed by Mark
-Smith in 2005, then taken over by Carlos Corbacho in 2007, in order to activate
-the wireless LAN card under a 64-bit version of Linux, as acerhk[1] (the
-previous solution to the problem) relied on making 32 bit BIOS calls which are
-not possible in kernel space from a 64 bit OS.
-
-[1] acerhk: http://www.cakey.de/acerhk/
-
-Supported Hardware
-******************
-
-NOTE: The Acer Aspire One is not supported hardware. It cannot work with
-acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been
-blacklisted until that happens.
-
-Please see the website for the current list of known working hardware:
-
-http://code.google.com/p/aceracpi/wiki/SupportedHardware
-
-If your laptop is not listed, or listed as unknown, and works with acer-wmi,
-please contact me with a copy of the DSDT.
-
-If your Acer laptop doesn't work with acer-wmi, I would also like to see the
-DSDT.
-
-To send me the DSDT, as root/sudo:
-
-cat /sys/firmware/acpi/tables/DSDT > dsdt
-
-And send me the resulting 'dsdt' file.
-
-Usage
-*****
-
-On Acer laptops, acer-wmi should already be autoloaded based on DMI matching.
-For non-Acer laptops, until WMI based autoloading support is added, you will
-need to manually load acer-wmi.
-
-acer-wmi creates /sys/devices/platform/acer-wmi, and fills it with various
-files whose usage is detailed below, which enables you to control some of the
-following (varies between models):
-
-* the wireless LAN card radio
-* inbuilt Bluetooth adapter
-* inbuilt 3G card
-* mail LED of your laptop
-* brightness of the LCD panel
-
-Wireless
-********
-
-With regards to wireless, all acer-wmi does is enable the radio on the card. It
-is not responsible for the wireless LED - once the radio is enabled, this is
-down to the wireless driver for your card. So the behaviour of the wireless LED,
-once you enable the radio, will depend on your hardware and driver combination.
-
-e.g. With the BCM4318 on the Acer Aspire 5020 series:
-
-ndiswrapper: Light blinks on when transmitting
-b43: Solid light, blinks off when transmitting
-
-Wireless radio control is unconditionally enabled - all Acer laptops that support
-acer-wmi come with built-in wireless. However, should you feel so inclined to
-ever wish to remove the card, or swap it out at some point, please get in touch
-with me, as we may well be able to gain some data on wireless card detection.
-
-The wireless radio is exposed through rfkill.
-
-Bluetooth
-*********
-
-For bluetooth, this is an internal USB dongle, so once enabled, you will get
-a USB device connection event, and a new USB device appears. When you disable
-bluetooth, you get the reverse - a USB device disconnect event, followed by the
-device disappearing again.
-
-Bluetooth is autodetected by acer-wmi, so if you do not have a bluetooth module
-installed in your laptop, this file won't exist (please be aware that it is
-quite common for Acer not to fit bluetooth to their laptops - so just because
-you have a bluetooth button on the laptop, doesn't mean that bluetooth is
-installed).
-
-For the adventurously minded - if you want to buy an internal bluetooth
-module off the internet that is compatible with your laptop and fit it, then
-it will work just fine with acer-wmi.
-
-Bluetooth is exposed through rfkill.
-
-3G
-**
-
-3G is currently not autodetected, so the 'threeg' file is always created under
-sysfs. So far, no-one in possession of an Acer laptop with 3G built-in appears to
-have tried Linux, or reported back, so we don't have any information on this.
-
-If you have an Acer laptop that does have a 3G card in, please contact me so we
-can properly detect these, and find out a bit more about them.
-
-To read the status of the 3G card (0=off, 1=on):
-cat /sys/devices/platform/acer-wmi/threeg
-
-To enable the 3G card:
-echo 1 > /sys/devices/platform/acer-wmi/threeg
-
-To disable the 3G card:
-echo 0 > /sys/devices/platform/acer-wmi/threeg
-
-To set the state of the 3G card when loading acer-wmi, pass:
-threeg=X (where X is 0 or 1)
-
-Mail LED
-********
-
-This can be found in most older Acer laptops supported by acer-wmi, and many
-newer ones - it is built into the 'mail' button, and blinks when active.
-
-On newer (WMID) laptops though, we have no way of detecting the mail LED. If
-your laptop identifies itself in dmesg as a WMID model, then please try loading
-acer_acpi with:
-
-force_series=2490
-
-This will use a known alternative method of reading/ writing the mail LED. If
-it works, please report back to me with the DMI data from your laptop so this
-can be added to acer-wmi.
-
-The LED is exposed through the LED subsystem, and can be found in:
-
-/sys/devices/platform/acer-wmi/leds/acer-wmi::mail/
-
-The mail LED is autodetected, so if you don't have one, the LED device won't
-be registered.
-
-Backlight
-*********
-
-The backlight brightness control is available on all acer-wmi supported
-hardware. The maximum brightness level is usually 15, but on some newer laptops
-it's 10 (this is again autodetected).
-
-The backlight is exposed through the backlight subsystem, and can be found in:
-
-/sys/devices/platform/acer-wmi/backlight/acer-wmi/
-
-Credits
-*******
-
-Olaf Tauber, who did the real hard work when he developed acerhk
-http://www.cakey.de/acerhk/
-All the authors of laptop ACPI modules in the kernel, whose work
-was an inspiration in the early days of acer_acpi
-Mathieu Segaud, who solved the problem with having to modprobe the driver
-twice in acer_acpi 0.2.
-Jim Ramsay, who added support for the WMID interface
-Mark Smith, who started the original acer_acpi
-
-And the many people who have used both acer_acpi and acer-wmi.
-- 
cgit v1.2.3-70-g09d2


From b9e066942981c6de3897de94953bea295ae18ec8 Mon Sep 17 00:00:00 2001
From: Ameya Palande <2ameya@gmail.com>
Date: Wed, 6 Apr 2011 17:44:37 +0300
Subject: platform/x86: Simplify intel_mid_powerbtn

This patch:
1. Removes unnecessay #defines
2. Removes 'mfld_pb_priv' data structure which results in simpler error
   handling and less memory allocations.

Signed-off-by: Ameya Palande <2ameya@gmail.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/intel_mid_powerbtn.c | 72 +++++++++++++------------------
 1 file changed, 30 insertions(+), 42 deletions(-)

diff --git a/drivers/platform/x86/intel_mid_powerbtn.c b/drivers/platform/x86/intel_mid_powerbtn.c
index 213e79ba68d..f1ae5078b7e 100644
--- a/drivers/platform/x86/intel_mid_powerbtn.c
+++ b/drivers/platform/x86/intel_mid_powerbtn.c
@@ -23,58 +23,48 @@
 #include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/input.h>
+
 #include <asm/intel_scu_ipc.h>
 
 #define DRIVER_NAME "msic_power_btn"
 
-#define MSIC_IRQ_STAT	0x02
-  #define MSIC_IRQ_PB	(1 << 0)
-#define MSIC_PB_CONFIG	0x3e
 #define MSIC_PB_STATUS	0x3f
-  #define MSIC_PB_LEVEL (1 << 3) /* 1 - release, 0 - press */
-
-struct mfld_pb_priv {
-	struct input_dev *input;
-	unsigned int irq;
-};
+#define MSIC_PB_LEVEL	(1 << 3) /* 1 - release, 0 - press */
 
 static irqreturn_t mfld_pb_isr(int irq, void *dev_id)
 {
-	struct mfld_pb_priv *priv = dev_id;
+	struct input_dev *input = dev_id;
 	int ret;
 	u8 pbstat;
 
 	ret = intel_scu_ipc_ioread8(MSIC_PB_STATUS, &pbstat);
-	if (ret < 0)
-		return IRQ_HANDLED;
-
-	input_event(priv->input, EV_KEY, KEY_POWER, !(pbstat & MSIC_PB_LEVEL));
-	input_sync(priv->input);
+	if (ret < 0) {
+		dev_err(input->dev.parent, "Read error %d while reading"
+			       " MSIC_PB_STATUS\n", ret);
+	} else {
+		input_event(input, EV_KEY, KEY_POWER,
+			       !(pbstat & MSIC_PB_LEVEL));
+		input_sync(input);
+	}
 
 	return IRQ_HANDLED;
 }
 
 static int __devinit mfld_pb_probe(struct platform_device *pdev)
 {
-	struct mfld_pb_priv *priv;
 	struct input_dev *input;
-	int irq;
+	int irq = platform_get_irq(pdev, 0);
 	int error;
 
-	irq = platform_get_irq(pdev, 0);
 	if (irq < 0)
 		return -EINVAL;
 
-	priv = kzalloc(sizeof(struct mfld_pb_priv), GFP_KERNEL);
 	input = input_allocate_device();
-	if (!priv || !input) {
-		error = -ENOMEM;
-		goto err_free_mem;
+	if (!input) {
+		dev_err(&pdev->dev, "Input device allocation error\n");
+		return -ENOMEM;
 	}
 
-	priv->input = input;
-	priv->irq = irq;
-
 	input->name = pdev->name;
 	input->phys = "power-button/input0";
 	input->id.bustype = BUS_HOST;
@@ -82,42 +72,40 @@ static int __devinit mfld_pb_probe(struct platform_device *pdev)
 
 	input_set_capability(input, EV_KEY, KEY_POWER);
 
-	error = request_threaded_irq(priv->irq, NULL, mfld_pb_isr,
-				     0, DRIVER_NAME, priv);
+	error = request_threaded_irq(irq, NULL, mfld_pb_isr, 0,
+			DRIVER_NAME, input);
 	if (error) {
-		dev_err(&pdev->dev,
-			"unable to request irq %d for mfld power button\n",
-			irq);
-		goto err_free_mem;
+		dev_err(&pdev->dev, "Unable to request irq %d for mfld power"
+				"button\n", irq);
+		goto err_free_input;
 	}
 
 	error = input_register_device(input);
 	if (error) {
-		dev_err(&pdev->dev,
-			"unable to register input dev, error %d\n", error);
+		dev_err(&pdev->dev, "Unable to register input dev, error "
+				"%d\n", error);
 		goto err_free_irq;
 	}
 
-	platform_set_drvdata(pdev, priv);
+	platform_set_drvdata(pdev, input);
 	return 0;
 
 err_free_irq:
-	free_irq(priv->irq, priv);
-err_free_mem:
+	free_irq(irq, input);
+err_free_input:
 	input_free_device(input);
-	kfree(priv);
 	return error;
 }
 
 static int __devexit mfld_pb_remove(struct platform_device *pdev)
 {
-	struct mfld_pb_priv *priv = platform_get_drvdata(pdev);
-
-	free_irq(priv->irq, priv);
-	input_unregister_device(priv->input);
-	kfree(priv);
+	struct input_dev *input = platform_get_drvdata(pdev);
+	int irq = platform_get_irq(pdev, 0);
 
+	free_irq(irq, input);
+	input_unregister_device(input);
 	platform_set_drvdata(pdev, NULL);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From cb8b646d8b27053d311595b4fc7dbe161d92683e Mon Sep 17 00:00:00 2001
From: Ameya Palande <2ameya@gmail.com>
Date: Wed, 6 Apr 2011 17:45:03 +0300
Subject: platform/x86: Fix Makefile for intel_mid_powerbtn

Signed-off-by: Ameya Palande <2ameya@gmail.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index af683fb5a80..afc1f832aa6 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -41,6 +41,6 @@ obj-$(CONFIG_XO1_RFKILL)	+= xo1-rfkill.o
 obj-$(CONFIG_XO15_EBOOK)	+= xo15-ebook.o
 obj-$(CONFIG_IBM_RTL)		+= ibm_rtl.o
 obj-$(CONFIG_SAMSUNG_LAPTOP)	+= samsung-laptop.o
-obj-$(CONFIG_INTEL_MFLD_THERMAL)	+= intel_mid_thermal.o
 obj-$(CONFIG_MXM_WMI)		+= mxm-wmi.o
+obj-$(CONFIG_INTEL_MID_POWER_BUTTON)	+= intel_mid_powerbtn.o
 obj-$(CONFIG_INTEL_OAKTRAIL)	+= intel_oaktrail.o
-- 
cgit v1.2.3-70-g09d2


From 239dca9e9aff6630c22fef696978ba3a878f7d96 Mon Sep 17 00:00:00 2001
From: Ameya Palande <2ameya@gmail.com>
Date: Thu, 7 Apr 2011 16:30:02 +0300
Subject: platform-x86: intel_mid_thermal: Fix memory leak

Signed-off-by: Ameya Palande <2ameya@gmail.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/intel_mid_thermal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/intel_mid_thermal.c b/drivers/platform/x86/intel_mid_thermal.c
index f0776504521..3a578323122 100644
--- a/drivers/platform/x86/intel_mid_thermal.c
+++ b/drivers/platform/x86/intel_mid_thermal.c
@@ -527,6 +527,7 @@ static int mid_thermal_remove(struct platform_device *pdev)
 	for (i = 0; i < MSIC_THERMAL_SENSORS; i++)
 		thermal_zone_device_unregister(pinfo->tzd[i]);
 
+	kfree(pinfo);
 	platform_set_drvdata(pdev, NULL);
 
 	/* Stop the ADC */
-- 
cgit v1.2.3-70-g09d2


From 8ae68de15d5ddaa8d41e197a730e19223adab2e1 Mon Sep 17 00:00:00 2001
From: Melchior FRANZ <mfranz@aon.at>
Date: Tue, 24 May 2011 10:35:55 +0200
Subject: support wlan hotkey on Acer Travelmate 5735Z

On an Acer Travelmate 5735Z-452G32Mnss the WLAN-enable/disable key
doesn't send 0x1 as acpi event key code, but 0x3. This patch also
makes the module ignore hotkey acpi events for functions that are
already handled without. This avoids warning message "keyboard:
can't emulate rawmode for keycode 240".

Signed-off-by: Melchior FRANZ <mfranz@aon.at>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 0682ecb3074..57c6c8a4db2 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -98,13 +98,26 @@ enum acer_wmi_event_ids {
 
 static const struct key_entry acer_wmi_keymap[] = {
 	{KE_KEY, 0x01, {KEY_WLAN} },     /* WiFi */
+	{KE_KEY, 0x03, {KEY_WLAN} },     /* WiFi */
 	{KE_KEY, 0x12, {KEY_BLUETOOTH} },	/* BT */
 	{KE_KEY, 0x21, {KEY_PROG1} },    /* Backup */
 	{KE_KEY, 0x22, {KEY_PROG2} },    /* Arcade */
 	{KE_KEY, 0x23, {KEY_PROG3} },    /* P_Key */
 	{KE_KEY, 0x24, {KEY_PROG4} },    /* Social networking_Key */
+	{KE_IGNORE, 0x41, {KEY_MUTE} },
+	{KE_IGNORE, 0x42, {KEY_PREVIOUSSONG} },
+	{KE_IGNORE, 0x43, {KEY_NEXTSONG} },
+	{KE_IGNORE, 0x44, {KEY_PLAYPAUSE} },
+	{KE_IGNORE, 0x45, {KEY_STOP} },
+	{KE_IGNORE, 0x48, {KEY_VOLUMEUP} },
+	{KE_IGNORE, 0x49, {KEY_VOLUMEDOWN} },
+	{KE_IGNORE, 0x61, {KEY_SWITCHVIDEOMODE} },
+	{KE_IGNORE, 0x62, {KEY_BRIGHTNESSUP} },
+	{KE_IGNORE, 0x63, {KEY_BRIGHTNESSDOWN} },
 	{KE_KEY, 0x64, {KEY_SWITCHVIDEOMODE} },	/* Display Switch */
+	{KE_IGNORE, 0x81, {KEY_SLEEP} },
 	{KE_KEY, 0x82, {KEY_TOUCHPAD_TOGGLE} },	/* Touch Pad On/Off */
+	{KE_IGNORE, 0x83, {KEY_TOUCHPAD_TOGGLE} },
 	{KE_END, 0}
 };
 
@@ -1345,7 +1358,7 @@ static void acer_wmi_notify(u32 value, void *context)
 	case WMID_HOTKEY_EVENT:
 		if (return_value.device_state) {
 			u16 device_state = return_value.device_state;
-			pr_debug("deivces states: 0x%x\n", device_state);
+			pr_debug("device state: 0x%x\n", device_state);
 			if (has_cap(ACER_CAP_WIRELESS))
 				rfkill_set_sw_state(wireless_rfkill,
 				!(device_state & ACER_WMID3_GDS_WIRELESS));
-- 
cgit v1.2.3-70-g09d2


From 5ddf9c5fa5a78036790ed72aaed3b84ee7dd4532 Mon Sep 17 00:00:00 2001
From: Weiping Pan <panweiping3@gmail.com>
Date: Mon, 16 May 2011 14:49:47 +0800
Subject: platform/x86:delete two unused variables

variable handle is not used in these two functions,
just delete them.

Signed-off-by: Weiping Pan <panweiping3@gmail.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/sony-laptop.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 12dd0a62f30..bbd182e178c 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -962,7 +962,6 @@ static int sony_backlight_get_brightness(struct backlight_device *bd)
 static int sony_nc_get_brightness_ng(struct backlight_device *bd)
 {
 	int result;
-	int *handle = (int *)bl_get_data(bd);
 	struct sony_backlight_props *sdev =
 		(struct sony_backlight_props *)bl_get_data(bd);
 
@@ -974,7 +973,6 @@ static int sony_nc_get_brightness_ng(struct backlight_device *bd)
 static int sony_nc_update_status_ng(struct backlight_device *bd)
 {
 	int value, result;
-	int *handle = (int *)bl_get_data(bd);
 	struct sony_backlight_props *sdev =
 		(struct sony_backlight_props *)bl_get_data(bd);
 
-- 
cgit v1.2.3-70-g09d2


From a8d1a266eee5f8b822449fe19d1735189377ef47 Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Sun, 22 May 2011 07:33:52 +0800
Subject: acer-wmi: check the existence of internal 3G device when set
 capability

That will be better to check the existence of internal 3G device when
we set threeg capability and generate killswitch for threeg. It can
avoid userland access 3G rfkill but the machine doesn't have internal
3G device.

Reference: bko#32862
	https://bugzilla.kernel.org/show_bug.cgi?id=32862

Tested on Acer Aspire 8930G, Acer Travelmate 8572

Tested-by: Hector Martin <hector@marcansoft.com>
Cc: Carlos Corbacho <carlos@strangeworlds.co.uk>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Corentin Chary <corentincj@iksaif.net>
Cc: Thomas Renninger <trenn@suse.de>
Signed-off-by: Lee, Chun-Yi <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 57c6c8a4db2..92ee4d63727 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -889,7 +889,8 @@ static acpi_status WMID_set_capabilities(void)
 	dmi_walk(type_aa_dmi_decode, NULL);
 	if (!has_type_aa) {
 		interface->capability |= ACER_CAP_WIRELESS;
-		interface->capability |= ACER_CAP_THREEG;
+		if (devices & 0x40)
+			interface->capability |= ACER_CAP_THREEG;
 		if (devices & 0x10)
 			interface->capability |= ACER_CAP_BLUETOOTH;
 	}
-- 
cgit v1.2.3-70-g09d2


From ab6a931620cfa5c565b351d1982306c3c8b97f96 Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Sun, 22 May 2011 07:33:53 +0800
Subject: acer-wmi: allow 64-bits return buffer from WMI methods

Acer WMID_GUID1/2 method's return buffer was declared to 64-bits
on some Acer notebook, but WMI method only use 32-bits in return
buffer.
So, add this patch for allow 64-bits return buffer.

Reference: bko#34142
	https://bugzilla.kernel.org/show_bug.cgi?id=34142

Tested on Acer Travelmate 5735Z-452G32Mnss

Tested-by: Melchior FRANZ <melchior.franz@gmail.com>
Cc: Carlos Corbacho <carlos@strangeworlds.co.uk>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Corentin Chary <corentincj@iksaif.net>
Cc: Thomas Renninger <trenn@suse.de>
Signed-off-by: Lee, Chun-Yi <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 92ee4d63727..48219117823 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -750,7 +750,8 @@ WMI_execute_u32(u32 method_id, u32 in, u32 *out)
 
 	obj = (union acpi_object *) result.pointer;
 	if (obj && obj->type == ACPI_TYPE_BUFFER &&
-		obj->buffer.length == sizeof(u32)) {
+		(obj->buffer.length == sizeof(u32) ||
+		obj->buffer.length == sizeof(u64))) {
 		tmp = *((u32 *) obj->buffer.pointer);
 	} else {
 		tmp = 0;
@@ -879,7 +880,8 @@ static acpi_status WMID_set_capabilities(void)
 
 	obj = (union acpi_object *) out.pointer;
 	if (obj && obj->type == ACPI_TYPE_BUFFER &&
-		obj->buffer.length == sizeof(u32)) {
+		(obj->buffer.length == sizeof(u32) ||
+		obj->buffer.length == sizeof(u64))) {
 		devices = *((u32 *) obj->buffer.pointer);
 	} else {
 		kfree(out.pointer);
@@ -1522,7 +1524,8 @@ static u32 get_wmid_devices(void)
 
 	obj = (union acpi_object *) out.pointer;
 	if (obj && obj->type == ACPI_TYPE_BUFFER &&
-		obj->buffer.length == sizeof(u32)) {
+		(obj->buffer.length == sizeof(u32) ||
+		obj->buffer.length == sizeof(u64))) {
 		devices = *((u32 *) obj->buffer.pointer);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 6d88ff0f8ef3529f68233d3963b2a7e07f8e4f69 Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Sun, 22 May 2011 07:33:54 +0800
Subject: acer-wmi: support to set communication device state by new wmid
 method

Have many Acer notebooks' BIOS already support new WMID_GUID3 method.
On those machines, that will be better set communication device by
evaluate WMID_GUID3 method.

Tested on Acer Travelmate 8572

Cc: Carlos Corbacho <carlos@strangeworlds.co.uk>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Corentin Chary <corentincj@iksaif.net>
Cc: Thomas Renninger <trenn@suse.de>
Signed-off-by: Lee, Chun-Yi <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 111 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 48219117823..5dc843bfc6c 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -135,6 +135,7 @@ struct event_return_value {
  */
 #define ACER_WMID3_GDS_WIRELESS		(1<<0)	/* WiFi */
 #define ACER_WMID3_GDS_THREEG		(1<<6)	/* 3G */
+#define ACER_WMID3_GDS_WIMAX		(1<<7)	/* WiMAX */
 #define ACER_WMID3_GDS_BLUETOOTH	(1<<11)	/* BT */
 
 struct lm_input_params {
@@ -1142,6 +1143,114 @@ static acpi_status get_device_status(u32 *value, u32 cap)
 	}
 }
 
+static acpi_status wmid3_set_device_status(u32 value, u16 device)
+{
+	struct wmid3_gds_return_value return_value;
+	acpi_status status;
+	union acpi_object *obj;
+	u16 devices;
+	struct wmid3_gds_input_param params = {
+		.function_num = 0x1,
+		.hotkey_number = 0x01,
+		.devices = ACER_WMID3_GDS_WIRELESS &
+				ACER_WMID3_GDS_THREEG &
+				ACER_WMID3_GDS_WIMAX &
+				ACER_WMID3_GDS_BLUETOOTH,
+	};
+	struct acpi_buffer input = {
+		sizeof(struct wmid3_gds_input_param),
+		&params
+	};
+	struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_buffer output2 = { ACPI_ALLOCATE_BUFFER, NULL };
+
+	status = wmi_evaluate_method(WMID_GUID3, 0, 0x2, &input, &output);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	obj = output.pointer;
+
+	if (!obj)
+		return AE_ERROR;
+	else if (obj->type != ACPI_TYPE_BUFFER) {
+		kfree(obj);
+		return AE_ERROR;
+	}
+	if (obj->buffer.length != 8) {
+		pr_warning("Unknown buffer length %d\n", obj->buffer.length);
+		kfree(obj);
+		return AE_ERROR;
+	}
+
+	return_value = *((struct wmid3_gds_return_value *)obj->buffer.pointer);
+	kfree(obj);
+
+	if (return_value.error_code || return_value.ec_return_value) {
+		pr_warning("Get Current Device Status failed: "
+			"0x%x - 0x%x\n", return_value.error_code,
+			return_value.ec_return_value);
+		return status;
+	}
+
+	devices = return_value.devices;
+	params.function_num = 0x2;
+	params.hotkey_number = 0x01;
+	params.devices = (value) ? (devices | device) : (devices & ~device);
+
+	status = wmi_evaluate_method(WMID_GUID3, 0, 0x1, &input, &output2);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	obj = output2.pointer;
+
+	if (!obj)
+		return AE_ERROR;
+	else if (obj->type != ACPI_TYPE_BUFFER) {
+		kfree(obj);
+		return AE_ERROR;
+	}
+	if (obj->buffer.length != 4) {
+		pr_warning("Unknown buffer length %d\n", obj->buffer.length);
+		kfree(obj);
+		return AE_ERROR;
+	}
+
+	return_value = *((struct wmid3_gds_return_value *)obj->buffer.pointer);
+	kfree(obj);
+
+	if (return_value.error_code || return_value.ec_return_value)
+		pr_warning("Set Device Status failed: "
+			"0x%x - 0x%x\n", return_value.error_code,
+			return_value.ec_return_value);
+
+	return status;
+}
+
+static acpi_status set_device_status(u32 value, u32 cap)
+{
+	if (wmi_has_guid(WMID_GUID3)) {
+		u16 device;
+
+		switch (cap) {
+		case ACER_CAP_WIRELESS:
+			device = ACER_WMID3_GDS_WIRELESS;
+			break;
+		case ACER_CAP_BLUETOOTH:
+			device = ACER_WMID3_GDS_BLUETOOTH;
+			break;
+		case ACER_CAP_THREEG:
+			device = ACER_WMID3_GDS_THREEG;
+			break;
+		default:
+			return AE_ERROR;
+		}
+		return wmid3_set_device_status(value, device);
+
+	} else {
+		return set_u32(value, cap);
+	}
+}
+
 /*
  * Rfkill devices
  */
@@ -1178,7 +1287,7 @@ static int acer_rfkill_set(void *data, bool blocked)
 	u32 cap = (unsigned long)data;
 
 	if (rfkill_inited) {
-		status = set_u32(!blocked, cap);
+		status = set_device_status(!blocked, cap);
 		if (ACPI_FAILURE(status))
 			return -ENODEV;
 	}
-- 
cgit v1.2.3-70-g09d2


From d436514e21b827ab602d1714028c34179c42d09f Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Thu, 26 May 2011 11:09:19 +0800
Subject: msi-laptop: fix section mismatch in reference from the function
 load_scm_model_init

There have section mismatch warning message shows up when building
the kernel with make CONFIG_DEBUG_SECTION_MISMATCH=y.

The problem is the load_scm_model_init() calls msi_laptop_input_setup()
which is an __init function, but load_scm_model_init() lacks a __init
annotation.

This patch add __init on load_scm_model_init() to avoid warning message.

Cc: Matthew Garrett <mjg@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Corentin Chary <corentincj@iksaif.net>
Cc: Thomas Renninger <trenn@suse.de>
Signed-off-by: Lee, Chun-Yi <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/msi-laptop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/msi-laptop.c b/drivers/platform/x86/msi-laptop.c
index f89c0b6a9ae..d7213e40686 100644
--- a/drivers/platform/x86/msi-laptop.c
+++ b/drivers/platform/x86/msi-laptop.c
@@ -800,7 +800,7 @@ static void msi_laptop_input_destroy(void)
 	input_unregister_device(msi_laptop_input_dev);
 }
 
-static int load_scm_model_init(struct platform_device *sdev)
+static int __init load_scm_model_init(struct platform_device *sdev)
 {
 	u8 data;
 	int result;
-- 
cgit v1.2.3-70-g09d2


From 987dfbaa65b2c3568b85e29d2598da08a011ee09 Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Fri, 27 May 2011 14:52:14 +0800
Subject: acer-wmi: support integer return type from WMI methods

Acer WMID_GUID1/2 method's return value was declared to integer
type on Gateway notebook.
So, add this patch for support integer return type.

Reference: bko#33032
	https://bugzilla.kernel.org/show_bug.cgi?id=33032

Tested on Gateway NV5909H laptop

Tested-by: Filipus Klutiero <chealer@gmail.com>
Cc: Carlos Corbacho <carlos@strangeworlds.co.uk>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: Corentin Chary <corentincj@iksaif.net>
Cc: Thomas Renninger <trenn@suse.de>
Signed-off-by: Lee, Chun-Yi <jlee@novell.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/acer-wmi.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 5dc843bfc6c..005417bd429 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -754,6 +754,8 @@ WMI_execute_u32(u32 method_id, u32 in, u32 *out)
 		(obj->buffer.length == sizeof(u32) ||
 		obj->buffer.length == sizeof(u64))) {
 		tmp = *((u32 *) obj->buffer.pointer);
+	} else if (obj->type == ACPI_TYPE_INTEGER) {
+		tmp = (u32) obj->integer.value;
 	} else {
 		tmp = 0;
 	}
@@ -884,6 +886,8 @@ static acpi_status WMID_set_capabilities(void)
 		(obj->buffer.length == sizeof(u32) ||
 		obj->buffer.length == sizeof(u64))) {
 		devices = *((u32 *) obj->buffer.pointer);
+	} else if (obj->type == ACPI_TYPE_INTEGER) {
+		devices = (u32) obj->integer.value;
 	} else {
 		kfree(out.pointer);
 		return AE_ERROR;
@@ -1636,6 +1640,8 @@ static u32 get_wmid_devices(void)
 		(obj->buffer.length == sizeof(u32) ||
 		obj->buffer.length == sizeof(u64))) {
 		devices = *((u32 *) obj->buffer.pointer);
+	} else if (obj->type == ACPI_TYPE_INTEGER) {
+		devices = (u32) obj->integer.value;
 	}
 
 	kfree(out.pointer);
-- 
cgit v1.2.3-70-g09d2


From aac11c1b351413aa3412e258e2b2dcba31777209 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 24 May 2011 16:28:55 +0200
Subject: iwl4965: fix 5GHz operation

rx_status.band is used uninitialized, what disallow to work on 5GHz .

Cc: stable@kernel.org # 2.6.39+
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlegacy/iwl-4965-lib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/iwlegacy/iwl-4965-lib.c b/drivers/net/wireless/iwlegacy/iwl-4965-lib.c
index 7e5e85a017b..a7a4739880d 100644
--- a/drivers/net/wireless/iwlegacy/iwl-4965-lib.c
+++ b/drivers/net/wireless/iwlegacy/iwl-4965-lib.c
@@ -628,11 +628,11 @@ void iwl4965_rx_reply_rx(struct iwl_priv *priv,
 
 	/* rx_status carries information about the packet to mac80211 */
 	rx_status.mactime = le64_to_cpu(phy_res->timestamp);
+	rx_status.band = (phy_res->phy_flags & RX_RES_PHY_FLAGS_BAND_24_MSK) ?
+				IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ;
 	rx_status.freq =
 		ieee80211_channel_to_frequency(le16_to_cpu(phy_res->channel),
 							rx_status.band);
-	rx_status.band = (phy_res->phy_flags & RX_RES_PHY_FLAGS_BAND_24_MSK) ?
-				IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ;
 	rx_status.rate_idx =
 		iwl4965_hwrate_to_mac80211_idx(rate_n_flags, rx_status.band);
 	rx_status.flag = 0;
-- 
cgit v1.2.3-70-g09d2


From 64bd0821a3b66c3307d7a4ee5523e3e35ec2df0e Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 25 May 2011 09:44:05 +0800
Subject: wireless: Default to 'n' for 2 new added devices in Kconfig.

We make oldconfig every time when a new kernel arrives, but
if we don't have such a device(I guess this is the most common
case for a new device), the default value should be 'n' so
that the kernel size we build doesn't grow up too much quickly.
For anyone who has the device, it is OK for them to turn it on
by themselves.

Cc: "John W. Linville" <linville@tuxdriver.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/Kconfig | 1 -
 drivers/net/wireless/rt2x00/Kconfig    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig
index d9ff8413ab9..d9c08c619a3 100644
--- a/drivers/net/wireless/ath/ath9k/Kconfig
+++ b/drivers/net/wireless/ath/ath9k/Kconfig
@@ -26,7 +26,6 @@ config ATH9K
 config ATH9K_PCI
 	bool "Atheros ath9k PCI/PCIe bus support"
 	depends on ATH9K && PCI
-	default PCI
 	---help---
 	  This option enables the PCI bus support in ath9k.
 
diff --git a/drivers/net/wireless/rt2x00/Kconfig b/drivers/net/wireless/rt2x00/Kconfig
index 9def1e5369a..b2f8b8fd4d2 100644
--- a/drivers/net/wireless/rt2x00/Kconfig
+++ b/drivers/net/wireless/rt2x00/Kconfig
@@ -166,7 +166,6 @@ config RT2800USB_RT35XX
 config RT2800USB_RT53XX
        bool "rt2800usb - Include support for rt53xx devices (EXPERIMENTAL)"
        depends on EXPERIMENTAL
-       default y
        ---help---
          This adds support for rt53xx wireless chipset family to the
          rt2800pci driver.
-- 
cgit v1.2.3-70-g09d2


From a331400bf01231253a0d9ab211c83212d2ac4edb Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Thu, 26 May 2011 11:46:37 +0300
Subject: mac80211: clear local->ps_data on disassoc

local->ps_data wasn't cleared on disassociation, which
(in some corner cases) caused reconnections to enter
psm before association completed.

Signed-off-by: Eliad Peller <eliad@wizery.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 746595597b6..456cccf26b5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1095,6 +1095,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
 		config_changed |= IEEE80211_CONF_CHANGE_PS;
 	}
+	local->ps_sdata = NULL;
 
 	ieee80211_hw_config(local, config_changed);
 
-- 
cgit v1.2.3-70-g09d2


From 64c754ed3b0009e4fa248f739000dc234eb0d2c9 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 26 May 2011 10:53:17 +0200
Subject: mac80211: Remove duplicate linux/slab.h include from
 net/mac80211/scan.c

Commit 79f460ca49d8d5700756ab7071c951311c7f29cc add a duplicate
linux/slab.h include to net/mac80211/scan.c - remove it.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Acked-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/scan.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 27af6723cb5..58ffa7d069c 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -15,7 +15,6 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/pm_qos_params.h>
-#include <linux/slab.h>
 #include <net/sch_generic.h>
 #include <linux/slab.h>
 #include <net/mac80211.h>
-- 
cgit v1.2.3-70-g09d2


From 1df85ecec36ad5da3f0165760704310d6c03f65f Mon Sep 17 00:00:00 2001
From: Adrian Chadd <adrian.chadd@gmail.com>
Date: Fri, 27 May 2011 01:08:04 +0800
Subject: ath9k: Fix AR9287 calibration

The AR9287 calibration code was not being called because of an
incorrect MAC revision check.
This forced the AR9287 to use the AR9285 initial calibration code and
bypass the AR9287 code entirely.

Signed-off-by: Adrian Chadd <adrian@freebsd.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/ar9002_calib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/ar9002_calib.c b/drivers/net/wireless/ath/ath9k/ar9002_calib.c
index 015d9743993..2d4c0910295 100644
--- a/drivers/net/wireless/ath/ath9k/ar9002_calib.c
+++ b/drivers/net/wireless/ath/ath9k/ar9002_calib.c
@@ -829,7 +829,7 @@ static bool ar9002_hw_init_cal(struct ath_hw *ah, struct ath9k_channel *chan)
 	if (AR_SREV_9271(ah)) {
 		if (!ar9285_hw_cl_cal(ah, chan))
 			return false;
-	} else if (AR_SREV_9285_12_OR_LATER(ah)) {
+	} else if (AR_SREV_9285(ah) && AR_SREV_9285_12_OR_LATER(ah)) {
 		if (!ar9285_hw_clc(ah, chan))
 			return false;
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 646aaea615704010b5fd2c8c8891ff1a3a4b4f1a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 27 May 2011 11:00:41 -0300
Subject: perf tools: Make sure kptr_restrict warnings fit 80 col terms

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/n/tip-i1p8vrhq7xveyui6t1sc914e@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 19 ++++++++-----------
 tools/perf/builtin-report.c | 17 +++++++----------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2ca107f3efd..8e2c8579818 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -824,17 +824,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	symbol__init();
 
 	if (symbol_conf.kptr_restrict)
-		pr_warning("WARNING: Kernel address maps "
-			   "(/proc/{kallsyms,modules}) are restricted, "
-			   "check /proc/sys/kernel/kptr_restrict.\n\n"
-			   "Samples in kernel functions may not be resolved "
-			   "if a suitable vmlinux file is not found in the "
-			   "buildid cache or in the vmlinux path.\n\n"
-			   "Samples in kernel modules won't be resolved "
-			   "at all.\n\n"
-			   "If some relocation was applied (e.g. kexec) "
-			   "symbols may be misresolved even with a suitable "
-			   "vmlinux or kallsyms file.\n\n");
+		pr_warning(
+"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
+"check /proc/sys/kernel/kptr_restrict.\n\n"
+"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
+"file is not found in the buildid cache or in the vmlinux path.\n\n"
+"Samples in kernel modules won't be resolved at all.\n\n"
+"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
+"even with a suitable vmlinux or kallsyms file.\n\n");
 
 	if (no_buildid_cache || no_buildid)
 		disable_buildid_cache();
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 99156c35bc6..287a173523a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -281,17 +281,14 @@ static int __cmd_report(void)
 	      kernel_kmap->ref_reloc_sym->addr == 0))) {
 		const struct dso *kdso = kernel_map->dso;
 
-		ui__warning("Kernel address maps "
-			    "(/proc/{kallsyms,modules}) were restricted, "
-			    "check /proc/sys/kernel/kptr_restrict before "
-			    "running 'perf record'.\n\n%s\n\n"
-			    "Samples in kernel modules can't be resolved "
-			    "as well.\n\n",
+		ui__warning(
+"Kernel address maps (/proc/{kallsyms,modules}) were restricted.\n\n"
+"Check /proc/sys/kernel/kptr_restrict before running 'perf record'.\n\n%s\n\n"
+"Samples in kernel modules can't be resolved as well.\n\n",
 			    RB_EMPTY_ROOT(&kdso->symbols[MAP__FUNCTION]) ?
-			    "As no suitable kallsyms nor vmlinux was found, "
-			    "kernel samples can't be resolved." :
-			    "If some relocation was applied (e.g. kexec) "
-			    "symbols may be misresolved.");
+"As no suitable kallsyms nor vmlinux was found, kernel samples\n"
+"can't be resolved." :
+"If some relocation was applied (e.g. kexec) symbols may be misresolved.");
 	}
 
 	if (dump_trace) {
-- 
cgit v1.2.3-70-g09d2


From 4af4c9550ccaaf0b53013ff730bc15068ffe6abc Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 27 May 2011 09:58:34 -0600
Subject: perf events: initialize fd array to -1 instead of 0

perf_evsel__alloc_fd allocates an array of file descriptors with the
memory initialized to 0. The array has dimensions for cpus and threads.

Later, __perf_evsel__open calls sys_perf_event_open for each cpu and thread
dimensions. If the open fails for any of the cpus or threads then the fd's
for this event are closed and the fd entry in the array is set to -1. Now,
if the first attempt fails for the event (e.g., the event is not supported)
the remaining dimensions (cpu > 0 and thread > 0) are not touched and left
at the initialized value of 0.

builtin-stat catches ENOENT and ENOSYS failures and allows the command to
continue. The end result is that stat attempts to read from an fd of 0 which
of course is stdin and so the command hangs until you type ctrl-D.

Resolve by initializing the array to -1 since an fd < 0 is already
handled.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1306511914-8016-1-git-send-email-dsahern@gmail.com
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ee0fe0dffa7..cca29ededb5 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -35,7 +35,17 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
+	int cpu, thread;
 	evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
+
+	if (evsel->fd) {
+		for (cpu = 0; cpu < ncpus; cpu++) {
+			for (thread = 0; thread < nthreads; thread++) {
+				FD(evsel, cpu, thread) = -1;
+			}
+		}
+	}
+
 	return evsel->fd != NULL ? 0 : -ENOMEM;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 59fb1ee95e74e8e0777289c44300cbe812aca836 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 27 May 2011 11:14:00 -0300
Subject: perf top: Remove unused macro

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/n/tip-weqbs0tkk2u0qp1xxdxxosfg@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2d7934e9de3..375ed160d93 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -62,8 +62,6 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-
 static struct perf_top top = {
 	.count_filter		= 5,
 	.delay_secs		= 2,
-- 
cgit v1.2.3-70-g09d2


From 5f6f55809758e106eca72c6e01402c8080a88ee8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 27 May 2011 11:53:28 -0300
Subject: perf top: Handle kptr_restrict

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/n/tip-cyl5zmi1nu35vyu7l5im2pyv@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 375ed160d93..472f6279002 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -80,6 +80,7 @@ static bool			use_tui, use_stdio;
 
 static int			default_interval		=      0;
 
+static bool			kptr_restrict_warned;
 static bool			inherit				=  false;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
@@ -738,6 +739,20 @@ static void perf_event__process_sample(const union perf_event *event,
 	    al.filtered)
 		return;
 
+	if (!kptr_restrict_warned &&
+	    symbol_conf.kptr_restrict &&
+	    al.cpumode == PERF_RECORD_MISC_KERNEL) {
+		ui__warning(
+"Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
+"Check /proc/sys/kernel/kptr_restrict.\n\n"
+"Kernel%s samples will not be resolved.\n",
+			  !RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION]) ?
+			  " modules" : "");
+		if (use_browser <= 0)
+			sleep(5);
+		kptr_restrict_warned = true;
+	}
+
 	if (al.sym == NULL) {
 		/*
 		 * As we do lazy loading of symtabs we only will know if the
-- 
cgit v1.2.3-70-g09d2


From e4a338d05df93ab1ebf291aca1e753064319d301 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 27 May 2011 13:42:16 -0300
Subject: perf top: Don't stop if no kernel symtab is found

We now just warn the user about the fact and go on providing just
userspace samples.

This fixes a problem when no vmlinux is explicetely passed by the user,
thus symbol_conf.vmlinux_name is NULL, no suitable vmlinux is found, and
then we get:

 aldebaran:~> perf top -p 7557
 [kernel.kallsyms] with build id 44d9a989eabbd79e486bc079d6b743d397c204e0
 not found, continuing without symbols
 The (null) file can't be used

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Link: http://lkml.kernel.org/n/tip-cj2g81hn64wv2bipmqk4fy2m@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 472f6279002..f2f3f4937aa 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -81,6 +81,7 @@ static bool			use_tui, use_stdio;
 static int			default_interval		=      0;
 
 static bool			kptr_restrict_warned;
+static bool			vmlinux_warned;
 static bool			inherit				=  false;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
@@ -754,6 +755,7 @@ static void perf_event__process_sample(const union perf_event *event,
 	}
 
 	if (al.sym == NULL) {
+		const char *msg = "Kernel samples will not be resolved.\n";
 		/*
 		 * As we do lazy loading of symtabs we only will know if the
 		 * specified vmlinux file is invalid when we actually have a
@@ -765,12 +767,20 @@ static void perf_event__process_sample(const union perf_event *event,
 		 * --hide-kernel-symbols, even if the user specifies an
 		 * invalid --vmlinux ;-)
 		 */
-		if (al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
+		if (!kptr_restrict_warned && !vmlinux_warned &&
+		    al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
 		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
-			ui__warning("The %s file can't be used\n",
-				    symbol_conf.vmlinux_name);
-			exit_browser(0);
-			exit(1);
+			if (symbol_conf.vmlinux_name) {
+				ui__warning("The %s file can't be used.\n%s",
+					    symbol_conf.vmlinux_name, msg);
+			} else {
+				ui__warning("A vmlinux file was not found.\n%s",
+					    msg);
+			}
+
+			if (use_browser <= 0)
+				sleep(5);
+			vmlinux_warned = true;
 		}
 
 		return;
-- 
cgit v1.2.3-70-g09d2


From 69e848c2090aebba5698a1620604c7dccb448684 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Tue, 22 Feb 2011 09:57:15 -0800
Subject: Intel xhci: Support EHCI/xHCI port switching.

The Intel Panther Point chipsets contain an EHCI and xHCI host controller
that shares some number of skew-dependent ports.  These ports can be
switched from the EHCI to the xHCI host (and vice versa) by a hardware MUX
that is controlled by registers in the xHCI PCI configuration space.  The
USB 3.0 SuperSpeed terminations on the xHCI ports can be controlled
separately from the USB 2.0 data wires.

This switchover mechanism is there to support users who do a custom
install of certain non-Linux operating systems that don't have official
USB 3.0 support.  By default, the ports are under EHCI, SuperSpeed
terminations are off, and USB 3.0 devices will show up under the EHCI
controller at reduced speeds.  (This was more palatable for the marketing
folks than having completely dead USB 3.0 ports if no xHCI drivers are
available.)  Users should be able to turn on xHCI by default through a
BIOS option, but users are happiest when they don't have to change random
BIOS settings.

This patch introduces a driver method to switchover the ports from EHCI to
xHCI before the EHCI driver finishes PCI enumeration.  We want to switch
the ports over before the USB core has the chance to enumerate devices
under EHCI, or boot from USB mass storage will fail if the boot device
connects under EHCI first, and then gets disconnected when the port
switches over to xHCI.

Add code to the xHCI PCI quirk to switch the ports from EHCI to xHCI.  The
PCI quirks code will run before any other PCI probe function is called, so
this avoids the issue with boot devices.

Another issue is with BIOS behavior during system resume from hibernate.
If the BIOS doesn't support xHCI, it may switch the devices under EHCI to
allow use of the USB keyboard, mice, and mass storage devices.  It's
supposed to remember the value of the port routing registers and switch
them back when the OS attempts to take control of the xHCI host controller,
but we all know not to trust BIOS writers.

Make both the xHCI driver and the EHCI driver attempt to switchover the
ports in their PCI resume functions.  We can't guarantee which PCI device
will be resumed first, so this avoids any race conditions.  Writing a '1'
to an already set port switchover bit or a '0' to a cleared port switchover
bit should have no effect.

The xHCI PCI configuration registers will be documented in the EDS-level
chipset spec, which is not public yet.  I have permission from legal and
the Intel chipset group to release this patch early to allow good Linux
support at product launch.  I've tried to document the registers as much
as possible, so please let me know if anything is unclear.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/ehci-pci.c   | 39 +++++++++++++++++++++++++++
 drivers/usb/host/pci-quirks.c | 63 +++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/pci-quirks.h |  2 ++
 drivers/usb/host/xhci-pci.c   | 20 ++++++++++++++
 4 files changed, 124 insertions(+)

diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index 660b80a75ca..1102ce65a3a 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -348,11 +348,50 @@ static int ehci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 	return rc;
 }
 
+static bool usb_is_intel_switchable_ehci(struct pci_dev *pdev)
+{
+	return pdev->class == PCI_CLASS_SERIAL_USB_EHCI &&
+		pdev->vendor == PCI_VENDOR_ID_INTEL &&
+		pdev->device == 0x1E26;
+}
+
+static void ehci_enable_xhci_companion(void)
+{
+	struct pci_dev		*companion = NULL;
+
+	/* The xHCI and EHCI controllers are not on the same PCI slot */
+	for_each_pci_dev(companion) {
+		if (!usb_is_intel_switchable_xhci(companion))
+			continue;
+		usb_enable_xhci_ports(companion);
+		return;
+	}
+}
+
 static int ehci_pci_resume(struct usb_hcd *hcd, bool hibernated)
 {
 	struct ehci_hcd		*ehci = hcd_to_ehci(hcd);
 	struct pci_dev		*pdev = to_pci_dev(hcd->self.controller);
 
+	/* The BIOS on systems with the Intel Panther Point chipset may or may
+	 * not support xHCI natively.  That means that during system resume, it
+	 * may switch the ports back to EHCI so that users can use their
+	 * keyboard to select a kernel from GRUB after resume from hibernate.
+	 *
+	 * The BIOS is supposed to remember whether the OS had xHCI ports
+	 * enabled before resume, and switch the ports back to xHCI when the
+	 * BIOS/OS semaphore is written, but we all know we can't trust BIOS
+	 * writers.
+	 *
+	 * Unconditionally switch the ports back to xHCI after a system resume.
+	 * We can't tell whether the EHCI or xHCI controller will be resumed
+	 * first, so we have to do the port switchover in both drivers.  Writing
+	 * a '1' to the port switchover registers should have no effect if the
+	 * port was already switched over.
+	 */
+	if (usb_is_intel_switchable_ehci(pdev))
+		ehci_enable_xhci_companion();
+
 	// maybe restore FLADJ
 
 	if (time_before(jiffies, ehci->next_statechange))
diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index f16c59d5f48..fd930618c28 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c
@@ -69,6 +69,9 @@
 #define	NB_PIF0_PWRDOWN_0	0x01100012
 #define	NB_PIF0_PWRDOWN_1	0x01100013
 
+#define USB_INTEL_XUSB2PR      0xD0
+#define USB_INTEL_USB3_PSSEN   0xD8
+
 static struct amd_chipset_info {
 	struct pci_dev	*nb_dev;
 	struct pci_dev	*smbus_dev;
@@ -673,6 +676,64 @@ static int handshake(void __iomem *ptr, u32 mask, u32 done,
 	return -ETIMEDOUT;
 }
 
+bool usb_is_intel_switchable_xhci(struct pci_dev *pdev)
+{
+	return pdev->class == PCI_CLASS_SERIAL_USB_XHCI &&
+		pdev->vendor == PCI_VENDOR_ID_INTEL &&
+		pdev->device == PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI;
+}
+EXPORT_SYMBOL_GPL(usb_is_intel_switchable_xhci);
+
+/*
+ * Intel's Panther Point chipset has two host controllers (EHCI and xHCI) that
+ * share some number of ports.  These ports can be switched between either
+ * controller.  Not all of the ports under the EHCI host controller may be
+ * switchable.
+ *
+ * The ports should be switched over to xHCI before PCI probes for any device
+ * start.  This avoids active devices under EHCI being disconnected during the
+ * port switchover, which could cause loss of data on USB storage devices, or
+ * failed boot when the root file system is on a USB mass storage device and is
+ * enumerated under EHCI first.
+ *
+ * We write into the xHC's PCI configuration space in some Intel-specific
+ * registers to switch the ports over.  The USB 3.0 terminations and the USB
+ * 2.0 data wires are switched separately.  We want to enable the SuperSpeed
+ * terminations before switching the USB 2.0 wires over, so that USB 3.0
+ * devices connect at SuperSpeed, rather than at USB 2.0 speeds.
+ */
+void usb_enable_xhci_ports(struct pci_dev *xhci_pdev)
+{
+	u32		ports_available;
+
+	ports_available = 0xffffffff;
+	/* Write USB3_PSSEN, the USB 3.0 Port SuperSpeed Enable
+	 * Register, to turn on SuperSpeed terminations for all
+	 * available ports.
+	 */
+	pci_write_config_dword(xhci_pdev, USB_INTEL_USB3_PSSEN,
+			cpu_to_le32(ports_available));
+
+	pci_read_config_dword(xhci_pdev, USB_INTEL_USB3_PSSEN,
+			&ports_available);
+	dev_dbg(&xhci_pdev->dev, "USB 3.0 ports that are now enabled "
+			"under xHCI: 0x%x\n", ports_available);
+
+	ports_available = 0xffffffff;
+	/* Write XUSB2PR, the xHC USB 2.0 Port Routing Register, to
+	 * switch the USB 2.0 power and data lines over to the xHCI
+	 * host.
+	 */
+	pci_write_config_dword(xhci_pdev, USB_INTEL_XUSB2PR,
+			cpu_to_le32(ports_available));
+
+	pci_read_config_dword(xhci_pdev, USB_INTEL_XUSB2PR,
+			&ports_available);
+	dev_dbg(&xhci_pdev->dev, "USB 2.0 ports that are now switched over "
+			"to xHCI: 0x%x\n", ports_available);
+}
+EXPORT_SYMBOL_GPL(usb_enable_xhci_ports);
+
 /**
  * PCI Quirks for xHCI.
  *
@@ -732,6 +793,8 @@ static void __devinit quirk_usb_handoff_xhci(struct pci_dev *pdev)
 	writel(XHCI_LEGACY_DISABLE_SMI,
 			base + ext_cap_offset + XHCI_LEGACY_CONTROL_OFFSET);
 
+	if (usb_is_intel_switchable_xhci(pdev))
+		usb_enable_xhci_ports(pdev);
 hc_init:
 	op_reg_base = base + XHCI_HC_LENGTH(readl(base));
 
diff --git a/drivers/usb/host/pci-quirks.h b/drivers/usb/host/pci-quirks.h
index 6ae9f78e993..b1002a8ef96 100644
--- a/drivers/usb/host/pci-quirks.h
+++ b/drivers/usb/host/pci-quirks.h
@@ -8,6 +8,8 @@ int usb_amd_find_chipset_info(void);
 void usb_amd_dev_put(void);
 void usb_amd_quirk_pll_disable(void);
 void usb_amd_quirk_pll_enable(void);
+bool usb_is_intel_switchable_xhci(struct pci_dev *pdev);
+void usb_enable_xhci_ports(struct pci_dev *xhci_pdev);
 #else
 static inline void usb_amd_quirk_pll_disable(void) {}
 static inline void usb_amd_quirk_pll_enable(void) {}
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index cbc4d491e62..faf039ac657 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -242,8 +242,28 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 static int xhci_pci_resume(struct usb_hcd *hcd, bool hibernated)
 {
 	struct xhci_hcd		*xhci = hcd_to_xhci(hcd);
+	struct pci_dev		*pdev = to_pci_dev(hcd->self.controller);
 	int			retval = 0;
 
+	/* The BIOS on systems with the Intel Panther Point chipset may or may
+	 * not support xHCI natively.  That means that during system resume, it
+	 * may switch the ports back to EHCI so that users can use their
+	 * keyboard to select a kernel from GRUB after resume from hibernate.
+	 *
+	 * The BIOS is supposed to remember whether the OS had xHCI ports
+	 * enabled before resume, and switch the ports back to xHCI when the
+	 * BIOS/OS semaphore is written, but we all know we can't trust BIOS
+	 * writers.
+	 *
+	 * Unconditionally switch the ports back to xHCI after a system resume.
+	 * We can't tell whether the EHCI or xHCI controller will be resumed
+	 * first, so we have to do the port switchover in both drivers.  Writing
+	 * a '1' to the port switchover registers should have no effect if the
+	 * port was already switched over.
+	 */
+	if (usb_is_intel_switchable_xhci(pdev))
+		usb_enable_xhci_ports(pdev);
+
 	retval = xhci_resume(xhci, hibernated);
 	return retval;
 }
-- 
cgit v1.2.3-70-g09d2


From ad808333d8201d53075a11bc8dd83b81f3d68f0b Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Wed, 25 May 2011 10:43:56 -0700
Subject: Intel xhci: Ignore spurious successful event.

The xHCI host controller in the Panther Point chipset sometimes produces
spurious events on the event ring.  If it receives a short packet, it
first puts a Transfer Event with a short transfer completion code on the
event ring.  Then it puts a Transfer Event with a successful completion
code on the ring for the same TD.  The xHCI driver correctly processes the
short transfer completion code, gives the URB back to the driver, and then
prints a warning in dmesg about the spurious event.  These warning
messages really fill up dmesg when an HD webcam is plugged into xHCI.

This spurious successful event behavior isn't technically disallowed by
the xHCI specification, so make the xHCI driver just ignore the spurious
completion event.

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci-pci.c  |  4 ++++
 drivers/usb/host/xhci-ring.c | 14 ++++++++++++++
 drivers/usb/host/xhci.h      |  2 ++
 3 files changed, 20 insertions(+)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index faf039ac657..eafd17fae94 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -118,6 +118,10 @@ static int xhci_pci_setup(struct usb_hcd *hcd)
 	/* AMD PLL quirk */
 	if (pdev->vendor == PCI_VENDOR_ID_AMD && usb_amd_find_chipset_info())
 		xhci->quirks |= XHCI_AMD_PLL_FIX;
+	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
+			pdev->device == PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI) {
+		xhci->quirks |= XHCI_SPURIOUS_SUCCESS;
+	}
 
 	/* Make sure the HC is halted. */
 	retval = xhci_halt(xhci);
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index d1498c03c4c..56f6c584c65 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -2061,6 +2061,16 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 		if (!event_seg) {
 			if (!ep->skip ||
 			    !usb_endpoint_xfer_isoc(&td->urb->ep->desc)) {
+				/* Some host controllers give a spurious
+				 * successful event after a short transfer.
+				 * Ignore it.
+				 */
+				if ((xhci->quirks & XHCI_SPURIOUS_SUCCESS) && 
+						ep_ring->last_td_was_short) {
+					ep_ring->last_td_was_short = false;
+					ret = 0;
+					goto cleanup;
+				}
 				/* HC is busted, give up! */
 				xhci_err(xhci,
 					"ERROR Transfer event TRB DMA ptr not "
@@ -2071,6 +2081,10 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 			ret = skip_isoc_td(xhci, td, event, ep, &status);
 			goto cleanup;
 		}
+		if (trb_comp_code == COMP_SHORT_TX)
+			ep_ring->last_td_was_short = true;
+		else
+			ep_ring->last_td_was_short = false;
 
 		if (ep->skip) {
 			xhci_dbg(xhci, "Found td. Clear skip flag.\n");
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 0772a8cfea2..5cfeb8614b8 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1123,6 +1123,7 @@ struct xhci_ring {
 	 */
 	u32			cycle_state;
 	unsigned int		stream_id;
+	bool			last_td_was_short;
 };
 
 struct xhci_erst_entry {
@@ -1290,6 +1291,7 @@ struct xhci_hcd {
 #define XHCI_RESET_EP_QUIRK	(1 << 1)
 #define XHCI_NEC_HOST		(1 << 2)
 #define XHCI_AMD_PLL_FIX	(1 << 3)
+#define XHCI_SPURIOUS_SUCCESS	(1 << 4)
 	/* There are two roothubs to keep track of bus suspend info for */
 	struct xhci_bus_state   bus_state[2];
 	/* Is each xHCI roothub port a USB 3.0, USB 2.0, or USB 1.1 port? */
-- 
cgit v1.2.3-70-g09d2


From 2cf95c18d5069e13c02a8667d91e064df8e17e09 Mon Sep 17 00:00:00 2001
From: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Date: Wed, 11 May 2011 16:14:58 -0700
Subject: Intel xhci: Limit number of active endpoints to 64.

The Panther Point chipset has an xHCI host controller that has a limit to
the number of active endpoints it can handle.  Ideally, it would signal
that it can't handle anymore endpoints by returning a Resource Error for
the Configure Endpoint command, but they don't.  Instead it needs software
to keep track of the number of active endpoints, across configure endpoint
commands, reset device commands, disable slot commands, and address device
commands.

Add a new endpoint context counter, xhci_hcd->num_active_eps, and use it
to track the number of endpoints the xHC has active.  This gets a little
tricky, because commands to change the number of active endpoints can
fail.  This patch adds a new xHCI quirk for these Intel hosts, and the new
code should not have any effect on other xHCI host controllers.

Fail a new device allocation if we don't have room for the new default
control endpoint.  Use the endpoint ring pointers to determine what
endpoints were active before a Reset Device command or a Disable Slot
command, and drop those once the command completes.

Fail a configure endpoint command if it would add too many new endpoints.
We have to be a bit over zealous here, and only count the number of new
endpoints to be added, without subtracting the number of dropped
endpoints.  That's because a second configure endpoint command for a
different device could sneak in before we know if the first command is
completed.  If the first command dropped resources, the host controller
fails the command for some reason, and we're nearing the limit of
endpoints, we could end up oversubscribing the host.

To fix this race condition, when evaluating whether a configure endpoint
command will fix in our bandwidth budget, only add the new endpoints to
xhci->num_active_eps, and don't subtract the dropped endpoints.  Ignore
changed endpoints (ones that are dropped and then re-added), as that
shouldn't effect the host's endpoint resources.  When the configure
endpoint command completes, subtract off the dropped endpoints.

This may mean some configuration changes may temporarily fail, but it's
always better to under-subscribe than over-subscribe resources.

(Originally my plan had been to push the resource allocation down into the
ring allocation functions.  However, that would cause us to allocate
unnecessary resources when endpoints were changed, because the xHCI driver
allocates a new ring for the changed endpoint, and only deletes the old
ring once the Configure Endpoint command succeeds.  A further complication
would have been dealing with the per-device endpoint ring cache.)

Signed-off-by: Sarah Sharp <sarah.a.sharp@linux.intel.com>
---
 drivers/usb/host/xhci-pci.c  |   2 +
 drivers/usb/host/xhci-ring.c |   7 +-
 drivers/usb/host/xhci.c      | 232 +++++++++++++++++++++++++++++++++++++++++--
 drivers/usb/host/xhci.h      |  14 +++
 4 files changed, 244 insertions(+), 11 deletions(-)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index eafd17fae94..c408e9f6a70 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -121,6 +121,8 @@ static int xhci_pci_setup(struct usb_hcd *hcd)
 	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 			pdev->device == PCI_DEVICE_ID_INTEL_PANTHERPOINT_XHCI) {
 		xhci->quirks |= XHCI_SPURIOUS_SUCCESS;
+		xhci->quirks |= XHCI_EP_LIMIT_QUIRK;
+		xhci->limit_active_eps = 64;
 	}
 
 	/* Make sure the HC is halted. */
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 56f6c584c65..cc1485bfed3 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1081,8 +1081,13 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
 		complete(&xhci->addr_dev);
 		break;
 	case TRB_TYPE(TRB_DISABLE_SLOT):
-		if (xhci->devs[slot_id])
+		if (xhci->devs[slot_id]) {
+			if (xhci->quirks & XHCI_EP_LIMIT_QUIRK)
+				/* Delete default control endpoint resources */
+				xhci_free_device_endpoint_resources(xhci,
+						xhci->devs[slot_id], true);
 			xhci_free_virt_device(xhci, slot_id);
+		}
 		break;
 	case TRB_TYPE(TRB_CONFIG_EP):
 		virt_dev = xhci->devs[slot_id];
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 58183d2a808..d9660eb97eb 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1582,6 +1582,113 @@ static int xhci_evaluate_context_result(struct xhci_hcd *xhci,
 	return ret;
 }
 
+static u32 xhci_count_num_new_endpoints(struct xhci_hcd *xhci,
+		struct xhci_container_ctx *in_ctx)
+{
+	struct xhci_input_control_ctx *ctrl_ctx;
+	u32 valid_add_flags;
+	u32 valid_drop_flags;
+
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+	/* Ignore the slot flag (bit 0), and the default control endpoint flag
+	 * (bit 1).  The default control endpoint is added during the Address
+	 * Device command and is never removed until the slot is disabled.
+	 */
+	valid_add_flags = ctrl_ctx->add_flags >> 2;
+	valid_drop_flags = ctrl_ctx->drop_flags >> 2;
+
+	/* Use hweight32 to count the number of ones in the add flags, or
+	 * number of endpoints added.  Don't count endpoints that are changed
+	 * (both added and dropped).
+	 */
+	return hweight32(valid_add_flags) -
+		hweight32(valid_add_flags & valid_drop_flags);
+}
+
+static unsigned int xhci_count_num_dropped_endpoints(struct xhci_hcd *xhci,
+		struct xhci_container_ctx *in_ctx)
+{
+	struct xhci_input_control_ctx *ctrl_ctx;
+	u32 valid_add_flags;
+	u32 valid_drop_flags;
+
+	ctrl_ctx = xhci_get_input_control_ctx(xhci, in_ctx);
+	valid_add_flags = ctrl_ctx->add_flags >> 2;
+	valid_drop_flags = ctrl_ctx->drop_flags >> 2;
+
+	return hweight32(valid_drop_flags) -
+		hweight32(valid_add_flags & valid_drop_flags);
+}
+
+/*
+ * We need to reserve the new number of endpoints before the configure endpoint
+ * command completes.  We can't subtract the dropped endpoints from the number
+ * of active endpoints until the command completes because we can oversubscribe
+ * the host in this case:
+ *
+ *  - the first configure endpoint command drops more endpoints than it adds
+ *  - a second configure endpoint command that adds more endpoints is queued
+ *  - the first configure endpoint command fails, so the config is unchanged
+ *  - the second command may succeed, even though there isn't enough resources
+ *
+ * Must be called with xhci->lock held.
+ */
+static int xhci_reserve_host_resources(struct xhci_hcd *xhci,
+		struct xhci_container_ctx *in_ctx)
+{
+	u32 added_eps;
+
+	added_eps = xhci_count_num_new_endpoints(xhci, in_ctx);
+	if (xhci->num_active_eps + added_eps > xhci->limit_active_eps) {
+		xhci_dbg(xhci, "Not enough ep ctxs: "
+				"%u active, need to add %u, limit is %u.\n",
+				xhci->num_active_eps, added_eps,
+				xhci->limit_active_eps);
+		return -ENOMEM;
+	}
+	xhci->num_active_eps += added_eps;
+	xhci_dbg(xhci, "Adding %u ep ctxs, %u now active.\n", added_eps,
+			xhci->num_active_eps);
+	return 0;
+}
+
+/*
+ * The configure endpoint was failed by the xHC for some other reason, so we
+ * need to revert the resources that failed configuration would have used.
+ *
+ * Must be called with xhci->lock held.
+ */
+static void xhci_free_host_resources(struct xhci_hcd *xhci,
+		struct xhci_container_ctx *in_ctx)
+{
+	u32 num_failed_eps;
+
+	num_failed_eps = xhci_count_num_new_endpoints(xhci, in_ctx);
+	xhci->num_active_eps -= num_failed_eps;
+	xhci_dbg(xhci, "Removing %u failed ep ctxs, %u now active.\n",
+			num_failed_eps,
+			xhci->num_active_eps);
+}
+
+/*
+ * Now that the command has completed, clean up the active endpoint count by
+ * subtracting out the endpoints that were dropped (but not changed).
+ *
+ * Must be called with xhci->lock held.
+ */
+static void xhci_finish_resource_reservation(struct xhci_hcd *xhci,
+		struct xhci_container_ctx *in_ctx)
+{
+	u32 num_dropped_eps;
+
+	num_dropped_eps = xhci_count_num_dropped_endpoints(xhci, in_ctx);
+	xhci->num_active_eps -= num_dropped_eps;
+	if (num_dropped_eps)
+		xhci_dbg(xhci, "Removing %u dropped ep ctxs, %u now active.\n",
+				num_dropped_eps,
+				xhci->num_active_eps);
+}
+
 /* Issue a configure endpoint command or evaluate context command
  * and wait for it to finish.
  */
@@ -1602,6 +1709,15 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
 	virt_dev = xhci->devs[udev->slot_id];
 	if (command) {
 		in_ctx = command->in_ctx;
+		if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK) &&
+				xhci_reserve_host_resources(xhci, in_ctx)) {
+			spin_unlock_irqrestore(&xhci->lock, flags);
+			xhci_warn(xhci, "Not enough host resources, "
+					"active endpoint contexts = %u\n",
+					xhci->num_active_eps);
+			return -ENOMEM;
+		}
+
 		cmd_completion = command->completion;
 		cmd_status = &command->status;
 		command->command_trb = xhci->cmd_ring->enqueue;
@@ -1617,6 +1733,14 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
 		list_add_tail(&command->cmd_list, &virt_dev->cmd_list);
 	} else {
 		in_ctx = virt_dev->in_ctx;
+		if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK) &&
+				xhci_reserve_host_resources(xhci, in_ctx)) {
+			spin_unlock_irqrestore(&xhci->lock, flags);
+			xhci_warn(xhci, "Not enough host resources, "
+					"active endpoint contexts = %u\n",
+					xhci->num_active_eps);
+			return -ENOMEM;
+		}
 		cmd_completion = &virt_dev->cmd_completion;
 		cmd_status = &virt_dev->cmd_status;
 	}
@@ -1631,6 +1755,8 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
 	if (ret < 0) {
 		if (command)
 			list_del(&command->cmd_list);
+		if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK))
+			xhci_free_host_resources(xhci, in_ctx);
 		spin_unlock_irqrestore(&xhci->lock, flags);
 		xhci_dbg(xhci, "FIXME allocate a new ring segment\n");
 		return -ENOMEM;
@@ -1653,8 +1779,22 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
 	}
 
 	if (!ctx_change)
-		return xhci_configure_endpoint_result(xhci, udev, cmd_status);
-	return xhci_evaluate_context_result(xhci, udev, cmd_status);
+		ret = xhci_configure_endpoint_result(xhci, udev, cmd_status);
+	else
+		ret = xhci_evaluate_context_result(xhci, udev, cmd_status);
+
+	if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK)) {
+		spin_lock_irqsave(&xhci->lock, flags);
+		/* If the command failed, remove the reserved resources.
+		 * Otherwise, clean up the estimate to include dropped eps.
+		 */
+		if (ret)
+			xhci_free_host_resources(xhci, in_ctx);
+		else
+			xhci_finish_resource_reservation(xhci, in_ctx);
+		spin_unlock_irqrestore(&xhci->lock, flags);
+	}
+	return ret;
 }
 
 /* Called after one or more calls to xhci_add_endpoint() or
@@ -2271,6 +2411,34 @@ int xhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev,
 	return 0;
 }
 
+/*
+ * Deletes endpoint resources for endpoints that were active before a Reset
+ * Device command, or a Disable Slot command.  The Reset Device command leaves
+ * the control endpoint intact, whereas the Disable Slot command deletes it.
+ *
+ * Must be called with xhci->lock held.
+ */
+void xhci_free_device_endpoint_resources(struct xhci_hcd *xhci,
+	struct xhci_virt_device *virt_dev, bool drop_control_ep)
+{
+	int i;
+	unsigned int num_dropped_eps = 0;
+	unsigned int drop_flags = 0;
+
+	for (i = (drop_control_ep ? 0 : 1); i < 31; i++) {
+		if (virt_dev->eps[i].ring) {
+			drop_flags |= 1 << i;
+			num_dropped_eps++;
+		}
+	}
+	xhci->num_active_eps -= num_dropped_eps;
+	if (num_dropped_eps)
+		xhci_dbg(xhci, "Dropped %u ep ctxs, flags = 0x%x, "
+				"%u now active.\n",
+				num_dropped_eps, drop_flags,
+				xhci->num_active_eps);
+}
+
 /*
  * This submits a Reset Device Command, which will set the device state to 0,
  * set the device address to 0, and disable all the endpoints except the default
@@ -2412,6 +2580,14 @@ int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev)
 		goto command_cleanup;
 	}
 
+	/* Free up host controller endpoint resources */
+	if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK)) {
+		spin_lock_irqsave(&xhci->lock, flags);
+		/* Don't delete the default control endpoint resources */
+		xhci_free_device_endpoint_resources(xhci, virt_dev, false);
+		spin_unlock_irqrestore(&xhci->lock, flags);
+	}
+
 	/* Everything but endpoint 0 is disabled, so free or cache the rings. */
 	last_freed_endpoint = 1;
 	for (i = 1; i < 31; ++i) {
@@ -2484,6 +2660,27 @@ void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	 */
 }
 
+/*
+ * Checks if we have enough host controller resources for the default control
+ * endpoint.
+ *
+ * Must be called with xhci->lock held.
+ */
+static int xhci_reserve_host_control_ep_resources(struct xhci_hcd *xhci)
+{
+	if (xhci->num_active_eps + 1 > xhci->limit_active_eps) {
+		xhci_dbg(xhci, "Not enough ep ctxs: "
+				"%u active, need to add 1, limit is %u.\n",
+				xhci->num_active_eps, xhci->limit_active_eps);
+		return -ENOMEM;
+	}
+	xhci->num_active_eps += 1;
+	xhci_dbg(xhci, "Adding 1 ep ctx, %u now active.\n",
+			xhci->num_active_eps);
+	return 0;
+}
+
+
 /*
  * Returns 0 if the xHC ran out of device slots, the Enable Slot command
  * timed out, or allocating memory failed.  Returns 1 on success.
@@ -2519,24 +2716,39 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
 		xhci_err(xhci, "Error while assigning device slot ID\n");
 		return 0;
 	}
-	/* xhci_alloc_virt_device() does not touch rings; no need to lock.
-	 * Use GFP_NOIO, since this function can be called from
+
+	if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK)) {
+		spin_lock_irqsave(&xhci->lock, flags);
+		ret = xhci_reserve_host_control_ep_resources(xhci);
+		if (ret) {
+			spin_unlock_irqrestore(&xhci->lock, flags);
+			xhci_warn(xhci, "Not enough host resources, "
+					"active endpoint contexts = %u\n",
+					xhci->num_active_eps);
+			goto disable_slot;
+		}
+		spin_unlock_irqrestore(&xhci->lock, flags);
+	}
+	/* Use GFP_NOIO, since this function can be called from
 	 * xhci_discover_or_reset_device(), which may be called as part of
 	 * mass storage driver error handling.
 	 */
 	if (!xhci_alloc_virt_device(xhci, xhci->slot_id, udev, GFP_NOIO)) {
-		/* Disable slot, if we can do it without mem alloc */
 		xhci_warn(xhci, "Could not allocate xHCI USB device data structures\n");
-		spin_lock_irqsave(&xhci->lock, flags);
-		if (!xhci_queue_slot_control(xhci, TRB_DISABLE_SLOT, udev->slot_id))
-			xhci_ring_cmd_db(xhci);
-		spin_unlock_irqrestore(&xhci->lock, flags);
-		return 0;
+		goto disable_slot;
 	}
 	udev->slot_id = xhci->slot_id;
 	/* Is this a LS or FS device under a HS hub? */
 	/* Hub or peripherial? */
 	return 1;
+
+disable_slot:
+	/* Disable slot, if we can do it without mem alloc */
+	spin_lock_irqsave(&xhci->lock, flags);
+	if (!xhci_queue_slot_control(xhci, TRB_DISABLE_SLOT, udev->slot_id))
+		xhci_ring_cmd_db(xhci);
+	spin_unlock_irqrestore(&xhci->lock, flags);
+	return 0;
 }
 
 /*
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 5cfeb8614b8..ac0196e7fcf 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1292,6 +1292,18 @@ struct xhci_hcd {
 #define XHCI_NEC_HOST		(1 << 2)
 #define XHCI_AMD_PLL_FIX	(1 << 3)
 #define XHCI_SPURIOUS_SUCCESS	(1 << 4)
+/*
+ * Certain Intel host controllers have a limit to the number of endpoint
+ * contexts they can handle.  Ideally, they would signal that they can't handle
+ * anymore endpoint contexts by returning a Resource Error for the Configure
+ * Endpoint command, but they don't.  Instead they expect software to keep track
+ * of the number of active endpoints for them, across configure endpoint
+ * commands, reset device commands, disable slot commands, and address device
+ * commands.
+ */
+#define XHCI_EP_LIMIT_QUIRK	(1 << 5)
+	unsigned int		num_active_eps;
+	unsigned int		limit_active_eps;
 	/* There are two roothubs to keep track of bus suspend info for */
 	struct xhci_bus_state   bus_state[2];
 	/* Is each xHCI roothub port a USB 3.0, USB 2.0, or USB 1.1 port? */
@@ -1435,6 +1447,8 @@ void xhci_setup_streams_ep_input_ctx(struct xhci_hcd *xhci,
 void xhci_setup_no_streams_ep_input_ctx(struct xhci_hcd *xhci,
 		struct xhci_ep_ctx *ep_ctx,
 		struct xhci_virt_ep *ep);
+void xhci_free_device_endpoint_resources(struct xhci_hcd *xhci,
+	struct xhci_virt_device *virt_dev, bool drop_control_ep);
 struct xhci_ring *xhci_dma_to_transfer_ring(
 		struct xhci_virt_ep *ep,
 		u64 address);
-- 
cgit v1.2.3-70-g09d2


From fe19a96b10032035a35779f42ad59e35d6dd8ffd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 18 Mar 2011 20:21:23 -0400
Subject: SUNRPC: Deal with the lack of a SYN_SENT sk->sk_state_change
 callback...

The TCP connection state code depends on the state_change() callback
being called when the SYN_SENT state is set. However the networking layer
doesn't actually call us back in that case.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 net/sunrpc/xprtsock.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf005d3c65e..f34f5ab341a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1344,7 +1344,6 @@ static void xs_tcp_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
 		xprt_force_disconnect(xprt);
-	case TCP_SYN_SENT:
 		xprt->connect_cookie++;
 	case TCP_CLOSING:
 		/*
@@ -1758,6 +1757,7 @@ static void xs_tcp_reuse_connection(struct sock_xprt *transport)
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	int ret = -ENOTCONN;
 
 	if (!transport->inet) {
 		struct sock *sk = sock->sk;
@@ -1789,12 +1789,22 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 	}
 
 	if (!xprt_bound(xprt))
-		return -ENOTCONN;
+		goto out;
 
 	/* Tell the socket layer to start connecting... */
 	xprt->stat.connect_count++;
 	xprt->stat.connect_start = jiffies;
-	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+	ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+	switch (ret) {
+	case 0:
+	case -EINPROGRESS:
+		/* SYN_SENT! */
+		xprt->connect_cookie++;
+		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+	}
+out:
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 0ced63d1a245ac11241a5d37932e6d04d9c8040d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 26 May 2011 14:26:35 -0400
Subject: NFSv4: Handle expired stateids when the lease is still valid

Currently, if the server returns NFS4ERR_EXPIRED in reply to a READ or
WRITE, but the RENEW test determines that the lease is still active, we
fail to recover and end up looping forever in a READ/WRITE + RENEW death
spiral.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/nfs4proc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c393..d0e15dba7a5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 				break;
 			nfs4_schedule_stateid_recovery(server, state);
 			goto wait_on_recovery;
+		case -NFS4ERR_EXPIRED:
+			if (state != NULL)
+				nfs4_schedule_stateid_recovery(server, state);
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
-		case -NFS4ERR_EXPIRED:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
@@ -3670,9 +3672,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 				break;
 			nfs4_schedule_stateid_recovery(server, state);
 			goto wait_on_recovery;
+		case -NFS4ERR_EXPIRED:
+			if (state != NULL)
+				nfs4_schedule_stateid_recovery(server, state);
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
-		case -NFS4ERR_EXPIRED:
 			nfs4_schedule_lease_recovery(clp);
 			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
@@ -4543,6 +4547,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -ESTALE:
 				goto out;
 			case -NFS4ERR_EXPIRED:
+				nfs4_schedule_stateid_recovery(server, state);
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
 				nfs4_schedule_lease_recovery(server->nfs_client);
-- 
cgit v1.2.3-70-g09d2


From 444f72fe7e7b5f4db34cee933fa3546ebb8e9122 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 26 May 2011 14:26:35 -0400
Subject: NFSv4.1: Fix the handling of NFS4ERR_SEQ_MISORDERED errors

Currently, the call to nfs4_schedule_session_recovery() will actually just
result in a test of the lease when what we really want is to force a
session reset.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/nfs4state.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 036f5adc9e1..e97dd219f84 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 #ifdef CONFIG_NFS_V4_1
 void nfs4_schedule_session_recovery(struct nfs4_session *session)
 {
-	nfs4_schedule_lease_recovery(session->clp);
+	struct nfs_client *clp = session->clp;
+
+	set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	nfs4_schedule_lease_recovery(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
@@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
 		status = nfs4_recovery_handle_error(clp, status);
 		goto out;
 	}
+	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 	/* create_session negotiated new slot table */
 	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
 
-- 
cgit v1.2.3-70-g09d2


From 60c16ea877546e559988a8b1e4f4256afbbd83e6 Mon Sep 17 00:00:00 2001
From: Harshula Jayasuriya <harshula@redhat.com>
Date: Mon, 23 May 2011 22:52:11 +1000
Subject: NFS: nfs_update_inode: print current and new inode size in debug
 output

Hi Trond,

In nfs_update_inode debug output, print the current and new inode
size when the file size changes on the NFS server.

Signed-off-by: Harshula Jayasuriya <harshula@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5..873c6fa8bc3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				i_size_write(inode, new_isize);
 				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 			}
-			dprintk("NFS: isize change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
+			dprintk("NFS: isize change on server for file %s/%ld "
+					"(%Ld to %Ld)\n",
+					inode->i_sb->s_id,
+					inode->i_ino,
+					(long long)cur_isize,
+					(long long)new_isize);
 		}
 	} else
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
-- 
cgit v1.2.3-70-g09d2


From 4b8ee2b82e8b0b6e17ee33feb74fcdb5c6d8dbdd Mon Sep 17 00:00:00 2001
From: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Date: Fri, 20 May 2011 01:34:46 +0400
Subject: nfs41: Correct offset for LAYOUTCOMMIT

A client sends offset to MDS as it was seen by DS. As result,
file size after copy is only half of original file size in case
of 2 DS.

Signed-off-by: Vitaliy Gusev <gusev.vitaliy@nexenta.com>
Cc: stable@kernel.org [2.6.39]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a52..101c85a3644 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1009,7 +1009,7 @@ void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
 	struct nfs_inode *nfsi = NFS_I(wdata->inode);
-	loff_t end_pos = wdata->args.offset + wdata->res.count;
+	loff_t end_pos = wdata->mds_offset + wdata->res.count;
 	bool mark_as_dirty = false;
 
 	spin_lock(&nfsi->vfs_inode.i_lock);
-- 
cgit v1.2.3-70-g09d2


From 26f04dde681c6a48b2bacfc5fe01fef204419b0c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 1 May 2011 06:21:54 -0700
Subject: nfs,rcu: convert call_rcu(nfs_free_delegation_callback) to
 kfree_rcu()

The rcu callback nfs_free_delegation_callback() just calls a kfree(),
so we use kfree_rcu() instead of the call_rcu(nfs_free_delegation_callback).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index bbbc6bf5cb2..dd25c2aec37 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -21,25 +21,13 @@
 #include "delegation.h"
 #include "internal.h"
 
-static void nfs_do_free_delegation(struct nfs_delegation *delegation)
-{
-	kfree(delegation);
-}
-
-static void nfs_free_delegation_callback(struct rcu_head *head)
-{
-	struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
-
-	nfs_do_free_delegation(delegation);
-}
-
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
 	if (delegation->cred) {
 		put_rpccred(delegation->cred);
 		delegation->cred = NULL;
 	}
-	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+	kfree_rcu(delegation, rcu);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 61677eeec29e87edc03a1061ae0a04b92507450d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 May 2011 15:22:34 -0400
Subject: SUNRPC: Rename xs_encode_tcp_fragment_header()

Clean up: Use a more generic name for xs_encode_tcp_fragment_header();
it's appropriate to use for all stream transport types.  We're about
to add new stream transport.

Also, move it to a place where it is more easily shared amongst the
various send_request methods.  And finally, replace the "htonl" macro
invocation with its modern equivalent.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f34f5ab341a..079366974e1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -495,6 +495,16 @@ static int xs_nospace(struct rpc_task *task)
 	return ret;
 }
 
+/*
+ * Construct a stream transport record marker in @buf.
+ */
+static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
+{
+	u32 reclen = buf->len - sizeof(rpc_fraghdr);
+	rpc_fraghdr *base = buf->head[0].iov_base;
+	*base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen);
+}
+
 /**
  * xs_udp_send_request - write an RPC request to a UDP socket
  * @task: address of RPC task that manages the state of an RPC request
@@ -574,13 +584,6 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
 		kernel_sock_shutdown(sock, SHUT_WR);
 }
 
-static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf)
-{
-	u32 reclen = buf->len - sizeof(rpc_fraghdr);
-	rpc_fraghdr *base = buf->head[0].iov_base;
-	*base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen);
-}
-
 /**
  * xs_tcp_send_request - write an RPC request to a TCP socket
  * @task: address of RPC task that manages the state of an RPC request
@@ -603,7 +606,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	struct xdr_buf *xdr = &req->rq_snd_buf;
 	int status;
 
-	xs_encode_tcp_record_marker(&req->rq_snd_buf);
+	xs_encode_stream_record_marker(&req->rq_snd_buf);
 
 	xs_pktdump("packet data:",
 				req->rq_svec->iov_base,
@@ -2024,10 +2027,7 @@ static int bc_sendto(struct rpc_rqst *req)
 	unsigned long headoff;
 	unsigned long tailoff;
 
-	/*
-	 * Set up the rpc header and record marker stuff
-	 */
-	xs_encode_tcp_record_marker(xbufp);
+	xs_encode_stream_record_marker(xbufp);
 
 	tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
 	headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
-- 
cgit v1.2.3-70-g09d2


From 4251c94833aa516c1fc7a0f8f504a26eadd4b91e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 May 2011 15:22:15 -0400
Subject: NFS: Revert NFSROOT default mount options

Marek Belisko <marek.belisko@gmail.com> reports that recent attempts
to fix regressions in NFSROOT have broken his configuration:

> After update from 2.6.38-rc8 to 2.6.38 is mounting rootfs over nfs not possible.
> Log:
> VFS: Mounted root (nfs filesystem) on device 0:14.
> Freeing init memory: 132K
> nfs: server 10.146.1.21 not responding, still trying
> nfs: server 10.146.1.21 not responding, still trying
>
> This is never ending. I make short bisect (not too much commits
> between versions)
> and bad commit was reported: 53d4737580535e073963b91ce87d4216e434fab5
>
> NFS: NFSROOT should default to "proto=udp"
>
> I've tested on mini2440 board (DM9000, static IP).
> Is there some missing option or something else to be checked?

An examination of a network trace captured during the failure shows
that the mount is actually succeeding, but that the client is not
seeing READ replies larger than 16KB.  This could be a local packet
filtering issue on the client, but we didn't troubleshoot this
further because of the reported "git bisect" result.

Last fall we removed the ad hoc mount option parser in
fs/nfs/nfsroot.c in favor of using the main parser in fs/nfs/super.c
(see commit 56463e50 "NFS: Use super.c for NFSROOT mount option
parsing").  That commit changed the default NFSROOT mount options to
be the same as those employed by user space mounts.

As it turns out, these new default mount options are not tolerated by
many embedded systems.  So far these problems have been due to
specific behavior of certain embedded NICs.  The NFS community does
not have such hardware on hand for running tests.

Commit 53d47375 recently introduced a clean way to specify default
mount options for NFSROOT, so we can now easily restore the
traditional defaults for NFSROOT:

   vers=2,udp,rsize=4096,wsize=4096

This should revert the new default NFSROOT mount options introduced
with commit 56463e50.

Tested-by: Marek Belisto <marek.belisto@open-nandra.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c541093a5bf..c4744e1d513 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -87,7 +87,7 @@
 #define NFS_ROOT		"/tftpboot/%s"
 
 /* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS		"udp"
+#define NFS_DEF_OPTIONS		"vers=2,udp,rsize=4096,wsize=4096"
 
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
-- 
cgit v1.2.3-70-g09d2


From da09eb93033e7204cb3e3f3140b46cf108c42c8f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 May 2011 15:22:25 -0400
Subject: SUNRPC: Clean up use of curly braces in switch cases

Clean up.  Preferred style is not to use curly braces around
switch cases.  I'm about to add another case that needs a third
type cast.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 8d83f9d4871..b91f8f943c5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -298,22 +298,21 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	 * up a string representation of the passed-in address.
 	 */
 	if (args->servername == NULL) {
+		struct sockaddr_in *sin =
+				(struct sockaddr_in *)args->address;
+		struct sockaddr_in6 *sin6 =
+				(struct sockaddr_in6 *)args->address;
+
 		servername[0] = '\0';
 		switch (args->address->sa_family) {
-		case AF_INET: {
-			struct sockaddr_in *sin =
-					(struct sockaddr_in *)args->address;
+		case AF_INET:
 			snprintf(servername, sizeof(servername), "%pI4",
 				 &sin->sin_addr.s_addr);
 			break;
-		}
-		case AF_INET6: {
-			struct sockaddr_in6 *sin =
-					(struct sockaddr_in6 *)args->address;
+		case AF_INET6:
 			snprintf(servername, sizeof(servername), "%pI6",
-				 &sin->sin6_addr);
+				 &sin6->sin6_addr);
 			break;
-		}
 		default:
 			/* caller wants default server name, but
 			 * address family isn't recognized. */
-- 
cgit v1.2.3-70-g09d2


From 7402ab19cdd5943c7dd4f3399afe3abda8077ef5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 May 2011 15:22:55 -0400
Subject: SUNRPC: Use AF_LOCAL for rpcbind upcalls

As libtirpc does in user space, have our registration API try using an
AF_LOCAL transport first when registering and unregistering.

This means we don't chew up privileged ports, and our registration is
bound to an "owner" (the effective uid of the process on the sending
end of the transport).  Only that "owner" may unregister the service.

The kernel could probe rpcbind via an rpcbind query to determine
whether rpcbind has an AF_LOCAL service. For simplicity, we use the
same technique that libtirpc uses: simply fail over to network
loopback if creating an AF_LOCAL transport to the well-known rpcbind
service socket fails.

This means we open-code the pathname of the rpcbind socket in the
kernel.  For now we have to do that anyway because the kernel's
RPC over AF_LOCAL implementation does not support autobind.  That may
be undesirable in the long term.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpcb_clnt.c | 97 ++++++++++++++++++++++++++++++++++++++++++--------
 net/sunrpc/svc.c       |  2 ++
 2 files changed, 85 insertions(+), 14 deletions(-)

diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index c652e4cc9fe..9a80a922c52 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -16,6 +16,7 @@
 
 #include <linux/types.h>
 #include <linux/socket.h>
+#include <linux/un.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/kernel.h>
@@ -32,6 +33,8 @@
 # define RPCDBG_FACILITY	RPCDBG_BIND
 #endif
 
+#define RPCBIND_SOCK_PATHNAME	"/var/run/rpcbind.sock"
+
 #define RPCBIND_PROGRAM		(100000u)
 #define RPCBIND_PORT		(111u)
 
@@ -158,20 +161,69 @@ static void rpcb_map_release(void *data)
 	kfree(map);
 }
 
-static const struct sockaddr_in rpcb_inaddr_loopback = {
-	.sin_family		= AF_INET,
-	.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
-	.sin_port		= htons(RPCBIND_PORT),
-};
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local_unix(void)
+{
+	static const struct sockaddr_un rpcb_localaddr_rpcbind = {
+		.sun_family		= AF_LOCAL,
+		.sun_path		= RPCBIND_SOCK_PATHNAME,
+	};
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= XPRT_TRANSPORT_LOCAL,
+		.address	= (struct sockaddr *)&rpcb_localaddr_rpcbind,
+		.addrsize	= sizeof(rpcb_localaddr_rpcbind),
+		.servername	= "localhost",
+		.program	= &rpcb_program,
+		.version	= RPCBVERS_2,
+		.authflavor	= RPC_AUTH_NULL,
+	};
+	struct rpc_clnt *clnt, *clnt4;
+	int result = 0;
+
+	/*
+	 * Because we requested an RPC PING at transport creation time,
+	 * this works only if the user space portmapper is rpcbind, and
+	 * it's listening on AF_LOCAL on the named socket.
+	 */
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("RPC:       failed to create AF_LOCAL rpcbind "
+				"client (errno %ld).\n", PTR_ERR(clnt));
+		result = -PTR_ERR(clnt);
+		goto out;
+	}
+
+	clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
+	if (IS_ERR(clnt4)) {
+		dprintk("RPC:       failed to bind second program to "
+				"rpcbind v4 client (errno %ld).\n",
+				PTR_ERR(clnt4));
+		clnt4 = NULL;
+	}
+
+	/* Protected by rpcb_create_local_mutex */
+	rpcb_local_clnt = clnt;
+	rpcb_local_clnt4 = clnt4;
 
-static DEFINE_MUTEX(rpcb_create_local_mutex);
+out:
+	return result;
+}
 
 /*
  * Returns zero on success, otherwise a negative errno value
  * is returned.
  */
-static int rpcb_create_local(void)
+static int rpcb_create_local_net(void)
 {
+	static const struct sockaddr_in rpcb_inaddr_loopback = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
+		.sin_port		= htons(RPCBIND_PORT),
+	};
 	struct rpc_create_args args = {
 		.net		= &init_net,
 		.protocol	= XPRT_TRANSPORT_TCP,
@@ -186,13 +238,6 @@ static int rpcb_create_local(void)
 	struct rpc_clnt *clnt, *clnt4;
 	int result = 0;
 
-	if (rpcb_local_clnt)
-		return result;
-
-	mutex_lock(&rpcb_create_local_mutex);
-	if (rpcb_local_clnt)
-		goto out;
-
 	clnt = rpc_create(&args);
 	if (IS_ERR(clnt)) {
 		dprintk("RPC:       failed to create local rpcbind "
@@ -214,9 +259,33 @@ static int rpcb_create_local(void)
 		clnt4 = NULL;
 	}
 
+	/* Protected by rpcb_create_local_mutex */
 	rpcb_local_clnt = clnt;
 	rpcb_local_clnt4 = clnt4;
 
+out:
+	return result;
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local(void)
+{
+	static DEFINE_MUTEX(rpcb_create_local_mutex);
+	int result = 0;
+
+	if (rpcb_local_clnt)
+		return result;
+
+	mutex_lock(&rpcb_create_local_mutex);
+	if (rpcb_local_clnt)
+		goto out;
+
+	if (rpcb_create_local_unix() != 0)
+		result = rpcb_create_local_net();
+
 out:
 	mutex_unlock(&rpcb_create_local_mutex);
 	return result;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 08e05a8ce02..2b90292e950 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -942,6 +942,8 @@ static void svc_unregister(const struct svc_serv *serv)
 			if (progp->pg_vers[i]->vs_hidden)
 				continue;
 
+			dprintk("svc: attempting to unregister %sv%u\n",
+				progp->pg_name, i);
 			__svc_unregister(progp->pg_prog, i, progp->pg_name);
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 559649efb9b0d248541933197bdf7b75529da457 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 May 2011 15:23:04 -0400
Subject: SUNRPC: Remove obsolete comment

Clean up.  The documenting comment at the top of net/sunrpc/clnt.c is
out of date.  We adopted BSD's RTO estimation mechanism years ago.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b91f8f943c5..08ed49629b8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -13,10 +13,6 @@
  *	and need to be refreshed, or when a packet was damaged in transit.
  *	This may be have to be moved to the VFS layer.
  *
- *  NB: BSD uses a more intelligent approach to guessing when a request
- *  or reply has been lost by keeping the RTO estimate for each procedure.
- *  We currently make do with a constant timeout value.
- *
  *  Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com>
  *  Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de>
  */
-- 
cgit v1.2.3-70-g09d2


From 176e21ee2ec89cae8d45cf1a850ea45a45428fb8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@ORACLE.COM>
Date: Mon, 9 May 2011 15:22:44 -0400
Subject: SUNRPC: Support for RPC over AF_LOCAL transports

TI-RPC introduces the capability of performing RPC over AF_LOCAL
sockets.  It uses this mainly for registering and unregistering
local RPC services securely with the local rpcbind, but we could
also conceivably use it as a generic upcall mechanism.

This patch provides a client-side only implementation for the moment.
We might also consider a server-side implementation to provide
AF_LOCAL access to NLM (for statd downcalls, and such like).

Autobinding is not supported on kernel AF_LOCAL transports at this
time.  Kernel ULPs must specify the pathname of the remote endpoint
when an AF_LOCAL transport is created.  rpcbind supports registering
services available via AF_LOCAL, so the kernel could handle it with
some adjustment to ->rpcbind and ->set_port.  But we don't need this
feature for doing upcalls via well-known named sockets.

This has not been tested with ULPs that move a substantial amount of
data.  Thus, I can't attest to how robust the write_space and
congestion management logic is.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/msg_prot.h |   1 +
 include/linux/sunrpc/xprt.h     |   3 +-
 net/sunrpc/clnt.c               |   8 +
 net/sunrpc/xprtsock.c           | 395 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 403 insertions(+), 4 deletions(-)

diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 77e62488339..c68a147939a 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -145,6 +145,7 @@ typedef __be32	rpc_fraghdr;
 #define RPCBIND_NETID_TCP	"tcp"
 #define RPCBIND_NETID_UDP6	"udp6"
 #define RPCBIND_NETID_TCP6	"tcp6"
+#define RPCBIND_NETID_LOCAL	"local"
 
 /*
  * Note that RFC 1833 does not put any size restrictions on the
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a0f998c07c6..81cce3b3ee6 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -141,7 +141,8 @@ enum xprt_transports {
 	XPRT_TRANSPORT_UDP	= IPPROTO_UDP,
 	XPRT_TRANSPORT_TCP	= IPPROTO_TCP,
 	XPRT_TRANSPORT_BC_TCP	= IPPROTO_TCP | XPRT_TRANSPORT_BC,
-	XPRT_TRANSPORT_RDMA	= 256
+	XPRT_TRANSPORT_RDMA	= 256,
+	XPRT_TRANSPORT_LOCAL	= 257,
 };
 
 struct rpc_xprt {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 08ed49629b8..b84d7395535 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -28,7 +28,9 @@
 #include <linux/slab.h>
 #include <linux/utsname.h>
 #include <linux/workqueue.h>
+#include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/un.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
@@ -294,6 +296,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	 * up a string representation of the passed-in address.
 	 */
 	if (args->servername == NULL) {
+		struct sockaddr_un *sun =
+				(struct sockaddr_un *)args->address;
 		struct sockaddr_in *sin =
 				(struct sockaddr_in *)args->address;
 		struct sockaddr_in6 *sin6 =
@@ -301,6 +305,10 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 
 		servername[0] = '\0';
 		switch (args->address->sa_family) {
+		case AF_LOCAL:
+			snprintf(servername, sizeof(servername), "%s",
+				 sun->sun_path);
+			break;
 		case AF_INET:
 			snprintf(servername, sizeof(servername), "%pI4",
 				 &sin->sin_addr.s_addr);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 079366974e1..72abb735893 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/capability.h>
@@ -28,6 +29,7 @@
 #include <linux/in.h>
 #include <linux/net.h>
 #include <linux/mm.h>
+#include <linux/un.h>
 #include <linux/udp.h>
 #include <linux/tcp.h>
 #include <linux/sunrpc/clnt.h>
@@ -45,6 +47,9 @@
 #include <net/tcp.h>
 
 #include "sunrpc.h"
+
+static void xs_close(struct rpc_xprt *xprt);
+
 /*
  * xprtsock tunables
  */
@@ -261,6 +266,11 @@ static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
 	return (struct sockaddr *) &xprt->addr;
 }
 
+static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_un *) &xprt->addr;
+}
+
 static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt)
 {
 	return (struct sockaddr_in *) &xprt->addr;
@@ -276,23 +286,34 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt)
 	struct sockaddr *sap = xs_addr(xprt);
 	struct sockaddr_in6 *sin6;
 	struct sockaddr_in *sin;
+	struct sockaddr_un *sun;
 	char buf[128];
 
-	(void)rpc_ntop(sap, buf, sizeof(buf));
-	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
-
 	switch (sap->sa_family) {
+	case AF_LOCAL:
+		sun = xs_addr_un(xprt);
+		strlcpy(buf, sun->sun_path, sizeof(buf));
+		xprt->address_strings[RPC_DISPLAY_ADDR] =
+						kstrdup(buf, GFP_KERNEL);
+		break;
 	case AF_INET:
+		(void)rpc_ntop(sap, buf, sizeof(buf));
+		xprt->address_strings[RPC_DISPLAY_ADDR] =
+						kstrdup(buf, GFP_KERNEL);
 		sin = xs_addr_in(xprt);
 		snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
 		break;
 	case AF_INET6:
+		(void)rpc_ntop(sap, buf, sizeof(buf));
+		xprt->address_strings[RPC_DISPLAY_ADDR] =
+						kstrdup(buf, GFP_KERNEL);
 		sin6 = xs_addr_in6(xprt);
 		snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
 		break;
 	default:
 		BUG();
 	}
+
 	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
 }
 
@@ -505,6 +526,60 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
 	*base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen);
 }
 
+/**
+ * xs_local_send_request - write an RPC request to an AF_LOCAL socket
+ * @task: RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ *        0:	The request has been sent
+ *   EAGAIN:	The socket was blocked, please call again later to
+ *		complete the request
+ * ENOTCONN:	Caller needs to invoke connect logic then call again
+ *    other:	Some other error occured, the request was not sent
+ */
+static int xs_local_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct xdr_buf *xdr = &req->rq_snd_buf;
+	int status;
+
+	xs_encode_stream_record_marker(&req->rq_snd_buf);
+
+	xs_pktdump("packet data:",
+			req->rq_svec->iov_base, req->rq_svec->iov_len);
+
+	status = xs_sendpages(transport->sock, NULL, 0,
+						xdr, req->rq_bytes_sent);
+	dprintk("RPC:       %s(%u) = %d\n",
+			__func__, xdr->len - req->rq_bytes_sent, status);
+	if (likely(status >= 0)) {
+		req->rq_bytes_sent += status;
+		req->rq_xmit_bytes_sent += status;
+		if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+			req->rq_bytes_sent = 0;
+			return 0;
+		}
+		status = -EAGAIN;
+	}
+
+	switch (status) {
+	case -EAGAIN:
+		status = xs_nospace(task);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
+	case -EPIPE:
+		xs_close(xprt);
+		status = -ENOTCONN;
+	}
+
+	return status;
+}
+
 /**
  * xs_udp_send_request - write an RPC request to a UDP socket
  * @task: address of RPC task that manages the state of an RPC request
@@ -788,6 +863,88 @@ static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
 	return (struct rpc_xprt *) sk->sk_user_data;
 }
 
+static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
+{
+	struct xdr_skb_reader desc = {
+		.skb		= skb,
+		.offset		= sizeof(rpc_fraghdr),
+		.count		= skb->len - sizeof(rpc_fraghdr),
+	};
+
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
+		return -1;
+	if (desc.count)
+		return -1;
+	return 0;
+}
+
+/**
+ * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
+ * @sk: socket with data to read
+ * @len: how much data to read
+ *
+ * Currently this assumes we can read the whole reply in a single gulp.
+ */
+static void xs_local_data_ready(struct sock *sk, int len)
+{
+	struct rpc_task *task;
+	struct rpc_xprt *xprt;
+	struct rpc_rqst *rovr;
+	struct sk_buff *skb;
+	int err, repsize, copied;
+	u32 _xid;
+	__be32 *xp;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	dprintk("RPC:       %s...\n", __func__);
+	xprt = xprt_from_sock(sk);
+	if (xprt == NULL)
+		goto out;
+
+	skb = skb_recv_datagram(sk, 0, 1, &err);
+	if (skb == NULL)
+		goto out;
+
+	if (xprt->shutdown)
+		goto dropit;
+
+	repsize = skb->len - sizeof(rpc_fraghdr);
+	if (repsize < 4) {
+		dprintk("RPC:       impossible RPC reply size %d\n", repsize);
+		goto dropit;
+	}
+
+	/* Copy the XID from the skb... */
+	xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
+	if (xp == NULL)
+		goto dropit;
+
+	/* Look up and lock the request corresponding to the given XID */
+	spin_lock(&xprt->transport_lock);
+	rovr = xprt_lookup_rqst(xprt, *xp);
+	if (!rovr)
+		goto out_unlock;
+	task = rovr->rq_task;
+
+	copied = rovr->rq_private_buf.buflen;
+	if (copied > repsize)
+		copied = repsize;
+
+	if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
+		dprintk("RPC:       sk_buff copy failed\n");
+		goto out_unlock;
+	}
+
+	xprt_complete_rqst(task, copied);
+
+ out_unlock:
+	spin_unlock(&xprt->transport_lock);
+ dropit:
+	skb_free_datagram(sk, skb);
+ out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
 /**
  * xs_udp_data_ready - "data ready" callback for UDP sockets
  * @sk: socket with data to read
@@ -1573,11 +1730,31 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 	return err;
 }
 
+/*
+ * We don't support autobind on AF_LOCAL sockets
+ */
+static void xs_local_rpcbind(struct rpc_task *task)
+{
+	xprt_set_bound(task->tk_xprt);
+}
+
+static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+}
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key xs_key[2];
 static struct lock_class_key xs_slock_key[2];
 
+static inline void xs_reclassify_socketu(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC",
+		&xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]);
+}
+
 static inline void xs_reclassify_socket4(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -1599,6 +1776,9 @@ static inline void xs_reclassify_socket6(struct socket *sock)
 static inline void xs_reclassify_socket(int family, struct socket *sock)
 {
 	switch (family) {
+	case AF_LOCAL:
+		xs_reclassify_socketu(sock);
+		break;
 	case AF_INET:
 		xs_reclassify_socket4(sock);
 		break;
@@ -1608,6 +1788,10 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
 	}
 }
 #else
+static inline void xs_reclassify_socketu(struct socket *sock)
+{
+}
+
 static inline void xs_reclassify_socket4(struct socket *sock)
 {
 }
@@ -1646,6 +1830,94 @@ out:
 	return ERR_PTR(err);
 }
 
+static int xs_local_finish_connecting(struct rpc_xprt *xprt,
+				      struct socket *sock)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+									xprt);
+
+	if (!transport->inet) {
+		struct sock *sk = sock->sk;
+
+		write_lock_bh(&sk->sk_callback_lock);
+
+		xs_save_old_callbacks(transport, sk);
+
+		sk->sk_user_data = xprt;
+		sk->sk_data_ready = xs_local_data_ready;
+		sk->sk_write_space = xs_udp_write_space;
+		sk->sk_error_report = xs_error_report;
+		sk->sk_allocation = GFP_ATOMIC;
+
+		xprt_clear_connected(xprt);
+
+		/* Reset to new socket */
+		transport->sock = sock;
+		transport->inet = sk;
+
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+
+	/* Tell the socket layer to start connecting... */
+	xprt->stat.connect_count++;
+	xprt->stat.connect_start = jiffies;
+	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
+}
+
+/**
+ * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_local_setup_socket(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock;
+	int status = -EIO;
+
+	if (xprt->shutdown)
+		goto out;
+
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	status = __sock_create(xprt->xprt_net, AF_LOCAL,
+					SOCK_STREAM, 0, &sock, 1);
+	if (status < 0) {
+		dprintk("RPC:       can't create AF_LOCAL "
+			"transport socket (%d).\n", -status);
+		goto out;
+	}
+	xs_reclassify_socketu(sock);
+
+	dprintk("RPC:       worker connecting xprt %p via AF_LOCAL to %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	status = xs_local_finish_connecting(xprt, sock);
+	switch (status) {
+	case 0:
+		dprintk("RPC:       xprt %p connected to %s\n",
+				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+		xprt_set_connected(xprt);
+		break;
+	case -ENOENT:
+		dprintk("RPC:       xprt %p: socket %s does not exist\n",
+				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+		break;
+	default:
+		printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n",
+				__func__, -status,
+				xprt->address_strings[RPC_DISPLAY_ADDR]);
+	}
+
+out:
+	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
+}
+
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1929,6 +2201,32 @@ static void xs_connect(struct rpc_task *task)
 	}
 }
 
+/**
+ * xs_local_print_stats - display AF_LOCAL socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu "
+			"%llu %llu\n",
+			xprt->stat.bind_count,
+			xprt->stat.connect_count,
+			xprt->stat.connect_time,
+			idle_time,
+			xprt->stat.sends,
+			xprt->stat.recvs,
+			xprt->stat.bad_xids,
+			xprt->stat.req_u,
+			xprt->stat.bklog_u);
+}
+
 /**
  * xs_udp_print_stats - display UDP socket-specifc stats
  * @xprt: rpc_xprt struct containing statistics
@@ -2099,6 +2397,21 @@ static void bc_destroy(struct rpc_xprt *xprt)
 {
 }
 
+static struct rpc_xprt_ops xs_local_ops = {
+	.reserve_xprt		= xprt_reserve_xprt,
+	.release_xprt		= xs_tcp_release_xprt,
+	.rpcbind		= xs_local_rpcbind,
+	.set_port		= xs_local_set_port,
+	.connect		= xs_connect,
+	.buf_alloc		= rpc_malloc,
+	.buf_free		= rpc_free,
+	.send_request		= xs_local_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= xs_close,
+	.destroy		= xs_destroy,
+	.print_stats		= xs_local_print_stats,
+};
+
 static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
@@ -2160,6 +2473,8 @@ static int xs_init_anyaddr(const int family, struct sockaddr *sap)
 	};
 
 	switch (family) {
+	case AF_LOCAL:
+		break;
 	case AF_INET:
 		memcpy(sap, &sin, sizeof(sin));
 		break;
@@ -2207,6 +2522,70 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
 	return xprt;
 }
 
+static const struct rpc_timeout xs_local_default_timeout = {
+	.to_initval = 10 * HZ,
+	.to_maxval = 10 * HZ,
+	.to_retries = 2,
+};
+
+/**
+ * xs_setup_local - Set up transport to use an AF_LOCAL socket
+ * @args: rpc transport creation arguments
+ *
+ * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP
+ */
+static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
+{
+	struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr;
+	struct sock_xprt *transport;
+	struct rpc_xprt *xprt;
+	struct rpc_xprt *ret;
+
+	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
+	if (IS_ERR(xprt))
+		return xprt;
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	xprt->prot = 0;
+	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+
+	xprt->bind_timeout = XS_BIND_TO;
+	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+	xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+	xprt->ops = &xs_local_ops;
+	xprt->timeout = &xs_local_default_timeout;
+
+	switch (sun->sun_family) {
+	case AF_LOCAL:
+		if (sun->sun_path[0] != '/') {
+			dprintk("RPC:       bad AF_LOCAL address: %s\n",
+					sun->sun_path);
+			ret = ERR_PTR(-EINVAL);
+			goto out_err;
+		}
+		xprt_set_bound(xprt);
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_local_setup_socket);
+		xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL);
+		break;
+	default:
+		ret = ERR_PTR(-EAFNOSUPPORT);
+		goto out_err;
+	}
+
+	dprintk("RPC:       set up xprt to %s via AF_LOCAL\n",
+			xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+	ret = ERR_PTR(-EINVAL);
+out_err:
+	xprt_free(xprt);
+	return ret;
+}
+
 static const struct rpc_timeout xs_udp_default_timeout = {
 	.to_initval = 5 * HZ,
 	.to_maxval = 30 * HZ,
@@ -2448,6 +2827,14 @@ out_err:
 	return ret;
 }
 
+static struct xprt_class	xs_local_transport = {
+	.list		= LIST_HEAD_INIT(xs_local_transport.list),
+	.name		= "named UNIX socket",
+	.owner		= THIS_MODULE,
+	.ident		= XPRT_TRANSPORT_LOCAL,
+	.setup		= xs_setup_local,
+};
+
 static struct xprt_class	xs_udp_transport = {
 	.list		= LIST_HEAD_INIT(xs_udp_transport.list),
 	.name		= "udp",
@@ -2483,6 +2870,7 @@ int init_socket_xprt(void)
 		sunrpc_table_header = register_sysctl_table(sunrpc_table);
 #endif
 
+	xprt_register_transport(&xs_local_transport);
 	xprt_register_transport(&xs_udp_transport);
 	xprt_register_transport(&xs_tcp_transport);
 	xprt_register_transport(&xs_bc_tcp_transport);
@@ -2503,6 +2891,7 @@ void cleanup_socket_xprt(void)
 	}
 #endif
 
+	xprt_unregister_transport(&xs_local_transport);
 	xprt_unregister_transport(&xs_udp_transport);
 	xprt_unregister_transport(&xs_tcp_transport);
 	xprt_unregister_transport(&xs_bc_tcp_transport);
-- 
cgit v1.2.3-70-g09d2


From 89e1be50c68eb5e58b873dce87bbac627ee18d1f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 May 2011 23:11:24 -0400
Subject: x86: Put back -pg to tsc.o and add no GCOV to vread_tsc_64.o

The commit 44259b1abfaa8bb819d25d41d71e8e33e25dd36a
    Author: Andy Lutomirski <luto@MIT.EDU>
    x86-64: Move vread_tsc into a new file with sensible options

Removed the -pg from tsc.o which caused the function graph tracer
to go into an infinite function call recursion as it uses the tsc
internally outside its recursion protection, thus tracing the tsc
breaks the function graph tracer.

This commit also added the file vread_tsc_64.c that gets used
by vdso but failed to prevent GCOV from monkeying with it,
causing userspace to try to access kernel data when GCOV was
enabled.

Thanks to Thomas Gleixner for pointing out GCOV as the likely
culprit that added strange kernel accesses into the vread_tsc()
call.

Cc: Author: Andy Lutomirski <luto@MIT.EDU>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f5abe3a245b..90b06d4daee 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,6 +8,7 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
 CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_pvclock.o = -pg
@@ -28,6 +29,7 @@ CFLAGS_paravirt.o	:= $(nostackp)
 GCOV_PROFILE_vsyscall_64.o	:= n
 GCOV_PROFILE_hpet.o		:= n
 GCOV_PROFILE_tsc.o		:= n
+GCOV_PROFILE_vread_tsc_64.o	:= n
 GCOV_PROFILE_paravirt.o		:= n
 
 # vread_tsc_64 is hot and should be fully optimized:
-- 
cgit v1.2.3-70-g09d2


From 76f0b8d2d2c8d417623142069cdfde1cf1e108d5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:41:54 -0700
Subject: bfs: remove unnecessary dentry_unhash on dir rename

Bfs does not have problems with references to unlinked directories.

CC: tigran@aivazian.fsnet.co.uk
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bfs/dir.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index c7d1d06b048..b14cebfd904 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct bfs_sb_info *info;
 	int error = -ENOENT;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_bh = new_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	if (S_ISDIR(old_inode->i_mode))
-- 
cgit v1.2.3-70-g09d2


From 873ae4d5a8b282c6e2bbc09c6d59eeb1bec97ef7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:41:55 -0700
Subject: sysv: remove unnecessary dentry_unhash from rmdir, dir rename

sysv does not have problems with references to unlinked directories.

CC: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e2cc6756f3b..e474fbcf8bd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	int err = -ENOTEMPTY;
 
-	dentry_unhash(dentry);
-
 	if (sysv_empty_dir(inode)) {
 		err = sysv_unlink(dir, dentry);
 		if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 	struct sysv_dir_entry * old_de;
 	int err = -ENOENT;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_de = sysv_find_entry(old_dentry, &old_page);
 	if (!old_de)
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From cf0f0536fa65ca1353476c49feed34891f3f7134 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:41:56 -0700
Subject: jffs2: remove unnecessary dentry_unhash from rmdir, dir rename

jffs2 does not have problems with references to unlinked directories.

CC: David Woodhouse <dwmw2@infradead.org>
CC: linux-mtd@lists.infradead.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/dir.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 05f73328b28..82faddd1f32 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,8 +609,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	int ret;
 	uint32_t now = get_seconds();
 
-	dentry_unhash(dentry);
-
 	for (fd = f->dents ; fd; fd = fd->next) {
 		if (fd->ino)
 			return -ENOTEMPTY;
@@ -786,9 +784,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	uint8_t type;
 	uint32_t now;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/* The VFS will check for us and prevent trying to rename a
 	 * file over a directory and vice versa, but if it's a directory,
 	 * the VFS can't check whether the victim is empty. The filesystem
-- 
cgit v1.2.3-70-g09d2


From 44a8e6364e48ab93a1d86385b5fc9efe81395fa9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:41:57 -0700
Subject: jfs: remove unnecessary dentry_unhash from rmdir, dir rename

jfs does not have problems with references to unlinked directories.

CC: Dave Kleikamp <shaggy@kernel.org>
CC: jfs-discussion@lists.sourceforge.net
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 865df16a6cf..eaaf2b511e8 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	/* Init inode for quota operations. */
 	dquot_initialize(dip);
 	dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
 		 new_dentry->d_name.name);
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
 
-- 
cgit v1.2.3-70-g09d2


From 64370e2f9df572977757d70b1ad293b743dd9e20 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:41:58 -0700
Subject: logfs: remove unnecessary dentry_unhash from rmdir, dir rename

logfs does not have problems with references to unlinked directories.

CC: Joern Engel <joern@logfs.org>
CC: logfs@logfs.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/dir.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f34c9cde9e9..9ed89d1663f 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
-	dentry_unhash(dentry);
-
 	if (!logfs_empty_dir(inode))
 		return -ENOTEMPTY;
 
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
 	loff_t pos;
 	int err;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/* 1. locate source dd */
 	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
 	if (err)
-- 
cgit v1.2.3-70-g09d2


From dfb55de89879a1c32a70d0a510b3701ed9a6b855 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:41:59 -0700
Subject: nilfs2: remove unnecessary dentry_unhash from rmdir, dir rename

nilfs2 does not have problems with references to unlinked directories.

CC: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
CC: linux-nilfs@vger.kernel.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1102a5fbb74..546849b3e88 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct nilfs_transaction_info ti;
 	int err;
 
-	dentry_unhash(dentry);
-
 	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
 	if (err)
 		return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct nilfs_transaction_info ti;
 	int err;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
 	if (unlikely(err))
 		return err;
-- 
cgit v1.2.3-70-g09d2


From 0e54ec1c3a002a9d5e57b5ac73a934cc15a0fe06 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:00 -0700
Subject: ubifs: remove unnecessary dentry_unhash from rmdir, dir rename

ubifs does not have problems with references to unlinked directories.

CC: Artem Bityutskiy <dedekind1@gmail.com>
CC: Adrian Hunter <adrian.hunter@nokia.com>
CC: linux-mtd@lists.infradead.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ubifs/dir.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c2b80943560..ef5abd38f0b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
 
-	dentry_unhash(dentry);
-
 	/*
 	 * Budget request settings: deletion direntry, deletion inode and
 	 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec time;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/*
 	 * Budget request settings: deletion direntry, new direntry, removing
 	 * the old inode, and changing old and new parent directory inodes.
-- 
cgit v1.2.3-70-g09d2


From 87161faae26503a8ebe1be5ba72073ae860dbfc7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:01 -0700
Subject: ufs: remove unnecessary dentry_unhash from rmdir, dir rename

ufs does not have problems with references to unlinked directories.

CC: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 953ebdfc5bf..29309e25417 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err= -ENOTEMPTY;
 
-	dentry_unhash(dentry);
-
 	lock_ufs(dir->i_sb);
 	if (ufs_empty_dir (inode)) {
 		err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From cc350c2764a657ee012efd5bd260a6cd5be2f877 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:02 -0700
Subject: reiserfs: remove unnecessary dentry_unhash from rmdir, dir rename

Reiserfs does not have problems with references to unlinked directories.

CC: reiserfs-devel@vger.kernel.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/namei.c | 5 -----
 fs/reiserfs/xattr.c | 1 -
 2 files changed, 6 deletions(-)

diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 76c8164d565..118662690cd 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 	INITIALIZE_PATH(path);
 	struct reiserfs_dir_entry de;
 
-	dentry_unhash(dentry);
-
 	/* we will be doing 2 balancings and update 2 stat data, we change quotas
 	 * of the owner of the directory and of the owner of the parent directory.
 	 * The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	unsigned long savelink = 1;
 	struct timespec ctime;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	/* three balancings: (1) old name removal, (2) new name insertion
 	   and (3) maybe "save" link insertion
 	   stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 50f1abccd1c..e8a62f41b45 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 
 	reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
 					I_MUTEX_CHILD, dir->i_sb);
-	dentry_unhash(dentry);
 	error = dir->i_op->rmdir(dir, dentry);
 	if (!error)
 		dentry->d_inode->i_flags |= S_DEAD;
-- 
cgit v1.2.3-70-g09d2


From 7020739df2fa0e2126fc9739987e016860f14323 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:03 -0700
Subject: udf: remove unnecessary dentry_unhash from rmdir, dir rename

udf does not have problems with references to unlinked directories.

CC: Jan Kara <jack@suse.cz>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4d76594c2a8..f1dce848ef9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fileIdentDesc *fi, cfi;
 	struct kernel_lb_addr tloc;
 
-	dentry_unhash(dentry);
-
 	retval = -ENOENT;
 	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct kernel_lb_addr tloc;
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	if (ofi) {
 		if (ofibh.sbh != ofibh.ebh)
-- 
cgit v1.2.3-70-g09d2


From 8aaa0f5431d8d1181b3d1a1bcd8f3330c0ce275f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:04 -0700
Subject: omfs: remove unnecessary dentry_unhash on rmdir, dir rneame

omfs does not have problems with references to unlinked directories.

CC: Bob Copeland <me@bobcopeland.com>
CC: linux-karma-devel@lists.sourceforge.net
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/omfs/dir.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c368360c35a..3b8d3979e03 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
 	int ret;
 
 
-	if (S_ISDIR(inode->i_mode)) {
-		dentry_unhash(dentry);
-		if (!omfs_dir_is_empty(inode))
-			return -ENOTEMPTY;
-	}
+	if (S_ISDIR(inode->i_mode) &&
+	    !omfs_dir_is_empty(inode))
+		return -ENOTEMPTY;
 
 	ret = omfs_delete_entry(dentry);
 	if (ret)
@@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err;
 
 	if (new_inode) {
-		if (S_ISDIR(new_inode->i_mode))
-			dentry_unhash(new_dentry);
-
 		/* overwriting existing file/dir */
 		err = omfs_remove(new_dir, new_dentry);
 		if (err)
-- 
cgit v1.2.3-70-g09d2


From 4e82d61b6ac4966b3b61c2d97ddf04928f037be1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:05 -0700
Subject: hfs: remove unnecessary dentry_unhash on rmdir, dir rename

hfs does not have problems with references to unlinked directories.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/dir.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 1cb70cdba2c..b4d70b13be9 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int res;
 
-	if (S_ISDIR(inode->i_mode))
-		dentry_unhash(dentry);
-
 	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
 		return -ENOTEMPTY;
 	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		if (S_ISDIR(new_dentry->d_inode->i_mode))
-			dentry_unhash(new_dentry);
-
 		res = hfs_remove(new_dir, new_dentry);
 		if (res)
 			return res;
-- 
cgit v1.2.3-70-g09d2


From e3911785b8ae6897b3dae2af4fa296aa5a0f2c56 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:06 -0700
Subject: hfsplus: remove unnecessary dentry_unhash on rmdir, dir rename

hfsplus does not have problems with references to unlinked directories.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/dir.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b28835091dd..4df5059c25d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int res;
 
-	dentry_unhash(dentry);
-
 	if (inode->i_size != 2)
 		return -ENOTEMPTY;
 
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		if (S_ISDIR(new_dentry->d_inode->i_mode)) {
-			dentry_unhash(new_dentry);
+		if (S_ISDIR(new_dentry->d_inode->i_mode))
 			res = hfsplus_rmdir(new_dir, new_dentry);
-		} else {
+		else
 			res = hfsplus_unlink(new_dir, new_dentry);
-		}
 		if (res)
 			return res;
 	}
-- 
cgit v1.2.3-70-g09d2


From e41a59e0550b7bb40fe8c3438d690712e9fd511c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:07 -0700
Subject: hostfs: remove unnecessary dentry_unhash on rmdir, dir rename

hostfs does not have problems with references to unlinked directories.

CC: Jeff Dike <jdike@addtoit.com>
CC: Richard Weinberger <richard@nod.at>
CC: user-mode-linux-devel@lists.sourceforge.net
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hostfs/hostfs_kern.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6816b9e690..2638c834ed2 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
-	dentry_unhash(dentry);
-
 	if ((file = dentry_name(dentry)) == NULL)
 		return -ENOMEM;
 	err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	char *from_name, *to_name;
 	int err;
 
-	if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
-		dentry_unhash(to);
-
 	if ((from_name = dentry_name(from)) == NULL)
 		return -ENOMEM;
 	if ((to_name = dentry_name(to)) == NULL) {
-- 
cgit v1.2.3-70-g09d2


From 55e5b7e022eaaa805a44e3b6ecd5c8638d862050 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:08 -0700
Subject: ecryptfs: remove unnecessary dentry_unhash on rmdir, dir rename

ecryptfs does not have problems with references to unlinked directories.

CC: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
CC: Dustin Kirkland <kirkland@canonical.com>
CC: ecryptfs-devel@lists.launchpad.net
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 227b409b840..4d4cc6a90cd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,8 +521,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct dentry *lower_dir_dentry;
 	int rc;
 
-	dentry_unhash(dentry);
-
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
@@ -573,9 +571,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dir_dentry;
 	struct dentry *trap = NULL;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
 	dget(lower_old_dentry);
-- 
cgit v1.2.3-70-g09d2


From 7ce605d93b775e8a960b0be244f7be565e73b3c1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:09 -0700
Subject: ncpfs: document dentry_unhash usage

ncpfs returns EBUSY if there are any references to the directory.  The
dentry_unhash call only unhashes the dentry if there are no references.

CC: Petr Vandrovec <petr@vandrovec.name>
CC: linux-kernel@vger.kernel.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/dir.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e3e646b0640..81c287d105d 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	DPRINTK("ncp_rmdir: removing %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
+	/*
+	 * fail with EBUSY if there are still references to this
+	 * directory.
+	 */
 	dentry_unhash(dentry);
-
 	error = -EBUSY;
 	if (!d_unhashed(dentry))
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 76cc071a06afb4d2dd17bb1b855a233a419e7e02 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:10 -0700
Subject: ncpfs: fix rename over directory with dangling references

ncpfs does not handle references to unlinked directories (or so it would
seem given the ncp_rmdir check).  Since it is also possible to rename over
an empty directory, perform the same check here.

CC: Petr Vandrovec <petr@vandrovec.name>
CC: linux-kernel@vger.kernel.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/dir.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 81c287d105d..9c51f621e90 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1144,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
+		/*
+		 * fail with EBUSY if there are still references to this
+		 * directory.
+		 */
 		dentry_unhash(new_dentry);
+		error = -EBUSY;
+		if (!d_unhashed(new_dentry))
+			goto out;
+	}
 
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
-- 
cgit v1.2.3-70-g09d2


From 86905d6d96f138a99326016e4f0ca933200e0729 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:11 -0700
Subject: 9p: remove unnecessary dentry_unhash on rmdir, dir rename

9p has no problems with references to unlinked directories.

CC: Eric Van Hensbergen <ericvh@gmail.com>
CC: Ron Minnich <rminnich@sandia.gov>
CC: Latchesar Ionkov <lucho@ionkov.net>
CC: v9fs-developer@lists.sourceforge.net
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d7f3e69ae2..7f6c6770319 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
-	dentry_unhash(d);
 	return v9fs_remove(i, d, 1);
 }
 
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct p9_fid *newdirfid;
 	struct p9_wstat wstat;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
 	retval = 0;
 	old_inode = old_dentry->d_inode;
-- 
cgit v1.2.3-70-g09d2


From 94f9901b6c05d5e1e38d887179b8307efbc8e10c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:12 -0700
Subject: affs: remove unnecessary dentry_unhash on rmdir, dir rename

affs has no problems with references to unlinked directories.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 03330e2e390..e3e9efc1fdd 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
 		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	return affs_remove_header(dentry);
 }
 
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh = NULL;
 	int retval;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
 		 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
 		 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
-- 
cgit v1.2.3-70-g09d2


From 705da2de95ce79199a52f21d82dd9f36b89afd12 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:13 -0700
Subject: afs: remove unnecessary dentry_unhash on rmdir, dir rename

afs has no problems with references to unlinked directories.

CC: David Howells <dhowells@redhat.com>
CC: linux-afs@lists.infradead.org
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/dir.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c4e0516004..20c106f2492 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 	_enter("{%x:%u},{%s}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
 
-	dentry_unhash(dentry);
-
 	ret = -ENAMETOOLONG;
 	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct key *key;
 	int ret;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	vnode = AFS_FS_I(old_dentry->d_inode);
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
-- 
cgit v1.2.3-70-g09d2


From 42b850b2806f8f55f5c3a0e3c78af738d5054fdc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:14 -0700
Subject: coda: remove unnecessary dentry_unhash on rmdir, dir rename

Coda has no problems with references to unlinked directories.

CC: Jan Harkes <jaharkes@cs.cmu.edu>
CC: coda@cs.cmu.edu
CC: codalist@coda.cs.cmu.edu
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/dir.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a46126fd573..2b8dae4d121 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 	int len = de->d_name.len;
 	int error;
 
-	dentry_unhash(de);
-
 	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
 	if (!error) {
 		/* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int new_length = new_dentry->d_name.len;
 	int error;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
 			     coda_i2f(new_dir), old_length, new_length,
 			     (const char *) old_name, (const char *)new_name);
-- 
cgit v1.2.3-70-g09d2


From 526e7ce5528c8addcf154acec41b615f71dce7e1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:15 -0700
Subject: fuse: remove unnecessary dentry_unhash on rmdir, dir rename

Fuse has no problems with references to unlinked directories.

CC: Miklos Szeredi <miklos@szeredi.hu>
CC: fuse-devel@lists.sourceforge.net
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/dir.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d0e3faddcf..d5016071459 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	dentry_unhash(entry);
-
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
 	req->in.numargs = 1;
@@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	struct fuse_conn *fc = get_fuse_conn(olddir);
 	struct fuse_req *req = fuse_get_req(fc);
 
-	if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
-		dentry_unhash(newent);
-
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-- 
cgit v1.2.3-70-g09d2


From b80d2c228d8206cadc32226750dbbf3b707bdd19 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:16 -0700
Subject: minix: remove unnecessary dentry_unhash on rmdir, dir rename

Minix has no issues with references to unlinked directories.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/minix/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f60aed8db9c..6e6777f1b4b 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err = -ENOTEMPTY;
 
-	dentry_unhash(dentry);
-
 	if (minix_empty_dir(inode)) {
 		err = minix_unlink(dir, dentry);
 		if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct minix_dir_entry * old_de;
 	int err = -ENOENT;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_de = minix_find_entry(old_dentry, &old_page);
 	if (!old_de)
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 45adfef7d023004ff95bf63b5f2f0e2d88afac3f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:17 -0700
Subject: hpfs: remove unnecessary dentry_unhash on rmdir, dir rename

Hpfs has no problems with references to unlinked directories.

We leave one dentry_unhash call in place, in hpfs_unlink's strange path
where it tries to truncate a file because the disk is full.  I'm not sure
what the full story is there.

CC: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hpfs/namei.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ff0ce21c086..acf95dab2aa 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int r;
 
-	dentry_unhash(dentry);
-
 	hpfs_adjust_length(name, &len);
 	hpfs_lock(dir->i_sb);
 	err = -ENOENT;
@@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct fnode *fnode;
 	int err;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
 	err = 0;
 	hpfs_adjust_length(old_name, &old_len);
-- 
cgit v1.2.3-70-g09d2


From f4ff0e25c5093dd89e9cac4a8f71a57587ada787 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:18 -0700
Subject: fat: remove unnecessary dentry_unhash on rmdir, dir rename

fat does not have problems with references to unlinked directories.

CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/namei_msdos.c | 5 -----
 fs/fat/namei_vfat.c  | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index be15437c272..3b222dafd15 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fat_slot_info sinfo;
 	int err;
 
-	dentry_unhash(dentry);
-
 	lock_super(sb);
 	/*
 	 * Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
 
-	if (new_inode && S_ISDIR(new_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	err = fat_scan(old_dir, old_name, &old_sinfo);
 	if (err) {
 		err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c61a6789f36..20b4ea53fdc 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	struct fat_slot_info sinfo;
 	int err;
 
-	dentry_unhash(dentry);
-
 	lock_super(sb);
 
 	err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-		dentry_unhash(new_dentry);
-
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
-- 
cgit v1.2.3-70-g09d2


From 98702467f829177b3993f17da9fe5c202d160e5e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 27 May 2011 13:42:19 -0700
Subject: configfs: remove unnecessary dentry_unhash on rmdir, dir rename

configfs does not have problems with references to unlinked directories.

CC: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/configfs/dir.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9d17d350abc..9a37a9b6de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
 	int ret;
 
-	dentry_unhash(dentry);
-
 	if (dentry->d_parent == configfs_sb->s_root)
 		return -EPERM;
 
-- 
cgit v1.2.3-70-g09d2


From 3d08bcc887a1c8d12be8d81f747ffa2e8a44b67b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Fri, 27 May 2011 12:23:34 -0700
Subject: mm: Wait for writeback when grabbing pages to begin a write

When grabbing a page for a buffered IO write, the mm should wait for writeback
on the page to complete so that the page does not become writable during the IO
operation.  This change is needed to provide page stability during writes for
all filesystems.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/filemap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index bcdc393b658..dac95a24dea 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2327,7 +2327,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
 repeat:
 	page = find_lock_page(mapping, index);
 	if (page)
-		return page;
+		goto found;
 
 	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
 	if (!page)
@@ -2340,6 +2340,8 @@ repeat:
 			goto repeat;
 		return NULL;
 	}
+found:
+	wait_on_page_writeback(page);
 	return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
-- 
cgit v1.2.3-70-g09d2


From d76ee18a8551e33ad7dbd55cac38bc7b094f3abb Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Fri, 27 May 2011 12:23:41 -0700
Subject: fs: block_page_mkwrite should wait for writeback to finish

For filesystems such as nilfs2 and xfs that use block_page_mkwrite, modify that
function to wait for pending writeback before allowing the page to become
writable.  This is needed to stabilize pages during writeback for those two
filesystems.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index 698c6b2cc46..49c9aada037 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		ret = -EAGAIN;
 		goto out_unlock;
 	}
+	wait_on_page_writeback(page);
 	return 0;
 out_unlock:
 	unlock_page(page);
-- 
cgit v1.2.3-70-g09d2


From 1ed7891f8dcdba62041dfa265aea038afb49635c Mon Sep 17 00:00:00 2001
From: Mattias Wallin <mattias.wallin@stericsson.com>
Date: Fri, 27 May 2011 11:49:43 +0200
Subject: mfd: Use mfd cell platform_data for db8500-prcmu cells platform bits

With the addition of a device platform mfd_cell pointer, MFD drivers
can go back to passing platform data back to their sub drivers.
This allows for an mfd_cell->mfd_data removal and thus keep the
sub drivers MFD agnostic. This is mostly needed for non MFD aware
sub drivers.

Signed-off-by: Mattias Wallin <mattias.wallin@stericsson.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/db8500-prcmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index e63782107e2..02a15d7cb3b 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2005,7 +2005,8 @@ static struct regulator_init_data db8500_regulators[DB8500_NUM_REGULATORS] = {
 static struct mfd_cell db8500_prcmu_devs[] = {
 	{
 		.name = "db8500-prcmu-regulators",
-		.mfd_data = &db8500_regulators,
+		.platform_data = &db8500_regulators,
+		.pdata_size = sizeof(db8500_regulators),
 	},
 	{
 		.name = "cpufreq-u8500",
-- 
cgit v1.2.3-70-g09d2


From 66c500ff36d74c6669ad260deda4ce8125b55304 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <lrg@kernel.org>
Date: Fri, 27 May 2011 22:13:08 +0100
Subject: mfd: Fix build breakage caused by tps65910 gpio directory move

Signed-off-by: Liam Girdwood <lrg@ti.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b6c267724e1..0f09c057e79 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -721,7 +721,7 @@ config MFD_PM8XXX_IRQ
 
 config MFD_TPS65910
 	bool "TPS65910 Power Management chip"
-	depends on I2C=y
+	depends on I2C=y && GPIOLIB
 	select MFD_CORE
 	select GPIO_TPS65910
 	help
-- 
cgit v1.2.3-70-g09d2


From 6df87e65dbe4528ef07b917af89913abb8caaaba Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@kernel.org>
Date: Sat, 28 May 2011 14:44:46 +0200
Subject: block: improve the bio_add_page() and bio_add_pc_page() descriptions

The descriptions of bio_add_page() and bio_add_pc_page() are slightly
inconsistent; improve them.

Signed-off-by: Andreas Gruenbacher <agruen@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 840a0d75524..9bfade8a609 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
  *	@offset: vec entry offset
  *
  *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block
- *	device limitations. The target block device must allow bio's
- *      smaller than PAGE_SIZE, so it is always possible to add a single
- *      page to an empty bio. This should only be used by REQ_PC bios.
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
+ *
+ *	This should only be used by REQ_PC bios.
  */
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
@@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page);
  *	@offset: vec entry offset
  *
  *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block
- *	device limitations. The target block device must allow bio's
- *      smaller than PAGE_SIZE, so it is always possible to add a single
- *      page to an empty bio.
+ *	number of reasons, such as the bio being full or target block device
+ *	limitations. The target block device must allow bio's up to PAGE_SIZE,
+ *	so it is always possible to add a single page to an empty bio.
  */
 int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
-- 
cgit v1.2.3-70-g09d2


From 35fbf5bcf497d6ddbe7b6478141e7526d1474ff5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sat, 28 May 2011 14:44:46 +0200
Subject: nbd: pass MSG_* flags to kernel_recvmsg()

Unlike kernel_sendmsg(), kernel_recvmsg() requires passing flags explicitly
via last parameter instead of struct msghdr.msg_flags. Therefore calls to
sock_xmit(lo, 0, ..., MSG_WAITALL) have not been processed properly by tcp
layer wrt. the flag. Fix it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/nbd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index e6fc716aca4..1df3bfe5225 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -192,7 +192,8 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 			if (lo->xmit_timeout)
 				del_timer_sync(&ti);
 		} else
-			result = kernel_recvmsg(sock, &msg, &iov, 1, size, 0);
+			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
+						msg.msg_flags);
 
 		if (signal_pending(current)) {
 			siginfo_t info;
-- 
cgit v1.2.3-70-g09d2


From 3b2710824e00d238554c13b5add347e6c701ab1a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sat, 28 May 2011 14:44:46 +0200
Subject: nbd: limit module parameters to a sane value

The 'max_part' parameter controls the number of maximum partition
a nbd device can have. However if a user specifies very large
value it would exceed the limitation of device minor number and
can cause a kernel oops (or, at least, produce invalid device
nodes in some cases).

In addition, specifying large 'nbds_max' value causes same
problem for the same reason.

On my desktop, following command results to the kernel bug:

$ sudo modprobe nbd max_part=100000
 kernel BUG at /media/Linux_Data/project/linux/fs/sysfs/group.c:65!
 invalid opcode: 0000 [#1] SMP
 last sysfs file: /sys/devices/virtual/block/nbd4/range
 CPU 1
 Modules linked in: nbd(+) bridge stp llc kvm_intel kvm asus_atk0110 sg sr_mod cdrom

 Pid: 2522, comm: modprobe Tainted: G        W   2.6.39-leonard+ #159 System manufacturer System Product Name/P5G41TD-M PRO
 RIP: 0010:[<ffffffff8115aa08>]  [<ffffffff8115aa08>] internal_create_group+0x2f/0x166
 RSP: 0018:ffff8801009f1de8  EFLAGS: 00010246
 RAX: 00000000ffffffef RBX: ffff880103920478 RCX: 00000000000a7bd3
 RDX: ffffffff81a2dbe0 RSI: 0000000000000000 RDI: ffff880103920478
 RBP: ffff8801009f1e38 R08: ffff880103920468 R09: ffff880103920478
 R10: ffff8801009f1de8 R11: ffff88011eccbb68 R12: ffffffff81a2dbe0
 R13: ffff880103920468 R14: 0000000000000000 R15: ffff880103920400
 FS:  00007f3c49de9700(0000) GS:ffff88011f800000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
 CR2: 00007f3b7fe7c000 CR3: 00000000cd58d000 CR4: 00000000000406e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Process modprobe (pid: 2522, threadinfo ffff8801009f0000, task ffff8801009a93a0)
 Stack:
  ffff8801009f1e58 ffffffff812e8f6e ffff8801009f1e58 ffffffff812e7a80
  ffff880000000010 ffff880103920400 ffff8801002fd0c0 ffff880103920468
  0000000000000011 ffff880103920400 ffff8801009f1e48 ffffffff8115ab6a
 Call Trace:
  [<ffffffff812e8f6e>] ? device_add+0x4f1/0x5e4
  [<ffffffff812e7a80>] ? dev_set_name+0x41/0x43
  [<ffffffff8115ab6a>] sysfs_create_group+0x13/0x15
  [<ffffffff810b857e>] blk_trace_init_sysfs+0x14/0x16
  [<ffffffff811ee58b>] blk_register_queue+0x4c/0xfd
  [<ffffffff811f3bdf>] add_disk+0xe4/0x29c
  [<ffffffffa007e2ab>] nbd_init+0x2ab/0x30d [nbd]
  [<ffffffffa007e000>] ? 0xffffffffa007dfff
  [<ffffffff8100020f>] do_one_initcall+0x7f/0x13e
  [<ffffffff8107ab0a>] sys_init_module+0xa1/0x1e3
  [<ffffffff814f3542>] system_call_fastpath+0x16/0x1b
 Code: 41 57 41 56 41 55 41 54 53 48 83 ec 28 0f 1f 44 00 00 48 89 fb 41 89 f6 49 89 d4 48 85 ff 74 0b 85 f6 75 0b 48 83
  7f 30 00 75 14 <0f> 0b eb fe b9 ea ff ff ff 48 83 7f 30 00 0f 84 09 01 00 00 49
 RIP  [<ffffffff8115aa08>] internal_create_group+0x2f/0x166
  RSP <ffff8801009f1de8>
 ---[ end trace 753285ffbf72c57c ]---

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Laurent Vivier <Laurent.Vivier@bull.net>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/nbd.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 1df3bfe5225..fdee7567fd1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -757,6 +757,12 @@ static int __init nbd_init(void)
 	if (max_part > 0)
 		part_shift = fls(max_part);
 
+	if ((1UL << part_shift) > DISK_MAX_PARTS)
+		return -EINVAL;
+
+	if (nbds_max > 1UL << (MINORBITS - part_shift))
+		return -EINVAL;
+
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
 		if (!disk)
-- 
cgit v1.2.3-70-g09d2


From 5988ce239682854d4e632fb58bff000700830394 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sat, 28 May 2011 14:44:46 +0200
Subject: nbd: adjust 'max_part' according to part_shift

The 'max_part' parameter determines how many partitions are supported
on each nbd device. However the actual number can be changed to the
power of 2 minus 1 form during the module initialization as
alloc_disk() is called with (1 << part_shift) for some reason.

So adjust 'max_part' also at least for consistency with loop and brd.
It is exported via sysfs already, and a user should check this value
after module loading if [s]he wants to use that number correctly
(i.e. fdisk or something).

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Laurent Vivier <Laurent.Vivier@bull.net>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/nbd.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index fdee7567fd1..f533f3375e2 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -754,9 +754,20 @@ static int __init nbd_init(void)
 		return -ENOMEM;
 
 	part_shift = 0;
-	if (max_part > 0)
+	if (max_part > 0) {
 		part_shift = fls(max_part);
 
+		/*
+		 * Adjust max_part according to part_shift as it is exported
+		 * to user space so that user can know the max number of
+		 * partition kernel should be able to manage.
+		 *
+		 * Note that -1 is required because partition 0 is reserved
+		 * for the whole disk.
+		 */
+		max_part = (1UL << part_shift) - 1;
+	}
+
 	if ((1UL << part_shift) > DISK_MAX_PARTS)
 		return -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From cd4ae6adf8b1c21d88e83ed56afeeef97b28f356 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Fri, 22 Apr 2011 18:53:54 +0800
Subject: sched: More sched_domain iterations fixes

sched_domain iterations needs to be protected by rcu_read_lock() now,
this patch adds another two places which needs the rcu lock, which is
spotted by following suspicious rcu_dereference_check() usage warnings.

kernel/sched_rt.c:1244 invoked rcu_dereference_check() without protection!
kernel/sched_stats.h:41 invoked rcu_dereference_check() without protection!

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1303469634-11678-1-git-send-email-dfeng@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c    | 10 ++++++++--
 kernel/sched_stats.h |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 64b2a37c07d..88725c939e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!cpumask_test_cpu(this_cpu, lowest_mask))
 		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
 
+	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_AFFINE) {
 			int best_cpu;
@@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task)
 			 * remote processor.
 			 */
 			if (this_cpu != -1 &&
-			    cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+				rcu_read_unlock();
 				return this_cpu;
+			}
 
 			best_cpu = cpumask_first_and(lowest_mask,
 						     sched_domain_span(sd));
-			if (best_cpu < nr_cpu_ids)
+			if (best_cpu < nr_cpu_ids) {
+				rcu_read_unlock();
 				return best_cpu;
+			}
 		}
 	}
+	rcu_read_unlock();
 
 	/*
 	 * And finally, if there were no matches within the domains
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0..331e01bcd02 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
-		preempt_disable();
+		rcu_read_lock();
 		for_each_domain(cpu, sd) {
 			enum cpu_idle_type itype;
 
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
 			    sd->ttwu_move_balance);
 		}
-		preempt_enable();
+		rcu_read_unlock();
 #endif
 	}
 	kfree(mask_str);
-- 
cgit v1.2.3-70-g09d2


From d6aa8f85f16379d42c147b22b59e33b67f9ff466 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 26 May 2011 14:21:33 +0200
Subject: sched: Fix ttwu() for __ARCH_WANT_INTERRUPTS_ON_CTXSW

Marc reported that e4a52bcb9 (sched: Remove rq->lock from the first
half of ttwu()) broke his ARM-SMP machine. Now ARM is one of the few
__ARCH_WANT_INTERRUPTS_ON_CTXSW users, so that exception in the ttwu()
code was suspect.

Yong found that the interrupt could hit after context_switch() changes
current but before it clears p->on_cpu, if that interrupt were to
attempt a wake-up of p we would indeed find ourselves spinning in IRQ
context.

Fix this by reverting to the old behaviour for this situation and
perform a full remote wake-up.

Cc: Frank Rowand <frank.rowand@am.sony.com>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Reported-by: Marc Zyngier <Marc.Zyngier@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5e43e9dc65d..a80ee911900 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2573,7 +2573,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
 	if (!next)
 		smp_send_reschedule(cpu);
 }
-#endif
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	rq = __task_rq_lock(p);
+	if (p->on_cpu) {
+		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+		ttwu_do_wakeup(rq, p, wake_flags);
+		ret = 1;
+	}
+	__task_rq_unlock(rq);
+
+	return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
 
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
@@ -2631,17 +2650,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	while (p->on_cpu) {
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 		/*
-		 * If called from interrupt context we could have landed in the
-		 * middle of schedule(), in this case we should take care not
-		 * to spin on ->on_cpu if p is current, since that would
-		 * deadlock.
+		 * In case the architecture enables interrupts in
+		 * context_switch(), we cannot busy wait, since that
+		 * would lead to deadlocks when an interrupt hits and
+		 * tries to wake up @prev. So bail and do a complete
+		 * remote wakeup.
 		 */
-		if (p == current) {
-			ttwu_queue(p, cpu);
+		if (ttwu_activate_remote(p, wake_flags))
 			goto stat;
-		}
-#endif
+#else
 		cpu_relax();
+#endif
 	}
 	/*
 	 * Pairs with the smp_wmb() in finish_lock_switch().
-- 
cgit v1.2.3-70-g09d2


From 1e876231785d82443a5ac8b6c660e9f51bc5dede Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 May 2011 16:21:10 -0700
Subject: sched: Fix ->min_vruntime calculation in dequeue_entity()

Dima Zavin <dima@android.com> reported:

"After pulling the thread off the run-queue during a cgroup change,
the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
then gets normalized to this new value. This can then lead to the thread
getting an unfair boost in the new group if the vruntime of the next
task in the old run-queue was way further ahead."

Reported-by: Dima Zavin <dima@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Recalls-having-tested-once-upon-a-time-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1305674470-23727-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e32a9b70ee9..433491c2dc8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	se->on_rq = 0;
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
-	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
+
+	update_min_vruntime(cfs_rq);
+	update_cfs_shares(cfs_rq);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 1e1b6c511d1b23cb7c3b619d82fc7bd9f620565d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 19 May 2011 15:08:58 +0900
Subject: cpuset: Fix cpuset_cpus_allowed_fallback(), don't update
 tsk->rt.nr_cpus_allowed

The rule is, we have to update tsk->rt.nr_cpus_allowed if we change
tsk->cpus_allowed. Otherwise RT scheduler may confuse.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4DD4B3FA.5060901@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h |  2 +-
 include/linux/sched.h  |  7 +++++++
 kernel/cpuset.c        |  4 ++--
 kernel/kthread.c       |  4 ++--
 kernel/sched.c         | 19 ++++++++++++-------
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index f20eb8f1602..e9eaec52265 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -146,7 +146,7 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 
 static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
 {
-	cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+	do_set_cpus_allowed(p, cpu_possible_mask);
 	return cpumask_any(cpu_active_mask);
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc8871295a5..8da84b7bc1b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1841,9 +1841,16 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif
 
 #ifdef CONFIG_SMP
+extern void do_set_cpus_allowed(struct task_struct *p,
+			       const struct cpumask *new_mask);
+
 extern int set_cpus_allowed_ptr(struct task_struct *p,
 				const struct cpumask *new_mask);
 #else
+static inline void do_set_cpus_allowed(struct task_struct *p,
+				      const struct cpumask *new_mask)
+{
+}
 static inline int set_cpus_allowed_ptr(struct task_struct *p,
 				       const struct cpumask *new_mask)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1ceeb049c82..9c9b7545c81 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2190,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 	rcu_read_lock();
 	cs = task_cs(tsk);
 	if (cs)
-		cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+		do_set_cpus_allowed(tsk, cs->cpus_allowed);
 	rcu_read_unlock();
 
 	/*
@@ -2217,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 		 * Like above we can temporary set any mask and rely on
 		 * set_cpus_allowed_ptr() as synchronization point.
 		 */
-		cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+		do_set_cpus_allowed(tsk, cpu_possible_mask);
 		cpu = cpumask_any(cpu_active_mask);
 	}
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bc..4ba7cccb499 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
 		return;
 	}
 
-	p->cpus_allowed = cpumask_of_cpu(cpu);
-	p->rt.nr_cpus_allowed = 1;
+	/* It's safe because the task is inactive. */
+	do_set_cpus_allowed(p, cpumask_of(cpu));
 	p->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/sched.c b/kernel/sched.c
index a80ee911900..cbb3a0eee58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5860,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
-	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+	do_set_cpus_allowed(idle, cpumask_of(cpu));
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5948,6 +5948,16 @@ static inline void sched_init_granularity(void)
 }
 
 #ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class && p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, new_mask);
+	else {
+		cpumask_copy(&p->cpus_allowed, new_mask);
+		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+	}
+}
+
 /*
  * This is how migration works:
  *
@@ -5993,12 +6003,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		goto out;
 	}
 
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-	else {
-		cpumask_copy(&p->cpus_allowed, new_mask);
-		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-	}
+	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
-- 
cgit v1.2.3-70-g09d2


From f62508f68d04adefc4cf9b0177ba02c8818b3eec Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Thu, 19 May 2011 12:53:53 +0200
Subject: Documentation: Add statistics about nested locks

Explain what the trailing "/1" on some lock class names of
lock_stat output means.

Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4DD4F6C1.5090701@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/lockstat.txt | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index 9c0a80d17a2..cef00d42ed5 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -12,8 +12,9 @@ Because things like lock contention can severely impact performance.
 - HOW
 
 Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that. The graph below shows the relation between
-the lock functions and the various hooks therein.
+lock classes. We build on that (see Documentation/lockdep-design.txt).
+The graph below shows the relation between the lock functions and the various
+hooks therein.
 
         __acquire
             |
@@ -128,6 +129,37 @@ points are the points we're contending with.
 
 The integer part of the time values is in us.
 
+Dealing with nested locks, subclasses may appear:
+
+32...............................................................................................................................................................................................
+33
+34                               &rq->lock:         13128          13128           0.43         190.53      103881.26          97454        3453404           0.00         401.11    13224683.11
+35                               ---------
+36                               &rq->lock            645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+37                               &rq->lock            297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+38                               &rq->lock            360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+39                               &rq->lock            428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+40                               ---------
+41                               &rq->lock             77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+42                               &rq->lock            174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+43                               &rq->lock           4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+44                               &rq->lock            893          [<ffffffff81340524>] schedule+0x157/0x7b8
+45
+46...............................................................................................................................................................................................
+47
+48                             &rq->lock/1:         11526          11488           0.33         388.73      136294.31          21461          38404           0.00          37.93      109388.53
+49                             -----------
+50                             &rq->lock/1          11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+51                             -----------
+52                             &rq->lock/1           5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+53                             &rq->lock/1           1224          [<ffffffff81340524>] schedule+0x157/0x7b8
+54                             &rq->lock/1           4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+55                             &rq->lock/1            181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+
+Line 48 shows statistics for the second subclass (/1) of &rq->lock class
+(subclass starts from 0), since in this case, as line 50 suggests,
+double_rq_lock actually acquires a nested lock of two spinlocks.
+
 View the top contending locks:
 
 # grep : /proc/lock_stat | head
-- 
cgit v1.2.3-70-g09d2


From f506b3dc0ec454a16d40cab9ee5d75435b39dc50 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 26 May 2011 17:02:53 +0200
Subject: perf: Fix SIGIO handling

Vince noticed that unless we mmap() a buffer, SIGIO gets lost. So
explicitly push the wakeup (including signals) when requested.

Reported-by: Vince Weaver <vweaver1@eecs.utk.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/n/tip-2euus3f3x3dyvdk52cjxw8zu@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/events/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c09767f7db3..d863b3c057b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5028,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 	else
 		perf_event_output(event, nmi, data, regs);
 
+	if (event->fasync && event->pending_kill) {
+		if (nmi) {
+			event->pending_wakeup = 1;
+			irq_work_queue(&event->pending);
+		} else
+			perf_event_wakeup(event);
+	}
+
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 55c2945aa9d4d907ec5ca4f6a4e30ae908d8d30d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 11 May 2011 05:33:33 -0700
Subject: atomic: Add atomic_or()

An atomic_or() function is needed by TREE_RCU to avoid deadlock, so
add a generic version.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/atomic.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 96c038e43d6..ee456c79b0e 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -34,4 +34,17 @@ static inline int atomic_inc_not_zero_hint(atomic_t *v, int hint)
 }
 #endif
 
+#ifndef CONFIG_ARCH_HAS_ATOMIC_OR
+static inline void atomic_or(int i, atomic_t *v)
+{
+	int old;
+	int new;
+
+	do {
+		old = atomic_read(v);
+		new = old | i;
+	} while (atomic_cmpxchg(v, old, new) != old);
+}
+#endif /* #ifndef CONFIG_ARCH_HAS_ATOMIC_OR */
+
 #endif /* _LINUX_ATOMIC_H */
-- 
cgit v1.2.3-70-g09d2


From 8826f3b0397562eee6f8785d548be9dfdb169100 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 11 May 2011 05:41:41 -0700
Subject: rcu: Avoid acquiring rcu_node locks in timer functions

This commit switches manipulations of the rcu_node ->wakemask field
to atomic operations, which allows rcu_cpu_kthread_timer() to avoid
acquiring the rcu_node lock.  This should avoid the following lockdep
splat reported by Valdis Kletnieks:

[   12.872150] usb 1-4: new high speed USB device number 3 using ehci_hcd
[   12.986667] usb 1-4: New USB device found, idVendor=413c, idProduct=2513
[   12.986679] usb 1-4: New USB device strings: Mfr=0, Product=0, SerialNumber=0
[   12.987691] hub 1-4:1.0: USB hub found
[   12.987877] hub 1-4:1.0: 3 ports detected
[   12.996372] input: PS/2 Generic Mouse as /devices/platform/i8042/serio1/input/input10
[   13.071471] udevadm used greatest stack depth: 3984 bytes left
[   13.172129]
[   13.172130] =======================================================
[   13.172425] [ INFO: possible circular locking dependency detected ]
[   13.172650] 2.6.39-rc6-mmotm0506 #1
[   13.172773] -------------------------------------------------------
[   13.172997] blkid/267 is trying to acquire lock:
[   13.173009]  (&p->pi_lock){-.-.-.}, at: [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa
[   13.173009]
[   13.173009] but task is already holding lock:
[   13.173009]  (rcu_node_level_0){..-...}, at: [<ffffffff810901cc>] rcu_cpu_kthread_timer+0x27/0x58
[   13.173009]
[   13.173009] which lock already depends on the new lock.
[   13.173009]
[   13.173009]
[   13.173009] the existing dependency chain (in reverse order) is:
[   13.173009]
[   13.173009] -> #2 (rcu_node_level_0){..-...}:
[   13.173009]        [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[   13.173009]        [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[   13.173009]        [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[   13.173009]        [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[   13.173009]        [<ffffffff815697f1>] _raw_spin_lock+0x36/0x45
[   13.173009]        [<ffffffff81090794>] rcu_read_unlock_special+0x8c/0x1d5
[   13.173009]        [<ffffffff8109092c>] __rcu_read_unlock+0x4f/0xd7
[   13.173009]        [<ffffffff81027bd3>] rcu_read_unlock+0x21/0x23
[   13.173009]        [<ffffffff8102cc34>] cpuacct_charge+0x6c/0x75
[   13.173009]        [<ffffffff81030cc6>] update_curr+0x101/0x12e
[   13.173009]        [<ffffffff810311d0>] check_preempt_wakeup+0xf7/0x23b
[   13.173009]        [<ffffffff8102acb3>] check_preempt_curr+0x2b/0x68
[   13.173009]        [<ffffffff81031d40>] ttwu_do_wakeup+0x76/0x128
[   13.173009]        [<ffffffff81031e49>] ttwu_do_activate.constprop.63+0x57/0x5c
[   13.173009]        [<ffffffff81031e96>] scheduler_ipi+0x48/0x5d
[   13.173009]        [<ffffffff810177d5>] smp_reschedule_interrupt+0x16/0x18
[   13.173009]        [<ffffffff815710f3>] reschedule_interrupt+0x13/0x20
[   13.173009]        [<ffffffff810b66d1>] rcu_read_unlock+0x21/0x23
[   13.173009]        [<ffffffff810b739c>] find_get_page+0xa9/0xb9
[   13.173009]        [<ffffffff810b8b48>] filemap_fault+0x6a/0x34d
[   13.173009]        [<ffffffff810d1a25>] __do_fault+0x54/0x3e6
[   13.173009]        [<ffffffff810d447a>] handle_pte_fault+0x12c/0x1ed
[   13.173009]        [<ffffffff810d48f7>] handle_mm_fault+0x1cd/0x1e0
[   13.173009]        [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de
[   13.173009]        [<ffffffff8156a75f>] page_fault+0x1f/0x30
[   13.173009]
[   13.173009] -> #1 (&rq->lock){-.-.-.}:
[   13.173009]        [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[   13.173009]        [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[   13.173009]        [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[   13.173009]        [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[   13.173009]        [<ffffffff815697f1>] _raw_spin_lock+0x36/0x45
[   13.173009]        [<ffffffff81027e19>] __task_rq_lock+0x8b/0xd3
[   13.173009]        [<ffffffff81032f7f>] wake_up_new_task+0x41/0x108
[   13.173009]        [<ffffffff810376c3>] do_fork+0x265/0x33f
[   13.173009]        [<ffffffff81007d02>] kernel_thread+0x6b/0x6d
[   13.173009]        [<ffffffff8153a9dd>] rest_init+0x21/0xd2
[   13.173009]        [<ffffffff81b1db4f>] start_kernel+0x3bb/0x3c6
[   13.173009]        [<ffffffff81b1d29f>] x86_64_start_reservations+0xaf/0xb3
[   13.173009]        [<ffffffff81b1d393>] x86_64_start_kernel+0xf0/0xf7
[   13.173009]
[   13.173009] -> #0 (&p->pi_lock){-.-.-.}:
[   13.173009]        [<ffffffff81067788>] check_prev_add+0x68/0x20e
[   13.173009]        [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[   13.173009]        [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[   13.173009]        [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[   13.173009]        [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[   13.173009]        [<ffffffff815698ea>] _raw_spin_lock_irqsave+0x44/0x57
[   13.173009]        [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa
[   13.173009]        [<ffffffff81032f3c>] wake_up_process+0x10/0x12
[   13.173009]        [<ffffffff810901e9>] rcu_cpu_kthread_timer+0x44/0x58
[   13.173009]        [<ffffffff81045286>] call_timer_fn+0xac/0x1e9
[   13.173009]        [<ffffffff8104556d>] run_timer_softirq+0x1aa/0x1f2
[   13.173009]        [<ffffffff8103e487>] __do_softirq+0x109/0x26a
[   13.173009]        [<ffffffff8157144c>] call_softirq+0x1c/0x30
[   13.173009]        [<ffffffff81003207>] do_softirq+0x44/0xf1
[   13.173009]        [<ffffffff8103e8b9>] irq_exit+0x58/0xc8
[   13.173009]        [<ffffffff81017f5a>] smp_apic_timer_interrupt+0x79/0x87
[   13.173009]        [<ffffffff81570fd3>] apic_timer_interrupt+0x13/0x20
[   13.173009]        [<ffffffff810bd51a>] get_page_from_freelist+0x2aa/0x310
[   13.173009]        [<ffffffff810bdf03>] __alloc_pages_nodemask+0x178/0x243
[   13.173009]        [<ffffffff8101fe2f>] pte_alloc_one+0x1e/0x3a
[   13.173009]        [<ffffffff810d27fe>] __pte_alloc+0x22/0x14b
[   13.173009]        [<ffffffff810d48a8>] handle_mm_fault+0x17e/0x1e0
[   13.173009]        [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de
[   13.173009]        [<ffffffff8156a75f>] page_fault+0x1f/0x30
[   13.173009]
[   13.173009] other info that might help us debug this:
[   13.173009]
[   13.173009] Chain exists of:
[   13.173009]   &p->pi_lock --> &rq->lock --> rcu_node_level_0
[   13.173009]
[   13.173009]  Possible unsafe locking scenario:
[   13.173009]
[   13.173009]        CPU0                    CPU1
[   13.173009]        ----                    ----
[   13.173009]   lock(rcu_node_level_0);
[   13.173009]                                lock(&rq->lock);
[   13.173009]                                lock(rcu_node_level_0);
[   13.173009]   lock(&p->pi_lock);
[   13.173009]
[   13.173009]  *** DEADLOCK ***
[   13.173009]
[   13.173009] 3 locks held by blkid/267:
[   13.173009]  #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff8156cdb4>] do_page_fault+0x1f3/0x5de
[   13.173009]  #1:  (&yield_timer){+.-...}, at: [<ffffffff810451da>] call_timer_fn+0x0/0x1e9
[   13.173009]  #2:  (rcu_node_level_0){..-...}, at: [<ffffffff810901cc>] rcu_cpu_kthread_timer+0x27/0x58
[   13.173009]
[   13.173009] stack backtrace:
[   13.173009] Pid: 267, comm: blkid Not tainted 2.6.39-rc6-mmotm0506 #1
[   13.173009] Call Trace:
[   13.173009]  <IRQ>  [<ffffffff8154a529>] print_circular_bug+0xc8/0xd9
[   13.173009]  [<ffffffff81067788>] check_prev_add+0x68/0x20e
[   13.173009]  [<ffffffff8100c861>] ? save_stack_trace+0x28/0x46
[   13.173009]  [<ffffffff810679b9>] check_prevs_add+0x8b/0x104
[   13.173009]  [<ffffffff81067da1>] validate_chain+0x36f/0x3ab
[   13.173009]  [<ffffffff8106846b>] __lock_acquire+0x369/0x3e2
[   13.173009]  [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa
[   13.173009]  [<ffffffff81068a0f>] lock_acquire+0xfc/0x14c
[   13.173009]  [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa
[   13.173009]  [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[   13.173009]  [<ffffffff815698ea>] _raw_spin_lock_irqsave+0x44/0x57
[   13.173009]  [<ffffffff81032d8f>] ? try_to_wake_up+0x29/0x1aa
[   13.173009]  [<ffffffff81032d8f>] try_to_wake_up+0x29/0x1aa
[   13.173009]  [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[   13.173009]  [<ffffffff81032f3c>] wake_up_process+0x10/0x12
[   13.173009]  [<ffffffff810901e9>] rcu_cpu_kthread_timer+0x44/0x58
[   13.173009]  [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[   13.173009]  [<ffffffff81045286>] call_timer_fn+0xac/0x1e9
[   13.173009]  [<ffffffff810451da>] ? del_timer+0x75/0x75
[   13.173009]  [<ffffffff810901a5>] ? rcu_check_quiescent_state+0x82/0x82
[   13.173009]  [<ffffffff8104556d>] run_timer_softirq+0x1aa/0x1f2
[   13.173009]  [<ffffffff8103e487>] __do_softirq+0x109/0x26a
[   13.173009]  [<ffffffff8106365f>] ? tick_dev_program_event+0x37/0xf6
[   13.173009]  [<ffffffff810a0e4a>] ? time_hardirqs_off+0x1b/0x2f
[   13.173009]  [<ffffffff8157144c>] call_softirq+0x1c/0x30
[   13.173009]  [<ffffffff81003207>] do_softirq+0x44/0xf1
[   13.173009]  [<ffffffff8103e8b9>] irq_exit+0x58/0xc8
[   13.173009]  [<ffffffff81017f5a>] smp_apic_timer_interrupt+0x79/0x87
[   13.173009]  [<ffffffff81570fd3>] apic_timer_interrupt+0x13/0x20
[   13.173009]  <EOI>  [<ffffffff810bd384>] ? get_page_from_freelist+0x114/0x310
[   13.173009]  [<ffffffff810bd51a>] ? get_page_from_freelist+0x2aa/0x310
[   13.173009]  [<ffffffff812220e7>] ? clear_page_c+0x7/0x10
[   13.173009]  [<ffffffff810bd1ef>] ? prep_new_page+0x14c/0x1cd
[   13.173009]  [<ffffffff810bd51a>] get_page_from_freelist+0x2aa/0x310
[   13.173009]  [<ffffffff810bdf03>] __alloc_pages_nodemask+0x178/0x243
[   13.173009]  [<ffffffff810d46b9>] ? __pmd_alloc+0x87/0x99
[   13.173009]  [<ffffffff8101fe2f>] pte_alloc_one+0x1e/0x3a
[   13.173009]  [<ffffffff810d46b9>] ? __pmd_alloc+0x87/0x99
[   13.173009]  [<ffffffff810d27fe>] __pte_alloc+0x22/0x14b
[   13.173009]  [<ffffffff810d48a8>] handle_mm_fault+0x17e/0x1e0
[   13.173009]  [<ffffffff8156cfee>] do_page_fault+0x42d/0x5de
[   13.173009]  [<ffffffff810d915f>] ? sys_brk+0x32/0x10c
[   13.173009]  [<ffffffff810a0e4a>] ? time_hardirqs_off+0x1b/0x2f
[   13.173009]  [<ffffffff81065c4f>] ? trace_hardirqs_off_caller+0x3f/0x9c
[   13.173009]  [<ffffffff812235dd>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[   13.173009]  [<ffffffff8156a75f>] page_fault+0x1f/0x30
[   14.010075] usb 5-1: new full speed USB device number 2 using uhci_hcd

Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 13 +++++--------
 kernel/rcutree.h |  4 +++-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8154a4a3491..5d96d68d20f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
@@ -1526,13 +1526,10 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
  */
 static void rcu_cpu_kthread_timer(unsigned long arg)
 {
-	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
 	struct rcu_node *rnp = rdp->mynode;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	rnp->wakemask |= rdp->grpmask;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	atomic_or(rdp->grpmask, &rnp->wakemask);
 	invoke_rcu_node_kthread(rnp);
 }
 
@@ -1680,11 +1677,11 @@ static int rcu_node_kthread(void *arg)
 
 	for (;;) {
 		rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-		wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
+		wait_event_interruptible(rnp->node_wq,
+					 atomic_read(&rnp->wakemask) != 0);
 		rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
-		mask = rnp->wakemask;
-		rnp->wakemask = 0;
+		mask = atomic_xchg(&rnp->wakemask, 0);
 		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
 			if ((mask & 0x1) == 0)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 93d4a1c2e88..561dcb9a8d2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -119,7 +119,9 @@ struct rcu_node {
 				/*  elements that need to drain to allow the */
 				/*  current expedited grace period to */
 				/*  complete (only for TREE_PREEMPT_RCU). */
-	unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
+	atomic_t wakemask;	/* CPUs whose kthread needs to be awakened. */
+				/*  Since this has meaning only for leaf */
+				/*  rcu_node structures, 32 bits suffices. */
 	unsigned long qsmaskinit;
 				/* Per-GP initial value for qsmask & expmask. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
-- 
cgit v1.2.3-70-g09d2


From 08bca60a6912ad225254250c0a9c3a05b4152cfa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 May 2011 16:06:29 -0700
Subject: rcu: Remove waitqueue usage for cpu, node, and boost kthreads

It is not necessary to use waitqueues for the RCU kthreads because
we always know exactly which thread is to be awakened.  In addition,
wake_up() only issues an actual wakeup when there is a thread waiting on
the queue, which was why there was an extra explicit wake_up_process()
to get the RCU kthreads started.

Eliminating the waitqueues (and wake_up()) in favor of wake_up_process()
eliminates the need for the initial wake_up_process() and also shrinks
the data structure size a bit.  The wakeup logic is placed in a new
rcu_wait() macro.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 23 ++++++-----------------
 kernel/rcutree.h        | 17 ++++++++++-------
 kernel/rcutree_plugin.h | 16 +---------------
 3 files changed, 17 insertions(+), 39 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5d96d68d20f..05e254e930e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -95,7 +95,6 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 static char rcu_kthreads_spawnable;
 
@@ -1476,7 +1475,7 @@ static void invoke_rcu_cpu_kthread(void)
 		local_irq_restore(flags);
 		return;
 	}
-	wake_up(&__get_cpu_var(rcu_cpu_wq));
+	wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
 	local_irq_restore(flags);
 }
 
@@ -1596,14 +1595,12 @@ static int rcu_cpu_kthread(void *arg)
 	unsigned long flags;
 	int spincnt = 0;
 	unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
-	wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
 	char work;
 	char *workp = &per_cpu(rcu_cpu_has_work, cpu);
 
 	for (;;) {
 		*statusp = RCU_KTHREAD_WAITING;
-		wait_event_interruptible(*wqp,
-					 *workp != 0 || kthread_should_stop());
+		rcu_wait(*workp != 0 || kthread_should_stop());
 		local_bh_disable();
 		if (rcu_cpu_kthread_should_stop(cpu)) {
 			local_bh_enable();
@@ -1654,7 +1651,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
 	WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
 	per_cpu(rcu_cpu_kthread_task, cpu) = t;
-	wake_up_process(t);
 	sp.sched_priority = RCU_KTHREAD_PRIO;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	return 0;
@@ -1677,8 +1673,7 @@ static int rcu_node_kthread(void *arg)
 
 	for (;;) {
 		rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-		wait_event_interruptible(rnp->node_wq,
-					 atomic_read(&rnp->wakemask) != 0);
+		rcu_wait(atomic_read(&rnp->wakemask) != 0);
 		rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		mask = atomic_xchg(&rnp->wakemask, 0);
@@ -1762,7 +1757,6 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rnp->node_kthread_task = t;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		wake_up_process(t);
 		sp.sched_priority = 99;
 		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	}
@@ -1779,21 +1773,16 @@ static int __init rcu_spawn_kthreads(void)
 
 	rcu_kthreads_spawnable = 1;
 	for_each_possible_cpu(cpu) {
-		init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
 		if (cpu_online(cpu))
 			(void)rcu_spawn_one_cpu_kthread(cpu);
 	}
 	rnp = rcu_get_root(rcu_state);
-	init_waitqueue_head(&rnp->node_wq);
-	rcu_init_boost_waitqueue(rnp);
 	(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-	if (NUM_RCU_NODES > 1)
-		rcu_for_each_leaf_node(rcu_state, rnp) {
-			init_waitqueue_head(&rnp->node_wq);
-			rcu_init_boost_waitqueue(rnp);
+	if (NUM_RCU_NODES > 1) {
+		rcu_for_each_leaf_node(rcu_state, rnp)
 			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-		}
+	}
 	return 0;
 }
 early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 561dcb9a8d2..7b9a08b4aae 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -159,9 +159,6 @@ struct rcu_node {
 	struct task_struct *boost_kthread_task;
 				/* kthread that takes care of priority */
 				/*  boosting for this rcu_node structure. */
-	wait_queue_head_t boost_wq;
-				/* Wait queue on which to park the boost */
-				/*  kthread. */
 	unsigned int boost_kthread_status;
 				/* State of boost_kthread_task for tracing. */
 	unsigned long n_tasks_boosted;
@@ -188,9 +185,6 @@ struct rcu_node {
 				/* kthread that takes care of this rcu_node */
 				/*  structure, for example, awakening the */
 				/*  per-CPU kthreads as needed. */
-	wait_queue_head_t node_wq;
-				/* Wait queue on which to park the per-node */
-				/*  kthread. */
 	unsigned int node_kthread_status;
 				/* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
@@ -336,6 +330,16 @@ struct rcu_data {
 						/*  scheduling clock irq */
 						/*  before ratting on them. */
 
+#define rcu_wait(cond)							\
+do {									\
+	for (;;) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (cond)						\
+			break;						\
+		schedule();						\
+	}								\
+	__set_current_state(TASK_RUNNING);				\
+} while (0)
 
 /*
  * RCU global state, including node hierarchy.  This hierarchy is
@@ -445,7 +449,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
 					  cpumask_var_t cm);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ed339702481..049f2787a98 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1196,8 +1196,7 @@ static int rcu_boost_kthread(void *arg)
 
 	for (;;) {
 		rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
-		wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
-							rnp->exp_tasks);
+		rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
 		rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
 		more2boost = rcu_boost(rnp);
 		if (more2boost)
@@ -1274,14 +1273,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 	rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
 }
 
-/*
- * Initialize the RCU-boost waitqueue.
- */
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
-	init_waitqueue_head(&rnp->boost_wq);
-}
-
 /*
  * Create an RCU-boost kthread for the specified node if one does not
  * already exist.  We only create this kthread for preemptible RCU.
@@ -1306,7 +1297,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	wake_up_process(t);
 	sp.sched_priority = RCU_KTHREAD_PRIO;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	return 0;
@@ -1328,10 +1318,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
 
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
-}
-
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp,
 						 int rnp_index)
-- 
cgit v1.2.3-70-g09d2


From cc3ce5176d83cd8ae1134f86e208ea758d6cb78e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 25 May 2011 13:42:06 -0700
Subject: rcu: Start RCU kthreads in TASK_INTERRUPTIBLE state

Upon creation, kthreads are in TASK_UNINTERRUPTIBLE state, which can
result in softlockup warnings.  Because some of RCU's kthreads can
legitimately be idle indefinitely, start them in TASK_INTERRUPTIBLE
state in order to avoid those warnings.

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 2 ++
 kernel/rcutree_plugin.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 05e254e930e..77a7671dd14 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1648,6 +1648,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	kthread_bind(t, cpu);
+	set_task_state(t, TASK_INTERRUPTIBLE);
 	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
 	WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
 	per_cpu(rcu_cpu_kthread_task, cpu) = t;
@@ -1755,6 +1756,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
 		if (IS_ERR(t))
 			return PTR_ERR(t);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
+		set_task_state(t, TASK_INTERRUPTIBLE);
 		rnp->node_kthread_task = t;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		sp.sched_priority = 99;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 049f2787a98..a767b7dac36 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1295,6 +1295,7 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	raw_spin_lock_irqsave(&rnp->lock, flags);
+	set_task_state(t, TASK_INTERRUPTIBLE);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	sp.sched_priority = RCU_KTHREAD_PRIO;
-- 
cgit v1.2.3-70-g09d2


From 69b4573296469fd3f70cf7044693074980517067 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 28 May 2011 08:25:51 -0700
Subject: Cache xattr security drop check for write v2

Some recent benchmarking on btrfs showed that a major scaling bottleneck
on large systems on btrfs is currently the xattr lookup on every write.

Why xattr lookup on every write I hear you ask?

write wants to drop suid and security related xattrs that could set o
capabilities for executables.  To do that it currently looks up
security.capability on EVERY write (even for non executables) to decide
whether to drop it or not.

In btrfs this causes an additional tree walk, hitting some per file system
locks and quite bad scalability. In a simple read workload on a 8S
system I saw over 90% CPU time in spinlocks related to that.

Chris Mason tells me this is also a problem in ext4, where it hits
the global mbcache lock.

This patch adds a simple per inode to avoid this problem.  We only
do the lookup once per file and then if there is no xattr cache
the decision. All xattr changes clear the flag.

I also used the same flag to avoid the suid check, although
that one is pretty cheap.

A file system can also set this flag when it creates the inode,
if it has a cheap way to do so.  This is done for some common file systems
in followon patches.

With this patch a major part of the lock contention disappears
for btrfs. Some testing on smaller systems didn't show significant
performance changes, but at least it helps the larger systems
and is generally more efficient.

v2: Rename is_sgid. add file system helper.
Cc: chris.mason@oracle.com
Cc: josef@redhat.com
Cc: viro@zeniv.linux.org.uk
Cc: agruen@linbit.com
Cc: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/attr.c          |  7 +++++++
 fs/xattr.c         |  7 +++++--
 include/linux/fs.h | 13 +++++++++++++
 mm/filemap.c       | 14 ++++++++++++--
 4 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 91dbe2a107f..caf2aa521e2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 			return -EPERM;
 	}
 
+	if ((ia_valid & ATTR_MODE)) {
+		mode_t amode = attr->ia_mode;
+		/* Flag setting protected by i_mutex */
+		if (is_sxid(amode))
+			inode->i_flags &= ~S_NOSEC;
+	}
+
 	now = current_fs_time(inode->i_sb);
 
 	attr->ia_ctime = now;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4be2e7666d0..f060663ab70 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -91,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 {
 	struct inode *inode = dentry->d_inode;
 	int error = -EOPNOTSUPP;
+	int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
+				   XATTR_SECURITY_PREFIX_LEN);
 
+	if (issec)
+		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_op->setxattr) {
 		error = inode->i_op->setxattr(dentry, name, value, size, flags);
 		if (!error) {
@@ -99,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 			security_inode_post_setxattr(dentry, name, value,
 						     size, flags);
 		}
-	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
-				XATTR_SECURITY_PREFIX_LEN)) {
+	} else if (issec) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
 		error = security_inode_setsecurity(inode, suffix, value,
 						   size, flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 573028df050..c55d6b7cd5d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -237,6 +237,7 @@ struct inodes_stat_t {
 #define S_PRIVATE	512	/* Inode is fs-internal */
 #define S_IMA		1024	/* Inode has an associated IMA struct */
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
+#define S_NOSEC		4096	/* no suid or xattr security attributes */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -273,6 +274,7 @@ struct inodes_stat_t {
 #define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
+#define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
 
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
@@ -2582,5 +2584,16 @@ int __init get_filesystem_list(char *buf);
 #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
 					    (flag & __FMODE_NONOTIFY)))
 
+static inline int is_sxid(mode_t mode)
+{
+	return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
+}
+
+static inline void inode_has_no_xattr(struct inode *inode)
+{
+	if (!is_sxid(inode->i_mode))
+		inode->i_flags |= S_NOSEC;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index dac95a24dea..d7b10578a64 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1982,16 +1982,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
 int file_remove_suid(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	int killsuid = should_remove_suid(dentry);
-	int killpriv = security_inode_need_killpriv(dentry);
+	struct inode *inode = dentry->d_inode;
+	int killsuid;
+	int killpriv;
 	int error = 0;
 
+	/* Fast path for nothing security related */
+	if (IS_NOSEC(inode))
+		return 0;
+
+	killsuid = should_remove_suid(dentry);
+	killpriv = security_inode_need_killpriv(dentry);
+
 	if (killpriv < 0)
 		return killpriv;
 	if (killpriv)
 		error = security_inode_killpriv(dentry);
 	if (!error && killsuid)
 		error = __remove_suid(dentry, killsuid);
+	if (!error)
+		inode->i_flags |= S_NOSEC;
 
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From 091c75985e2f3d1b60eb25d577f04923c1b8e022 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 27 Oct 2010 03:41:03 -0400
Subject: Blackfin: bfin_serial.h: turn default port wrappers into stubs

Any consumer that needs to access the MMRs has to provide these helpers,
so make the default into useless stubs.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/include/asm/bfin_serial.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/blackfin/include/asm/bfin_serial.h b/arch/blackfin/include/asm/bfin_serial.h
index 7dbc664eab1..7fd0ec7b5b0 100644
--- a/arch/blackfin/include/asm/bfin_serial.h
+++ b/arch/blackfin/include/asm/bfin_serial.h
@@ -184,7 +184,7 @@ struct bfin_uart_regs {
 #undef __BFP
 
 #ifndef port_membase
-# define port_membase(p) (((struct bfin_serial_port *)(p))->port.membase)
+# define port_membase(p) 0
 #endif
 
 #define UART_GET_CHAR(p)      bfin_read16(port_membase(p) + OFFSET_RBR)
@@ -235,10 +235,10 @@ struct bfin_uart_regs {
 #define UART_SET_DLAB(p)      do { UART_PUT_LCR(p, UART_GET_LCR(p) | DLAB); SSYNC(); } while (0)
 
 #ifndef put_lsr_cache
-# define put_lsr_cache(p, v) (((struct bfin_serial_port *)(p))->lsr = (v))
+# define put_lsr_cache(p, v)
 #endif
 #ifndef get_lsr_cache
-# define get_lsr_cache(p)    (((struct bfin_serial_port *)(p))->lsr)
+# define get_lsr_cache(p) 0
 #endif
 
 /* The hardware clears the LSR bits upon read, so we need to cache
-- 
cgit v1.2.3-70-g09d2


From 63917efc4feb856e134eedc9ad7a9810fec12968 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 28 Sep 2009 03:16:01 +0000
Subject: Blackfin: mach/bfin_serial_5xx.h: punt now-unused header

Now that the serial code has been unified in bfin_serial.h, and the
Blackfin UART driver pushed its resources to the boards files, we
don't need these headers anymore.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 .../mach-bf518/include/mach/bfin_serial_5xx.h      | 79 ------------------
 .../mach-bf527/include/mach/bfin_serial_5xx.h      | 79 ------------------
 .../mach-bf533/include/mach/bfin_serial_5xx.h      | 52 ------------
 .../mach-bf537/include/mach/bfin_serial_5xx.h      | 79 ------------------
 .../mach-bf538/include/mach/bfin_serial_5xx.h      | 93 ---------------------
 .../mach-bf548/include/mach/bfin_serial_5xx.h      | 94 ----------------------
 .../mach-bf561/include/mach/bfin_serial_5xx.h      | 52 ------------
 7 files changed, 528 deletions(-)
 delete mode 100644 arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h
 delete mode 100644 arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h
 delete mode 100644 arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h
 delete mode 100644 arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h
 delete mode 100644 arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h
 delete mode 100644 arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h
 delete mode 100644 arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h

diff --git a/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index f6d924ac0c4..00000000000
--- a/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright 2008-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS)
-# define CONFIG_SERIAL_BFIN_CTSRTS
-
-# ifndef CONFIG_UART0_CTS_PIN
-#  define CONFIG_UART0_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART0_RTS_PIN
-#  define CONFIG_UART0_RTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_CTS_PIN
-#  define CONFIG_UART1_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_RTS_PIN
-#  define CONFIG_UART1_RTS_PIN -1
-# endif
-#endif
-
-struct bfin_serial_res {
-	unsigned long uart_base_addr;
-	int uart_irq;
-	int uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int uart_tx_dma_channel;
-	unsigned int uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	int uart_cts_pin;
-	int uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-#ifdef CONFIG_SERIAL_BFIN_UART0
-	{
-	 0xFFC00400,
-	 IRQ_UART0_RX,
-	 IRQ_UART0_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	 CH_UART0_TX,
-	 CH_UART0_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	 CONFIG_UART0_CTS_PIN,
-	 CONFIG_UART0_RTS_PIN,
-#endif
-	 },
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART1
-	{
-	 0xFFC02000,
-	 IRQ_UART1_RX,
-	 IRQ_UART1_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	 CH_UART1_TX,
-	 CH_UART1_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	 CONFIG_UART1_CTS_PIN,
-	 CONFIG_UART1_RTS_PIN,
-#endif
-	 },
-#endif
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
diff --git a/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index 960e08919de..00000000000
--- a/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright 2007-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS)
-# define CONFIG_SERIAL_BFIN_CTSRTS
-
-# ifndef CONFIG_UART0_CTS_PIN
-#  define CONFIG_UART0_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART0_RTS_PIN
-#  define CONFIG_UART0_RTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_CTS_PIN
-#  define CONFIG_UART1_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_RTS_PIN
-#  define CONFIG_UART1_RTS_PIN -1
-# endif
-#endif
-
-struct bfin_serial_res {
-	unsigned long uart_base_addr;
-	int uart_irq;
-	int uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int uart_tx_dma_channel;
-	unsigned int uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	int uart_cts_pin;
-	int uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-#ifdef CONFIG_SERIAL_BFIN_UART0
-	{
-	 0xFFC00400,
-	 IRQ_UART0_RX,
-	 IRQ_UART0_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	 CH_UART0_TX,
-	 CH_UART0_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	 CONFIG_UART0_CTS_PIN,
-	 CONFIG_UART0_RTS_PIN,
-#endif
-	 },
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART1
-	{
-	 0xFFC02000,
-	 IRQ_UART1_RX,
-	 IRQ_UART1_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	 CH_UART1_TX,
-	 CH_UART1_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	 CONFIG_UART1_CTS_PIN,
-	 CONFIG_UART1_RTS_PIN,
-#endif
-	 },
-#endif
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
diff --git a/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index 45dcaa4f3e4..00000000000
--- a/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2006-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#ifdef CONFIG_BFIN_UART0_CTSRTS
-# define CONFIG_SERIAL_BFIN_CTSRTS
-# ifndef CONFIG_UART0_CTS_PIN
-#  define CONFIG_UART0_CTS_PIN -1
-# endif
-# ifndef CONFIG_UART0_RTS_PIN
-#  define CONFIG_UART0_RTS_PIN -1
-# endif
-#endif
-
-struct bfin_serial_res {
-	unsigned long	uart_base_addr;
-	int		uart_irq;
-	int		uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int	uart_tx_dma_channel;
-	unsigned int	uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	int		uart_cts_pin;
-	int		uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-	{
-	0xFFC00400,
-	IRQ_UART0_RX,
-	IRQ_UART0_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART0_TX,
-	CH_UART0_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	CONFIG_UART0_CTS_PIN,
-	CONFIG_UART0_RTS_PIN,
-#endif
-	}
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
diff --git a/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index 3e955dba895..00000000000
--- a/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright 2006-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS)
-# define CONFIG_SERIAL_BFIN_CTSRTS
-
-# ifndef CONFIG_UART0_CTS_PIN
-#  define CONFIG_UART0_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART0_RTS_PIN
-#  define CONFIG_UART0_RTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_CTS_PIN
-#  define CONFIG_UART1_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_RTS_PIN
-#  define CONFIG_UART1_RTS_PIN -1
-# endif
-#endif
-
-struct bfin_serial_res {
-	unsigned long	uart_base_addr;
-	int		uart_irq;
-	int		uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int	uart_tx_dma_channel;
-	unsigned int	uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	int	uart_cts_pin;
-	int	uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-#ifdef CONFIG_SERIAL_BFIN_UART0
-	{
-	0xFFC00400,
-	IRQ_UART0_RX,
-	IRQ_UART0_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART0_TX,
-	CH_UART0_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	CONFIG_UART0_CTS_PIN,
-	CONFIG_UART0_RTS_PIN,
-#endif
-	},
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART1
-	{
-	0xFFC02000,
-	IRQ_UART1_RX,
-	IRQ_UART1_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART1_TX,
-	CH_UART1_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	CONFIG_UART1_CTS_PIN,
-	CONFIG_UART1_RTS_PIN,
-#endif
-	},
-#endif
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
diff --git a/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index beb502e9cb3..00000000000
--- a/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright 2008-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS)
-# define CONFIG_SERIAL_BFIN_CTSRTS
-
-# ifndef CONFIG_UART0_CTS_PIN
-#  define CONFIG_UART0_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART0_RTS_PIN
-#  define CONFIG_UART0_RTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_CTS_PIN
-#  define CONFIG_UART1_CTS_PIN -1
-# endif
-
-# ifndef CONFIG_UART1_RTS_PIN
-#  define CONFIG_UART1_RTS_PIN -1
-# endif
-#endif
-
-struct bfin_serial_res {
-	unsigned long	uart_base_addr;
-	int		uart_irq;
-	int		uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int	uart_tx_dma_channel;
-	unsigned int	uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	int	uart_cts_pin;
-	int	uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-#ifdef CONFIG_SERIAL_BFIN_UART0
-	{
-	0xFFC00400,
-	IRQ_UART0_RX,
-	IRQ_UART0_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART0_TX,
-	CH_UART0_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	CONFIG_UART0_CTS_PIN,
-	CONFIG_UART0_RTS_PIN,
-#endif
-	},
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART1
-	{
-	0xFFC02000,
-	IRQ_UART1_RX,
-	IRQ_UART1_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART1_TX,
-	CH_UART1_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	CONFIG_UART1_CTS_PIN,
-	CONFIG_UART1_RTS_PIN,
-#endif
-	},
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART2
-	{
-	0xFFC02100,
-	IRQ_UART2_RX,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART2_TX,
-	CH_UART2_RX,
-#endif
-#ifdef CONFIG_BFIN_UART2_CTSRTS
-	CONFIG_UART2_CTS_PIN,
-	CONFIG_UART2_RTS_PIN,
-#endif
-	},
-#endif
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
diff --git a/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index 0d94edaaaa2..00000000000
--- a/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright 2007-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS) || \
-	defined(CONFIG_BFIN_UART2_CTSRTS) || defined(CONFIG_BFIN_UART3_CTSRTS)
-# define CONFIG_SERIAL_BFIN_HARD_CTSRTS
-#endif
-
-struct bfin_serial_res {
-	unsigned long	uart_base_addr;
-	int		uart_irq;
-	int		uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int	uart_tx_dma_channel;
-	unsigned int	uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
-	int		uart_cts_pin;
-	int		uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-#ifdef CONFIG_SERIAL_BFIN_UART0
-	{
-	0xFFC00400,
-	IRQ_UART0_RX,
-	IRQ_UART0_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART0_TX,
-	CH_UART0_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
-	0,
-	0,
-#endif
-	},
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART1
-	{
-	0xFFC02000,
-	IRQ_UART1_RX,
-	IRQ_UART1_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART1_TX,
-	CH_UART1_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
-	GPIO_PE10,
-	GPIO_PE9,
-#endif
-	},
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART2
-	{
-	0xFFC02100,
-	IRQ_UART2_RX,
-	IRQ_UART2_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART2_TX,
-	CH_UART2_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
-	0,
-	0,
-#endif
-	},
-#endif
-#ifdef CONFIG_SERIAL_BFIN_UART3
-	{
-	0xFFC03100,
-	IRQ_UART3_RX,
-	IRQ_UART3_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART3_TX,
-	CH_UART3_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_HARD_CTSRTS
-	GPIO_PB3,
-	GPIO_PB2,
-#endif
-	},
-#endif
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
diff --git a/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h
deleted file mode 100644
index 3a6947456cf..00000000000
--- a/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2006-2009 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#include <asm/dma.h>
-#include <asm/portmux.h>
-
-#ifdef CONFIG_BFIN_UART0_CTSRTS
-# define CONFIG_SERIAL_BFIN_CTSRTS
-# ifndef CONFIG_UART0_CTS_PIN
-#  define CONFIG_UART0_CTS_PIN -1
-# endif
-# ifndef CONFIG_UART0_RTS_PIN
-#  define CONFIG_UART0_RTS_PIN -1
-# endif
-#endif
-
-struct bfin_serial_res {
-	unsigned long	uart_base_addr;
-	int		uart_irq;
-	int		uart_status_irq;
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	unsigned int	uart_tx_dma_channel;
-	unsigned int	uart_rx_dma_channel;
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	int		uart_cts_pin;
-	int		uart_rts_pin;
-#endif
-};
-
-struct bfin_serial_res bfin_serial_resource[] = {
-	{
-	0xFFC00400,
-	IRQ_UART_RX,
-	IRQ_UART_ERROR,
-#ifdef CONFIG_SERIAL_BFIN_DMA
-	CH_UART_TX,
-	CH_UART_RX,
-#endif
-#ifdef CONFIG_SERIAL_BFIN_CTSRTS
-	CONFIG_UART0_CTS_PIN,
-	CONFIG_UART0_RTS_PIN,
-#endif
-	}
-};
-
-#define DRIVER_NAME "bfin-uart"
-
-#include <asm/bfin_serial.h>
-- 
cgit v1.2.3-70-g09d2


From 427472c967977256db42ffbf16f65f2770278b7f Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 23 May 2011 21:45:42 -0400
Subject: Blackfin: wire up new sendmmsg syscall

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/include/asm/unistd.h | 3 ++-
 arch/blackfin/mach-common/entry.S  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index 6ff9c411b14..0ccba60b9cc 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -398,8 +398,9 @@
 #define __NR_clock_adjtime	377
 #define __NR_syncfs		378
 #define __NR_setns		379
+#define __NR_sendmmsg		380
 
-#define __NR_syscall		380
+#define __NR_syscall		381
 #define NR_syscalls		__NR_syscall
 
 /* Old optional stuff no one actually uses */
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index dda11ef06be..225d311c970 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1754,6 +1754,7 @@ ENTRY(_sys_call_table)
 	.long _sys_clock_adjtime
 	.long _sys_syncfs
 	.long _sys_setns
+	.long _sys_sendmmsg		/* 380 */
 
 	.rept NR_syscalls-(.-_sys_call_table)/4
 	.long _sys_ni_syscall
-- 
cgit v1.2.3-70-g09d2


From a4ffd956924e265865a4425bd888927059fd46a9 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 26 May 2011 17:26:58 -0400
Subject: Blackfin: gptimers: add structure for hardware register layout

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/include/asm/gptimers.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/blackfin/include/asm/gptimers.h b/arch/blackfin/include/asm/gptimers.h
index c722acdda0d..38657dac123 100644
--- a/arch/blackfin/include/asm/gptimers.h
+++ b/arch/blackfin/include/asm/gptimers.h
@@ -193,4 +193,22 @@ uint16_t get_enabled_gptimers(void);
 uint32_t get_gptimer_status(unsigned int group);
 void     set_gptimer_status(unsigned int group, uint32_t value);
 
+/*
+ * All Blackfin system MMRs are padded to 32bits even if the register
+ * itself is only 16bits.  So use a helper macro to streamline this.
+ */
+#define __BFP(m) u16 m; u16 __pad_##m
+
+/*
+ * bfin timer registers layout
+ */
+struct bfin_gptimer_regs {
+	__BFP(config);
+	u32 counter;
+	u32 period;
+	u32 width;
+};
+
+#undef __BFP
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From d09fb602030e3247c21452ee2b3229baf115f71f Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 26 May 2011 17:27:36 -0400
Subject: Blackfin: debug-mmrs: fix typos with gptimers/mdma/ppi

This code was mostly developed against a BF54x, so some BF537-specific
issues were missed.

The PPI block starts at PPI_CONTROL, not PPI_STATUS (which is the reverse
of the EPPI block).

The MDMA block starts at MDMA_NEXT_DESC_PTR, not MDMA_CONFIG.  Seems the
sim does not catch misreads here so that'll need to get fixed.

The gptimer block is mostly 32bit regs, not 16bit.  Use the gptimer struct
to figure that out rather than hardcoding it locally.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/kernel/debug-mmrs.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/blackfin/kernel/debug-mmrs.c b/arch/blackfin/kernel/debug-mmrs.c
index 94b1d8a0256..aa201be49a1 100644
--- a/arch/blackfin/kernel/debug-mmrs.c
+++ b/arch/blackfin/kernel/debug-mmrs.c
@@ -13,6 +13,7 @@
 
 #include <asm/blackfin.h>
 #include <asm/gpio.h>
+#include <asm/gptimers.h>
 #include <asm/bfin_can.h>
 #include <asm/bfin_dma.h>
 #include <asm/bfin_ppi.h>
@@ -230,8 +231,8 @@ bfin_debug_mmrs_dma(struct dentry *parent, unsigned long base, int num, char mdm
 #define DMA(num)  _DMA(num, DMA##num##_NEXT_DESC_PTR, 0, "")
 #define _MDMA(num, x) \
 	do { \
-		_DMA(num, x##DMA_D##num##_CONFIG, 'D', #x); \
-		_DMA(num, x##DMA_S##num##_CONFIG, 'S', #x); \
+		_DMA(num, x##DMA_D##num##_NEXT_DESC_PTR, 'D', #x); \
+		_DMA(num, x##DMA_S##num##_NEXT_DESC_PTR, 'S', #x); \
 	} while (0)
 #define MDMA(num) _MDMA(num, M)
 #define IMDMA(num) _MDMA(num, IM)
@@ -264,20 +265,15 @@ bfin_debug_mmrs_eppi(struct dentry *parent, unsigned long base, int num)
 /*
  * General Purpose Timers
  */
-#define GPTIMER_OFF(mmr) (TIMER0_##mmr - TIMER0_CONFIG)
-#define __GPTIMER(name) \
-	do { \
-		strcpy(_buf, #name); \
-		debugfs_create_x16(buf, S_IRUSR|S_IWUSR, parent, (u16 *)(base + GPTIMER_OFF(name))); \
-	} while (0)
+#define __GPTIMER(uname, lname) __REGS(gptimer, #uname, lname)
 static void __init __maybe_unused
 bfin_debug_mmrs_gptimer(struct dentry *parent, unsigned long base, int num)
 {
 	char buf[32], *_buf = REGS_STR_PFX(buf, TIMER, num);
-	__GPTIMER(CONFIG);
-	__GPTIMER(COUNTER);
-	__GPTIMER(PERIOD);
-	__GPTIMER(WIDTH);
+	__GPTIMER(CONFIG, config);
+	__GPTIMER(COUNTER, counter);
+	__GPTIMER(PERIOD, period);
+	__GPTIMER(WIDTH, width);
 }
 #define GPTIMER(num) bfin_debug_mmrs_gptimer(parent, TIMER##num##_CONFIG, num)
 
@@ -355,7 +351,7 @@ bfin_debug_mmrs_ppi(struct dentry *parent, unsigned long base, int num)
 	__PPI(DELAY, delay);
 	__PPI(FRAME, frame);
 }
-#define PPI(num) bfin_debug_mmrs_ppi(parent, PPI##num##_STATUS, num)
+#define PPI(num) bfin_debug_mmrs_ppi(parent, PPI##num##_CONTROL, num)
 
 /*
  * SPI
@@ -1288,15 +1284,15 @@ static int __init bfin_debug_mmrs_init(void)
 	D16(VR_CTL);
 	D32(CHIPID);	/* it's part of this hardware block */
 
-#if defined(PPI_STATUS) || defined(PPI0_STATUS) || defined(PPI1_STATUS)
+#if defined(PPI_CONTROL) || defined(PPI0_CONTROL) || defined(PPI1_CONTROL)
 	parent = debugfs_create_dir("ppi", top);
-# ifdef PPI_STATUS
-	bfin_debug_mmrs_ppi(parent, PPI_STATUS, -1);
+# ifdef PPI_CONTROL
+	bfin_debug_mmrs_ppi(parent, PPI_CONTROL, -1);
 # endif
-# ifdef PPI0_STATUS
+# ifdef PPI0_CONTROL
 	PPI(0);
 # endif
-# ifdef PPI1_STATUS
+# ifdef PPI1_CONTROL
 	PPI(1);
 # endif
 #endif
-- 
cgit v1.2.3-70-g09d2


From 61aa818f7bfdb5dd0aef4687513566c75c0e4c21 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 26 May 2011 17:39:17 -0400
Subject: Blackfin: bf52x/bf54x: fix up usb MMR defines

The bf52x/bf54x have the incorrect addresses for USB_EP_NI7_RXINTERVAL
and USB_EP_NI7_TXCOUNT, so adjust those.

Further, the bf54x header puts the USB defines in the wrong place, so
shuffle them back to the right grouping.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/mach-bf527/include/mach/defBF525.h |  4 ++--
 arch/blackfin/mach-bf548/include/mach/defBF547.h | 19 ++++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/blackfin/mach-bf527/include/mach/defBF525.h b/arch/blackfin/mach-bf527/include/mach/defBF525.h
index cc383adfdff..aab80bb1a68 100644
--- a/arch/blackfin/mach-bf527/include/mach/defBF525.h
+++ b/arch/blackfin/mach-bf527/include/mach/defBF525.h
@@ -185,8 +185,8 @@
 #define                USB_EP_NI7_TXTYPE  0xffc03bd4   /* Sets the transaction protocol and peripheral endpoint number for the Host Tx endpoint7 */
 #define            USB_EP_NI7_TXINTERVAL  0xffc03bd8   /* Sets the NAK response timeout on Endpoint7 */
 #define                USB_EP_NI7_RXTYPE  0xffc03bdc   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint7 */
-#define            USB_EP_NI7_RXINTERVAL  0xffc03bf0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint7 */
-#define               USB_EP_NI7_TXCOUNT  0xffc03bf8   /* Number of bytes to be written to the endpoint7 Tx FIFO */
+#define            USB_EP_NI7_RXINTERVAL  0xffc03be0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint7 */
+#define               USB_EP_NI7_TXCOUNT  0xffc03be8   /* Number of bytes to be written to the endpoint7 Tx FIFO */
 
 #define                USB_DMA_INTERRUPT  0xffc03c00   /* Indicates pending interrupts for the DMA channels */
 
diff --git a/arch/blackfin/mach-bf548/include/mach/defBF547.h b/arch/blackfin/mach-bf548/include/mach/defBF547.h
index 1cbba115f96..1fa41ec03f3 100644
--- a/arch/blackfin/mach-bf548/include/mach/defBF547.h
+++ b/arch/blackfin/mach-bf548/include/mach/defBF547.h
@@ -271,10 +271,10 @@
 #define            USB_EP_NI0_TXINTERVAL  0xffc03e18   /* Sets the NAK response timeout on Endpoint 0 */
 #define                USB_EP_NI0_RXTYPE  0xffc03e1c   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint0 */
 #define            USB_EP_NI0_RXINTERVAL  0xffc03e20   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint0 */
+#define               USB_EP_NI0_TXCOUNT  0xffc03e28   /* Number of bytes to be written to the endpoint0 Tx FIFO */
 
 /* USB Endpoint 1 Control Registers */
 
-#define               USB_EP_NI0_TXCOUNT  0xffc03e28   /* Number of bytes to be written to the endpoint0 Tx FIFO */
 #define                USB_EP_NI1_TXMAXP  0xffc03e40   /* Maximum packet size for Host Tx endpoint1 */
 #define                 USB_EP_NI1_TXCSR  0xffc03e44   /* Control Status register for endpoint1 */
 #define                USB_EP_NI1_RXMAXP  0xffc03e48   /* Maximum packet size for Host Rx endpoint1 */
@@ -284,10 +284,10 @@
 #define            USB_EP_NI1_TXINTERVAL  0xffc03e58   /* Sets the NAK response timeout on Endpoint1 */
 #define                USB_EP_NI1_RXTYPE  0xffc03e5c   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint1 */
 #define            USB_EP_NI1_RXINTERVAL  0xffc03e60   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint1 */
+#define               USB_EP_NI1_TXCOUNT  0xffc03e68   /* Number of bytes to be written to the+H102 endpoint1 Tx FIFO */
 
 /* USB Endpoint 2 Control Registers */
 
-#define               USB_EP_NI1_TXCOUNT  0xffc03e68   /* Number of bytes to be written to the+H102 endpoint1 Tx FIFO */
 #define                USB_EP_NI2_TXMAXP  0xffc03e80   /* Maximum packet size for Host Tx endpoint2 */
 #define                 USB_EP_NI2_TXCSR  0xffc03e84   /* Control Status register for endpoint2 */
 #define                USB_EP_NI2_RXMAXP  0xffc03e88   /* Maximum packet size for Host Rx endpoint2 */
@@ -297,10 +297,10 @@
 #define            USB_EP_NI2_TXINTERVAL  0xffc03e98   /* Sets the NAK response timeout on Endpoint2 */
 #define                USB_EP_NI2_RXTYPE  0xffc03e9c   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint2 */
 #define            USB_EP_NI2_RXINTERVAL  0xffc03ea0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint2 */
+#define               USB_EP_NI2_TXCOUNT  0xffc03ea8   /* Number of bytes to be written to the endpoint2 Tx FIFO */
 
 /* USB Endpoint 3 Control Registers */
 
-#define               USB_EP_NI2_TXCOUNT  0xffc03ea8   /* Number of bytes to be written to the endpoint2 Tx FIFO */
 #define                USB_EP_NI3_TXMAXP  0xffc03ec0   /* Maximum packet size for Host Tx endpoint3 */
 #define                 USB_EP_NI3_TXCSR  0xffc03ec4   /* Control Status register for endpoint3 */
 #define                USB_EP_NI3_RXMAXP  0xffc03ec8   /* Maximum packet size for Host Rx endpoint3 */
@@ -310,10 +310,10 @@
 #define            USB_EP_NI3_TXINTERVAL  0xffc03ed8   /* Sets the NAK response timeout on Endpoint3 */
 #define                USB_EP_NI3_RXTYPE  0xffc03edc   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint3 */
 #define            USB_EP_NI3_RXINTERVAL  0xffc03ee0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint3 */
+#define               USB_EP_NI3_TXCOUNT  0xffc03ee8   /* Number of bytes to be written to the H124endpoint3 Tx FIFO */
 
 /* USB Endpoint 4 Control Registers */
 
-#define               USB_EP_NI3_TXCOUNT  0xffc03ee8   /* Number of bytes to be written to the H124endpoint3 Tx FIFO */
 #define                USB_EP_NI4_TXMAXP  0xffc03f00   /* Maximum packet size for Host Tx endpoint4 */
 #define                 USB_EP_NI4_TXCSR  0xffc03f04   /* Control Status register for endpoint4 */
 #define                USB_EP_NI4_RXMAXP  0xffc03f08   /* Maximum packet size for Host Rx endpoint4 */
@@ -323,10 +323,10 @@
 #define            USB_EP_NI4_TXINTERVAL  0xffc03f18   /* Sets the NAK response timeout on Endpoint4 */
 #define                USB_EP_NI4_RXTYPE  0xffc03f1c   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint4 */
 #define            USB_EP_NI4_RXINTERVAL  0xffc03f20   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint4 */
+#define               USB_EP_NI4_TXCOUNT  0xffc03f28   /* Number of bytes to be written to the endpoint4 Tx FIFO */
 
 /* USB Endpoint 5 Control Registers */
 
-#define               USB_EP_NI4_TXCOUNT  0xffc03f28   /* Number of bytes to be written to the endpoint4 Tx FIFO */
 #define                USB_EP_NI5_TXMAXP  0xffc03f40   /* Maximum packet size for Host Tx endpoint5 */
 #define                 USB_EP_NI5_TXCSR  0xffc03f44   /* Control Status register for endpoint5 */
 #define                USB_EP_NI5_RXMAXP  0xffc03f48   /* Maximum packet size for Host Rx endpoint5 */
@@ -336,10 +336,10 @@
 #define            USB_EP_NI5_TXINTERVAL  0xffc03f58   /* Sets the NAK response timeout on Endpoint5 */
 #define                USB_EP_NI5_RXTYPE  0xffc03f5c   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint5 */
 #define            USB_EP_NI5_RXINTERVAL  0xffc03f60   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint5 */
+#define               USB_EP_NI5_TXCOUNT  0xffc03f68   /* Number of bytes to be written to the H145endpoint5 Tx FIFO */
 
 /* USB Endpoint 6 Control Registers */
 
-#define               USB_EP_NI5_TXCOUNT  0xffc03f68   /* Number of bytes to be written to the H145endpoint5 Tx FIFO */
 #define                USB_EP_NI6_TXMAXP  0xffc03f80   /* Maximum packet size for Host Tx endpoint6 */
 #define                 USB_EP_NI6_TXCSR  0xffc03f84   /* Control Status register for endpoint6 */
 #define                USB_EP_NI6_RXMAXP  0xffc03f88   /* Maximum packet size for Host Rx endpoint6 */
@@ -349,10 +349,10 @@
 #define            USB_EP_NI6_TXINTERVAL  0xffc03f98   /* Sets the NAK response timeout on Endpoint6 */
 #define                USB_EP_NI6_RXTYPE  0xffc03f9c   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint6 */
 #define            USB_EP_NI6_RXINTERVAL  0xffc03fa0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint6 */
+#define               USB_EP_NI6_TXCOUNT  0xffc03fa8   /* Number of bytes to be written to the endpoint6 Tx FIFO */
 
 /* USB Endpoint 7 Control Registers */
 
-#define               USB_EP_NI6_TXCOUNT  0xffc03fa8   /* Number of bytes to be written to the endpoint6 Tx FIFO */
 #define                USB_EP_NI7_TXMAXP  0xffc03fc0   /* Maximum packet size for Host Tx endpoint7 */
 #define                 USB_EP_NI7_TXCSR  0xffc03fc4   /* Control Status register for endpoint7 */
 #define                USB_EP_NI7_RXMAXP  0xffc03fc8   /* Maximum packet size for Host Rx endpoint7 */
@@ -361,8 +361,9 @@
 #define                USB_EP_NI7_TXTYPE  0xffc03fd4   /* Sets the transaction protocol and peripheral endpoint number for the Host Tx endpoint7 */
 #define            USB_EP_NI7_TXINTERVAL  0xffc03fd8   /* Sets the NAK response timeout on Endpoint7 */
 #define                USB_EP_NI7_RXTYPE  0xffc03fdc   /* Sets the transaction protocol and peripheral endpoint number for the Host Rx endpoint7 */
-#define            USB_EP_NI7_RXINTERVAL  0xffc03ff0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint7 */
-#define               USB_EP_NI7_TXCOUNT  0xffc03ff8   /* Number of bytes to be written to the endpoint7 Tx FIFO */
+#define            USB_EP_NI7_RXINTERVAL  0xffc03fe0   /* Sets the polling interval for Interrupt/Isochronous transfers or the NAK response timeout on Bulk transfers for Host Rx endpoint7 */
+#define               USB_EP_NI7_TXCOUNT  0xffc03fe8   /* Number of bytes to be written to the endpoint7 Tx FIFO */
+
 #define                USB_DMA_INTERRUPT  0xffc04000   /* Indicates pending interrupts for the DMA channels */
 
 /* USB Channel 0 Config Registers */
-- 
cgit v1.2.3-70-g09d2


From fcb243918f9e8414bf5ad6fb0361447ac3d3fddb Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 26 May 2011 18:05:15 -0400
Subject: Blackfin: bf51x: fix up RSI_PID# MMR defines

Looks like the copying of MMR defines from the SDH block missed updating
the addresses of the RSI_PID# registers.  So tweak them to reflect the
actual hardware.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/mach-bf518/include/mach/defBF514.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/blackfin/mach-bf518/include/mach/defBF514.h b/arch/blackfin/mach-bf518/include/mach/defBF514.h
index 98a51c47929..cfab428e577 100644
--- a/arch/blackfin/mach-bf518/include/mach/defBF514.h
+++ b/arch/blackfin/mach-bf518/include/mach/defBF514.h
@@ -36,13 +36,13 @@
 #define RSI_EMASK                      0xFFC038C4 /* RSI Exception Mask Register */
 #define RSI_CONFIG                     0xFFC038C8 /* RSI Configuration Register */
 #define RSI_RD_WAIT_EN                 0xFFC038CC /* RSI Read Wait Enable Register */
-#define RSI_PID0                       0xFFC03FE0 /* RSI Peripheral ID Register 0 */
-#define RSI_PID1                       0xFFC03FE4 /* RSI Peripheral ID Register 1 */
-#define RSI_PID2                       0xFFC03FE8 /* RSI Peripheral ID Register 2 */
-#define RSI_PID3                       0xFFC03FEC /* RSI Peripheral ID Register 3 */
-#define RSI_PID4                       0xFFC03FF0 /* RSI Peripheral ID Register 4 */
-#define RSI_PID5                       0xFFC03FF4 /* RSI Peripheral ID Register 5 */
-#define RSI_PID6                       0xFFC03FF8 /* RSI Peripheral ID Register 6 */
-#define RSI_PID7                       0xFFC03FFC /* RSI Peripheral ID Register 7 */
+#define RSI_PID0                       0xFFC038D0 /* RSI Peripheral ID Register 0 */
+#define RSI_PID1                       0xFFC038D4 /* RSI Peripheral ID Register 1 */
+#define RSI_PID2                       0xFFC038D8 /* RSI Peripheral ID Register 2 */
+#define RSI_PID3                       0xFFC038DC /* RSI Peripheral ID Register 3 */
+#define RSI_PID4                       0xFFC038E0 /* RSI Peripheral ID Register 0 */
+#define RSI_PID5                       0xFFC038E4 /* RSI Peripheral ID Register 1 */
+#define RSI_PID6                       0xFFC038E8 /* RSI Peripheral ID Register 2 */
+#define RSI_PID7                       0xFFC038EC /* RSI Peripheral ID Register 3 */
 
 #endif /* _DEF_BF514_H */
-- 
cgit v1.2.3-70-g09d2


From c320afe965bf3f857249d223801d8f2fc95615c2 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 26 May 2011 18:07:11 -0400
Subject: Blackfin: debug-mmrs: include RSI_PID[4567] MMRs

The documentation is a little iffy as to whether these are actual MMRs,
but reading them on the hardware works, and the previous version of this
logic (the SDH) had PID[4567].  So add it for RSI too.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/kernel/debug-mmrs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/blackfin/kernel/debug-mmrs.c b/arch/blackfin/kernel/debug-mmrs.c
index aa201be49a1..fce4807ceef 100644
--- a/arch/blackfin/kernel/debug-mmrs.c
+++ b/arch/blackfin/kernel/debug-mmrs.c
@@ -1337,6 +1337,10 @@ static int __init bfin_debug_mmrs_init(void)
 	D16(RSI_PID1);
 	D16(RSI_PID2);
 	D16(RSI_PID3);
+	D16(RSI_PID4);
+	D16(RSI_PID5);
+	D16(RSI_PID6);
+	D16(RSI_PID7);
 	D16(RSI_PWR_CONTROL);
 	D16(RSI_RD_WAIT_EN);
 	D32(RSI_RESPONSE0);
-- 
cgit v1.2.3-70-g09d2


From 826267cf1e6c6899eda1325a19f1b1d15c558b20 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 28 May 2011 13:14:09 -0700
Subject: tmpfs: fix race between truncate and writepage

While running fsx on tmpfs with a memhog then swapoff, swapoff was hanging
(interruptibly), repeatedly failing to locate the owner of a 0xff entry in
the swap_map.

Although shmem_writepage() does abandon when it sees incoming page index
is beyond eof, there was still a window in which shmem_truncate_range()
could come in between writepage's dropping lock and updating swap_map,
find the half-completed swap_map entry, and in trying to free it,
leave it in a state that swap_shmem_alloc() could not correct.

Arguably a bug in __swap_duplicate()'s and swap_entry_free()'s handling
of the different cases, but easiest to fix by moving swap_shmem_alloc()
under cover of the lock.

More interesting than the bug: it's been there since 2.6.33, why could
I not see it with earlier kernels?  The mmotm of two weeks ago seems to
have some magic for generating races, this is just one of three I found.

With yesterday's git I first saw this in mainline, bisected in search of
that magic, but the easy reproducibility evaporated.  Oh well, fix the bug.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 1acfb2687bf..d221a1cfd7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1114,8 +1114,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		delete_from_page_cache(page);
 		shmem_swp_set(info, entry, swap.val);
 		shmem_swp_unmap(entry);
-		spin_unlock(&info->lock);
 		swap_shmem_alloc(swap);
+		spin_unlock(&info->lock);
 		BUG_ON(page_mapped(page));
 		swap_writepage(page, wbc);
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 5dbe0af47f8a8f968bac2991c3ec974c6e3eaabc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 28 May 2011 13:17:04 -0700
Subject: mm: fix kernel BUG at mm/rmap.c:1017!

I've hit the "address >= vma->vm_end" check in do_page_add_anon_rmap()
just once.  The stack showed khugepaged allocation trying to compact
pages: the call to page_add_anon_rmap() coming from remove_migration_pte().

That path holds anon_vma lock, but does not hold mmap_sem: it can
therefore race with a split_vma(), and in commit 5f70b962ccc2 "mmap:
avoid unnecessary anon_vma lock" we just took away the anon_vma lock
protection when adjusting vma->vm_end.

I don't think that particular BUG_ON ever caught anything interesting,
so better replace it by a comment, than reinstate the anon_vma locking.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 3a39b518a65..ba58ca36fc9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1014,7 +1014,7 @@ void do_page_add_anon_rmap(struct page *page,
 		return;
 
 	VM_BUG_ON(!PageLocked(page));
-	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	/* address might be in next vma when migration races vma_adjust */
 	if (first)
 		__page_set_anon_rmap(page, vma, address, exclusive);
 	else
@@ -1709,7 +1709,7 @@ void hugepage_add_anon_rmap(struct page *page,
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!anon_vma);
-	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	/* address might be in next vma when migration races vma_adjust */
 	first = atomic_inc_and_test(&page->_mapcount);
 	if (first)
 		__hugepage_set_anon_rmap(page, vma, address, 0);
-- 
cgit v1.2.3-70-g09d2


From eee0f252c6537da2e883f75d22cff1427515eaf3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 28 May 2011 13:20:21 -0700
Subject: mm: fix page_lock_anon_vma leaving mutex locked

On one machine I've been getting hangs, a page fault's anon_vma_prepare()
waiting in anon_vma_lock(), other processes waiting for that page's lock.

This is a replay of last year's f18194275c39 "mm: fix hang on
anon_vma->root->lock".

The new page_lock_anon_vma() places too much faith in its refcount: when
it has acquired the mutex_trylock(), it's possible that a racing task in
anon_vma_alloc() has just reallocated the struct anon_vma, set refcount
to 1, and is about to reset its anon_vma->root.

Fix this by saving anon_vma->root, and relying on the usual page_mapped()
check instead of a refcount check: if page is still mapped, the anon_vma
is still ours; if page is not still mapped, we're no longer interested.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index ba58ca36fc9..6bada99cd61 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -405,6 +405,7 @@ out:
 struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma = NULL;
+	struct anon_vma *root_anon_vma;
 	unsigned long anon_mapping;
 
 	rcu_read_lock();
@@ -415,13 +416,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
 		goto out;
 
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-	if (mutex_trylock(&anon_vma->root->mutex)) {
+	root_anon_vma = ACCESS_ONCE(anon_vma->root);
+	if (mutex_trylock(&root_anon_vma->mutex)) {
 		/*
-		 * If we observe a !0 refcount, then holding the lock ensures
-		 * the anon_vma will not go away, see __put_anon_vma().
+		 * If the page is still mapped, then this anon_vma is still
+		 * its anon_vma, and holding the mutex ensures that it will
+		 * not go away, see __put_anon_vma().
 		 */
-		if (!atomic_read(&anon_vma->refcount)) {
-			anon_vma_unlock(anon_vma);
+		if (!page_mapped(page)) {
+			mutex_unlock(&root_anon_vma->mutex);
 			anon_vma = NULL;
 		}
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From 7467571f4480b273007517b26297c07154c73924 Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@nokia.com>
Date: Thu, 24 Feb 2011 17:19:23 +0200
Subject: cpuidle: menu: fixed wrapping timers at 4.294 seconds

Cpuidle menu governor is using u32 as a temporary datatype for storing
nanosecond values which wrap around at 4.294 seconds. This causes errors
in predicted sleep times resulting in higher than should be C state
selection and increased power consumption. This also breaks cpuidle
state residency statistics.

cc: stable@kernel.org # .32.x through .39.x
Signed-off-by: Tero Kristo <tero.kristo@nokia.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/cpuidle/governors/menu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f508690eb95..c47f3d09c1e 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -237,6 +237,7 @@ static int menu_select(struct cpuidle_device *dev)
 	unsigned int power_usage = -1;
 	int i;
 	int multiplier;
+	struct timespec t;
 
 	if (data->needs_update) {
 		menu_update(dev);
@@ -251,8 +252,9 @@ static int menu_select(struct cpuidle_device *dev)
 		return 0;
 
 	/* determine the expected residency time, round up */
+	t = ktime_to_timespec(tick_nohz_get_sleep_length());
 	data->expected_us =
-	    DIV_ROUND_UP((u32)ktime_to_ns(tick_nohz_get_sleep_length()), 1000);
+		t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC;
 
 
 	data->bucket = which_bucket(data->expected_us);
-- 
cgit v1.2.3-70-g09d2


From 333c5ae9948194428fe6c5ef5c088304fc98263b Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 11 Feb 2011 12:49:04 -0800
Subject: idle governor: Avoid lock acquisition to read pm_qos before entering
 idle

Thanks to the reviews and comments by Rafael, James, Mark and Andi.
Here's version 2 of the patch incorporating your comments and also some
update to my previous patch comments.

I noticed that before entering idle state, the menu idle governor will
look up the current pm_qos target value according to the list of qos
requests received.  This look up currently needs the acquisition of a
lock to access the list of qos requests to find the qos target value,
slowing down the entrance into idle state due to contention by multiple
cpus to access this list.  The contention is severe when there are a lot
of cpus waking and going into idle.  For example, for a simple workload
that has 32 pair of processes ping ponging messages to each other, where
64 cpu cores are active in test system, I see the following profile with
37.82% of cpu cycles spent in contention of pm_qos_lock:

-     37.82%          swapper  [kernel.kallsyms]          [k]
_raw_spin_lock_irqsave
   - _raw_spin_lock_irqsave
      - 95.65% pm_qos_request
           menu_select
           cpuidle_idle_call
         - cpu_idle
              99.98% start_secondary

A better approach will be to cache the updated pm_qos target value so
reading it does not require lock acquisition as in the patch below.
With this patch the contention for pm_qos_lock is removed and I saw a
2.2X increase in throughput for my message passing workload.

cc: stable@kernel.org
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Acked-by: James Bottomley <James.Bottomley@suse.de>
Acked-by: mark gross <markgross@thegnar.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/pm_qos_params.h |  4 ++++
 kernel/pm_qos_params.c        | 37 +++++++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
index 77cbddb3784..a7d87f911ca 100644
--- a/include/linux/pm_qos_params.h
+++ b/include/linux/pm_qos_params.h
@@ -16,6 +16,10 @@
 #define PM_QOS_NUM_CLASSES 4
 #define PM_QOS_DEFAULT_VALUE -1
 
+#define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE	(2000 * USEC_PER_SEC)
+#define PM_QOS_NETWORK_LAT_DEFAULT_VALUE	(2000 * USEC_PER_SEC)
+#define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE	0
+
 struct pm_qos_request_list {
 	struct plist_node list;
 	int pm_qos_class;
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f84682..6a8fad82a3a 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -53,11 +53,17 @@ enum pm_qos_type {
 	PM_QOS_MIN		/* return the smallest value */
 };
 
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically.  Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
 struct pm_qos_object {
 	struct plist_head requests;
 	struct blocking_notifier_head *notifiers;
 	struct miscdevice pm_qos_power_miscdev;
 	char *name;
+	s32 target_value;	/* Do not change to 64 bit */
 	s32 default_value;
 	enum pm_qos_type type;
 };
@@ -70,7 +76,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
 	.notifiers = &cpu_dma_lat_notifier,
 	.name = "cpu_dma_latency",
-	.default_value = 2000 * USEC_PER_SEC,
+	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+	.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN,
 };
 
@@ -79,7 +86,8 @@ static struct pm_qos_object network_lat_pm_qos = {
 	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
 	.notifiers = &network_lat_notifier,
 	.name = "network_latency",
-	.default_value = 2000 * USEC_PER_SEC,
+	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN
 };
 
@@ -89,7 +97,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
 	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
 	.notifiers = &network_throughput_notifier,
 	.name = "network_throughput",
-	.default_value = 0,
+	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.type = PM_QOS_MAX,
 };
 
@@ -132,6 +141,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
 	}
 }
 
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+	return o->target_value;
+}
+
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+	o->target_value = value;
+}
+
 static void update_target(struct pm_qos_object *o, struct plist_node *node,
 			  int del, int value)
 {
@@ -156,6 +175,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
 		plist_add(node, &o->requests);
 	}
 	curr_value = pm_qos_get_value(o);
+	pm_qos_set_value(o, curr_value);
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
 	if (prev_value != curr_value)
@@ -190,18 +210,11 @@ static int find_pm_qos_object_by_minor(int minor)
  * pm_qos_request - returns current system wide qos expectation
  * @pm_qos_class: identification of which qos value is requested
  *
- * This function returns the current target value in an atomic manner.
+ * This function returns the current target value.
  */
 int pm_qos_request(int pm_qos_class)
 {
-	unsigned long flags;
-	int value;
-
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-
-	return value;
+	return pm_qos_read_value(pm_qos_array[pm_qos_class]);
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
 
-- 
cgit v1.2.3-70-g09d2


From 534bc4e3d27096e2f3fc00c14a20efd597837a4f Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 26 Apr 2011 16:30:02 +0800
Subject: ACPI EC: enable MSI workaround for Quanta laptops

Enable MSI workaround for Quanta laptops.
https://bugzilla.kernel.org/show_bug.cgi?id=20242

Tested-by: Jan-Matthias Braun <jan_braun@gmx.net>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/ec.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index fa848c4116a..62628e5eba3 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -938,6 +938,14 @@ static struct dmi_system_id __initdata ec_dmi_table[] = {
 	ec_flag_msi, "MSI hardware", {
 	DMI_MATCH(DMI_CHASSIS_VENDOR, "MICRO-STAR")}, NULL},
 	{
+	ec_flag_msi, "Quanta hardware", {
+	DMI_MATCH(DMI_SYS_VENDOR, "Quanta"),
+	DMI_MATCH(DMI_PRODUCT_NAME, "TW8/SW8/DW8"),}, NULL},
+	{
+	ec_flag_msi, "Quanta hardware", {
+	DMI_MATCH(DMI_SYS_VENDOR, "Quanta"),
+	DMI_MATCH(DMI_PRODUCT_NAME, "TW9/SW9"),}, NULL},
+	{
 	ec_validate_ecdt, "ASUS hardware", {
 	DMI_MATCH(DMI_BIOS_VENDOR, "ASUS") }, NULL},
 	{},
-- 
cgit v1.2.3-70-g09d2


From aecad432fd68dafa5b3b497c4816fbfce6fd4066 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Thu, 26 May 2011 12:26:23 +0200
Subject: ACPI: Cleanup custom_method debug stuff

- Move param aml_debug_output to other params into sysfs.c
- Split acpi_debugfs_init to prepare custom_method to be
  an own .config option and driver.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: rui.zhang@intel.com
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/debugfs.c  | 32 +++++++++++++-------------------
 drivers/acpi/internal.h |  3 ++-
 drivers/acpi/sysfs.c    |  8 ++++++++
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/drivers/acpi/debugfs.c b/drivers/acpi/debugfs.c
index 384f7abcff7..e7abc6e3bba 100644
--- a/drivers/acpi/debugfs.c
+++ b/drivers/acpi/debugfs.c
@@ -12,13 +12,8 @@
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
 ACPI_MODULE_NAME("debugfs");
 
-
-/* /sys/modules/acpi/parameters/aml_debug_output */
-
-module_param_named(aml_debug_output, acpi_gbl_enable_aml_debug_object,
-		   bool, 0644);
-MODULE_PARM_DESC(aml_debug_output,
-		 "To enable/disable the ACPI Debug Object output.");
+struct dentry *acpi_debugfs_dir;
+static struct dentry *cm_dentry;
 
 /* /sys/kernel/debug/acpi/custom_method */
 
@@ -80,23 +75,22 @@ static const struct file_operations cm_fops = {
 	.llseek = default_llseek,
 };
 
-int __init acpi_debugfs_init(void)
+static int __init acpi_custom_method_init(void)
 {
-	struct dentry *acpi_dir, *cm_dentry;
-
-	acpi_dir = debugfs_create_dir("acpi", NULL);
-	if (!acpi_dir)
-		goto err;
+	if (!acpi_debugfs_dir)
+		return -ENOENT;
 
 	cm_dentry = debugfs_create_file("custom_method", S_IWUSR,
-					acpi_dir, NULL, &cm_fops);
+					acpi_debugfs_dir, NULL, &cm_fops);
 	if (!cm_dentry)
-		goto err;
+		return -ENODEV;
 
 	return 0;
+}
+
+void __init acpi_debugfs_init(void)
+{
+	acpi_debugfs_dir = debugfs_create_dir("acpi", NULL);
 
-err:
-	if (acpi_dir)
-		debugfs_remove(acpi_dir);
-	return -EINVAL;
+	acpi_custom_method_init();
 }
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 4bfb759deb1..ca75b9ce048 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -28,9 +28,10 @@ int acpi_scan_init(void);
 int acpi_sysfs_init(void);
 
 #ifdef CONFIG_DEBUG_FS
+extern struct dentry *acpi_debugfs_dir;
 int acpi_debugfs_init(void);
 #else
-static inline int acpi_debugfs_init(void) { return 0; }
+static inline void acpi_debugfs_init(void) { return; }
 #endif
 
 /* --------------------------------------------------------------------------
diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
index 61891e75583..77255f250db 100644
--- a/drivers/acpi/sysfs.c
+++ b/drivers/acpi/sysfs.c
@@ -220,6 +220,14 @@ module_param_call(trace_state, param_set_trace_state, param_get_trace_state,
 		  NULL, 0644);
 #endif /* CONFIG_ACPI_DEBUG */
 
+
+/* /sys/modules/acpi/parameters/aml_debug_output */
+
+module_param_named(aml_debug_output, acpi_gbl_enable_aml_debug_object,
+		   bool, 0644);
+MODULE_PARM_DESC(aml_debug_output,
+		 "To enable/disable the ACPI Debug Object output.");
+
 /* /sys/module/acpi/parameters/acpica_version */
 static int param_get_acpica_version(char *buffer, struct kernel_param *kp)
 {
-- 
cgit v1.2.3-70-g09d2


From 526b4af47f44148c9d665e57723ed9f86634c6e3 Mon Sep 17 00:00:00 2001
From: Thomas Renninger <trenn@suse.de>
Date: Thu, 26 May 2011 12:26:24 +0200
Subject: ACPI: Split out custom_method functionality into an own driver

With /sys/kernel/debug/acpi/custom_method root can write
to arbitrary memory and increase his priveleges, even if
these are restricted.

-> Make this an own debug .config option and warn about the
security issue in the config description.

-> Still keep acpi/debugfs.c which now only creates an empty
   /sys/kernel/debug/acpi directory. There might be other
   users of it later.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: rui.zhang@intel.com
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/acpi/method-customizing.txt |   5 ++
 drivers/acpi/Kconfig                      |  15 +++++
 drivers/acpi/Makefile                     |   1 +
 drivers/acpi/custom_method.c              | 100 ++++++++++++++++++++++++++++++
 drivers/acpi/debugfs.c                    |  80 +-----------------------
 5 files changed, 122 insertions(+), 79 deletions(-)
 create mode 100644 drivers/acpi/custom_method.c

diff --git a/Documentation/acpi/method-customizing.txt b/Documentation/acpi/method-customizing.txt
index 3e1d25aee3f..5f55373dd53 100644
--- a/Documentation/acpi/method-customizing.txt
+++ b/Documentation/acpi/method-customizing.txt
@@ -66,3 +66,8 @@ Note: We can use a kernel with multiple custom ACPI method running,
       But each individual write to debugfs can implement a SINGLE
       method override. i.e. if we want to insert/override multiple
       ACPI methods, we need to redo step c) ~ g) for multiple times.
+
+Note: Be aware that root can mis-use this driver to modify arbitrary
+      memory and gain additional rights, if root's privileges got
+      restricted (for example if root is not allowed to load additional
+      modules after boot).
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 3a17ca5fff6..d918e130bef 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -380,6 +380,21 @@ config ACPI_HED
 	  which is used to report some hardware errors notified via
 	  SCI, mainly the corrected errors.
 
+config ACPI_CUSTOM_METHOD
+	tristate "Allow ACPI methods to be inserted/replaced at run time"
+	depends on DEBUG_FS
+	default n
+	help
+	  This debug facility allows ACPI AML methods to me inserted and/or
+	  replaced without rebooting the system. For details refer to:
+	  Documentation/acpi/method-customizing.txt.
+
+	  NOTE: This option is security sensitive, because it allows arbitrary
+	  kernel memory to be written to by root (uid=0) users, allowing them
+	  to bypass certain security measures (e.g. if root is not allowed to
+	  load additional kernel modules after boot, this feature may be used
+	  to override that restriction).
+
 source "drivers/acpi/apei/Kconfig"
 
 endif	# ACPI
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index d113fa5100b..cba0b2334b9 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_ACPI_SBS)		+= sbs.o
 obj-$(CONFIG_ACPI_POWER_METER)	+= power_meter.o
 obj-$(CONFIG_ACPI_HED)		+= hed.o
 obj-$(CONFIG_ACPI_EC_DEBUGFS)	+= ec_sys.o
+obj-$(CONFIG_ACPI_CUSTOM_METHOD)+= custom_method.o
 
 # processor has its own "processor." module_param namespace
 processor-y			:= processor_driver.o processor_throttling.o
diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c
new file mode 100644
index 00000000000..5d42c2414ae
--- /dev/null
+++ b/drivers/acpi/custom_method.c
@@ -0,0 +1,100 @@
+/*
+ * debugfs.c - ACPI debugfs interface to userspace.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <acpi/acpi_drivers.h>
+
+#include "internal.h"
+
+#define _COMPONENT		ACPI_SYSTEM_COMPONENT
+ACPI_MODULE_NAME("custom_method");
+MODULE_LICENSE("GPL");
+
+static struct dentry *cm_dentry;
+
+/* /sys/kernel/debug/acpi/custom_method */
+
+static ssize_t cm_write(struct file *file, const char __user * user_buf,
+			size_t count, loff_t *ppos)
+{
+	static char *buf;
+	static u32 max_size;
+	static u32 uncopied_bytes;
+
+	struct acpi_table_header table;
+	acpi_status status;
+
+	if (!(*ppos)) {
+		/* parse the table header to get the table length */
+		if (count <= sizeof(struct acpi_table_header))
+			return -EINVAL;
+		if (copy_from_user(&table, user_buf,
+				   sizeof(struct acpi_table_header)))
+			return -EFAULT;
+		uncopied_bytes = max_size = table.length;
+		buf = kzalloc(max_size, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+	}
+
+	if (buf == NULL)
+		return -EINVAL;
+
+	if ((*ppos > max_size) ||
+	    (*ppos + count > max_size) ||
+	    (*ppos + count < count) ||
+	    (count > uncopied_bytes))
+		return -EINVAL;
+
+	if (copy_from_user(buf + (*ppos), user_buf, count)) {
+		kfree(buf);
+		buf = NULL;
+		return -EFAULT;
+	}
+
+	uncopied_bytes -= count;
+	*ppos += count;
+
+	if (!uncopied_bytes) {
+		status = acpi_install_method(buf);
+		kfree(buf);
+		buf = NULL;
+		if (ACPI_FAILURE(status))
+			return -EINVAL;
+		add_taint(TAINT_OVERRIDDEN_ACPI_TABLE);
+	}
+
+	return count;
+}
+
+static const struct file_operations cm_fops = {
+	.write = cm_write,
+	.llseek = default_llseek,
+};
+
+static int __init acpi_custom_method_init(void)
+{
+	if (acpi_debugfs_dir == NULL)
+		return -ENOENT;
+
+	cm_dentry = debugfs_create_file("custom_method", S_IWUSR,
+					acpi_debugfs_dir, NULL, &cm_fops);
+	if (cm_dentry == NULL)
+		return -ENODEV;
+
+	return 0;
+}
+
+static void __exit acpi_custom_method_exit(void)
+{
+	if (cm_dentry)
+		debugfs_remove(cm_dentry);
+ }
+
+module_init(acpi_custom_method_init);
+module_exit(acpi_custom_method_exit);
diff --git a/drivers/acpi/debugfs.c b/drivers/acpi/debugfs.c
index e7abc6e3bba..182a9fc3635 100644
--- a/drivers/acpi/debugfs.c
+++ b/drivers/acpi/debugfs.c
@@ -3,9 +3,6 @@
  */
 
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/uaccess.h>
 #include <linux/debugfs.h>
 #include <acpi/acpi_drivers.h>
 
@@ -13,84 +10,9 @@
 ACPI_MODULE_NAME("debugfs");
 
 struct dentry *acpi_debugfs_dir;
-static struct dentry *cm_dentry;
-
-/* /sys/kernel/debug/acpi/custom_method */
-
-static ssize_t cm_write(struct file *file, const char __user * user_buf,
-			size_t count, loff_t *ppos)
-{
-	static char *buf;
-	static u32 max_size;
-	static u32 uncopied_bytes;
-
-	struct acpi_table_header table;
-	acpi_status status;
-
-	if (!(*ppos)) {
-		/* parse the table header to get the table length */
-		if (count <= sizeof(struct acpi_table_header))
-			return -EINVAL;
-		if (copy_from_user(&table, user_buf,
-				   sizeof(struct acpi_table_header)))
-			return -EFAULT;
-		uncopied_bytes = max_size = table.length;
-		buf = kzalloc(max_size, GFP_KERNEL);
-		if (!buf)
-			return -ENOMEM;
-	}
-
-	if (buf == NULL)
-		return -EINVAL;
-
-	if ((*ppos > max_size) ||
-	    (*ppos + count > max_size) ||
-	    (*ppos + count < count) ||
-	    (count > uncopied_bytes))
-		return -EINVAL;
-
-	if (copy_from_user(buf + (*ppos), user_buf, count)) {
-		kfree(buf);
-		buf = NULL;
-		return -EFAULT;
-	}
-
-	uncopied_bytes -= count;
-	*ppos += count;
-
-	if (!uncopied_bytes) {
-		status = acpi_install_method(buf);
-		kfree(buf);
-		buf = NULL;
-		if (ACPI_FAILURE(status))
-			return -EINVAL;
-		add_taint(TAINT_OVERRIDDEN_ACPI_TABLE);
-	}
-
-	return count;
-}
-
-static const struct file_operations cm_fops = {
-	.write = cm_write,
-	.llseek = default_llseek,
-};
-
-static int __init acpi_custom_method_init(void)
-{
-	if (!acpi_debugfs_dir)
-		return -ENOENT;
-
-	cm_dentry = debugfs_create_file("custom_method", S_IWUSR,
-					acpi_debugfs_dir, NULL, &cm_fops);
-	if (!cm_dentry)
-		return -ENODEV;
-
-	return 0;
-}
+EXPORT_SYMBOL_GPL(acpi_debugfs_dir);
 
 void __init acpi_debugfs_init(void)
 {
 	acpi_debugfs_dir = debugfs_create_dir("acpi", NULL);
-
-	acpi_custom_method_init();
 }
-- 
cgit v1.2.3-70-g09d2


From 5be7ef002469eaf757065ee16b4100c6d3ead1cc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 28 May 2011 17:26:40 -0700
Subject: scsi: fix scsi_proc new kernel-doc warning

Fix kernel-doc warnings in scsi_proc.c:

  Warning(drivers/scsi/scsi_proc.c:390): No description found for parameter 'dev'
  Warning(drivers/scsi/scsi_proc.c:390): No description found for parameter 'data'
  Warning(drivers/scsi/scsi_proc.c:390): Excess function parameter 's' description in 'always_match'
  Warning(drivers/scsi/scsi_proc.c:390): Excess function parameter 'p' description in 'always_match'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/scsi_proc.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index f46855cd853..ad747dc337d 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -381,11 +381,6 @@ static ssize_t proc_scsi_write(struct file *file, const char __user *buf,
 	return err;
 }
 
-/**
- * proc_scsi_show - show contents of /proc/scsi/scsi (attached devices)
- * @s: output goes here
- * @p: not used
- */
 static int always_match(struct device *dev, void *data)
 {
 	return 1;
-- 
cgit v1.2.3-70-g09d2


From 932df7414336a00f45e5aec62724cf736b0bcfd4 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Mon, 16 May 2011 09:11:00 +0800
Subject: ACPI: processor: fix processor_physically_present in UP kernel

Usually, there are multiple processors defined in ACPI table, for
example

    Scope (_PR)
    {
        Processor (CPU0, 0x00, 0x00000410, 0x06) {}
        Processor (CPU1, 0x01, 0x00000410, 0x06) {}
        Processor (CPU2, 0x02, 0x00000410, 0x06) {}
        Processor (CPU3, 0x03, 0x00000410, 0x06) {}
    }

processor_physically_present(...) will be called to check whether those
processors are physically present.

Currently we have below codes in processor_physically_present,

cpuid = acpi_get_cpuid(...);
if ((cpuid == -1) && (num_possible_cpus() > 1))
        return false;
return true;

In UP kernel, acpi_get_cpuid(...) always return -1 and
num_possible_cpus() always return 1, so
processor_physically_present(...) always returns true for all passed in
processor handles.

This is wrong for UP processor or SMP processor running UP kernel.

This patch removes the !SMP version of acpi_get_cpuid(), so both UP and
SMP kernel use the same acpi_get_cpuid function.

And for UP kernel, only processor 0 is valid.

https://bugzilla.kernel.org/show_bug.cgi?id=16548
https://bugzilla.kernel.org/show_bug.cgi?id=16357

Tested-by: Anton Kochkov <anton.kochkov@gmail.com>
Tested-by: Ambroz Bizjak <ambrop7@gmail.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/processor_core.c | 12 +++++++++---
 include/acpi/processor.h      |  7 -------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 25bf17da69f..02d2a4c9084 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -37,7 +37,6 @@ static struct dmi_system_id __initdata processor_idle_dmi_table[] = {
 	{},
 };
 
-#ifdef CONFIG_SMP
 static int map_lapic_id(struct acpi_subtable_header *entry,
 		 u32 acpi_id, int *apic_id)
 {
@@ -165,7 +164,9 @@ exit:
 
 int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 {
+#ifdef CONFIG_SMP
 	int i;
+#endif
 	int apic_id = -1;
 
 	apic_id = map_mat_entry(handle, type, acpi_id);
@@ -174,14 +175,19 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 	if (apic_id == -1)
 		return apic_id;
 
+#ifdef CONFIG_SMP
 	for_each_possible_cpu(i) {
 		if (cpu_physical_id(i) == apic_id)
 			return i;
 	}
+#else
+	/* In UP kernel, only processor 0 is valid */
+	if (apic_id == 0)
+		return apic_id;
+#endif
 	return -1;
 }
 EXPORT_SYMBOL_GPL(acpi_get_cpuid);
-#endif
 
 static bool __init processor_physically_present(acpi_handle handle)
 {
@@ -217,7 +223,7 @@ static bool __init processor_physically_present(acpi_handle handle)
 	type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
 	cpuid = acpi_get_cpuid(handle, type, acpi_id);
 
-	if ((cpuid == -1) && (num_possible_cpus() > 1))
+	if (cpuid == -1)
 		return false;
 
 	return true;
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 55192ac0ced..ba4928cae47 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -310,14 +310,7 @@ static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
 
 /* in processor_core.c */
 void acpi_processor_set_pdc(acpi_handle handle);
-#ifdef CONFIG_SMP
 int acpi_get_cpuid(acpi_handle, int type, u32 acpi_id);
-#else
-static inline int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
-{
-	return -1;
-}
-#endif
 
 /* in processor_throttling.c */
 int acpi_processor_tstate_has_changed(struct acpi_processor *pr);
-- 
cgit v1.2.3-70-g09d2


From 28c2103dad04dba29ba86e22dad5735db8f0e13c Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 4 May 2011 22:56:43 +0800
Subject: ACPI: Add D3 cold state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_SxW returns an Integer containing the lowest D-state supported in state
Sx. If OSPM has not indicated that it supports _PR3, then the value “3”
corresponds to D3.  If it has indicated _PR3 support, the value “3”
represents D3hot and the value “4” represents D3cold.

Linux does set _OSC._PR3, so we should fix it to expect that _SxW can
return 4.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/bus.c     | 2 +-
 drivers/pci/pci-acpi.c | 2 ++
 include/acpi/actypes.h | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 9749980ca6c..d1e06c182cd 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -227,7 +227,7 @@ static int __acpi_bus_set_power(struct acpi_device *device, int state)
 	acpi_status status = AE_OK;
 	char object_name[5] = { '_', 'P', 'S', '0' + state, '\0' };
 
-	if (!device || (state < ACPI_STATE_D0) || (state > ACPI_STATE_D3))
+	if (!device || (state < ACPI_STATE_D0) || (state > ACPI_STATE_D3_COLD))
 		return -EINVAL;
 
 	/* Make sure this is a valid target state */
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 7c3b18e78ce..d36f41ea8cb 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -195,6 +195,8 @@ static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev)
 		return PCI_D2;
 	case ACPI_STATE_D3:
 		return PCI_D3hot;
+	case ACPI_STATE_D3_COLD:
+		return PCI_D3cold;
 	}
 	return PCI_POWER_ERROR;
 }
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 64f838beaab..f72cbe57429 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -501,8 +501,9 @@ typedef u64 acpi_integer;
 #define ACPI_STATE_D1                   (u8) 1
 #define ACPI_STATE_D2                   (u8) 2
 #define ACPI_STATE_D3                   (u8) 3
-#define ACPI_D_STATES_MAX               ACPI_STATE_D3
-#define ACPI_D_STATE_COUNT              4
+#define ACPI_STATE_D3_COLD              (u8) 4
+#define ACPI_D_STATES_MAX               ACPI_STATE_D3_COLD
+#define ACPI_D_STATE_COUNT              5
 
 #define ACPI_STATE_C0                   (u8) 0
 #define ACPI_STATE_C1                   (u8) 1
-- 
cgit v1.2.3-70-g09d2


From 08b53f0e6b565fe8dc0b8f929960ed16d76291bd Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 26 Apr 2011 16:29:55 +0800
Subject: ACPI EC: remove redundant code

ec->handle is set in ec_parse_device(), so don't bother to set it again.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/ec.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index b3f1d6f52a8..ed8cf17cf03 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -804,8 +804,6 @@ static int acpi_ec_add(struct acpi_device *device)
 			return -EINVAL;
 	}
 
-	ec->handle = device->handle;
-
 	/* Find and register all query methods */
 	acpi_walk_namespace(ACPI_TYPE_METHOD, ec->handle, 1,
 			    acpi_ec_register_query_methods, NULL, ec, NULL);
-- 
cgit v1.2.3-70-g09d2


From 02c68a02018669d1817c43c42de800975cbec467 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 1 Apr 2011 16:59:53 -0400
Subject: x86 idle: clarify AMD erratum 400 workaround

The workaround for AMD erratum 400 uses the term "c1e" falsely suggesting:
1. Intel C1E is somehow involved
2. All AMD processors with C1E are involved

Use the string "amd_c1e" instead of simply "c1e" to clarify that
this workaround is specific to AMD's version of C1E.
Use the string "e400" to clarify that the workaround is specific
to AMD processors with Erratum 400.

This patch is text-substitution only, with no functional change.

cc: x86@kernel.org
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/acpi.h      |  2 +-
 arch/x86/include/asm/idle.h      |  2 +-
 arch/x86/include/asm/processor.h |  4 ++--
 arch/x86/kernel/cpu/common.c     |  2 +-
 arch/x86/kernel/process.c        | 38 +++++++++++++++++++-------------------
 arch/x86/kernel/smpboot.c        |  2 +-
 drivers/acpi/processor_idle.c    |  2 +-
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4ea15ca89b2..52fd57f95c5 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -138,7 +138,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 	    boot_cpu_data.x86_model <= 0x05 &&
 	    boot_cpu_data.x86_mask < 0x0A)
 		return 1;
-	else if (c1e_detected)
+	else if (amd_e400_c1e_detected)
 		return 1;
 	else
 		return max_cstate;
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e27..f49253d7571 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -16,6 +16,6 @@ static inline void enter_idle(void) { }
 static inline void exit_idle(void) { }
 #endif /* CONFIG_X86_64 */
 
-void c1e_remove_cpu(int cpu);
+void amd_e400_remove_cpu(int cpu);
 
 #endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636cefa18..b9c03fb3369 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -758,10 +758,10 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
 
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
-extern void init_c1e_mask(void);
+extern void init_amd_e400_c1e_mask(void);
 
 extern unsigned long		boot_option_idle_override;
-extern bool			c1e_detected;
+extern bool			amd_e400_c1e_detected;
 
 enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
 			 IDLE_POLL, IDLE_FORCE_MWAIT};
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834396b..745a602f204 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -887,7 +887,7 @@ static void vgetcpu_set_mode(void)
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
-	init_c1e_mask();
+	init_amd_e400_c1e_mask();
 #ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff455419898..2efbfb712fb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -538,45 +538,45 @@ int mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
-bool c1e_detected;
-EXPORT_SYMBOL(c1e_detected);
+bool amd_e400_c1e_detected;
+EXPORT_SYMBOL(amd_e400_c1e_detected);
 
-static cpumask_var_t c1e_mask;
+static cpumask_var_t amd_e400_c1e_mask;
 
-void c1e_remove_cpu(int cpu)
+void amd_e400_remove_cpu(int cpu)
 {
-	if (c1e_mask != NULL)
-		cpumask_clear_cpu(cpu, c1e_mask);
+	if (amd_e400_c1e_mask != NULL)
+		cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
 }
 
 /*
- * C1E aware idle routine. We check for C1E active in the interrupt
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
  * pending message MSR. If we detect C1E, then we handle it the same
  * way as C3 power states (local apic timer and TSC stop)
  */
-static void c1e_idle(void)
+static void amd_e400_idle(void)
 {
 	if (need_resched())
 		return;
 
-	if (!c1e_detected) {
+	if (!amd_e400_c1e_detected) {
 		u32 lo, hi;
 
 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
 
 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-			c1e_detected = true;
+			amd_e400_c1e_detected = true;
 			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 				mark_tsc_unstable("TSC halt in AMD C1E");
 			printk(KERN_INFO "System has AMD C1E enabled\n");
 		}
 	}
 
-	if (c1e_detected) {
+	if (amd_e400_c1e_detected) {
 		int cpu = smp_processor_id();
 
-		if (!cpumask_test_cpu(cpu, c1e_mask)) {
-			cpumask_set_cpu(cpu, c1e_mask);
+		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
+			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
 			/*
 			 * Force broadcast so ACPI can not interfere.
 			 */
@@ -619,17 +619,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		pm_idle = mwait_idle;
 	} else if (cpu_has_amd_erratum(amd_erratum_400)) {
 		/* E400: APIC timer interrupt does not wake up CPU from C1e */
-		printk(KERN_INFO "using C1E aware idle routine\n");
-		pm_idle = c1e_idle;
+		printk(KERN_INFO "using AMD E400 aware idle routine\n");
+		pm_idle = amd_e400_idle;
 	} else
 		pm_idle = default_idle;
 }
 
-void __init init_c1e_mask(void)
+void __init init_amd_e400_c1e_mask(void)
 {
-	/* If we're using c1e_idle, we need to allocate c1e_mask. */
-	if (pm_idle == c1e_idle)
-		zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+	/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
+	if (pm_idle == amd_e400_idle)
+		zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
 }
 
 static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 08776a95348..2c33633595c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1379,7 +1379,7 @@ void play_dead_common(void)
 {
 	idle_task_exit();
 	reset_lazy_tlbstate();
-	c1e_remove_cpu(raw_smp_processor_id());
+	amd_e400_remove_cpu(raw_smp_processor_id());
 
 	mb();
 	/* Ack it */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d615b7d69bc..431ab11c8c1 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -161,7 +161,7 @@ static void lapic_timer_check_state(int state, struct acpi_processor *pr,
 	if (cpu_has(&cpu_data(pr->id), X86_FEATURE_ARAT))
 		return;
 
-	if (c1e_detected)
+	if (amd_e400_c1e_detected)
 		type = ACPI_STATE_C1;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 06ae40ce073daf233607a3c54a489f2c1e44683e Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 1 Apr 2011 15:28:09 -0400
Subject: x86 idle: EXPORT_SYMBOL(default_idle, pm_idle) only when APM demands
 it

In the long run, we don't want default_idle() or (pm_idle)() to
be exported outside of process.c.  Start by not exporting them
to modules, unless the APM build demands it.

cc: x86@kernel.org
cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/process.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2efbfb712fb..84f3cdae440 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -340,7 +340,9 @@ EXPORT_SYMBOL(boot_option_idle_override);
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void);
+#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE)
 EXPORT_SYMBOL(pm_idle);
+#endif
 
 #ifdef CONFIG_X86_32
 /*
@@ -400,7 +402,7 @@ void default_idle(void)
 		cpu_relax();
 	}
 }
-#ifdef CONFIG_APM_MODULE
+#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE)
 EXPORT_SYMBOL(default_idle);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 3b70b2e5fcf6315eb833a1bcc2b810bdc75484ff Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 1 Apr 2011 15:08:48 -0400
Subject: x86 idle floppy: deprecate disable_hlt()

Plan to remove floppy_disable_hlt in 2012, an ancient
workaround with comments that it should be removed.

This allows us to remove clutter and a run-time branch
from the idle code.

WARN_ONCE() on invocation until it is removed.

cc: x86@kernel.org
cc: stable@kernel.org # .39.x
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/feature-removal-schedule.txt | 8 ++++++++
 drivers/block/floppy.c                     | 1 +
 2 files changed, 9 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b3f35e5f9c9..5540615ac26 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,14 @@ be removed from this file.
 
 ---------------------------
 
+What:	x86 floppy disable_hlt
+When:	2012
+Why:	ancient workaround of dubious utility clutters the
+	code used by everybody else.
+Who:	Len Brown <len.brown@intel.com>
+
+---------------------------
+
 What:	PRISM54
 When:	2.6.34
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 77fc76f8aea..20aea9b511b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1038,6 +1038,7 @@ static void floppy_disable_hlt(void)
 {
 	unsigned long flags;
 
+	WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
 	spin_lock_irqsave(&floppy_hlt_lock, flags);
 	if (!hlt_disabled) {
 		hlt_disabled = 1;
-- 
cgit v1.2.3-70-g09d2


From 99c63221435963e0cee2402686ba99293c2ffa9e Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 1 Apr 2011 15:19:23 -0400
Subject: x86 idle APM: deprecate CONFIG_APM_CPU_IDLE

We don't want to export the pm_idle function pointer to modules.
Currently CONFIG_APM_CPU_IDLE w/ CONFIG_APM_MODULE forces us to.

CONFIG_APM_CPU_IDLE is of dubious value, it runs only on 32-bit
uniprocessor laptops that are over 10 years old.  It calls into
the BIOS during idle, and is known to cause a number of machines
to fail.

Removing CONFIG_APM_CPU_IDLE and will allow us to stop exporting
pm_idle.  Any systems that were calling into the APM BIOS
at run-time will simply use HLT instead.

cc: x86@kernel.org
cc: Jiri Kosina <jkosina@suse.cz>
cc: stable@kernel.org # .39.x
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/feature-removal-schedule.txt | 10 ++++++++++
 arch/x86/kernel/apm_32.c                   |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 5540615ac26..fc505c1b476 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -14,6 +14,16 @@ Who:	Len Brown <len.brown@intel.com>
 
 ---------------------------
 
+What:	CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle
+When:	2012
+Why:	This optional sub-feature of APM is of dubious reliability,
+	and ancient APM laptops are likely better served by calling HLT.
+	Deleting CONFIG_APM_CPU_IDLE allows x86 to stop exporting
+	the pm_idle function pointer to modules.
+Who:	Len Brown <len.brown@intel.com>
+
+----------------------------
+
 What:	PRISM54
 When:	2.6.34
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0e4f24c2a74..4c4ac32cd12 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -359,6 +359,7 @@ struct apm_user {
  * idle percentage above which bios idle calls are done
  */
 #ifdef CONFIG_APM_CPU_IDLE
+#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
 #define DEFAULT_IDLE_THRESHOLD	95
 #else
 #define DEFAULT_IDLE_THRESHOLD	100
@@ -902,6 +903,7 @@ static void apm_cpu_idle(void)
 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
 	unsigned int bucket;
 
+	WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
 recalc:
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
-- 
cgit v1.2.3-70-g09d2


From cdaab4a0d330f70c0e5ad8c3f7c65c2e375ea180 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 1 Apr 2011 15:41:17 -0400
Subject: x86 idle: deprecate "no-hlt" cmdline param

We'd rather that modern machines not check if HLT works on
every entry into idle, for the benefit of machines that had
marginal electricals 15-years ago.  If those machines are still running
the upstream kernel, they can use "idle=poll".  The only difference
will be that they'll now invoke HLT in machine_hlt().

cc: x86@kernel.org # .39.x
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/feature-removal-schedule.txt | 11 +++++++++++
 arch/x86/kernel/cpu/bugs.c                 |  1 +
 2 files changed, 12 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index fc505c1b476..f1b0eb02aa1 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -24,6 +24,17 @@ Who:	Len Brown <len.brown@intel.com>
 
 ----------------------------
 
+What:	x86_32 "no-hlt" cmdline param
+When:	2012
+Why:	remove a branch from idle path, simplify code used by everybody.
+	This option disabled the use of HLT in idle and machine_halt()
+	for hardware that was flakey 15-years ago.  Today we have
+	"idle=poll" that removed HLT from idle, and so if such a machine
+	is still running the upstream kernel, "idle=poll" is likely sufficient.
+Who:	Len Brown <len.brown@intel.com>
+
+----------------------------
+
 What:	PRISM54
 When:	2.6.34
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb301..525514cf33c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
 
 static int __init no_halt(char *s)
 {
+	WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
 	boot_cpu_data.hlt_works_ok = 0;
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From 5d4c47e0195b989f284907358bd5c268a44b91c7 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 1 Apr 2011 15:46:09 -0400
Subject: x86 idle: deprecate mwait_idle() and "idle=mwait" cmdline param

mwait_idle() is a C1-only idle loop intended to be more efficient
than HLT on SMP hardware that supports it.

But mwait_idle() has been replaced by the more general
mwait_idle_with_hints(), which handles both C1 and deeper C-states.
ACPI uses only mwait_idle_with_hints(), and never uses mwait_idle().

Deprecate mwait_idle() and the "idle=mwait" cmdline param
to simplify the x86 idle code.

After this change, kernels configured with
(!CONFIG_ACPI=n && !CONFIG_INTEL_IDLE=n) when run on hardware
that support MWAIT will simply use HLT.  If MWAIT is desired
on those systems, cpuidle and the cpuidle drivers above
can be used.

cc: x86@kernel.org
cc: stable@kernel.org # .39.x
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/feature-removal-schedule.txt | 7 +++++++
 arch/x86/kernel/process.c                  | 1 +
 2 files changed, 8 insertions(+)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index f1b0eb02aa1..13f0bb3187b 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -35,6 +35,13 @@ Who:	Len Brown <len.brown@intel.com>
 
 ----------------------------
 
+What:	x86 "idle=mwait" cmdline param
+When:	2012
+Why:	simplify x86 idle code
+Who:	Len Brown <len.brown@intel.com>
+
+----------------------------
+
 What:	PRISM54
 When:	2.6.34
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 84f3cdae440..8fb182956cb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -645,6 +645,7 @@ static int __init idle_setup(char *str)
 		boot_option_idle_override = IDLE_POLL;
 	} else if (!strcmp(str, "mwait")) {
 		boot_option_idle_override = IDLE_FORCE_MWAIT;
+		WARN_ONCE(1, "\idle=mwait\" will be removed in 2012\"\n");
 	} else if (!strcmp(str, "halt")) {
 		/*
 		 * When the boot option of idle=halt is added, halt is
-- 
cgit v1.2.3-70-g09d2


From d5b72ce15ea99a0b8f0d1973074c584daf92d70e Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Sun, 29 May 2011 00:38:46 +0100
Subject: Squashfs: Fix sanity check patches on big-endian systems

le64 values should be swapped when accessing on
big-endian systems.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/export.c   | 2 +-
 fs/squashfs/fragment.c | 2 +-
 fs/squashfs/id.c       | 2 +-
 fs/squashfs/super.c    | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 730c56248c9..5e1101ff276 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
 	 * table[0] points to the first inode lookup table metadata block,
 	 * this should be less than lookup_table_start
 	 */
-	if (!IS_ERR(table) && table[0] >= lookup_table_start) {
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
 		kfree(table);
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 1516a6490bf..0ed6edbc5c7 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
 	 * table[0] points to the first fragment table metadata block, this
 	 * should be less than fragment_table_start
 	 */
-	if (!IS_ERR(table) && table[0] >= fragment_table_start) {
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
 		kfree(table);
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index a70858e0fb4..d38ea3dab95 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
 	 * table[0] points to the first id lookup table metadata block, this
 	 * should be less than id_table_start
 	 */
-	if (!IS_ERR(table) && table[0] >= id_table_start) {
+	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
 		kfree(table);
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6f26abee359..7438850c62d 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -245,7 +245,7 @@ allocate_id_index_table:
 		msblk->id_table = NULL;
 		goto failed_mount;
 	}
-	next_table = msblk->id_table[0];
+	next_table = le64_to_cpu(msblk->id_table[0]);
 
 	/* Handle inode lookup table */
 	lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
@@ -261,7 +261,7 @@ allocate_id_index_table:
 		msblk->inode_lookup_table = NULL;
 		goto failed_mount;
 	}
-	next_table = msblk->inode_lookup_table[0];
+	next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
 
 	sb->s_export_op = &squashfs_export_ops;
 
@@ -286,7 +286,7 @@ handle_fragments:
 		msblk->fragment_index = NULL;
 		goto failed_mount;
 	}
-	next_table = msblk->fragment_index[0];
+	next_table = le64_to_cpu(msblk->fragment_index[0]);
 
 check_directory_table:
 	/* Sanity check directory_table */
-- 
cgit v1.2.3-70-g09d2


From 3b6445a6f68b839d1b437756b9c72312e33339b2 Mon Sep 17 00:00:00 2001
From: Jim Rees <rees@umich.edu>
Date: Tue, 22 Feb 2011 19:31:57 -0500
Subject: NFSv4.1: fix typo in filelayout_check_layout

Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4filelayout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386..dd6ccf00767 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -428,7 +428,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	dprintk("--> %s\n", __func__);
 
 	if (fl->pattern_offset > lgr->range.offset) {
-		dprintk("%s pattern_offset %lld to large\n",
+		dprintk("%s pattern_offset %lld too large\n",
 				__func__, fl->pattern_offset);
 		goto out;
 	}
-- 
cgit v1.2.3-70-g09d2


From 67d51f65bde233b17de304baec4f7c4d086471fe Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 20 May 2011 10:45:05 +0200
Subject: NFSv4.1: use struct nfs_client to qualify deviceid

deviceids are unique per server, per layout type.
Therefore, in the global cache in the files layout driver
deviceids from different servers may clash so we need
to qualify them with a struct nfs_client that represents
the nfs server that returned the deviceid.

Introduced in 2.6.39 commit ea8eecdd
"NFSv4.1 move deviceid cache to filelayout driver"

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4filelayout.c    | 2 +-
 fs/nfs/nfs4filelayout.h    | 3 ++-
 fs/nfs/nfs4filelayoutdev.c | 9 ++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index dd6ccf00767..571c1b032e9 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -440,7 +440,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	}
 
 	/* find and reference the deviceid */
-	dsaddr = nfs4_fl_find_get_deviceid(id);
+	dsaddr = nfs4_fl_find_get_deviceid(NFS_SERVER(lo->plh_inode)->nfs_client, id);
 	if (dsaddr == NULL) {
 		dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
 		if (dsaddr == NULL)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43..301b95568bd 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -60,6 +60,7 @@ struct nfs4_pnfs_ds {
 
 struct nfs4_file_layout_dsaddr {
 	struct hlist_node		node;
+	struct nfs_client		*nfs_client;
 	struct nfs4_deviceid		deviceid;
 	atomic_t			ref;
 	unsigned long			flags;
@@ -101,7 +102,7 @@ u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 					u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af139..42e32663727 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -431,7 +431,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	dsaddr->stripe_indices = stripe_indices;
 	stripe_indices = NULL;
 	dsaddr->ds_num = num;
-
+	dsaddr->nfs_client = NFS_SERVER(ino)->nfs_client;
 	memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
@@ -516,7 +516,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
 	}
 
 	spin_lock(&filelayout_deviceid_lock);
-	d = nfs4_fl_find_get_deviceid(&new->deviceid);
+	d = nfs4_fl_find_get_deviceid(new->nfs_client, &new->deviceid);
 	if (d) {
 		spin_unlock(&filelayout_deviceid_lock);
 		nfs4_fl_free_deviceid(new);
@@ -610,16 +610,15 @@ nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 }
 
 struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
 {
 	struct nfs4_file_layout_dsaddr *d;
 	struct hlist_node *n;
 	long hash = nfs4_fl_deviceid_hash(id);
 
-
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
-		if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+		if (d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) {
 			if (!atomic_inc_not_zero(&d->ref))
 				goto fail;
 			rcu_read_unlock();
-- 
cgit v1.2.3-70-g09d2


From 45df3c8b0f3a58facb125d7631890426706c0bfa Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 5 May 2011 08:28:46 +0300
Subject: pnfs: resolve header dependency in pnfs.h

Some definitions in the header file depend on nfs_fs.h so pnfs.h can't
be included independently.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/pnfs.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7..720bb9da3f9 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
+#include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 
 enum {
-- 
cgit v1.2.3-70-g09d2


From a1eaecbc4c8307e27772d6584ef85a2e93250661 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 19 May 2011 22:14:47 -0400
Subject: NFSv4.1: make deviceid cache global

Move deviceid cache from the pnfs files layout driver to the
generic layer in preparation for the objects layout driver.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/Makefile            |   2 +-
 fs/nfs/nfs4filelayout.c    |  10 +--
 fs/nfs/nfs4filelayout.h    |   8 +--
 fs/nfs/nfs4filelayoutdev.c | 104 ++++--------------------------
 fs/nfs/pnfs.h              |  17 +++++
 fs/nfs/pnfs_dev.c          | 156 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 194 insertions(+), 103 deletions(-)
 create mode 100644 fs/nfs/pnfs_dev.c

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e381..7516a8aaa42 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,7 +15,7 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o \
 			   nfs4namespace.o
-nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o
+nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
 
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 571c1b032e9..4c67a6f6519 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 			struct nfs4_deviceid *id,
 			gfp_t gfp_flags)
 {
+	struct nfs4_deviceid_node *d;
 	struct nfs4_file_layout_dsaddr *dsaddr;
 	int status = -EINVAL;
 	struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -440,12 +441,13 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	}
 
 	/* find and reference the deviceid */
-	dsaddr = nfs4_fl_find_get_deviceid(NFS_SERVER(lo->plh_inode)->nfs_client, id);
-	if (dsaddr == NULL) {
+	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->nfs_client, id);
+	if (d == NULL) {
 		dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
 		if (dsaddr == NULL)
 			goto out;
-	}
+	} else
+		dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
 	fl->dsaddr = dsaddr;
 
 	if (fl->first_stripe_index < 0 ||
@@ -535,7 +537,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 
 	memcpy(id, p, sizeof(*id));
 	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-	print_deviceid(id);
+	nfs4_print_deviceid(id);
 
 	nfl_util = be32_to_cpup(p++);
 	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 301b95568bd..0ace0a2d6d7 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,10 +59,7 @@ struct nfs4_pnfs_ds {
 #define NFS4_DEVICE_ID_NEG_ENTRY	0x00000001
 
 struct nfs4_file_layout_dsaddr {
-	struct hlist_node		node;
-	struct nfs_client		*nfs_client;
-	struct nfs4_deviceid		deviceid;
-	atomic_t			ref;
+	struct nfs4_deviceid_node	id_node;
 	unsigned long			flags;
 	u32				stripe_count;
 	u8				*stripe_indices;
@@ -96,13 +93,10 @@ extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 
 extern void print_ds(struct nfs4_pnfs_ds *ds);
-extern void print_deviceid(struct nfs4_deviceid *dev_id);
 u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 					u32 ds_idx);
-extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 42e32663727..eda4527a57e 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -36,30 +36,6 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_FL_DEVICE_ID_HASH_BITS	5
-#define NFS4_FL_DEVICE_ID_HASH_SIZE	(1 << NFS4_FL_DEVICE_ID_HASH_BITS)
-#define NFS4_FL_DEVICE_ID_HASH_MASK	(NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
-{
-	unsigned char *cptr = (unsigned char *)id->data;
-	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-	u32 x = 0;
-
-	while (nbytes--) {
-		x *= 37;
-		x += *cptr++;
-	}
-	return x & NFS4_FL_DEVICE_ID_HASH_MASK;
-}
-
-static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
-static DEFINE_SPINLOCK(filelayout_deviceid_lock);
-
 /*
  * Data server cache
  *
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
 		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
 }
 
-void
-print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
-{
-	int i;
-
-	ifdebug(FACILITY) {
-		printk("%s dsaddr->ds_num %d\n", __func__,
-		       dsaddr->ds_num);
-		for (i = 0; i < dsaddr->ds_num; i++)
-			print_ds(dsaddr->ds_list[i]);
-	}
-}
-
-void print_deviceid(struct nfs4_deviceid *id)
-{
-	u32 *p = (u32 *)id;
-
-	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
-		p[0], p[1], p[2], p[3]);
-}
-
 /* nfs4_ds_cache_lock is held */
 static struct nfs4_pnfs_ds *
 _data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -207,7 +162,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 	struct nfs4_pnfs_ds *ds;
 	int i;
 
-	print_deviceid(&dsaddr->deviceid);
+	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
 		ds = dsaddr->ds_list[i];
@@ -431,8 +386,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	dsaddr->stripe_indices = stripe_indices;
 	stripe_indices = NULL;
 	dsaddr->ds_num = num;
-	dsaddr->nfs_client = NFS_SERVER(ino)->nfs_client;
-	memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
+	nfs4_init_deviceid_node(&dsaddr->id_node, NFS_SERVER(ino)->nfs_client,
+				&pdev->dev_id);
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
 		int j;
@@ -505,8 +460,8 @@ out_err:
 static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
 {
-	struct nfs4_file_layout_dsaddr *d, *new;
-	long hash;
+	struct nfs4_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *n, *new;
 
 	new = decode_device(inode, dev, gfp_flags);
 	if (!new) {
@@ -515,20 +470,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
 		return NULL;
 	}
 
-	spin_lock(&filelayout_deviceid_lock);
-	d = nfs4_fl_find_get_deviceid(new->nfs_client, &new->deviceid);
-	if (d) {
-		spin_unlock(&filelayout_deviceid_lock);
+	d = nfs4_insert_deviceid_node(&new->id_node);
+	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+	if (n != new) {
 		nfs4_fl_free_deviceid(new);
-		return d;
+		return n;
 	}
 
-	INIT_HLIST_NODE(&new->node);
-	atomic_set(&new->ref, 1);
-	hash = nfs4_fl_deviceid_hash(&new->deviceid);
-	hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
-	spin_unlock(&filelayout_deviceid_lock);
-
 	return new;
 }
 
@@ -600,34 +548,8 @@ out_free:
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
-	if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
-		hlist_del_rcu(&dsaddr->node);
-		spin_unlock(&filelayout_deviceid_lock);
-
-		synchronize_rcu();
+	if (nfs4_put_deviceid_node(&dsaddr->id_node))
 		nfs4_fl_free_deviceid(dsaddr);
-	}
-}
-
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
-{
-	struct nfs4_file_layout_dsaddr *d;
-	struct hlist_node *n;
-	long hash = nfs4_fl_deviceid_hash(id);
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
-		if (d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) {
-			if (!atomic_inc_not_zero(&d->ref))
-				goto fail;
-			rcu_read_unlock();
-			return d;
-		}
-	}
-fail:
-	rcu_read_unlock();
-	return NULL;
 }
 
 /*
@@ -675,15 +597,15 @@ static void
 filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
 			       int err, u32 ds_addr)
 {
-	u32 *p = (u32 *)&dsaddr->deviceid;
+	u32 *p = (u32 *)&dsaddr->id_node.deviceid;
 
 	printk(KERN_ERR "NFS: data server %x connection error %d."
 		" Deviceid [%x%x%x%x] marked out of use.\n",
 		ds_addr, err, p[0], p[1], p[2], p[3]);
 
-	spin_lock(&filelayout_deviceid_lock);
+	spin_lock(&nfs4_ds_cache_lock);
 	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
-	spin_unlock(&filelayout_deviceid_lock);
+	spin_unlock(&nfs4_ds_cache_lock);
 }
 
 struct nfs4_pnfs_ds *
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 720bb9da3f9..3831ad04a23 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -157,6 +157,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+	struct hlist_node		node;
+	const struct nfs_client		*nfs_client;
+	struct nfs4_deviceid		deviceid;
+	atomic_t			ref;
+};
+
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+			     const struct nfs_client *,
+			     const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+
 static inline int lo_fail_bit(u32 iomode)
 {
 	return iomode == IOMODE_RW ?
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 00000000000..bf05189a7cb
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,156 @@
+/*
+ *  Device operations for the pnfs client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY		NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS	5
+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+	u32 *p = (u32 *)id;
+
+	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+		p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	struct nfs4_deviceid_node *d;
+	struct hlist_node *n;
+	long hash = nfs4_deviceid_hash(id);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) {
+		if (d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) {
+			if (!atomic_inc_not_zero(&d->ref))
+				goto fail;
+			rcu_read_unlock();
+			return d;
+		}
+	}
+fail:
+	rcu_read_unlock();
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+			const struct nfs_client *nfs_client,
+			const struct nfs4_deviceid *id)
+{
+	d->nfs_client = nfs_client;
+	d->deviceid = *id;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Uniquely initialize and insert a deviceid node into cache
+ *
+ * @new new deviceid node
+ *      Note that the caller must set up new->nfs_client and new->deviceid
+ *
+ * @ret the inserted node, if none found, otherwise, the found entry.
+ */
+struct nfs4_deviceid_node *
+nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
+{
+	struct nfs4_deviceid_node *d;
+	long hash;
+
+	spin_lock(&nfs4_deviceid_lock);
+	d = nfs4_find_get_deviceid(new->nfs_client, &new->deviceid);
+	if (d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		return d;
+	}
+
+	INIT_HLIST_NODE(&new->node);
+	atomic_set(&new->ref, 1);
+	hash = nfs4_deviceid_hash(&new->deviceid);
+	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+	spin_unlock(&nfs4_deviceid_lock);
+
+	return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * @ret true iff the node was deleted
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+		return false;
+	hlist_del_init_rcu(&d->node);
+	spin_unlock(&nfs4_deviceid_lock);
+	synchronize_rcu();
+	return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
-- 
cgit v1.2.3-70-g09d2


From a43a9d93d40a69eceeb4e4a4c860cc20186d475c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 29 May 2011 12:40:50 +0200
Subject: [S390] mm: fix storage key handling

page_get_storage_key() and page_set_storage_key() expect a page address
and not its page frame number. This got inconsistent with 2d42552d
"[S390] merge page_test_dirty and page_clear_dirty".

Result is that we read/write storage keys from random pages and do not
have a working dirty bit tracking at all.
E.g. SetPageUpdate() doesn't clear the dirty bit of requested pages, which
for example ext4 doesn't like very much and panics after a while.

Unable to handle kernel paging request at virtual user address (null)
Oops: 0004 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 1 Not tainted 2.6.39-07551-g139f37f-dirty #152
Process flush-94:0 (pid: 1576, task: 000000003eb34538, ksp: 000000003c287b70)
Krnl PSW : 0704c00180000000 0000000000316b12 (jbd2_journal_file_inode+0x10e/0x138)
           R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 EA:3
Krnl GPRS: 0000000000000000 0000000000000000 0000000000000000 0700000000000000
           0000000000316a62 000000003eb34cd0 0000000000000025 000000003c287b88
           0000000000000001 000000003c287a70 000000003f1ec678 000000003f1ec000
           0000000000000000 000000003e66ec00 0000000000316a62 000000003c287988
Krnl Code: 0000000000316b04: f0a0000407f4       srp     4(11,%r0),2036,0
           0000000000316b0a: b9020022           ltgr    %r2,%r2
           0000000000316b0e: a7740015           brc     7,316b38
          >0000000000316b12: e3d0c0000024       stg     %r13,0(%r12)
           0000000000316b18: 4120c010           la      %r2,16(%r12)
           0000000000316b1c: 4130d060           la      %r3,96(%r13)
           0000000000316b20: e340d0600004       lg      %r4,96(%r13)
           0000000000316b26: c0e50002b567       brasl   %r14,36d5f4
Call Trace:
([<0000000000316a62>] jbd2_journal_file_inode+0x5e/0x138)
 [<00000000002da13c>] mpage_da_map_and_submit+0x2e8/0x42c
 [<00000000002daac2>] ext4_da_writepages+0x2da/0x504
 [<00000000002597e8>] writeback_single_inode+0xf8/0x268
 [<0000000000259f06>] writeback_sb_inodes+0xd2/0x18c
 [<000000000025a700>] writeback_inodes_wb+0x80/0x168
 [<000000000025aa92>] wb_writeback+0x2aa/0x324
 [<000000000025abde>] wb_do_writeback+0xd2/0x274
 [<000000000025ae3a>] bdi_writeback_thread+0xba/0x1c4
 [<00000000001737be>] kthread+0xa6/0xb0
 [<000000000056c1da>] kernel_thread_starter+0x6/0xc
 [<000000000056c1d4>] kernel_thread_starter+0x0/0xc
INFO: lockdep is turned off.
Last Breaking-Event-Address:
 [<0000000000316a8a>] jbd2_journal_file_inode+0x86/0x138

Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 16 ++++++++--------
 include/linux/page-flags.h      |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index c4773a2ef3d..e4efacfe1b6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -577,16 +577,16 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
 static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-	unsigned long pfn, bits;
+	unsigned long address, bits;
 	unsigned char skey;
 
-	pfn = pte_val(*ptep) >> PAGE_SHIFT;
-	skey = page_get_storage_key(pfn);
+	address = pte_val(*ptep) & PAGE_MASK;
+	skey = page_get_storage_key(address);
 	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 	/* Clear page changed & referenced bit in the storage key */
 	if (bits) {
 		skey ^= bits;
-		page_set_storage_key(pfn, skey, 1);
+		page_set_storage_key(address, skey, 1);
 	}
 	/* Transfer page changed & referenced bit to guest bits in pgste */
 	pgste_val(pgste) |= bits << 48;		/* RCP_GR_BIT & RCP_GC_BIT */
@@ -628,16 +628,16 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
 static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste)
 {
 #ifdef CONFIG_PGSTE
-	unsigned long pfn;
+	unsigned long address;
 	unsigned long okey, nkey;
 
-	pfn = pte_val(*ptep) >> PAGE_SHIFT;
-	okey = nkey = page_get_storage_key(pfn);
+	address = pte_val(*ptep) & PAGE_MASK;
+	okey = nkey = page_get_storage_key(address);
 	nkey &= ~(_PAGE_ACC_BITS | _PAGE_FP_BIT);
 	/* Set page access key and fetch protection bit from pgste */
 	nkey |= (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56;
 	if (okey != nkey)
-		page_set_storage_key(pfn, nkey, 1);
+		page_set_storage_key(address, nkey, 1);
 #endif
 }
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 79a6700b716..6081493db68 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -308,7 +308,7 @@ static inline void SetPageUptodate(struct page *page)
 {
 #ifdef CONFIG_S390
 	if (!test_and_set_bit(PG_uptodate, &page->flags))
-		page_set_storage_key(page_to_pfn(page), PAGE_DEFAULT_KEY, 0);
+		page_set_storage_key(page_to_phys(page), PAGE_DEFAULT_KEY, 0);
 #else
 	/*
 	 * Memory barrier must be issued before setting the PG_uptodate bit,
-- 
cgit v1.2.3-70-g09d2


From 3c5cffb66d8ea94832650fcb55194715b0229088 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 29 May 2011 12:40:51 +0200
Subject: [S390] mm: fix mmu_gather rework

Quite a few functions that get called from the tlb gather code require that
preemption must be disabled. So disable preemption inside of the called
functions instead.
The only drawback is that rcu_table_freelist_finish() doesn't get necessarily
called on the cpu(s) that filled the free lists. So we may see a delay, until
we finally see an rcu callback. However over time this shouldn't matter.

So we get rid of lots of "BUG: using smp_processor_id() in preemptible"
messages.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/mm/pgtable.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 14c6fae6fe6..b09763fe5da 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -71,12 +71,15 @@ static void rcu_table_freelist_callback(struct rcu_head *head)
 
 void rcu_table_freelist_finish(void)
 {
-	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
+	struct rcu_table_freelist **batchp = &get_cpu_var(rcu_table_freelist);
+	struct rcu_table_freelist *batch = *batchp;
 
 	if (!batch)
-		return;
+		goto out;
 	call_rcu(&batch->rcu, rcu_table_freelist_callback);
-	__get_cpu_var(rcu_table_freelist) = NULL;
+	*batchp = NULL;
+out:
+	put_cpu_var(rcu_table_freelist);
 }
 
 static void smp_sync(void *arg)
@@ -141,20 +144,23 @@ void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
 {
 	struct rcu_table_freelist *batch;
 
+	preempt_disable();
 	if (atomic_read(&mm->mm_users) < 2 &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
 		crst_table_free(mm, table);
-		return;
+		goto out;
 	}
 	batch = rcu_table_freelist_get(mm);
 	if (!batch) {
 		smp_call_function(smp_sync, NULL, 1);
 		crst_table_free(mm, table);
-		return;
+		goto out;
 	}
 	batch->table[--batch->crst_index] = table;
 	if (batch->pgt_index >= batch->crst_index)
 		rcu_table_freelist_finish();
+out:
+	preempt_enable();
 }
 
 #ifdef CONFIG_64BIT
@@ -323,16 +329,17 @@ void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
 	struct page *page;
 	unsigned long bits;
 
+	preempt_disable();
 	if (atomic_read(&mm->mm_users) < 2 &&
 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
 		page_table_free(mm, table);
-		return;
+		goto out;
 	}
 	batch = rcu_table_freelist_get(mm);
 	if (!batch) {
 		smp_call_function(smp_sync, NULL, 1);
 		page_table_free(mm, table);
-		return;
+		goto out;
 	}
 	bits = (mm->context.has_pgste) ? 3UL : 1UL;
 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
@@ -345,6 +352,8 @@ void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
 	batch->table[batch->pgt_index++] = table;
 	if (batch->pgt_index >= batch->crst_index)
 		rcu_table_freelist_finish();
+out:
+	preempt_enable();
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4c2593270133708698d4b8cea2dab469479ad13b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 29 May 2011 12:52:55 +0100
Subject: dm table: allow targets to support discards internally

Permit a target to support discards regardless of whether or not all its
underlying devices do.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 6 +++++-
 include/linux/device-mapper.h | 6 ++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index cb8380c9767..215e112d153 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1346,7 +1346,8 @@ bool dm_table_supports_discards(struct dm_table *t)
 		return 0;
 
 	/*
-	 * Ensure that at least one underlying device supports discards.
+	 * Unless any target used by the table set discards_supported,
+	 * require at least one underlying device to support discards.
 	 * t->devices includes internal dm devices such as mirror logs
 	 * so we need to use iterate_devices here, which targets
 	 * supporting discard must provide.
@@ -1354,6 +1355,9 @@ bool dm_table_supports_discards(struct dm_table *t)
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
+		if (ti->discards_supported)
+			return 1;
+
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
 			return 1;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 32a4423710f..4427e045405 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -191,6 +191,12 @@ struct dm_target {
 
 	/* Used to provide an error string from the ctr */
 	char *error;
+
+	/*
+	 * Set if this target needs to receive discards regardless of
+	 * whether or not its underlying devices have support.
+	 */
+	unsigned discards_supported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
cgit v1.2.3-70-g09d2


From f4808ca99a203f20b4475601748e44b25a65bdec Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Sun, 29 May 2011 13:02:52 +0100
Subject: dm table: reject devices without request fns

This patch adds a check that a block device has a request function
defined before it is used.  Otherwise, misconfiguration can cause an oops.

Because we are allowing devices with zero size e.g. an offline multipath
device as in commit 2cd54d9bedb79a97f014e86c0da393416b264eb3
("dm: allow offline devices") there needs to be an additional check
to ensure devices are initialised.  Some block devices, like a loop
device without a backing file, exist but have no request function.

Reproducer is trivial: dm-mirror on unbound loop device
(no backing file on loop devices)

dmsetup create x --table "0 8 mirror core 2 8 sync 2 /dev/loop0 0 /dev/loop1 0"

and mirror resync will immediatelly cause OOps.

BUG: unable to handle kernel NULL pointer dereference at   (null)
 ? generic_make_request+0x2bd/0x590
 ? kmem_cache_alloc+0xad/0x190
 submit_bio+0x53/0xe0
 ? bio_add_page+0x3b/0x50
 dispatch_io+0x1ca/0x210 [dm_mod]
 ? read_callback+0x0/0xd0 [dm_mirror]
 dm_io+0xbb/0x290 [dm_mod]
 do_mirror+0x1e0/0x748 [dm_mirror]

Signed-off-by: Milan Broz <mbroz@redhat.com>
Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 215e112d153..451c3bb176d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -362,6 +362,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
+	struct request_queue *q;
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
 	sector_t dev_size =
@@ -370,6 +371,22 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 		limits->logical_block_size >> SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];
 
+	/*
+	 * Some devices exist without request functions,
+	 * such as loop devices not yet bound to backing files.
+	 * Forbid the use of such devices.
+	 */
+	q = bdev_get_queue(bdev);
+	if (!q || !q->make_request_fn) {
+		DMWARN("%s: %s is not yet initialised: "
+		       "start=%llu, len=%llu, dev_size=%llu",
+		       dm_device_name(ti->table->md), bdevname(bdev, b),
+		       (unsigned long long)start,
+		       (unsigned long long)len,
+		       (unsigned long long)dev_size);
+		return 1;
+	}
+
 	if (!dev_size)
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 6f13f6fba76edc7d0e7580c5deee829d59a41b2f Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 29 May 2011 13:02:55 +0100
Subject: dm mpath: do not fail paths after integrity errors

Integrity errors need to be passed to the owner of the integrity
metadata for processing. Consequently EILSEQ should be passed up the
stack.

Cc: stable@kernel.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index a550a057d99..aa4e570c2cb 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1290,7 +1290,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (!error && !clone->errors)
 		return 0;	/* I/O complete */
 
-	if (error == -EOPNOTSUPP || error == -EREMOTEIO)
+	if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ)
 		return error;
 
 	if (mpio->pgpath)
-- 
cgit v1.2.3-70-g09d2


From a705a34a565a5445bf731bd8006d51ea4d2b4236 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:02:58 +0100
Subject: dm kcopyd: avoid pointless job splitting

Don't split SUB_JOB_SIZE jobs

If the job size equals SUB_JOB_SIZE, there is no point in splitting it.
Splitting it just unnecessarily wastes time, because the split job size
is SUB_JOB_SIZE too.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 1bb73a13ca4..505b6f5cd38 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -578,7 +578,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->fn = fn;
 	job->context = context;
 
-	if (job->source.count < SUB_JOB_SIZE)
+	if (job->source.count <= SUB_JOB_SIZE)
 		dispatch_job(job);
 
 	else {
-- 
cgit v1.2.3-70-g09d2


From c6ea41fbbe08f270a8edef99dc369faf809d1bd6 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:00 +0100
Subject: dm kcopyd: preallocate sub jobs to avoid deadlock

There's a possible theoretical deadlock in dm-kcopyd because multiple
allocations from the same mempool are required to finish a request.
Avoid this by preallocating sub jobs.

There is a mempool of 512 entries. Each request requires up to 9
entries from the mempool. If we have at least 57 concurrent requests
running, the mempool may overflow and mempool allocations may start
blocking until another entry is freed to the mempool. Because the same
thread is used to free entries to the mempool and allocate entries from
the mempool, this may result in a deadlock.

This patch changes it so that one mempool entry contains all 9 "struct
kcopyd_job" required to fulfill the whole request. The allocation is
done only once in dm_kcopyd_copy and no further mempool allocations are
done during request processing.

If dm_kcopyd_copy is not run in the completion thread, this
implementation is deadlock-free.

MIN_JOBS needs reducing accordingly and we've chosen to reduce it
further to 8.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 505b6f5cd38..24fb42ed7b8 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -27,6 +27,10 @@
 
 #include "dm.h"
 
+#define SUB_JOB_SIZE	128
+#define SPLIT_COUNT	8
+#define MIN_JOBS	8
+
 /*-----------------------------------------------------------------
  * Each kcopyd client has its own little pool of preallocated
  * pages for kcopyd io.
@@ -216,16 +220,17 @@ struct kcopyd_job {
 	struct mutex lock;
 	atomic_t sub_jobs;
 	sector_t progress;
-};
 
-/* FIXME: this should scale with the number of pages */
-#define MIN_JOBS 512
+	struct kcopyd_job *master_job;
+};
 
 static struct kmem_cache *_job_cache;
 
 int __init dm_kcopyd_init(void)
 {
-	_job_cache = KMEM_CACHE(kcopyd_job, 0);
+	_job_cache = kmem_cache_create("kcopyd_job",
+				sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
+				__alignof__(struct kcopyd_job), 0, NULL);
 	if (!_job_cache)
 		return -ENOMEM;
 
@@ -299,7 +304,12 @@ static int run_complete_job(struct kcopyd_job *job)
 
 	if (job->pages)
 		kcopyd_put_pages(kc, job->pages);
-	mempool_free(job, kc->job_pool);
+	/*
+	 * If this is the master job, the sub jobs have already
+	 * completed so we can free everything.
+	 */
+	if (job->master_job == job)
+		mempool_free(job, kc->job_pool);
 	fn(read_err, write_err, context);
 
 	if (atomic_dec_and_test(&kc->nr_jobs))
@@ -460,14 +470,14 @@ static void dispatch_job(struct kcopyd_job *job)
 	wake(kc);
 }
 
-#define SUB_JOB_SIZE 128
 static void segment_complete(int read_err, unsigned long write_err,
 			     void *context)
 {
 	/* FIXME: tidy this function */
 	sector_t progress = 0;
 	sector_t count = 0;
-	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
+	struct kcopyd_job *job = sub_job->master_job;
 	struct dm_kcopyd_client *kc = job->kc;
 
 	mutex_lock(&job->lock);
@@ -498,8 +508,6 @@ static void segment_complete(int read_err, unsigned long write_err,
 
 	if (count) {
 		int i;
-		struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
-							   GFP_NOIO);
 
 		*sub_job = *job;
 		sub_job->source.sector += progress;
@@ -511,7 +519,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 		}
 
 		sub_job->fn = segment_complete;
-		sub_job->context = job;
+		sub_job->context = sub_job;
 		dispatch_job(sub_job);
 
 	} else if (atomic_dec_and_test(&job->sub_jobs)) {
@@ -531,19 +539,19 @@ static void segment_complete(int read_err, unsigned long write_err,
 }
 
 /*
- * Create some little jobs that will do the move between
- * them.
+ * Create some sub jobs to share the work between them.
  */
-#define SPLIT_COUNT 8
-static void split_job(struct kcopyd_job *job)
+static void split_job(struct kcopyd_job *master_job)
 {
 	int i;
 
-	atomic_inc(&job->kc->nr_jobs);
+	atomic_inc(&master_job->kc->nr_jobs);
 
-	atomic_set(&job->sub_jobs, SPLIT_COUNT);
-	for (i = 0; i < SPLIT_COUNT; i++)
-		segment_complete(0, 0u, job);
+	atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
+	for (i = 0; i < SPLIT_COUNT; i++) {
+		master_job[i + 1].master_job = master_job;
+		segment_complete(0, 0u, &master_job[i + 1]);
+	}
 }
 
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
@@ -553,7 +561,8 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	struct kcopyd_job *job;
 
 	/*
-	 * Allocate a new job.
+	 * Allocate an array of jobs consisting of one master job
+	 * followed by SPLIT_COUNT sub jobs.
 	 */
 	job = mempool_alloc(kc->job_pool, GFP_NOIO);
 
@@ -577,10 +586,10 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 
 	job->fn = fn;
 	job->context = context;
+	job->master_job = job;
 
 	if (job->source.count <= SUB_JOB_SIZE)
 		dispatch_job(job);
-
 	else {
 		mutex_init(&job->lock);
 		job->progress = 0;
-- 
cgit v1.2.3-70-g09d2


From 4cc1b4cffd187a5c5d6264c8d766c49b3c57fb05 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:02 +0100
Subject: dm kcopyd: remove superfluous page allocation spinlock

Remove the spinlock protecting the pages allocation.  The spinlock is only
taken on initialization or from single-threaded workqueue.  Therefore, the
spinlock is useless.

The spinlock is taken in kcopyd_get_pages and kcopyd_put_pages.

kcopyd_get_pages is only called from run_pages_job, which is only
called from process_jobs called from do_work.

kcopyd_put_pages is called from client_alloc_pages (which is initialization
function) or from run_complete_job. run_complete_job is only called from
process_jobs called from do_work.

Another spinlock, kc->job_lock is taken each time someone pushes or pops
some work for the worker thread.  Once we take kc->job_lock, we
guarantee that any written memory is visible to the other CPUs.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 24fb42ed7b8..ed957791639 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -36,7 +36,6 @@
  * pages for kcopyd io.
  *---------------------------------------------------------------*/
 struct dm_kcopyd_client {
-	spinlock_t lock;
 	struct page_list *pages;
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
@@ -99,11 +98,8 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
 {
 	struct page_list *pl;
 
-	spin_lock(&kc->lock);
-	if (kc->nr_free_pages < nr) {
-		spin_unlock(&kc->lock);
+	if (kc->nr_free_pages < nr)
 		return -ENOMEM;
-	}
 
 	kc->nr_free_pages -= nr;
 	for (*pages = pl = kc->pages; --nr; pl = pl->next)
@@ -112,8 +108,6 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
 	kc->pages = pl->next;
 	pl->next = NULL;
 
-	spin_unlock(&kc->lock);
-
 	return 0;
 }
 
@@ -121,14 +115,12 @@ static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
 {
 	struct page_list *cursor;
 
-	spin_lock(&kc->lock);
 	for (cursor = pl; cursor->next; cursor = cursor->next)
 		kc->nr_free_pages++;
 
 	kc->nr_free_pages++;
 	cursor->next = kc->pages;
 	kc->pages = pl;
-	spin_unlock(&kc->lock);
 }
 
 /*
@@ -625,7 +617,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 	if (!kc)
 		return -ENOMEM;
 
-	spin_lock_init(&kc->lock);
 	spin_lock_init(&kc->job_lock);
 	INIT_LIST_HEAD(&kc->complete_jobs);
 	INIT_LIST_HEAD(&kc->io_jobs);
-- 
cgit v1.2.3-70-g09d2


From f99b55eec795bd0fd577ab3ca06f3acfbe3b1ab1 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:04 +0100
Subject: dm kcopyd: add gfp parm to alloc_pl

Introduce a parameter for gfp flags to alloc_pl() for use in following
patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index ed957791639..0270844c2a3 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -70,15 +70,15 @@ static void wake(struct dm_kcopyd_client *kc)
 	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
 }
 
-static struct page_list *alloc_pl(void)
+static struct page_list *alloc_pl(gfp_t gfp)
 {
 	struct page_list *pl;
 
-	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+	pl = kmalloc(sizeof(*pl), gfp);
 	if (!pl)
 		return NULL;
 
-	pl->page = alloc_page(GFP_KERNEL);
+	pl->page = alloc_page(gfp);
 	if (!pl->page) {
 		kfree(pl);
 		return NULL;
@@ -143,7 +143,7 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
 	struct page_list *pl = NULL, *next;
 
 	for (i = 0; i < nr; i++) {
-		next = alloc_pl();
+		next = alloc_pl(GFP_KERNEL);
 		if (!next) {
 			if (pl)
 				drop_pages(pl);
-- 
cgit v1.2.3-70-g09d2


From d04714580f12379fcf7a0f799e86c92b96dd4e1f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:07 +0100
Subject: dm kcopyd: alloc pages from the main page allocator

This patch changes dm-kcopyd so that it allocates pages from the main
page allocator with __GFP_NOWARN | __GFP_NORETRY flags (so that it can
fail in case of memory pressure). If the allocation fails, dm-kcopyd
allocates pages from its own reserve.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 91 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 0270844c2a3..5dfbdcb40a4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,8 +37,8 @@
  *---------------------------------------------------------------*/
 struct dm_kcopyd_client {
 	struct page_list *pages;
-	unsigned int nr_pages;
-	unsigned int nr_free_pages;
+	unsigned nr_reserved_pages;
+	unsigned nr_free_pages;
 
 	struct dm_io_client *io_client;
 
@@ -70,6 +70,9 @@ static void wake(struct dm_kcopyd_client *kc)
 	queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
 }
 
+/*
+ * Obtain one page for the use of kcopyd.
+ */
 static struct page_list *alloc_pl(gfp_t gfp)
 {
 	struct page_list *pl;
@@ -93,34 +96,56 @@ static void free_pl(struct page_list *pl)
 	kfree(pl);
 }
 
-static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
-			    unsigned int nr, struct page_list **pages)
+/*
+ * Add the provided pages to a client's free page list, releasing
+ * back to the system any beyond the reserved_pages limit.
+ */
+static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
 {
-	struct page_list *pl;
-
-	if (kc->nr_free_pages < nr)
-		return -ENOMEM;
+	struct page_list *next;
 
-	kc->nr_free_pages -= nr;
-	for (*pages = pl = kc->pages; --nr; pl = pl->next)
-		;
+	do {
+		next = pl->next;
 
-	kc->pages = pl->next;
-	pl->next = NULL;
+		if (kc->nr_free_pages >= kc->nr_reserved_pages)
+			free_pl(pl);
+		else {
+			pl->next = kc->pages;
+			kc->pages = pl;
+			kc->nr_free_pages++;
+		}
 
-	return 0;
+		pl = next;
+	} while (pl);
 }
 
-static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
+static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
+			    unsigned int nr, struct page_list **pages)
 {
-	struct page_list *cursor;
+	struct page_list *pl;
 
-	for (cursor = pl; cursor->next; cursor = cursor->next)
-		kc->nr_free_pages++;
+	*pages = NULL;
+
+	do {
+		pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+		if (unlikely(!pl)) {
+			/* Use reserved pages */
+			pl = kc->pages;
+			if (unlikely(!pl))
+				goto out_of_memory;
+			kc->pages = pl->next;
+			kc->nr_free_pages--;
+		}
+		pl->next = *pages;
+		*pages = pl;
+	} while (--nr);
+
+	return 0;
 
-	kc->nr_free_pages++;
-	cursor->next = kc->pages;
-	kc->pages = pl;
+out_of_memory:
+	if (*pages)
+		kcopyd_put_pages(kc, *pages);
+	return -ENOMEM;
 }
 
 /*
@@ -137,12 +162,15 @@ static void drop_pages(struct page_list *pl)
 	}
 }
 
-static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
+/*
+ * Allocate and reserve nr_pages for the use of a specific client.
+ */
+static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages)
 {
-	unsigned int i;
+	unsigned i;
 	struct page_list *pl = NULL, *next;
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr_pages; i++) {
 		next = alloc_pl(GFP_KERNEL);
 		if (!next) {
 			if (pl)
@@ -153,17 +181,18 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
 		pl = next;
 	}
 
+	kc->nr_reserved_pages += nr_pages;
 	kcopyd_put_pages(kc, pl);
-	kc->nr_pages += nr;
+
 	return 0;
 }
 
 static void client_free_pages(struct dm_kcopyd_client *kc)
 {
-	BUG_ON(kc->nr_free_pages != kc->nr_pages);
+	BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
 	drop_pages(kc->pages);
 	kc->pages = NULL;
-	kc->nr_free_pages = kc->nr_pages = 0;
+	kc->nr_free_pages = kc->nr_reserved_pages = 0;
 }
 
 /*-----------------------------------------------------------------
@@ -607,7 +636,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
  * Client setup
  *---------------------------------------------------------------*/
-int dm_kcopyd_client_create(unsigned int nr_pages,
+int dm_kcopyd_client_create(unsigned min_pages,
 			    struct dm_kcopyd_client **result)
 {
 	int r = -ENOMEM;
@@ -633,12 +662,12 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 		goto bad_workqueue;
 
 	kc->pages = NULL;
-	kc->nr_pages = kc->nr_free_pages = 0;
-	r = client_alloc_pages(kc, nr_pages);
+	kc->nr_reserved_pages = kc->nr_free_pages = 0;
+	r = client_reserve_pages(kc, min_pages);
 	if (r)
 		goto bad_client_pages;
 
-	kc->io_client = dm_io_client_create(nr_pages);
+	kc->io_client = dm_io_client_create(min_pages);
 	if (IS_ERR(kc->io_client)) {
 		r = PTR_ERR(kc->io_client);
 		goto bad_io_client;
-- 
cgit v1.2.3-70-g09d2


From bda8efec5c706a672e0714d341a342e811f0262a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:09 +0100
Subject: dm io: use fixed initial mempool size

Replace the arbitrary calculation of an initial io struct mempool size
with a constant.

The code calculated the number of reserved structures based on the request
size and used a "magic" multiplication constant of 4.  This patch changes
it to reserve a fixed number - itself still chosen quite arbitrarily.
Further testing might show if there is a better number to choose.

Note that if there is no memory pressure, we can still allocate an
arbitrary number of "struct io" structures.  One structure is enough to
process the whole request.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c              | 27 +++++----------------------
 drivers/md/dm-kcopyd.c          |  2 +-
 drivers/md/dm-log.c             |  3 +--
 drivers/md/dm-raid1.c           |  3 +--
 drivers/md/dm-snap-persistent.c | 13 +------------
 include/linux/dm-io.h           |  3 +--
 6 files changed, 10 insertions(+), 41 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 76a5af00a26..2067288f61f 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -19,6 +19,8 @@
 #define DM_MSG_PREFIX "io"
 
 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
+#define MIN_IOS		16
+#define MIN_BIOS	16
 
 struct dm_io_client {
 	mempool_t *pool;
@@ -40,34 +42,22 @@ struct io {
 
 static struct kmem_cache *_dm_io_cache;
 
-/*
- * io contexts are only dynamically allocated for asynchronous
- * io.  Since async io is likely to be the majority of io we'll
- * have the same number of io contexts as bios! (FIXME: must reduce this).
- */
-
-static unsigned int pages_to_ios(unsigned int pages)
-{
-	return 4 * pages;	/* too many ? */
-}
-
 /*
  * Create a client with mempool and bioset.
  */
-struct dm_io_client *dm_io_client_create(unsigned num_pages)
+struct dm_io_client *dm_io_client_create(void)
 {
-	unsigned ios = pages_to_ios(num_pages);
 	struct dm_io_client *client;
 
 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);
 
-	client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(16, 0);
+	client->bios = bioset_create(MIN_BIOS, 0);
 	if (!client->bios)
 		goto bad;
 
@@ -81,13 +71,6 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
 }
 EXPORT_SYMBOL(dm_io_client_create);
 
-int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
-{
-	return mempool_resize(client->pool, pages_to_ios(num_pages),
-			      GFP_KERNEL);
-}
-EXPORT_SYMBOL(dm_io_client_resize);
-
 void dm_io_client_destroy(struct dm_io_client *client)
 {
 	mempool_destroy(client->pool);
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 5dfbdcb40a4..719693340d1 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -667,7 +667,7 @@ int dm_kcopyd_client_create(unsigned min_pages,
 	if (r)
 		goto bad_client_pages;
 
-	kc->io_client = dm_io_client_create(min_pages);
+	kc->io_client = dm_io_client_create();
 	if (IS_ERR(kc->io_client)) {
 		r = PTR_ERR(kc->io_client);
 		goto bad_io_client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a1f32188967..948e3f4925b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -449,8 +449,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 
 		lc->io_req.mem.type = DM_IO_VMA;
 		lc->io_req.notify.fn = NULL;
-		lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
-								   PAGE_SIZE));
+		lc->io_req.client = dm_io_client_create();
 		if (IS_ERR(lc->io_req.client)) {
 			r = PTR_ERR(lc->io_req.client);
 			DMWARN("couldn't allocate disk io client");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 976ad4688af..53089aa1138 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -22,7 +22,6 @@
 #define DM_MSG_PREFIX "raid1"
 
 #define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
-#define DM_IO_PAGES 64
 #define DM_KCOPYD_PAGES 64
 
 #define DM_RAID1_HANDLE_ERRORS 0x01
@@ -887,7 +886,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 		return NULL;
 	}
 
-	ms->io_client = dm_io_client_create(DM_IO_PAGES);
+	ms->io_client = dm_io_client_create();
 	if (IS_ERR(ms->io_client)) {
 		ti->error = "Error creating dm_io client";
 		mempool_destroy(ms->read_record_pool);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 95891dfcbca..135c2f1fdbf 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -154,11 +154,6 @@ struct pstore {
 	struct workqueue_struct *metadata_wq;
 };
 
-static unsigned sectors_to_pages(unsigned sectors)
-{
-	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
-}
-
 static int alloc_area(struct pstore *ps)
 {
 	int r = -ENOMEM;
@@ -318,8 +313,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 		chunk_size_supplied = 0;
 	}
 
-	ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
-							     chunk_size));
+	ps->io_client = dm_io_client_create();
 	if (IS_ERR(ps->io_client))
 		return PTR_ERR(ps->io_client);
 
@@ -368,11 +362,6 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 		return r;
 	}
 
-	r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
-				ps->io_client);
-	if (r)
-		return r;
-
 	r = alloc_area(ps);
 	return r;
 
diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h
index 5c9186b93ff..f4b0aa3126f 100644
--- a/include/linux/dm-io.h
+++ b/include/linux/dm-io.h
@@ -69,8 +69,7 @@ struct dm_io_request {
  *
  * Create/destroy may block.
  */
-struct dm_io_client *dm_io_client_create(unsigned num_pages);
-int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client);
+struct dm_io_client *dm_io_client_create(void);
 void dm_io_client_destroy(struct dm_io_client *client);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 5f43ba2950414dc0abf4ac44c397d88069056746 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:11 +0100
Subject: dm kcopyd: reserve fewer pages

Reserve just the minimum of pages needed to process one job.

Because we allocate pages from page allocator, we don't need to reserve
a large number of pages.  The maximum job size is SUB_JOB_SIZE and we
calculate the number of reserved pages based on this.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c    | 6 +++---
 drivers/md/dm-raid1.c     | 3 +--
 drivers/md/dm-snap.c      | 7 +------
 include/linux/dm-kcopyd.h | 3 +--
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 719693340d1..579647f8b4d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -30,6 +30,7 @@
 #define SUB_JOB_SIZE	128
 #define SPLIT_COUNT	8
 #define MIN_JOBS	8
+#define RESERVE_PAGES	(DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
 
 /*-----------------------------------------------------------------
  * Each kcopyd client has its own little pool of preallocated
@@ -636,8 +637,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
  * Client setup
  *---------------------------------------------------------------*/
-int dm_kcopyd_client_create(unsigned min_pages,
-			    struct dm_kcopyd_client **result)
+int dm_kcopyd_client_create(struct dm_kcopyd_client **result)
 {
 	int r = -ENOMEM;
 	struct dm_kcopyd_client *kc;
@@ -663,7 +663,7 @@ int dm_kcopyd_client_create(unsigned min_pages,
 
 	kc->pages = NULL;
 	kc->nr_reserved_pages = kc->nr_free_pages = 0;
-	r = client_reserve_pages(kc, min_pages);
+	r = client_reserve_pages(kc, RESERVE_PAGES);
 	if (r)
 		goto bad_client_pages;
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 53089aa1138..9defad04541 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -22,7 +22,6 @@
 #define DM_MSG_PREFIX "raid1"
 
 #define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
-#define DM_KCOPYD_PAGES 64
 
 #define DM_RAID1_HANDLE_ERRORS 0x01
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
@@ -1116,7 +1115,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto err_destroy_wq;
 	}
 
-	r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client);
+	r = dm_kcopyd_client_create(&ms->kcopyd_client);
 	if (r)
 		goto err_destroy_wq;
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a2d330942cb..5a2296de84a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -39,11 +39,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
  */
 #define SNAPSHOT_COPY_PRIORITY 2
 
-/*
- * Reserve 1MB for each snapshot initially (with minimum of 1 page).
- */
-#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
-
 /*
  * The size of the mempool used to track chunks in use.
  */
@@ -1116,7 +1111,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_hash_tables;
 	}
 
-	r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
+	r = dm_kcopyd_client_create(&s->kcopyd_client);
 	if (r) {
 		ti->error = "Could not create kcopyd client";
 		goto bad_kcopyd;
diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h
index 5db21631169..312513d4741 100644
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h
@@ -25,8 +25,7 @@
  * To use kcopyd you must first create a dm_kcopyd_client object.
  */
 struct dm_kcopyd_client;
-int dm_kcopyd_client_create(unsigned num_pages,
-			    struct dm_kcopyd_client **result);
+int dm_kcopyd_client_create(struct dm_kcopyd_client **result);
 void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From fa34ce73072f90ecd90dcc43f29d82e70e5f8676 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 29 May 2011 13:03:13 +0100
Subject: dm kcopyd: return client directly and not through a pointer

Return client directly from dm_kcopyd_client_create, not through a
parameter, making it consistent with dm_io_client_create.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c    | 9 ++++-----
 drivers/md/dm-raid1.c     | 6 ++++--
 drivers/md/dm-snap.c      | 5 +++--
 include/linux/dm-kcopyd.h | 2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 579647f8b4d..819e37eaaeb 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -637,14 +637,14 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
 /*-----------------------------------------------------------------
  * Client setup
  *---------------------------------------------------------------*/
-int dm_kcopyd_client_create(struct dm_kcopyd_client **result)
+struct dm_kcopyd_client *dm_kcopyd_client_create(void)
 {
 	int r = -ENOMEM;
 	struct dm_kcopyd_client *kc;
 
 	kc = kmalloc(sizeof(*kc), GFP_KERNEL);
 	if (!kc)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&kc->job_lock);
 	INIT_LIST_HEAD(&kc->complete_jobs);
@@ -676,8 +676,7 @@ int dm_kcopyd_client_create(struct dm_kcopyd_client **result)
 	init_waitqueue_head(&kc->destroyq);
 	atomic_set(&kc->nr_jobs, 0);
 
-	*result = kc;
-	return 0;
+	return kc;
 
 bad_io_client:
 	client_free_pages(kc);
@@ -688,7 +687,7 @@ bad_workqueue:
 bad_slab:
 	kfree(kc);
 
-	return r;
+	return ERR_PTR(r);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_create);
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9defad04541..9bfd057be68 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1115,9 +1115,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto err_destroy_wq;
 	}
 
-	r = dm_kcopyd_client_create(&ms->kcopyd_client);
-	if (r)
+	ms->kcopyd_client = dm_kcopyd_client_create();
+	if (IS_ERR(ms->kcopyd_client)) {
+		r = PTR_ERR(ms->kcopyd_client);
 		goto err_destroy_wq;
+	}
 
 	wakeup_mirrord(ms);
 	return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5a2296de84a..9ecff5f3023 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1111,8 +1111,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_hash_tables;
 	}
 
-	r = dm_kcopyd_client_create(&s->kcopyd_client);
-	if (r) {
+	s->kcopyd_client = dm_kcopyd_client_create();
+	if (IS_ERR(s->kcopyd_client)) {
+		r = PTR_ERR(s->kcopyd_client);
 		ti->error = "Could not create kcopyd client";
 		goto bad_kcopyd;
 	}
diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h
index 312513d4741..298d587e349 100644
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h
@@ -25,7 +25,7 @@
  * To use kcopyd you must first create a dm_kcopyd_client object.
  */
 struct dm_kcopyd_client;
-int dm_kcopyd_client_create(struct dm_kcopyd_client **result);
+struct dm_kcopyd_client *dm_kcopyd_client_create(void);
 void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From bc658c96037fc87463f0703ad2ea7c895344cb7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sun, 29 May 2011 10:33:44 +0200
Subject: mm, rmap: Add yet more comments to
 page_get_anon_vma/page_lock_anon_vma

Inspired by an analysis from Hugh on why again all this doesn't explode
in our face.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 6bada99cd61..0eb463ea88d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -352,6 +352,11 @@ void __init anon_vma_init(void)
  * The page might have been remapped to a different anon_vma or the anon_vma
  * returned may already be freed (and even reused).
  *
+ * In case it was remapped to a different anon_vma, the new anon_vma will be a
+ * child of the old anon_vma, and the anon_vma lifetime rules will therefore
+ * ensure that any anon_vma obtained from the page will still be valid for as
+ * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ *
  * All users of this function must be very careful when walking the anon_vma
  * chain and verify that the page in question is indeed mapped in it
  * [ something equivalent to page_mapped_in_vma() ].
@@ -421,7 +426,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
 		/*
 		 * If the page is still mapped, then this anon_vma is still
 		 * its anon_vma, and holding the mutex ensures that it will
-		 * not go away, see __put_anon_vma().
+		 * not go away, see anon_vma_free().
 		 */
 		if (!page_mapped(page)) {
 			mutex_unlock(&root_anon_vma->mutex);
-- 
cgit v1.2.3-70-g09d2


From c4f790736ca8d7d86883c5aee2ba1caa15cd8da3 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 23 May 2011 21:18:20 -0500
Subject: eCryptfs: Consolidate inode functions into inode.c

These functions should live in inode.c since their focus is on inodes
and they're primarily used by functions in inode.c.

Also does a simple cleanup of ecryptfs_inode_test() and rolls
ecryptfs_init_inode() into ecryptfs_inode_set().

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Tested-by: David <david@unsolicited.net>
---
 fs/ecryptfs/ecryptfs_kernel.h |   9 +---
 fs/ecryptfs/inode.c           | 104 ++++++++++++++++++++++++++++++++++++------
 fs/ecryptfs/main.c            |  69 ----------------------------
 fs/ecryptfs/super.c           |  16 -------
 4 files changed, 91 insertions(+), 107 deletions(-)

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e70282775e2..37224b5fb12 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -625,10 +625,8 @@ struct ecryptfs_open_req {
 	struct list_head kthread_ctl_list;
 };
 
-#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
-int ecryptfs_interpose(struct dentry *hidden_dentry,
-		       struct dentry *this_dentry, struct super_block *sb,
-		       u32 flags);
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb);
 void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
@@ -679,9 +677,6 @@ int
 ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 			  unsigned char *src, struct dentry *ecryptfs_dentry);
 int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
 ssize_t
 ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
 			void *value, size_t size);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 94ab3c06317..704a8c8fe19 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -51,6 +51,95 @@ static void unlock_dir(struct dentry *dir)
 	dput(dir);
 }
 
+static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
+{
+	if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
+		return 1;
+	return 0;
+}
+
+static int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
+{
+	ecryptfs_set_inode_lower(inode, (struct inode *)lower_inode);
+	inode->i_ino = ((struct inode *)lower_inode)->i_ino;
+	inode->i_version++;
+	inode->i_op = &ecryptfs_main_iops;
+	inode->i_fop = &ecryptfs_main_fops;
+	inode->i_mapping->a_ops = &ecryptfs_aops;
+	return 0;
+}
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb)
+{
+	struct inode *inode;
+	int rc = 0;
+
+	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
+		rc = -EXDEV;
+		goto out;
+	}
+	if (!igrab(lower_inode)) {
+		rc = -ESTALE;
+		goto out;
+	}
+	inode = iget5_locked(sb, (unsigned long)lower_inode,
+			     ecryptfs_inode_test, ecryptfs_inode_set,
+			     lower_inode);
+	if (!inode) {
+		rc = -EACCES;
+		iput(lower_inode);
+		goto out;
+	}
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	else
+		iput(lower_inode);
+	if (S_ISLNK(lower_inode->i_mode))
+		inode->i_op = &ecryptfs_symlink_iops;
+	else if (S_ISDIR(lower_inode->i_mode))
+		inode->i_op = &ecryptfs_dir_iops;
+	if (S_ISDIR(lower_inode->i_mode))
+		inode->i_fop = &ecryptfs_dir_fops;
+	if (special_file(lower_inode->i_mode))
+		init_special_inode(inode, lower_inode->i_mode,
+				   lower_inode->i_rdev);
+	fsstack_copy_attr_all(inode, lower_inode);
+	/* This size will be overwritten for real files w/ headers and
+	 * other metadata */
+	fsstack_copy_inode_size(inode, lower_inode);
+	return inode;
+out:
+	return ERR_PTR(rc);
+}
+
+#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flags: flags to govern behavior of interpose procedure
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_interpose(struct dentry *lower_dentry,
+			      struct dentry *dentry, struct super_block *sb,
+			      u32 flags)
+{
+	struct inode *lower_inode = lower_dentry->d_inode;
+	struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
+		d_add(dentry, inode);
+	else
+		d_instantiate(dentry, inode);
+	return 0;
+}
+
 /**
  * ecryptfs_create_underlying_file
  * @lower_dir_inode: inode of the parent in the lower fs of the new file
@@ -1079,21 +1168,6 @@ out:
 	return rc;
 }
 
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
-{
-	if ((ecryptfs_inode_to_lower(inode)
-	     == (struct inode *)candidate_lower_inode))
-		return 1;
-	else
-		return 0;
-}
-
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
-{
-	ecryptfs_init_inode(inode, (struct inode *)lower_inode);
-	return 0;
-}
-
 const struct inode_operations ecryptfs_symlink_iops = {
 	.readlink = ecryptfs_readlink,
 	.follow_link = ecryptfs_follow_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 89b93389af8..7c697abab39 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode)
 	}
 }
 
-static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
-		       struct super_block *sb)
-{
-	struct inode *inode;
-	int rc = 0;
-
-	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
-		rc = -EXDEV;
-		goto out;
-	}
-	if (!igrab(lower_inode)) {
-		rc = -ESTALE;
-		goto out;
-	}
-	inode = iget5_locked(sb, (unsigned long)lower_inode,
-			     ecryptfs_inode_test, ecryptfs_inode_set,
-			     lower_inode);
-	if (!inode) {
-		rc = -EACCES;
-		iput(lower_inode);
-		goto out;
-	}
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-	else
-		iput(lower_inode);
-	if (S_ISLNK(lower_inode->i_mode))
-		inode->i_op = &ecryptfs_symlink_iops;
-	else if (S_ISDIR(lower_inode->i_mode))
-		inode->i_op = &ecryptfs_dir_iops;
-	if (S_ISDIR(lower_inode->i_mode))
-		inode->i_fop = &ecryptfs_dir_fops;
-	if (special_file(lower_inode->i_mode))
-		init_special_inode(inode, lower_inode->i_mode,
-				   lower_inode->i_rdev);
-	fsstack_copy_attr_all(inode, lower_inode);
-	/* This size will be overwritten for real files w/ headers and
-	 * other metadata */
-	fsstack_copy_inode_size(inode, lower_inode);
-	return inode;
-out:
-	return ERR_PTR(rc);
-}
-
-/**
- * ecryptfs_interpose
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-		       struct super_block *sb, u32 flags)
-{
-	struct inode *lower_inode = lower_dentry->d_inode;
-	struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-		d_add(dentry, inode);
-	else
-		d_instantiate(dentry, inode);
-	return 0;
-}
-
 enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
        ecryptfs_opt_ecryptfs_key_bytes,
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 245b517bf1b..dbd52d40df4 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -92,22 +92,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 
-/**
- * ecryptfs_init_inode
- * @inode: The ecryptfs inode
- *
- * Set up the ecryptfs inode.
- */
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
-{
-	ecryptfs_set_inode_lower(inode, lower_inode);
-	inode->i_ino = lower_inode->i_ino;
-	inode->i_version++;
-	inode->i_op = &ecryptfs_main_iops;
-	inode->i_fop = &ecryptfs_main_fops;
-	inode->i_mapping->a_ops = &ecryptfs_aops;
-}
-
 /**
  * ecryptfs_statfs
  * @sb: The ecryptfs super block
-- 
cgit v1.2.3-70-g09d2


From 1775bc342c6eacd6304493cbb2e0cda1a0182246 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 20 May 2011 13:47:33 +0200
Subject: NFSv4.1: purge deviceid cache on nfs_free_client

Use the pnfs_layoutdriver_type both as a qualifier for the deviceid,
distinguishing deviceid from different layout types on the server,
and for freeing the layout-driver allocated structure containing the
nfs4_deviceid_node.

[BUG in _deviceid_purge_client]
[layout_driver MUST set free_deviceid_node if using dev-cache]
[let ver < 4.1 compile]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[removed EXPORT_SYMBOL_GPL(nfs4_deviceid_purge_client)]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/client.c            |  2 ++
 fs/nfs/nfs4filelayout.c    |  7 +++++++
 fs/nfs/nfs4filelayout.h    |  1 +
 fs/nfs/nfs4filelayoutdev.c |  9 +++++----
 fs/nfs/pnfs.h              | 11 +++++++++++
 fs/nfs/pnfs_dev.c          | 47 +++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d8..b3dc2b88b65 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
 	if (clp->cl_machine_cred != NULL)
 		put_rpccred(clp->cl_machine_cred);
 
+	nfs4_deviceid_purge_client(clp);
+
 	kfree(clp->cl_hostname);
 	kfree(clp);
 
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 4c67a6f6519..cd289d9b7de 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -862,6 +862,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	return -ENOMEM;
 }
 
+static void
+filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+	nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 	.id			= LAYOUT_NFSV4_1_FILES,
 	.name			= "LAYOUT_NFSV4_1_FILES",
@@ -874,6 +880,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.commit_pagelist	= filelayout_commit_pagelist,
 	.read_pagelist		= filelayout_read_pagelist,
 	.write_pagelist		= filelayout_write_pagelist,
+	.free_deviceid_node	= filelayout_free_deveiceid_node,
 };
 
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 0ace0a2d6d7..cebe01e3795 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -98,6 +98,7 @@ u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 					u32 ds_idx);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
 
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index eda4527a57e..5914659c8ec 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -156,7 +156,7 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
 	kfree(ds);
 }
 
-static void
+void
 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
 	struct nfs4_pnfs_ds *ds;
@@ -386,7 +386,9 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	dsaddr->stripe_indices = stripe_indices;
 	stripe_indices = NULL;
 	dsaddr->ds_num = num;
-	nfs4_init_deviceid_node(&dsaddr->id_node, NFS_SERVER(ino)->nfs_client,
+	nfs4_init_deviceid_node(&dsaddr->id_node,
+				NFS_SERVER(ino)->pnfs_curr_ld,
+				NFS_SERVER(ino)->nfs_client,
 				&pdev->dev_id);
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
@@ -548,8 +550,7 @@ out_free:
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
-	if (nfs4_put_deviceid_node(&dsaddr->id_node))
-		nfs4_fl_free_deviceid(dsaddr);
+	nfs4_put_deviceid_node(&dsaddr->id_node);
 }
 
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3831ad04a23..80a5d0e2cc4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,6 +65,8 @@ enum {
 	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
 };
 
+struct nfs4_deviceid_node;
+
 /* Per-layout driver specific registration structure */
 struct pnfs_layoutdriver_type {
 	struct list_head pnfs_tblid;
@@ -90,6 +92,8 @@ struct pnfs_layoutdriver_type {
 	 */
 	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
 	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+
+	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
 };
 
 struct pnfs_layout_hdr {
@@ -160,6 +164,7 @@ int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 /* pnfs_dev.c */
 struct nfs4_deviceid_node {
 	struct hlist_node		node;
+	const struct pnfs_layoutdriver_type *ld;
 	const struct nfs_client		*nfs_client;
 	struct nfs4_deviceid		deviceid;
 	atomic_t			ref;
@@ -169,10 +174,12 @@ void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+			     const struct pnfs_layoutdriver_type *,
 			     const struct nfs_client *,
 			     const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
 bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
 
 static inline int lo_fail_bit(u32 iomode)
 {
@@ -349,6 +356,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
 	return 0;
 }
+
+static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index bf05189a7cb..64a4b85c7db 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -96,11 +96,15 @@ EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
 
 void
 nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+			const struct pnfs_layoutdriver_type *ld,
 			const struct nfs_client *nfs_client,
 			const struct nfs4_deviceid *id)
 {
+	INIT_HLIST_NODE(&d->node);
+	d->ld = ld;
 	d->nfs_client = nfs_client;
 	d->deviceid = *id;
+	atomic_set(&d->ref, 1);
 }
 EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
 
@@ -108,7 +112,10 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
  * Uniquely initialize and insert a deviceid node into cache
  *
  * @new new deviceid node
- *      Note that the caller must set up new->nfs_client and new->deviceid
+ *      Note that the caller must set up the following members:
+ *        new->ld
+ *        new->nfs_client
+ *        new->deviceid
  *
  * @ret the inserted node, if none found, otherwise, the found entry.
  */
@@ -125,8 +132,6 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
 		return d;
 	}
 
-	INIT_HLIST_NODE(&new->node);
-	atomic_set(&new->ref, 1);
 	hash = nfs4_deviceid_hash(&new->deviceid);
 	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
 	spin_unlock(&nfs4_deviceid_lock);
@@ -151,6 +156,42 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
 	hlist_del_init_rcu(&d->node);
 	spin_unlock(&nfs4_deviceid_lock);
 	synchronize_rcu();
+	d->ld->free_deviceid_node(d);
 	return true;
 }
 EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+	struct nfs4_deviceid_node *d;
+	struct hlist_node *n, *next;
+	HLIST_HEAD(tmp);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+		if (d->nfs_client == clp && atomic_read(&d->ref)) {
+			hlist_del_init_rcu(&d->node);
+			hlist_add_head(&d->node, &tmp);
+		}
+	rcu_read_unlock();
+
+	if (hlist_empty(&tmp))
+		return;
+
+	synchronize_rcu();
+	hlist_for_each_entry_safe(d, n, next, &tmp, node)
+		if (atomic_dec_and_test(&d->ref))
+			d->ld->free_deviceid_node(d);
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+	long h;
+
+	spin_lock(&nfs4_deviceid_lock);
+	for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+		_deviceid_purge_client(clp, h);
+	spin_unlock(&nfs4_deviceid_lock);
+}
-- 
cgit v1.2.3-70-g09d2


From 5ccf92037c7c6e6f28175fd245284923f939259f Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 02:16:51 -0500
Subject: eCryptfs: Cleanup inode initialization code

The eCryptfs inode get, initialization, and dentry interposition code
has two separate paths. One is for when dentry interposition is needed
after doing things like a mkdir in the lower filesystem and the other
is needed after a lookup. Unlocking new inodes and doing a d_add() needs
to happen at different times, depending on which type of dentry
interposing is being done.

This patch cleans up the inode get and initialization code paths and
splits them up so that the locking and d_add() differences mentioned
above can be handled appropriately in a later patch.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Tested-by: David <david@unsolicited.net>
---
 fs/ecryptfs/ecryptfs_kernel.h |   3 -
 fs/ecryptfs/inode.c           | 134 ++++++++++++++++++++++--------------------
 2 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 37224b5fb12..41a45323637 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -628,9 +628,6 @@ struct ecryptfs_open_req {
 struct inode *ecryptfs_get_inode(struct inode *lower_inode,
 				 struct super_block *sb);
 void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
-					struct dentry *lower_dentry,
-					struct inode *ecryptfs_dir_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
 					 size_t *decrypted_name_size,
 					 struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 704a8c8fe19..fc7d2b74850 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -58,85 +58,87 @@ static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
 	return 0;
 }
 
-static int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
+static int ecryptfs_inode_set(struct inode *inode, void *opaque)
 {
-	ecryptfs_set_inode_lower(inode, (struct inode *)lower_inode);
-	inode->i_ino = ((struct inode *)lower_inode)->i_ino;
+	struct inode *lower_inode = opaque;
+
+	ecryptfs_set_inode_lower(inode, lower_inode);
+	fsstack_copy_attr_all(inode, lower_inode);
+	/* i_size will be overwritten for encrypted regular files */
+	fsstack_copy_inode_size(inode, lower_inode);
+	inode->i_ino = lower_inode->i_ino;
 	inode->i_version++;
-	inode->i_op = &ecryptfs_main_iops;
-	inode->i_fop = &ecryptfs_main_fops;
 	inode->i_mapping->a_ops = &ecryptfs_aops;
+
+	if (S_ISLNK(inode->i_mode))
+		inode->i_op = &ecryptfs_symlink_iops;
+	else if (S_ISDIR(inode->i_mode))
+		inode->i_op = &ecryptfs_dir_iops;
+	else
+		inode->i_op = &ecryptfs_main_iops;
+
+	if (S_ISDIR(inode->i_mode))
+		inode->i_fop = &ecryptfs_dir_fops;
+	else if (special_file(inode->i_mode))
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+	else
+		inode->i_fop = &ecryptfs_main_fops;
+
 	return 0;
 }
 
-struct inode *ecryptfs_get_inode(struct inode *lower_inode,
-				 struct super_block *sb)
+static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
+					  struct super_block *sb)
 {
 	struct inode *inode;
-	int rc = 0;
 
-	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
-		rc = -EXDEV;
-		goto out;
-	}
-	if (!igrab(lower_inode)) {
-		rc = -ESTALE;
-		goto out;
-	}
+	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
+		return ERR_PTR(-EXDEV);
+	if (!igrab(lower_inode))
+		return ERR_PTR(-ESTALE);
 	inode = iget5_locked(sb, (unsigned long)lower_inode,
 			     ecryptfs_inode_test, ecryptfs_inode_set,
 			     lower_inode);
 	if (!inode) {
-		rc = -EACCES;
 		iput(lower_inode);
-		goto out;
+		return ERR_PTR(-EACCES);
 	}
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-	else
+	if (!(inode->i_state & I_NEW))
 		iput(lower_inode);
-	if (S_ISLNK(lower_inode->i_mode))
-		inode->i_op = &ecryptfs_symlink_iops;
-	else if (S_ISDIR(lower_inode->i_mode))
-		inode->i_op = &ecryptfs_dir_iops;
-	if (S_ISDIR(lower_inode->i_mode))
-		inode->i_fop = &ecryptfs_dir_fops;
-	if (special_file(lower_inode->i_mode))
-		init_special_inode(inode, lower_inode->i_mode,
-				   lower_inode->i_rdev);
-	fsstack_copy_attr_all(inode, lower_inode);
-	/* This size will be overwritten for real files w/ headers and
-	 * other metadata */
-	fsstack_copy_inode_size(inode, lower_inode);
+
+	return inode;
+}
+
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+				 struct super_block *sb)
+{
+	struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
+
+	if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+		unlock_new_inode(inode);
+
 	return inode;
-out:
-	return ERR_PTR(rc);
 }
 
-#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
 /**
  * ecryptfs_interpose
  * @lower_dentry: Existing dentry in the lower filesystem
  * @dentry: ecryptfs' dentry
  * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
  *
  * Interposes upper and lower dentries.
  *
  * Returns zero on success; non-zero otherwise
  */
 static int ecryptfs_interpose(struct dentry *lower_dentry,
-			      struct dentry *dentry, struct super_block *sb,
-			      u32 flags)
+			      struct dentry *dentry, struct super_block *sb)
 {
-	struct inode *lower_inode = lower_dentry->d_inode;
-	struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+	struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
+
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
-	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-		d_add(dentry, inode);
-	else
-		d_instantiate(dentry, inode);
+	d_instantiate(dentry, inode);
+
 	return 0;
 }
 
@@ -218,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 		goto out_lock;
 	}
 	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-				directory_inode->i_sb, 0);
+				directory_inode->i_sb);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
 		goto out_lock;
@@ -305,15 +307,15 @@ out:
 }
 
 /**
- * ecryptfs_lookup_and_interpose_lower - Perform a lookup
+ * ecryptfs_lookup_interpose - Dentry interposition for a lookup
  */
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
-					struct dentry *lower_dentry,
-					struct inode *ecryptfs_dir_inode)
+static int ecryptfs_lookup_interpose(struct dentry *ecryptfs_dentry,
+				     struct dentry *lower_dentry,
+				     struct inode *ecryptfs_dir_inode)
 {
 	struct dentry *lower_dir_dentry;
 	struct vfsmount *lower_mnt;
-	struct inode *lower_inode;
+	struct inode *inode, *lower_inode;
 	struct ecryptfs_crypt_stat *crypt_stat;
 	char *page_virt = NULL;
 	int put_lower = 0, rc = 0;
@@ -341,14 +343,16 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		d_add(ecryptfs_dentry, NULL);
 		goto out;
 	}
-	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-				ecryptfs_dir_inode->i_sb,
-				ECRYPTFS_INTERPOSE_FLAG_D_ADD);
-	if (rc) {
+	inode = __ecryptfs_get_inode(lower_inode, ecryptfs_dir_inode->i_sb);
+	if (IS_ERR(inode)) {
+		rc = PTR_ERR(inode);
 		printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
 		       __func__, rc);
 		goto out;
 	}
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	d_add(ecryptfs_dentry, inode);
 	if (S_ISDIR(lower_inode->i_mode))
 		goto out;
 	if (S_ISLNK(lower_inode->i_mode))
@@ -442,12 +446,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	if (lower_dentry->d_inode)
-		goto lookup_and_interpose;
+		goto interpose;
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
 				ecryptfs_dentry->d_sb)->mount_crypt_stat;
 	if (!(mount_crypt_stat
 	    && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
-		goto lookup_and_interpose;
+		goto interpose;
 	dput(lower_dentry);
 	rc = ecryptfs_encrypt_and_encode_filename(
 		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
@@ -470,9 +474,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
 	}
-lookup_and_interpose:
-	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-						 ecryptfs_dir_inode);
+interpose:
+	rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
+				       ecryptfs_dir_inode);
 	goto out;
 out_d_drop:
 	d_drop(ecryptfs_dentry);
@@ -500,7 +504,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 		      lower_new_dentry);
 	if (rc || !lower_new_dentry->d_inode)
 		goto out_lock;
-	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
 	if (rc)
 		goto out_lock;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -567,7 +571,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	kfree(encoded_symname);
 	if (rc || !lower_dentry->d_inode)
 		goto out_lock;
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
 	if (rc)
 		goto out_lock;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -591,7 +595,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
 	if (rc || !lower_dentry->d_inode)
 		goto out;
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
 	if (rc)
 		goto out;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -639,7 +643,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
 	if (rc || !lower_dentry->d_inode)
 		goto out;
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
 	if (rc)
 		goto out;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
-- 
cgit v1.2.3-70-g09d2


From 1be5683b03a766670b3b629bf6bfeab3ca9239d8 Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Sun, 22 May 2011 19:47:09 +0300
Subject: pnfs: CB_NOTIFY_DEVICEID

Note: This functionlaity is incomplete as all layout segments referring to
the 'to be removed device id' need to be reaped, and all in flight I/O drained.

[use be32 res in nfs4_callback_devicenotify]
[use nfs_client to qualify deviceid for cb_notify_deviceid]
[use global deviceid cache for CB_NOTIFY_DEVICEID]
[refactor device cache _lookup_deviceid]
[refactor device cache _find_get_deviceid]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Bug in new global-device-cache code]
[layout_driver MUST set free_deviceid_node if using dev-cache]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/callback.h      | 17 +++++++++
 fs/nfs/callback_proc.c | 47 ++++++++++++++++++++++++
 fs/nfs/callback_xdr.c  | 96 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h          |  1 +
 fs/nfs/pnfs_dev.c      | 95 +++++++++++++++++++++++++++++++++++++++++--------
 5 files changed, 241 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311..b257383bb56 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
 
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 extern void nfs4_cb_take_slot(struct nfs_client *clp);
+
+struct cb_devicenotifyitem {
+	uint32_t		cbd_notify_type;
+	uint32_t		cbd_layout_type;
+	struct nfs4_deviceid	cbd_dev_id;
+	uint32_t		cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+	int				 ndevs;
+	struct cb_devicenotifyitem	 *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+	struct cb_devicenotifyargs *args,
+	void *dummy, struct cb_process_state *cps);
+
 #endif /* CONFIG_NFS_V4_1 */
 extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18..fb5e5b9a97a 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
 	do_callback_layoutrecall(clp, &args);
 }
 
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+				  void *dummy, struct cb_process_state *cps)
+{
+	int i;
+	__be32 res = 0;
+	struct nfs_client *clp = cps->clp;
+	struct nfs_server *server = NULL;
+
+	dprintk("%s: -->\n", __func__);
+
+	if (!clp) {
+		res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+		goto out;
+	}
+
+	for (i = 0; i < args->ndevs; i++) {
+		struct cb_devicenotifyitem *dev = &args->devs[i];
+
+		if (!server ||
+		    server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+			rcu_read_lock();
+			list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+				if (server->pnfs_curr_ld &&
+				    server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+					rcu_read_unlock();
+					goto found;
+				}
+			rcu_read_unlock();
+			dprintk("%s: layout type %u not found\n",
+				__func__, dev->cbd_layout_type);
+			continue;
+		}
+
+	found:
+		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+				"deleting instead\n", __func__);
+		nfs4_delete_deviceid(clp, &dev->cbd_dev_id);
+	}
+
+out:
+	kfree(args->devs);
+	dprintk("%s: exit with status = %u\n",
+		__func__, be32_to_cpu(res));
+	return res;
+}
+
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
 	if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c..c6c86a77e04 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
 
 #if defined(CONFIG_NFS_V4_1)
 #define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
 					4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
 	return status;
 }
 
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+				struct xdr_stream *xdr,
+				struct cb_devicenotifyargs *args)
+{
+	__be32 *p;
+	__be32 status = 0;
+	u32 tmp;
+	int n, i;
+	args->ndevs = 0;
+
+	/* Num of device notifications */
+	p = read_buf(xdr, sizeof(uint32_t));
+	if (unlikely(p == NULL)) {
+		status = htonl(NFS4ERR_BADXDR);
+		goto out;
+	}
+	n = ntohl(*p++);
+	if (n <= 0)
+		goto out;
+
+	args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+	if (!args->devs) {
+		status = htonl(NFS4ERR_DELAY);
+		goto out;
+	}
+
+	/* Decode each dev notification */
+	for (i = 0; i < n; i++) {
+		struct cb_devicenotifyitem *dev = &args->devs[i];
+
+		p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+		if (unlikely(p == NULL)) {
+			status = htonl(NFS4ERR_BADXDR);
+			goto err;
+		}
+
+		tmp = ntohl(*p++);	/* bitmap size */
+		if (tmp != 1) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+		dev->cbd_notify_type = ntohl(*p++);
+		if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+		    dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+
+		tmp = ntohl(*p++);	/* opaque size */
+		if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+		     (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+		    ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+		     (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+			status = htonl(NFS4ERR_INVAL);
+			goto err;
+		}
+		dev->cbd_layout_type = ntohl(*p++);
+		memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+		if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+			p = read_buf(xdr, sizeof(uint32_t));
+			if (unlikely(p == NULL)) {
+				status = htonl(NFS4ERR_BADXDR);
+				goto err;
+			}
+			dev->cbd_immediate = ntohl(*p++);
+		} else {
+			dev->cbd_immediate = 0;
+		}
+
+		args->ndevs++;
+
+		dprintk("%s: type %d layout 0x%x immediate %d\n",
+			__func__, dev->cbd_notify_type, dev->cbd_layout_type,
+			dev->cbd_immediate);
+	}
+out:
+	dprintk("%s: status %d ndevs %d\n",
+		__func__, ntohl(status), args->ndevs);
+	return status;
+err:
+	kfree(args->devs);
+	goto out;
+}
+
 static __be32 decode_sessionid(struct xdr_stream *xdr,
 				 struct nfs4_sessionid *sid)
 {
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_RECALL_ANY:
 	case OP_CB_RECALL_SLOT:
 	case OP_CB_LAYOUTRECALL:
+	case OP_CB_NOTIFY_DEVICEID:
 		*op = &callback_ops[op_nr];
 		break;
 
-	case OP_CB_NOTIFY_DEVICEID:
 	case OP_CB_NOTIFY:
 	case OP_CB_PUSH_DELEG:
 	case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
 			(callback_decode_arg_t)decode_layoutrecall_args,
 		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
 	},
+	[OP_CB_NOTIFY_DEVICEID] = {
+		.process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+		.decode_args =
+			(callback_decode_arg_t)decode_devicenotify_args,
+		.res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+	},
 	[OP_CB_SEQUENCE] = {
 		.process_op = (callback_process_op_t)nfs4_callback_sequence,
 		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 80a5d0e2cc4..fbd3f7cd9e7 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -173,6 +173,7 @@ struct nfs4_deviceid_node {
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
 struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
 			     const struct pnfs_layoutdriver_type *,
 			     const struct nfs_client *,
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 64a4b85c7db..8fd3839df29 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -66,33 +66,100 @@ nfs4_deviceid_hash(const struct nfs4_deviceid *id)
 	return x & NFS4_DEVICE_ID_HASH_MASK;
 }
 
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id,
+		 long hash)
+{
+	struct nfs4_deviceid_node *d;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+		if (d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) {
+			if (atomic_read(&d->ref))
+				return d;
+			else
+				continue;
+		}
+	return NULL;
+}
+
 /*
  * Lookup a deviceid in cache and get a reference count on it if found
  *
  * @clp nfs_client associated with deviceid
  * @id deviceid to look up
  */
+struct nfs4_deviceid_node *
+_find_get_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id,
+		   long hash)
+{
+	struct nfs4_deviceid_node *d;
+
+	rcu_read_lock();
+	d = _lookup_deviceid(clp, id, hash);
+	if (d && !atomic_inc_not_zero(&d->ref))
+		d = NULL;
+	rcu_read_unlock();
+	return d;
+}
+
 struct nfs4_deviceid_node *
 nfs4_find_get_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	return _find_get_deviceid(clp, id, nfs4_deviceid_hash(id));
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Unhash and put deviceid
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+struct nfs4_deviceid_node *
+nfs4_unhash_put_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
 {
 	struct nfs4_deviceid_node *d;
-	struct hlist_node *n;
-	long hash = nfs4_deviceid_hash(id);
 
+	spin_lock(&nfs4_deviceid_lock);
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) {
-		if (d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) {
-			if (!atomic_inc_not_zero(&d->ref))
-				goto fail;
-			rcu_read_unlock();
-			return d;
-		}
-	}
-fail:
+	d = _lookup_deviceid(clp, id, nfs4_deviceid_hash(id));
 	rcu_read_unlock();
+	if (!d) {
+		spin_unlock(&nfs4_deviceid_lock);
+		return NULL;
+	}
+	hlist_del_init_rcu(&d->node);
+	spin_unlock(&nfs4_deviceid_lock);
+	synchronize_rcu();
+
+	/* balance the initial ref set in pnfs_insert_deviceid */
+	if (atomic_dec_and_test(&d->ref))
+		return d;
+
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
+
+/*
+ * Delete a deviceid from cache
+ *
+ * @clp struct nfs_client qualifying the deviceid
+ * @id deviceid to delete
+ */
+void
+nfs4_delete_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+	struct nfs4_deviceid_node *d;
+
+	d = nfs4_unhash_put_deviceid(clp, id);
+	if (!d)
+		return;
+	d->ld->free_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
 
 void
 nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
@@ -126,13 +193,13 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
 	long hash;
 
 	spin_lock(&nfs4_deviceid_lock);
-	d = nfs4_find_get_deviceid(new->nfs_client, &new->deviceid);
+	hash = nfs4_deviceid_hash(&new->deviceid);
+	d = _find_get_deviceid(new->nfs_client, &new->deviceid, hash);
 	if (d) {
 		spin_unlock(&nfs4_deviceid_lock);
 		return d;
 	}
 
-	hash = nfs4_deviceid_hash(&new->deviceid);
 	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
 	spin_unlock(&nfs4_deviceid_lock);
 
-- 
cgit v1.2.3-70-g09d2


From 35c8bb543c9e83197e6375142d1d1c2ee3cf017d Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Tue, 24 May 2011 18:04:02 +0300
Subject: NFSv4.1: use layout driver in global device cache

pnfs deviceids are unique per server, per layout type.
struct nfs_client is currently used to distinguish deviceids from
different nfs servers, yet these may clash between different layout
types on the same server.  Therefore, use the layout driver associated
with each deviceid at insertion time to look it up, unhash, or
delete it.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/callback_proc.c  |  2 +-
 fs/nfs/nfs4filelayout.c |  3 ++-
 fs/nfs/pnfs.h           |  6 +++---
 fs/nfs/pnfs_dev.c       | 28 +++++++++++++++++-----------
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index fb5e5b9a97a..c73e7b2fb8e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -278,7 +278,7 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
 		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
 			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
 				"deleting instead\n", __func__);
-		nfs4_delete_deviceid(clp, &dev->cbd_dev_id);
+		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
 	}
 
 out:
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index cd289d9b7de..501a9b86b31 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -441,7 +441,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	}
 
 	/* find and reference the deviceid */
-	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->nfs_client, id);
+	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
+				   NFS_SERVER(lo->plh_inode)->nfs_client, id);
 	if (d == NULL) {
 		dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
 		if (dsaddr == NULL)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index fbd3f7cd9e7..5b083d29533 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -171,9 +171,9 @@ struct nfs4_deviceid_node {
 };
 
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
-struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_delete_deviceid(const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
 			     const struct pnfs_layoutdriver_type *,
 			     const struct nfs_client *,
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 8fd3839df29..c65e133ce9c 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -67,14 +67,16 @@ nfs4_deviceid_hash(const struct nfs4_deviceid *id)
 }
 
 static struct nfs4_deviceid_node *
-_lookup_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id,
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+		 const struct nfs_client *clp, const struct nfs4_deviceid *id,
 		 long hash)
 {
 	struct nfs4_deviceid_node *d;
 	struct hlist_node *n;
 
 	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
-		if (d->nfs_client == clp && !memcmp(&d->deviceid, id, sizeof(*id))) {
+		if (d->ld == ld && d->nfs_client == clp &&
+		    !memcmp(&d->deviceid, id, sizeof(*id))) {
 			if (atomic_read(&d->ref))
 				return d;
 			else
@@ -90,13 +92,14 @@ _lookup_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id,
  * @id deviceid to look up
  */
 struct nfs4_deviceid_node *
-_find_get_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id,
+_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+		   const struct nfs_client *clp, const struct nfs4_deviceid *id,
 		   long hash)
 {
 	struct nfs4_deviceid_node *d;
 
 	rcu_read_lock();
-	d = _lookup_deviceid(clp, id, hash);
+	d = _lookup_deviceid(ld, clp, id, hash);
 	if (d && !atomic_inc_not_zero(&d->ref))
 		d = NULL;
 	rcu_read_unlock();
@@ -104,9 +107,10 @@ _find_get_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id,
 }
 
 struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+		       const struct nfs_client *clp, const struct nfs4_deviceid *id)
 {
-	return _find_get_deviceid(clp, id, nfs4_deviceid_hash(id));
+	return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
 }
 EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
 
@@ -119,13 +123,14 @@ EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
  * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
  */
 struct nfs4_deviceid_node *
-nfs4_unhash_put_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+			 const struct nfs_client *clp, const struct nfs4_deviceid *id)
 {
 	struct nfs4_deviceid_node *d;
 
 	spin_lock(&nfs4_deviceid_lock);
 	rcu_read_lock();
-	d = _lookup_deviceid(clp, id, nfs4_deviceid_hash(id));
+	d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
 	rcu_read_unlock();
 	if (!d) {
 		spin_unlock(&nfs4_deviceid_lock);
@@ -150,11 +155,12 @@ EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
  * @id deviceid to delete
  */
 void
-nfs4_delete_deviceid(const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+		     const struct nfs_client *clp, const struct nfs4_deviceid *id)
 {
 	struct nfs4_deviceid_node *d;
 
-	d = nfs4_unhash_put_deviceid(clp, id);
+	d = nfs4_unhash_put_deviceid(ld, clp, id);
 	if (!d)
 		return;
 	d->ld->free_deviceid_node(d);
@@ -194,7 +200,7 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
 
 	spin_lock(&nfs4_deviceid_lock);
 	hash = nfs4_deviceid_hash(&new->deviceid);
-	d = _find_get_deviceid(new->nfs_client, &new->deviceid, hash);
+	d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
 	if (d) {
 		spin_unlock(&nfs4_deviceid_lock);
 		return d;
-- 
cgit v1.2.3-70-g09d2


From f7da7a129d57bfe0f74573dc03531c63e1360fae Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 19 May 2011 14:16:47 -0400
Subject: SUNRPC: introduce xdr_init_decode_pages

Initialize xdr_stream and xdr_buf using an array of page pointers
and length of buffer.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/dir.c               |  9 ++-------
 fs/nfs/nfs4filelayout.c    |  9 ++-------
 fs/nfs/nfs4filelayoutdev.c |  9 ++-------
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 19 +++++++++++++++++++
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c..f673a9e1d95 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 				struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
 	struct xdr_stream stream;
-	struct xdr_buf buf = {
-		.pages = xdr_pages,
-		.page_len = buflen,
-		.buflen = buflen,
-		.len = buflen,
-	};
+	struct xdr_buf buf;
 	struct page *scratch;
 	struct nfs_cache_array *array;
 	unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 	if (scratch == NULL)
 		return -ENOMEM;
 
-	xdr_init_decode(&stream, &buf, NULL);
+	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
 	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
 	do {
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 501a9b86b31..33bda24e8cd 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -510,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 			 gfp_t gfp_flags)
 {
 	struct xdr_stream stream;
-	struct xdr_buf buf = {
-		.pages =  lgr->layoutp->pages,
-		.page_len =  lgr->layoutp->len,
-		.buflen =  lgr->layoutp->len,
-		.len = lgr->layoutp->len,
-	};
+	struct xdr_buf buf;
 	struct page *scratch;
 	__be32 *p;
 	uint32_t nfl_util;
@@ -527,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 	if (!scratch)
 		return -ENOMEM;
 
-	xdr_init_decode(&stream, &buf, NULL);
+	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
 	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
 	/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 5914659c8ec..3b7bf137726 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -308,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	u8 max_stripe_index;
 	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
 	struct xdr_stream stream;
-	struct xdr_buf buf = {
-		.pages = pdev->pages,
-		.page_len = pdev->pglen,
-		.buflen = pdev->pglen,
-		.len = pdev->pglen,
-	};
+	struct xdr_buf buf;
 	struct page *scratch;
 
 	/* set up xdr stream */
@@ -321,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 	if (!scratch)
 		goto out_err;
 
-	xdr_init_decode(&stream, &buf, NULL);
+	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
 	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
 	/* Get the stripe count (number of stripe index) */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index fc84b7a19ca..a20970ef9e4 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -216,6 +216,8 @@ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
 extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+		struct page **pages, unsigned int len);
 extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 679cd674b81..f008c14ad34 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -638,6 +638,25 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 }
 EXPORT_SYMBOL_GPL(xdr_init_decode);
 
+/**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer from which to decode data
+ * @pages: list of pages to decode into
+ * @len: length in bytes of buffer in pages
+ */
+void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+			   struct page **pages, unsigned int len)
+{
+	memset(buf, 0, sizeof(*buf));
+	buf->pages =  pages;
+	buf->page_len =  len;
+	buf->buflen =  len;
+	buf->len = len;
+	xdr_init_decode(xdr, buf, NULL);
+}
+EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
+
 static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p = xdr->p;
-- 
cgit v1.2.3-70-g09d2


From fb3296eb4636763918edef2d22e45b85b15d4518 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:47:26 +0300
Subject: pnfs: Use byte-range for layoutget

Add offset and count parameters to pnfs_update_layout and use them to get
the layout in the pageio path.

Order cache layout segments in the following order:
* offset (ascending)
* length (descending)
* iomode (RW before READ)

Test byte range against the layout segment in use in pnfs_{read,write}_pg_test
so not to coalesce pages not using the same layout segment.

[fix lseg ordering]
[clean up pnfs_find_lseg lseg arg]
[remove unnecessary FIXME]
[fix ordering in pnfs_insert_layout]
[clean up pnfs_insert_layout]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/pnfs.c  | 165 +++++++++++++++++++++++++++++++++++++++++++--------------
 fs/nfs/pnfs.h  |   6 ++-
 fs/nfs/read.c  |   8 ++-
 fs/nfs/write.c |   8 ++-
 4 files changed, 142 insertions(+), 45 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a52..c2f09e9b670 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -261,6 +261,65 @@ put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(put_lseg);
 
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ *   start1                             end1
+ *   [----------------------------------)
+ *           start2           end2
+ *           [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+		 struct pnfs_layout_range *l2)
+{
+	u64 start1 = l1->offset;
+	u64 end1 = end_offset(start1, l1->length);
+	u64 start2 = l2->offset;
+	u64 end2 = end_offset(start2, l2->length);
+
+	return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ *   start1                             end1
+ *   [----------------------------------)
+ *                              start2           end2
+ *                              [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+		    struct pnfs_layout_range *l2)
+{
+	u64 start1 = l1->offset;
+	u64 end1 = end_offset(start1, l1->length);
+	u64 start2 = l2->offset;
+	u64 end2 = end_offset(start2, l2->length);
+
+	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
 static bool
 should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 {
@@ -467,7 +526,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
-	   u32 iomode,
+	   struct pnfs_layout_range *range,
 	   gfp_t gfp_flags)
 {
 	struct inode *ino = lo->plh_inode;
@@ -499,11 +558,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 			goto out_err_free;
 	}
 
-	lgp->args.minlength = NFS4_MAX_UINT64;
+	lgp->args.minlength = PAGE_CACHE_SIZE;
+	if (lgp->args.minlength > range->length)
+		lgp->args.minlength = range->length;
 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-	lgp->args.range.iomode = iomode;
-	lgp->args.range.offset = 0;
-	lgp->args.range.length = NFS4_MAX_UINT64;
+	lgp->args.range = *range;
 	lgp->args.type = server->pnfs_curr_ld->id;
 	lgp->args.inode = ino;
 	lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +577,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	nfs4_proc_layoutget(lgp);
 	if (!lseg) {
 		/* remember that LAYOUTGET failed and suspend trying */
-		set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+		set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
 	}
 
 	/* free xdr pages */
@@ -625,10 +684,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
  * are seen first.
  */
 static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+	   struct pnfs_layout_range *l2)
 {
+	s64 d;
+
+	/* high offset > low offset */
+	d = l1->offset - l2->offset;
+	if (d)
+		return d;
+
+	/* short length > long length */
+	d = l2->length - l1->length;
+	if (d)
+		return d;
+
 	/* read > read/write */
-	return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 }
 
 static void
@@ -636,13 +708,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 		   struct pnfs_layout_segment *lseg)
 {
 	struct pnfs_layout_segment *lp;
-	int found = 0;
 
 	dprintk("%s:Begin\n", __func__);
 
 	assert_spin_locked(&lo->plh_inode->i_lock);
 	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-		if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+		if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
 			continue;
 		list_add_tail(&lseg->pls_list, &lp->pls_list);
 		dprintk("%s: inserted lseg %p "
@@ -652,16 +723,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 			lseg->pls_range.offset, lseg->pls_range.length,
 			lp, lp->pls_range.iomode, lp->pls_range.offset,
 			lp->pls_range.length);
-		found = 1;
-		break;
-	}
-	if (!found) {
-		list_add_tail(&lseg->pls_list, &lo->plh_segs);
-		dprintk("%s: inserted lseg %p "
-			"iomode %d offset %llu length %llu at tail\n",
-			__func__, lseg, lseg->pls_range.iomode,
-			lseg->pls_range.offset, lseg->pls_range.length);
+		goto out;
 	}
+	list_add_tail(&lseg->pls_list, &lo->plh_segs);
+	dprintk("%s: inserted lseg %p "
+		"iomode %d offset %llu length %llu at tail\n",
+		__func__, lseg, lseg->pls_range.iomode,
+		lseg->pls_range.offset, lseg->pls_range.length);
+out:
 	get_layout_hdr(lo);
 
 	dprintk("%s:Return\n", __func__);
@@ -721,16 +790,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
  * READ		RW	true
  */
 static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_range *ls_range,
+		 struct pnfs_layout_range *range)
 {
-	return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+	struct pnfs_layout_range range1;
+
+	if ((range->iomode == IOMODE_RW &&
+	     ls_range->iomode != IOMODE_RW) ||
+	    !lo_seg_intersecting(ls_range, range))
+		return 0;
+
+	/* range1 covers only the first byte in the range */
+	range1 = *range;
+	range1.length = 1;
+	return lo_seg_contained(ls_range, &range1);
 }
 
 /*
  * lookup range in layout
  */
 static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_range *range)
 {
 	struct pnfs_layout_segment *lseg, *ret = NULL;
 
@@ -739,11 +820,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 	assert_spin_locked(&lo->plh_inode->i_lock);
 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
-		    is_matching_lseg(lseg, iomode)) {
+		    is_matching_lseg(&lseg->pls_range, range)) {
 			ret = get_lseg(lseg);
 			break;
 		}
-		if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+		if (cmp_layout(range, &lseg->pls_range) > 0)
 			break;
 	}
 
@@ -759,9 +840,16 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino,
 		   struct nfs_open_context *ctx,
+		   loff_t pos,
+		   u64 count,
 		   enum pnfs_iomode iomode,
 		   gfp_t gfp_flags)
 {
+	struct pnfs_layout_range arg = {
+		.iomode = iomode,
+		.offset = pos,
+		.length = count,
+	};
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct pnfs_layout_hdr *lo;
@@ -789,7 +877,7 @@ pnfs_update_layout(struct inode *ino,
 		goto out_unlock;
 
 	/* Check to see if the layout for the given range already exists */
-	lseg = pnfs_find_lseg(lo, iomode);
+	lseg = pnfs_find_lseg(lo, &arg);
 	if (lseg)
 		goto out_unlock;
 
@@ -811,7 +899,7 @@ pnfs_update_layout(struct inode *ino,
 		spin_unlock(&clp->cl_lock);
 	}
 
-	lseg = send_layoutget(lo, ctx, iomode, gfp_flags);
+	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
 	if (!lseg && first) {
 		spin_lock(&clp->cl_lock);
 		list_del_init(&lo->plh_layouts);
@@ -838,17 +926,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	int status = 0;
 
-	/* Verify we got what we asked for.
-	 * Note that because the xdr parsing only accepts a single
-	 * element array, this can fail even if the server is behaving
-	 * correctly.
-	 */
-	if (lgp->args.range.iomode > res->range.iomode ||
-	    res->range.offset != 0 ||
-	    res->range.length != NFS4_MAX_UINT64) {
-		status = -EINVAL;
-		goto out;
-	}
 	/* Inject layout blob into I/O device driver */
 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
 	if (!lseg || IS_ERR(lseg)) {
@@ -903,9 +980,14 @@ static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
 		/* This is first coelesce call for a series of nfs_pages */
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   prev->wb_context,
+						   req_offset(req),
+						   pgio->pg_count,
 						   IOMODE_READ,
 						   GFP_KERNEL);
-	}
+	} else if (pgio->pg_lseg &&
+		   req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+						pgio->pg_lseg->pls_range.length))
+		return 0;
 	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
 }
 
@@ -926,9 +1008,14 @@ static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
 		/* This is first coelesce call for a series of nfs_pages */
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   prev->wb_context,
+						   req_offset(req),
+						   pgio->pg_count,
 						   IOMODE_RW,
 						   GFP_NOFS);
-	}
+	} else if (pgio->pg_lseg &&
+		   req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+						pgio->pg_lseg->pls_range.length))
+		return 0;
 	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
 }
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5b083d29533..78f8a4a171b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -131,7 +131,8 @@ void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-		   enum pnfs_iomode access_type, gfp_t gfp_flags);
+		   loff_t pos, u64 count, enum pnfs_iomode access_type,
+		   gfp_t gfp_flags);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
@@ -271,7 +272,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
 
 static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-		   enum pnfs_iomode access_type, gfp_t gfp_flags)
+		   loff_t pos, u64 count, enum pnfs_iomode access_type,
+		   gfp_t gfp_flags)
 {
 	return NULL;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a..540c8bc93f9 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 	atomic_set(&req->wb_complete, requests);
 
 	BUG_ON(desc->pg_lseg != NULL);
-	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+				  req_offset(req), desc->pg_count,
+				  IOMODE_READ, GFP_KERNEL);
 	ClearPageError(page);
 	offset = 0;
 	nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 	}
 	req = nfs_list_entry(data->pages.next);
 	if ((!lseg) && list_is_singular(&data->pages))
-		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+					  req_offset(req), desc->pg_count,
+					  IOMODE_READ, GFP_KERNEL);
 
 	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
 				0, lseg);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac9..7edb72f27c2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 	atomic_set(&req->wb_complete, requests);
 
 	BUG_ON(desc->pg_lseg);
-	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+				  req_offset(req), desc->pg_count,
+				  IOMODE_RW, GFP_NOFS);
 	ClearPageError(page);
 	offset = 0;
 	nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 	}
 	req = nfs_list_entry(data->pages.next);
 	if ((!lseg) && list_is_singular(&data->pages))
-		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+					  req_offset(req), desc->pg_count,
+					  IOMODE_RW, GFP_NOFS);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
-- 
cgit v1.2.3-70-g09d2


From 707ed5fdb587c71fdb7ad224ba1d80231f33c974 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:47:46 +0300
Subject: pnfs: align layoutget requests on page boundaries

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/pnfs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c2f09e9b670..2357ee343f4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -850,6 +850,7 @@ pnfs_update_layout(struct inode *ino,
 		.offset = pos,
 		.length = count,
 	};
+	unsigned pg_offset;
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct pnfs_layout_hdr *lo;
@@ -899,6 +900,13 @@ pnfs_update_layout(struct inode *ino,
 		spin_unlock(&clp->cl_lock);
 	}
 
+	pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+	if (pg_offset) {
+		arg.offset -= pg_offset;
+		arg.length += pg_offset;
+	}
+	arg.length = PAGE_CACHE_ALIGN(arg.length);
+
 	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
 	if (!lseg && first) {
 		spin_lock(&clp->cl_lock);
-- 
cgit v1.2.3-70-g09d2


From 778b5502fdba5b183553f3f2ef1672ba78ac58b6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:48:02 +0300
Subject: pnfs: Use byte-range for cb_layoutrecall

Use recalled range to invalidate particular layout segments in the layout cache.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/callback_proc.c |  4 ++--
 fs/nfs/pnfs.c          | 15 +++++++++------
 fs/nfs/pnfs.h          |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c73e7b2fb8e..d4d1954e9bb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 	spin_lock(&ino->i_lock);
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 	    mark_matching_lsegs_invalid(lo, &free_me_list,
-					args->cbl_range.iomode))
+					&args->cbl_range))
 		rv = NFS4ERR_DELAY;
 	else
 		rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
 		ino = lo->plh_inode;
 		spin_lock(&ino->i_lock);
 		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-		if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+		if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
 			rv = NFS4ERR_DELAY;
 		list_del_init(&lo->plh_bulk_recall);
 		spin_unlock(&ino->i_lock);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2357ee343f4..20436a5e76c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -321,10 +321,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
 }
 
 static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+		 struct pnfs_layout_range *recall_range)
 {
-	return (recall_iomode == IOMODE_ANY ||
-		lseg_iomode == recall_iomode);
+	return (recall_range->iomode == IOMODE_ANY ||
+		lseg_range->iomode == recall_range->iomode) &&
+	       lo_seg_intersecting(lseg_range, recall_range);
 }
 
 /* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -355,7 +357,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 int
 mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 			    struct list_head *tmp_list,
-			    u32 iomode)
+			    struct pnfs_layout_range *recall_range)
 {
 	struct pnfs_layout_segment *lseg, *next;
 	int invalid = 0, removed = 0;
@@ -368,7 +370,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 		return 0;
 	}
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-		if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+		if (!recall_range ||
+		    should_free_lseg(&lseg->pls_range, recall_range)) {
 			dprintk("%s: freeing lseg %p iomode %d "
 				"offset %llu length %llu\n", __func__,
 				lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -417,7 +420,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	lo = nfsi->layout;
 	if (lo) {
 		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-		mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+		mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
 	}
 	spin_unlock(&nfsi->vfs_inode.i_lock);
 	pnfs_free_lseg_list(&tmp_list);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 78f8a4a171b..f37ab3539cb 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -154,7 +154,7 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
 				  struct nfs4_state *open_state);
 int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
-				u32 iomode);
+				struct pnfs_layout_range *recall_range);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-- 
cgit v1.2.3-70-g09d2


From ae50c0b5c6f6fa340f1fe2d244b358145f7e5a15 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Sun, 22 May 2011 19:48:21 +0300
Subject: pnfs: client stats

A pNFS client auto-negotiates a lot of features (minorversion level,
pNFS layout type, etc.).  This is convenient, but makes certain kinds of
failures hard for a user to detect.

For example, if the client falls back on 4.0, or falls back to MDS IO
because the user didn't connect to the right iscsi disks before
mounting, the only symptoms may be reduced performance, which may not be
noticed till long after the actual failure, and may be difficult for a
user to diagnose.

However, such "failures" may also be perfectly normal in some cases, so
we don't want to spam the system logs with them.

One approach would be to put some more information into
/proc/self/mountstats.

Signed-off-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[pnfs: add commit client stats]
[fixup data types for "ret" variables in pnfs_try_to* inline funcs.]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[fix definition of show_pnfs for !CONFIG_PNFS]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfs41: Fix show_sessions in the not CONFIG_NFS_V4_1 case]
    There is a build error when CONFIG_NFS_V4 is set but
    CONFIG_NFS_V4_1 is *not* set. show_sessions() prototype
    was unbalanced between the two cases.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[pnfs: super.c remove CONFIG_PNFS]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/super.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa..ce40e5c568b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	return 0;
 }
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+	if (nfs4_has_session(server->nfs_client))
+		seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+	seq_printf(m, ",pnfs=");
+	if (server->pnfs_curr_ld)
+		seq_printf(m, "%s", server->pnfs_curr_ld->name);
+	else
+		seq_printf(m, "not configured");
+}
+#else  /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
 
 static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
 {
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
 		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+		show_sessions(m, nfss);
+		show_pnfs(m, nfss);
 	}
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From c93407d03c3ccf60b33a309e5fcd41cd98969901 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:49:06 +0300
Subject: pnfs-obj: objlayoutdriver module skeleton

* Define the PNFS_OBJLAYOUT Kconfig option in the nfs
  master Kconfig file.
* Add the objlayout driver to the Kernel's Kbuild system.
* Add the fs/nfs/objlayout/Kbuild file for building the
  objlayoutdriver.ko driver
* Define fs/nfs/objlayout/objio_osd.c, register the driver on module
  initialization and unregister on exit.

[pnfs-obj: remove of CONFIG_PNFS fallout]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[added "unsure" clause]
[depend on NFS_V4_1]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/Kconfig               | 10 ++++++
 fs/nfs/Makefile              |  2 ++
 fs/nfs/objlayout/Kbuild      |  5 +++
 fs/nfs/objlayout/objio_osd.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+)
 create mode 100644 fs/nfs/objlayout/Kbuild
 create mode 100644 fs/nfs/objlayout/objio_osd.c

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6d..81515545ba7 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
 config PNFS_FILE_LAYOUT
 	tristate
 
+config PNFS_OBJLAYOUT
+	tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+	depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+	help
+	  Say M here if you want your pNFS client to support the Objects Layout Driver.
+	  Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+	  upper level driver (SCSI_OSD_ULD).
+
+	  If unsure, say N.
+
 config ROOT_NFS
 	bool "Root file system on NFS"
 	depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 7516a8aaa42..6a34f7dd0e6 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -21,3 +21,5 @@ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
 
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 00000000000..2e5b9a44773
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 00000000000..379595f2d1c
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,76 @@
+/*
+ *  pNFS Objects layout implementation over open-osd initiator library
+ *
+ *  Copyright (C) 2009 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include "../pnfs.h"
+
+static struct pnfs_layoutdriver_type objlayout_type = {
+	.id = LAYOUT_OSD2_OBJECTS,
+	.name = "LAYOUT_OSD2_OBJECTS",
+};
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+	int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+	if (ret)
+		printk(KERN_INFO
+			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+			__func__, ret);
+	else
+		printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+			__func__);
+	return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+	pnfs_unregister_layoutdriver(&objlayout_type);
+	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+	       __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
-- 
cgit v1.2.3-70-g09d2


From 38b7c401f6ade50543f246c4bc2c971edf2b19dd Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:49:32 +0300
Subject: pnfs-obj: pnfs_osd XDR definitions

* Add the pnfs_osd_xdr.h header

* defintions the pnfs_osd_layout structure including all it's
  sub-types and constants.
* Declare the pnfs_osd_xdr_decode_layout API + all needed
  inline helpers.

* Define the pnfs_osd_deviceaddr structure and all its subtypes and
  constants.
* Declare API for decoding of a pnfs_osd_deviceaddr from XDR stream.

* Define the pnfs_osd_ioerr structure, its substructures and constants.
* Declare API for encoding of a pnfs_osd_ioerr into XDR stream.

* Define the pnfs_osd_layoutupdate structure and its substructures.
* Declare API for encoding of a pnfs_osd_layoutupdate into XDR stream.

[Remove server definitions]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 include/linux/pnfs_osd_xdr.h | 345 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 include/linux/pnfs_osd_xdr.h

diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
new file mode 100644
index 00000000000..76efbdd0162
--- /dev/null
+++ b/include/linux/pnfs_osd_xdr.h
@@ -0,0 +1,345 @@
+/*
+ *  pNFS-osd on-the-wire data structures
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __PNFS_OSD_XDR_H__
+#define __PNFS_OSD_XDR_H__
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <scsi/osd_protocol.h>
+
+#define PNFS_OSD_OSDNAME_MAXSIZE 256
+
+/*
+ * draft-ietf-nfsv4-minorversion-22
+ * draft-ietf-nfsv4-pnfs-obj-12
+ */
+
+/* Layout Structure */
+
+enum pnfs_osd_raid_algorithm4 {
+	PNFS_OSD_RAID_0		= 1,
+	PNFS_OSD_RAID_4		= 2,
+	PNFS_OSD_RAID_5		= 3,
+	PNFS_OSD_RAID_PQ	= 4     /* Reed-Solomon P+Q */
+};
+
+/*   struct pnfs_osd_data_map4 {
+ *       uint32_t                    odm_num_comps;
+ *       length4                     odm_stripe_unit;
+ *       uint32_t                    odm_group_width;
+ *       uint32_t                    odm_group_depth;
+ *       uint32_t                    odm_mirror_cnt;
+ *       pnfs_osd_raid_algorithm4    odm_raid_algorithm;
+ *   };
+ */
+struct pnfs_osd_data_map {
+	u32	odm_num_comps;
+	u64	odm_stripe_unit;
+	u32	odm_group_width;
+	u32	odm_group_depth;
+	u32	odm_mirror_cnt;
+	u32	odm_raid_algorithm;
+};
+
+/*   struct pnfs_osd_objid4 {
+ *       deviceid4       oid_device_id;
+ *       uint64_t        oid_partition_id;
+ *       uint64_t        oid_object_id;
+ *   };
+ */
+struct pnfs_osd_objid {
+	struct nfs4_deviceid	oid_device_id;
+	u64			oid_partition_id;
+	u64			oid_object_id;
+};
+
+/* For printout. I use:
+ * kprint("dev(%llx:%llx)", _DEVID_LO(pointer), _DEVID_HI(pointer));
+ * BE style
+ */
+#define _DEVID_LO(oid_device_id) \
+	(unsigned long long)be64_to_cpup((__be64 *)(oid_device_id)->data)
+
+#define _DEVID_HI(oid_device_id) \
+	(unsigned long long)be64_to_cpup(((__be64 *)(oid_device_id)->data) + 1)
+
+static inline int
+pnfs_osd_objid_xdr_sz(void)
+{
+	return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
+}
+
+enum pnfs_osd_version {
+	PNFS_OSD_MISSING              = 0,
+	PNFS_OSD_VERSION_1            = 1,
+	PNFS_OSD_VERSION_2            = 2
+};
+
+struct pnfs_osd_opaque_cred {
+	u32 cred_len;
+	void *cred;
+};
+
+enum pnfs_osd_cap_key_sec {
+	PNFS_OSD_CAP_KEY_SEC_NONE     = 0,
+	PNFS_OSD_CAP_KEY_SEC_SSV      = 1,
+};
+
+/*   struct pnfs_osd_object_cred4 {
+ *       pnfs_osd_objid4         oc_object_id;
+ *       pnfs_osd_version4       oc_osd_version;
+ *       pnfs_osd_cap_key_sec4   oc_cap_key_sec;
+ *       opaque                  oc_capability_key<>;
+ *       opaque                  oc_capability<>;
+ *   };
+ */
+struct pnfs_osd_object_cred {
+	struct pnfs_osd_objid		oc_object_id;
+	u32				oc_osd_version;
+	u32				oc_cap_key_sec;
+	struct pnfs_osd_opaque_cred	oc_cap_key;
+	struct pnfs_osd_opaque_cred	oc_cap;
+};
+
+/*   struct pnfs_osd_layout4 {
+ *       pnfs_osd_data_map4      olo_map;
+ *       uint32_t                olo_comps_index;
+ *       pnfs_osd_object_cred4   olo_components<>;
+ *   };
+ */
+struct pnfs_osd_layout {
+	struct pnfs_osd_data_map	olo_map;
+	u32				olo_comps_index;
+	u32				olo_num_comps;
+	struct pnfs_osd_object_cred	*olo_comps;
+};
+
+/* Device Address */
+enum pnfs_osd_targetid_type {
+	OBJ_TARGET_ANON = 1,
+	OBJ_TARGET_SCSI_NAME = 2,
+	OBJ_TARGET_SCSI_DEVICE_ID = 3,
+};
+
+/*   union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
+ *       case OBJ_TARGET_SCSI_NAME:
+ *           string              oti_scsi_name<>;
+ *
+ *       case OBJ_TARGET_SCSI_DEVICE_ID:
+ *           opaque              oti_scsi_device_id<>;
+ *
+ *       default:
+ *           void;
+ *   };
+ *
+ *   union pnfs_osd_targetaddr4 switch (bool ota_available) {
+ *       case TRUE:
+ *           netaddr4            ota_netaddr;
+ *       case FALSE:
+ *           void;
+ *   };
+ *
+ *   struct pnfs_osd_deviceaddr4 {
+ *       pnfs_osd_targetid4      oda_targetid;
+ *       pnfs_osd_targetaddr4    oda_targetaddr;
+ *       uint64_t                oda_lun;
+ *       opaque                  oda_systemid<>;
+ *       pnfs_osd_object_cred4   oda_root_obj_cred;
+ *       opaque                  oda_osdname<>;
+ *   };
+ */
+struct pnfs_osd_targetid {
+	u32				oti_type;
+	struct nfs4_string		oti_scsi_device_id;
+};
+
+enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
+
+/*   struct netaddr4 {
+ *       // see struct rpcb in RFC1833
+ *       string r_netid<>;    // network id
+ *       string r_addr<>;     // universal address
+ *   };
+ */
+struct pnfs_osd_net_addr {
+	struct nfs4_string	r_netid;
+	struct nfs4_string	r_addr;
+};
+
+struct pnfs_osd_targetaddr {
+	u32				ota_available;
+	struct pnfs_osd_net_addr	ota_netaddr;
+};
+
+enum {
+	NETWORK_ID_MAX = 16 / 4,
+	UNIVERSAL_ADDRESS_MAX = 64 / 4,
+	PNFS_OSD_TARGETADDR_MAX = 3 +  NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
+};
+
+struct pnfs_osd_deviceaddr {
+	struct pnfs_osd_targetid	oda_targetid;
+	struct pnfs_osd_targetaddr	oda_targetaddr;
+	u8				oda_lun[8];
+	struct nfs4_string		oda_systemid;
+	struct pnfs_osd_object_cred	oda_root_obj_cred;
+	struct nfs4_string		oda_osdname;
+};
+
+enum {
+	ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
+	PNFS_OSD_DEVICEADDR_MAX =
+		PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
+		2 /*oda_lun*/ +
+		1 + OSD_SYSTEMID_LEN +
+		1 + ODA_OSDNAME_MAX,
+};
+
+/* LAYOUTCOMMIT: layoutupdate */
+
+/*   union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
+ *       case TRUE:
+ *           int64_t     dsu_delta;
+ *       case FALSE:
+ *           void;
+ *   };
+ *
+ *   struct pnfs_osd_layoutupdate4 {
+ *       pnfs_osd_deltaspaceused4    olu_delta_space_used;
+ *       bool                        olu_ioerr_flag;
+ *   };
+ */
+struct pnfs_osd_layoutupdate {
+	u32	dsu_valid;
+	s64	dsu_delta;
+	u32	olu_ioerr_flag;
+};
+
+/* LAYOUTRETURN: I/O Rrror Report */
+
+enum pnfs_osd_errno {
+	PNFS_OSD_ERR_EIO		= 1,
+	PNFS_OSD_ERR_NOT_FOUND		= 2,
+	PNFS_OSD_ERR_NO_SPACE		= 3,
+	PNFS_OSD_ERR_BAD_CRED		= 4,
+	PNFS_OSD_ERR_NO_ACCESS		= 5,
+	PNFS_OSD_ERR_UNREACHABLE	= 6,
+	PNFS_OSD_ERR_RESOURCE		= 7
+};
+
+/*   struct pnfs_osd_ioerr4 {
+ *       pnfs_osd_objid4     oer_component;
+ *       length4             oer_comp_offset;
+ *       length4             oer_comp_length;
+ *       bool                oer_iswrite;
+ *       pnfs_osd_errno4     oer_errno;
+ *   };
+ */
+struct pnfs_osd_ioerr {
+	struct pnfs_osd_objid	oer_component;
+	u64			oer_comp_offset;
+	u64			oer_comp_length;
+	u32			oer_iswrite;
+	u32			oer_errno;
+};
+
+/* OSD XDR API */
+/* Layout helpers */
+/* Layout decoding is done in two parts:
+ * 1. First Call pnfs_osd_xdr_decode_layout_map to read in only the header part
+ *    of the layout. @iter members need not be initialized.
+ *    Returned:
+ *             @layout members are set. (@layout->olo_comps set to NULL).
+ *
+ *             Zero on success, or negative error if passed xdr is broken.
+ *
+ * 2. 2nd Call pnfs_osd_xdr_decode_layout_comp() in a loop until it returns
+ *    false, to decode the next component.
+ *    Returned:
+ *       true if there is more to decode or false if we are done or error.
+ *
+ * Example:
+ *	struct pnfs_osd_xdr_decode_layout_iter iter;
+ *	struct pnfs_osd_layout layout;
+ *	struct pnfs_osd_object_cred comp;
+ *	int status;
+ *
+ *	status = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+ *	if (unlikely(status))
+ *		goto err;
+ *	while(pnfs_osd_xdr_decode_layout_comp(&comp, &iter, xdr, &status)) {
+ *		// All of @comp strings point to inside the xdr_buffer
+ *		// or scrach buffer. Copy them out to user memory eg.
+ *		copy_single_comp(dest_comp++, &comp);
+ *	}
+ *	if (unlikely(status))
+ *		goto err;
+ */
+
+struct pnfs_osd_xdr_decode_layout_iter {
+	unsigned total_comps;
+	unsigned decoded_comps;
+};
+
+extern int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr);
+
+extern bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+	int *err);
+
+/* Device Info helpers */
+
+/* Note: All strings inside @deviceaddr point to space inside @p.
+ * @p should stay valid while @deviceaddr is in use.
+ */
+extern void pnfs_osd_xdr_decode_deviceaddr(
+	struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
+
+/* layoutupdate (layout_commit) xdr helpers */
+extern int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+				 struct pnfs_osd_layoutupdate *lou);
+
+/* osd_ioerror encoding/decoding (layout_return) */
+/* Client */
+extern __be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr);
+extern void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr);
+
+#endif /* __PNFS_OSD_XDR_H__ */
-- 
cgit v1.2.3-70-g09d2


From f1bc893a89d012649e4e7f43575b2c290e08ee42 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 22 May 2011 19:49:57 +0300
Subject: pnfs-obj: pnfs_osd XDR client implementation

* Add the fs/nfs/objlayout/pnfs_osd_xdr_cli.c file, which will
  include the XDR encode/decode implementations for the pNFS
  client objlayout driver.

[Wrong type in comments]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/Kbuild             |   2 +-
 fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 412 ++++++++++++++++++++++++++++++++++++
 2 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 fs/nfs/objlayout/pnfs_osd_xdr_cli.c

diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
index 2e5b9a44773..7b2a5a24065 100644
--- a/fs/nfs/objlayout/Kbuild
+++ b/fs/nfs/objlayout/Kbuild
@@ -1,5 +1,5 @@
 #
 # Makefile for the pNFS Objects Layout Driver kernel module
 #
-objlayoutdriver-y := objio_osd.o
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 00000000000..16fc758e912
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
+/*
+ *  Object-Based pNFS Layout XDR layer
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on RFC5664
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ *	struct nfs4_deviceid	oid_device_id;
+ *	u64			oid_partition_id;
+ *	u64			oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+	p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+				    sizeof(objid->oid_device_id.data));
+
+	p = xdr_decode_hyper(p, &objid->oid_partition_id);
+	p = xdr_decode_hyper(p, &objid->oid_object_id);
+	return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ *	u32 cred_len;
+ *	void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+			    struct xdr_stream *xdr)
+{
+	__be32 *p = xdr_inline_decode(xdr, 1);
+
+	if (!p)
+		return -EINVAL;
+
+	opaque_cred->cred_len = be32_to_cpu(*p++);
+
+	p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+	if (!p)
+		return -EINVAL;
+
+	opaque_cred->cred = p;
+	return 0;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ *	struct pnfs_osd_objid		oc_object_id;
+ *	u32				oc_osd_version;
+ *	u32				oc_cap_key_sec;
+ *	struct pnfs_osd_opaque_cred	oc_cap_key
+ *	struct pnfs_osd_opaque_cred	oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+			    struct xdr_stream *xdr)
+{
+	__be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+	int ret;
+
+	if (!p)
+		return -EIO;
+
+	p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+	comp->oc_osd_version = be32_to_cpup(p++);
+	comp->oc_cap_key_sec = be32_to_cpup(p);
+
+	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+	if (unlikely(ret))
+		return ret;
+
+	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+	return ret;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ *	u32	odm_num_comps;
+ *	u64	odm_stripe_unit;
+ *	u32	odm_group_width;
+ *	u32	odm_group_depth;
+ *	u32	odm_mirror_cnt;
+ *	u32	odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+	return 4 + 8 + 4 + 4 + 4 + 4;
+}
+
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+	data_map->odm_num_comps = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+	data_map->odm_group_width = be32_to_cpup(p++);
+	data_map->odm_group_depth = be32_to_cpup(p++);
+	data_map->odm_mirror_cnt = be32_to_cpup(p++);
+	data_map->odm_raid_algorithm = be32_to_cpup(p++);
+	dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+		"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+		__func__,
+		data_map->odm_num_comps,
+		(unsigned long long)data_map->odm_stripe_unit,
+		data_map->odm_group_width,
+		data_map->odm_group_depth,
+		data_map->odm_mirror_cnt,
+		data_map->odm_raid_algorithm);
+	return p;
+}
+
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	memset(iter, 0, sizeof(*iter));
+
+	p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+	if (unlikely(!p))
+		return -EINVAL;
+
+	p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+	layout->olo_comps_index = be32_to_cpup(p++);
+	layout->olo_num_comps = be32_to_cpup(p++);
+	iter->total_comps = layout->olo_num_comps;
+	return 0;
+}
+
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+	int *err)
+{
+	BUG_ON(iter->decoded_comps > iter->total_comps);
+	if (iter->decoded_comps == iter->total_comps)
+		return false;
+
+	*err = _osd_xdr_decode_object_cred(comp, xdr);
+	if (unlikely(*err)) {
+		dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+			"total_comps=%d\n", __func__, *err,
+			iter->decoded_comps, iter->total_comps);
+		return false; /* stop the loop */
+	}
+	dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+		"key_len=%u cap_len=%u\n",
+		__func__,
+		_DEVID_LO(&comp->oc_object_id.oid_device_id),
+		_DEVID_HI(&comp->oc_object_id.oid_device_id),
+		comp->oc_object_id.oid_partition_id,
+		comp->oc_object_id.oid_object_id,
+		comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+
+	iter->decoded_comps++;
+	return true;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ *       variable strings fields are left inside the rpc buffer and are only
+ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ *       should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ *	unsigned int len;
+ *	char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+	str->len = be32_to_cpup(p++);
+	str->data = (char *)p;
+
+	p += XDR_QUADLEN(str->len);
+	return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ *	u32			oti_type;
+ *	struct nfs4_string	oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+	u32 oti_type;
+
+	oti_type = be32_to_cpup(p++);
+	targetid->oti_type = oti_type;
+
+	switch (oti_type) {
+	case OBJ_TARGET_SCSI_NAME:
+	case OBJ_TARGET_SCSI_DEVICE_ID:
+		p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+	}
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ *	struct nfs4_string	r_netid;
+ *	struct nfs4_string	r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+	p = __read_u8_opaque(p, &netaddr->r_netid);
+	p = __read_u8_opaque(p, &netaddr->r_addr);
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ *	u32				ota_available;
+ *	struct pnfs_osd_net_addr	ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+	u32 ota_available;
+
+	ota_available = be32_to_cpup(p++);
+	targetaddr->ota_available = ota_available;
+
+	if (ota_available)
+		p = __read_net_addr(p, &targetaddr->ota_netaddr);
+
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ *	struct pnfs_osd_targetid	oda_targetid;
+ *	struct pnfs_osd_targetaddr	oda_targetaddr;
+ *	u8				oda_lun[8];
+ *	struct nfs4_string		oda_systemid;
+ *	struct pnfs_osd_object_cred	oda_root_obj_cred;
+ *	struct nfs4_string		oda_osdname;
+ * };
+ */
+
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+			      struct pnfs_osd_opaque_cred *opaque_cred)
+{
+	opaque_cred->cred_len = be32_to_cpu(*p++);
+	opaque_cred->cred = p;
+	return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+	p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+	comp->oc_osd_version = be32_to_cpup(p++);
+	comp->oc_cap_key_sec = be32_to_cpup(p++);
+
+	p = __read_opaque_cred(p, &comp->oc_cap_key);
+	p = __read_opaque_cred(p, &comp->oc_cap);
+	return p;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+	struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+	p = __read_targetid(p, &deviceaddr->oda_targetid);
+
+	p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+
+	p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+				    sizeof(deviceaddr->oda_lun));
+
+	p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+
+	p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+
+	p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+
+	/* libosd likes this terminated in dbg. It's last, so no problems */
+	deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ *	u32	dsu_valid;
+ *	s64	dsu_delta;
+ *	u32	olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+				 struct pnfs_osd_layoutupdate *lou)
+{
+	__be32 *p = xdr_reserve_space(xdr,  4 + 8 + 4);
+
+	if (!p)
+		return -E2BIG;
+
+	*p++ = cpu_to_be32(lou->dsu_valid);
+	if (lou->dsu_valid)
+		p = xdr_encode_hyper(p, lou->dsu_delta);
+	*p++ = cpu_to_be32(lou->olu_ioerr_flag);
+	return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ *	struct nfs4_deviceid	oid_device_id;
+ *	u64			oid_partition_id;
+ *	u64			oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+				    sizeof(object_id->oid_device_id.data));
+	p = xdr_encode_hyper(p, object_id->oid_partition_id);
+	p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+	return p;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ *	struct pnfs_osd_objid	oer_component;
+ *	u64			oer_comp_offset;
+ *	u64			oer_comp_length;
+ *	u32			oer_iswrite;
+ *	u32			oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+	p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+	p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+	p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+	*p++ = cpu_to_be32(ioerr->oer_iswrite);
+	*p   = cpu_to_be32(ioerr->oer_errno);
+}
+
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 32 + 24);
+	if (unlikely(!p))
+		dprintk("%s: out of xdr space\n", __func__);
+
+	return p;
+}
-- 
cgit v1.2.3-70-g09d2


From 09f5bf4e6d0607399c16ec7a2d8d166f31086686 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 22 May 2011 19:50:20 +0300
Subject: pnfs-obj: decode layout, alloc/free lseg

objlayout_alloc_lseg prepares an xdr_stream and calls the
raid engins objio_alloc_lseg() to allocate a private
pnfs_layout_segment.

objio_osd.c::objio_alloc_lseg() uses passed xdr_stream to
decode and store the layout_segment information in an
objio_segment struct, using the pnfs_osd_xdr.h API for
the actual parsing the layout xdr.

objlayout_free_lseg calls objio_free_lseg() to free the
allocated space.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[gfp_flags]
[removed "extern" from function definitions]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/Kbuild      |   2 +-
 fs/nfs/objlayout/objio_osd.c | 170 ++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/objlayout/objlayout.c | 104 ++++++++++++++++++++++++++
 fs/nfs/objlayout/objlayout.h |  67 +++++++++++++++++
 4 files changed, 341 insertions(+), 2 deletions(-)
 create mode 100644 fs/nfs/objlayout/objlayout.c
 create mode 100644 fs/nfs/objlayout/objlayout.h

diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
index 7b2a5a24065..ed30ea072bb 100644
--- a/fs/nfs/objlayout/Kbuild
+++ b/fs/nfs/objlayout/Kbuild
@@ -1,5 +1,5 @@
 #
 # Makefile for the pNFS Objects Layout Driver kernel module
 #
-objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 379595f2d1c..08f1d90b4ce 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,11 +38,179 @@
  */
 
 #include <linux/module.h>
-#include "../pnfs.h"
+#include <scsi/osd_initiator.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+struct caps_buffers {
+	u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
+	u8 creds[OSD_CAP_LEN];
+};
+
+struct objio_segment {
+	struct pnfs_layout_segment lseg;
+
+	struct pnfs_osd_object_cred *comps;
+
+	unsigned mirrors_p1;
+	unsigned stripe_unit;
+	unsigned group_width;	/* Data stripe_units without integrity comps */
+	u64 group_depth;
+	unsigned group_count;
+
+	unsigned comps_index;
+	unsigned num_comps;
+	/* variable length */
+	struct objio_dev_ent *ods[];
+};
+
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+	return container_of(lseg, struct objio_segment, lseg);
+}
+
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+	struct pnfs_osd_data_map *data_map = &layout->olo_map;
+	u64 stripe_length;
+	u32 group_width;
+
+/* FIXME: Only raid0 for now. if not go through MDS */
+	if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+		printk(KERN_ERR "Only RAID_0 for now\n");
+		return -ENOTSUPP;
+	}
+	if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+		printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+			  data_map->odm_num_comps, data_map->odm_mirror_cnt);
+		return -EINVAL;
+	}
+
+	if (data_map->odm_group_width)
+		group_width = data_map->odm_group_width;
+	else
+		group_width = data_map->odm_num_comps /
+						(data_map->odm_mirror_cnt + 1);
+
+	stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+	if (stripe_length >= (1ULL << 32)) {
+		printk(KERN_ERR "Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -ENOTSUPP;
+	}
+
+	if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+		printk(KERN_ERR "Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+		return -ENOTSUPP;
+	}
+
+	return 0;
+}
+
+static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
+			     struct pnfs_osd_object_cred *src_comp,
+			     struct caps_buffers *caps_p)
+{
+	WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
+	WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+
+	*cur_comp = *src_comp;
+
+	memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
+	       sizeof(caps_p->caps_key));
+	cur_comp->oc_cap_key.cred = caps_p->caps_key;
+
+	memcpy(caps_p->creds, src_comp->oc_cap.cred,
+	       sizeof(caps_p->creds));
+	cur_comp->oc_cap.cred = caps_p->creds;
+}
+
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+	struct pnfs_layout_hdr *pnfslay,
+	struct pnfs_layout_range *range,
+	struct xdr_stream *xdr,
+	gfp_t gfp_flags)
+{
+	struct objio_segment *objio_seg;
+	struct pnfs_osd_xdr_decode_layout_iter iter;
+	struct pnfs_osd_layout layout;
+	struct pnfs_osd_object_cred *cur_comp, src_comp;
+	struct caps_buffers *caps_p;
+	int err;
+
+	err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+	if (unlikely(err))
+		return err;
+
+	err = _verify_data_map(&layout);
+	if (unlikely(err))
+		return err;
+
+	objio_seg = kzalloc(sizeof(*objio_seg) +
+			    sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+			    sizeof(*objio_seg->comps) * layout.olo_num_comps +
+			    sizeof(struct caps_buffers) * layout.olo_num_comps,
+			    gfp_flags);
+	if (!objio_seg)
+		return -ENOMEM;
+
+	objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+	cur_comp = objio_seg->comps;
+	caps_p = (void *)(cur_comp + layout.olo_num_comps);
+	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
+		copy_single_comp(cur_comp++, &src_comp, caps_p++);
+	if (unlikely(err))
+		goto err;
+
+	objio_seg->num_comps = layout.olo_num_comps;
+	objio_seg->comps_index = layout.olo_comps_index;
+
+	objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+	objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+	if (layout.olo_map.odm_group_width) {
+		objio_seg->group_width = layout.olo_map.odm_group_width;
+		objio_seg->group_depth = layout.olo_map.odm_group_depth;
+		objio_seg->group_count = layout.olo_map.odm_num_comps /
+						objio_seg->mirrors_p1 /
+						objio_seg->group_width;
+	} else {
+		objio_seg->group_width = layout.olo_map.odm_num_comps /
+						objio_seg->mirrors_p1;
+		objio_seg->group_depth = -1;
+		objio_seg->group_count = 1;
+	}
+
+	*outp = &objio_seg->lseg;
+	return 0;
+
+err:
+	kfree(objio_seg);
+	dprintk("%s: Error: return %d\n", __func__, err);
+	*outp = NULL;
+	return err;
+}
+
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+
+	kfree(objio_seg);
+}
+
 
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
+
+	.alloc_lseg              = objlayout_alloc_lseg,
+	.free_lseg               = objlayout_free_lseg,
 };
 
 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 00000000000..946526763d3
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,104 @@
+/*
+ *  pNFS Objects layout driver high level definitions
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+		     struct nfs4_layoutget_res *lgr,
+		     gfp_t gfp_flags)
+{
+	int status = -ENOMEM;
+	struct xdr_stream stream;
+	struct xdr_buf buf = {
+		.pages =  lgr->layoutp->pages,
+		.page_len =  lgr->layoutp->len,
+		.buflen =  lgr->layoutp->len,
+		.len = lgr->layoutp->len,
+	};
+	struct page *scratch;
+	struct pnfs_layout_segment *lseg;
+
+	dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+	scratch = alloc_page(gfp_flags);
+	if (!scratch)
+		goto err_nofree;
+
+	xdr_init_decode(&stream, &buf, NULL);
+	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+	status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+	if (unlikely(status)) {
+		dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+			status);
+		goto err;
+	}
+
+	__free_page(scratch);
+
+	dprintk("%s: Return %p\n", __func__, lseg);
+	return lseg;
+
+err:
+	__free_page(scratch);
+err_nofree:
+	dprintk("%s: Err Return=>%d\n", __func__, status);
+	return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+	dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+	if (unlikely(!lseg))
+		return;
+
+	objio_free_lseg(lseg);
+}
+
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 00000000000..066280a07fa
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,67 @@
+/*
+ *  Data types and function declerations for interfacing with the
+ *  pNFS standard object layout driver.
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+	struct pnfs_layout_hdr *pnfslay,
+	struct pnfs_layout_range *range,
+	struct xdr_stream *xdr,
+	gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+
+/*
+ * exported generic objects function vectors
+ */
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+	struct pnfs_layout_hdr *,
+	struct nfs4_layoutget_res *,
+	gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+#endif /* _OBJLAYOUT_H */
-- 
cgit v1.2.3-70-g09d2


From b6c05f1693115164c7b797152ac7ea3ef8e5d296 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 26 May 2011 21:45:34 +0300
Subject: pnfs-obj: objio_osd device information retrieval and caching

When a new layout is received in objio_alloc_lseg all device_ids
referenced are retrieved. The device information is queried for from MDS
and then the osd_device is looked-up from the osd-initiator library. The
devices are cached in a per-mount-point list, for later use. At unmount
all devices are "put" back to the library.

objlayout_get_deviceinfo(), objlayout_put_deviceinfo() middleware
API for retrieving device information given a device_id.

TODO: The device cache can get big. Cap its size. Keep an LRU and start
      to return devices which were not used, when list gets to big, or
      when new entries allocation fail.

[pnfs-obj: Bugs in new global-device-cache code]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[gfp_flags]
[use global device cache]
[use layout driver in global device cache]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/objio_osd.c | 157 +++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/objlayout/objlayout.c |  68 +++++++++++++++++++
 fs/nfs/objlayout/objlayout.h |   8 +++
 3 files changed, 233 insertions(+)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 08f1d90b4ce..2255e2d22d0 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -46,6 +46,68 @@
 
 #define _LLU(x) ((unsigned long long)x)
 
+struct objio_dev_ent {
+	struct nfs4_deviceid_node id_node;
+	struct osd_dev *od;
+};
+
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+	struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+
+	dprintk("%s: free od=%p\n", __func__, de->od);
+	osduld_put_device(de->od);
+	kfree(de);
+}
+
+static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
+	const struct nfs4_deviceid *d_id)
+{
+	struct nfs4_deviceid_node *d;
+	struct objio_dev_ent *de;
+
+	d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
+	if (!d)
+		return NULL;
+
+	de = container_of(d, struct objio_dev_ent, id_node);
+	return de;
+}
+
+static struct objio_dev_ent *
+_dev_list_add(const struct nfs_server *nfss,
+	const struct nfs4_deviceid *d_id, struct osd_dev *od,
+	gfp_t gfp_flags)
+{
+	struct nfs4_deviceid_node *d;
+	struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
+	struct objio_dev_ent *n;
+
+	if (!de) {
+		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+		return NULL;
+	}
+
+	dprintk("%s: Adding od=%p\n", __func__, od);
+	nfs4_init_deviceid_node(&de->id_node,
+				nfss->pnfs_curr_ld,
+				nfss->nfs_client,
+				d_id);
+	de->od = od;
+
+	d = nfs4_insert_deviceid_node(&de->id_node);
+	n = container_of(d, struct objio_dev_ent, id_node);
+	if (n != de) {
+		dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+		objio_free_deviceid_node(&de->id_node);
+		de = n;
+	}
+
+	atomic_inc(&de->id_node.ref);
+	return de;
+}
+
 struct caps_buffers {
 	u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
 	u8 creds[OSD_CAP_LEN];
@@ -74,6 +136,90 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
 	return container_of(lseg, struct objio_segment, lseg);
 }
 
+/* Send and wait for a get_device_info of devices in the layout,
+   then look them up with the osd_initiator library */
+static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+				struct objio_segment *objio_seg, unsigned comp,
+				gfp_t gfp_flags)
+{
+	struct pnfs_osd_deviceaddr *deviceaddr;
+	struct nfs4_deviceid *d_id;
+	struct objio_dev_ent *ode;
+	struct osd_dev *od;
+	struct osd_dev_info odi;
+	int err;
+
+	d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
+
+	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+	if (ode)
+		return ode;
+
+	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+	if (unlikely(err)) {
+		dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
+			__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
+		return ERR_PTR(err);
+	}
+
+	odi.systemid_len = deviceaddr->oda_systemid.len;
+	if (odi.systemid_len > sizeof(odi.systemid)) {
+		err = -EINVAL;
+		goto out;
+	} else if (odi.systemid_len)
+		memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+		       odi.systemid_len);
+	odi.osdname_len	 = deviceaddr->oda_osdname.len;
+	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data;
+
+	if (!odi.osdname_len && !odi.systemid_len) {
+		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+			__func__);
+		err = -ENODEV;
+		goto out;
+	}
+
+	od = osduld_info_lookup(&odi);
+	if (unlikely(IS_ERR(od))) {
+		err = PTR_ERR(od);
+		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+		goto out;
+	}
+
+	ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
+			    gfp_flags);
+
+out:
+	dprintk("%s: return=%d\n", __func__, err);
+	objlayout_put_deviceinfo(deviceaddr);
+	return err ? ERR_PTR(err) : ode;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+	struct objio_segment *objio_seg,
+	gfp_t gfp_flags)
+{
+	unsigned i;
+	int err;
+
+	/* lookup all devices */
+	for (i = 0; i < objio_seg->num_comps; i++) {
+		struct objio_dev_ent *ode;
+
+		ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+		if (unlikely(IS_ERR(ode))) {
+			err = PTR_ERR(ode);
+			goto out;
+		}
+		objio_seg->ods[i] = ode;
+	}
+	err = 0;
+
+out:
+	dprintk("%s: return=%d\n", __func__, err);
+	return err;
+}
+
 static int _verify_data_map(struct pnfs_osd_layout *layout)
 {
 	struct pnfs_osd_data_map *data_map = &layout->olo_map;
@@ -171,6 +317,9 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 
 	objio_seg->num_comps = layout.olo_num_comps;
 	objio_seg->comps_index = layout.olo_comps_index;
+	err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+	if (err)
+		goto err;
 
 	objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
 	objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
@@ -199,8 +348,14 @@ err:
 
 void objio_free_lseg(struct pnfs_layout_segment *lseg)
 {
+	int i;
 	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
 
+	for (i = 0; i < objio_seg->num_comps; i++) {
+		if (!objio_seg->ods[i])
+			break;
+		nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+	}
 	kfree(objio_seg);
 }
 
@@ -211,6 +366,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 
 	.alloc_lseg              = objlayout_alloc_lseg,
 	.free_lseg               = objlayout_free_lseg,
+
+	.free_deviceid_node	 = objio_free_deviceid_node,
 };
 
 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 946526763d3..10e5fca3b7f 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -102,3 +102,71 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg)
 	objio_free_lseg(lseg);
 }
 
+/*
+ * Get Device Info API for io engines
+ */
+struct objlayout_deviceinfo {
+	struct page *page;
+	struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+	gfp_t gfp_flags)
+{
+	struct objlayout_deviceinfo *odi;
+	struct pnfs_device pd;
+	struct super_block *sb;
+	struct page *page, **pages;
+	u32 *p;
+	int err;
+
+	page = alloc_page(gfp_flags);
+	if (!page)
+		return -ENOMEM;
+
+	pages = &page;
+	pd.pages = pages;
+
+	memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+	pd.layout_type = LAYOUT_OSD2_OBJECTS;
+	pd.pages = &page;
+	pd.pgbase = 0;
+	pd.pglen = PAGE_SIZE;
+	pd.mincount = 0;
+
+	sb = pnfslay->plh_inode->i_sb;
+	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+	if (err)
+		goto err_out;
+
+	p = page_address(page);
+	odi = kzalloc(sizeof(*odi), gfp_flags);
+	if (!odi) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+	odi->page = page;
+	*deviceaddr = &odi->da;
+	return 0;
+
+err_out:
+	__free_page(page);
+	return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+	struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+						struct objlayout_deviceinfo,
+						da);
+
+	__free_page(odi->page);
+	kfree(odi);
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 066280a07fa..0814271bb9b 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -55,6 +55,14 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
 
+/*
+ * callback API
+ */
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+	gfp_t gfp_flags);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
 /*
  * exported generic objects function vectors
  */
-- 
cgit v1.2.3-70-g09d2


From 636fb9c89d7e216aac3d406e458864420057e981 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:51:33 +0300
Subject: pnfs: alloc and free layout_hdr layoutdriver methods

[gfp_flags]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/pnfs.c | 21 ++++++++++++++++++---
 fs/nfs/pnfs.h |  4 ++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 20436a5e76c..ef535f2a2c7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
 	atomic_inc(&lo->plh_refcount);
 }
 
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+	return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+		kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+	return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
 static void
 destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
 	dprintk("%s: freeing layout cache %p\n", __func__, lo);
 	BUG_ON(!list_empty(&lo->plh_layouts));
 	NFS_I(lo->plh_inode)->layout = NULL;
-	kfree(lo);
+	pnfs_free_layout_hdr(lo);
 }
 
 static void
@@ -744,7 +759,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 {
 	struct pnfs_layout_hdr *lo;
 
-	lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
 	if (!lo)
 		return NULL;
 	atomic_set(&lo->plh_refcount, 1);
@@ -777,7 +792,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
 	if (likely(nfsi->layout == NULL))	/* Won the race? */
 		nfsi->layout = new;
 	else
-		kfree(new);
+		pnfs_free_layout_hdr(new);
 	return nfsi->layout;
 }
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f37ab3539cb..925d6ef8ba7 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -73,6 +73,10 @@ struct pnfs_layoutdriver_type {
 	const u32 id;
 	const char *name;
 	struct module *owner;
+
+	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+	void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 
-- 
cgit v1.2.3-70-g09d2


From e51b841dd0be9ff53f740c44c32c32679edcb7c8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:51:48 +0300
Subject: pnfs-obj: define per-inode private structure

allocate and deallocate per-inode private pnfs_layout_hdr
in preparation for I/O implementation.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/objio_osd.c |  3 +++
 fs/nfs/objlayout/objlayout.c | 26 ++++++++++++++++++++++++++
 fs/nfs/objlayout/objlayout.h | 17 +++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 2255e2d22d0..353821f7937 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -364,6 +364,9 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
 
+	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
+	.free_layout_hdr         = objlayout_free_layout_hdr,
+
 	.alloc_lseg              = objlayout_alloc_lseg,
 	.free_lseg               = objlayout_free_lseg,
 
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 10e5fca3b7f..f14b4da3405 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -41,6 +41,32 @@
 #include "objlayout.h"
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct objlayout *objlay;
+
+	objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+	dprintk("%s: Return %p\n", __func__, objlay);
+	return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct objlayout *objlay = OBJLAYOUT(lo);
+
+	dprintk("%s: objlay %p\n", __func__, objlay);
+
+	kfree(objlay);
+}
+
 /*
  * Unmarshall layout and store it in pnfslay.
  */
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 0814271bb9b..fa0262149f5 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -45,6 +45,19 @@
 #include <linux/pnfs_osd_xdr.h>
 #include "../pnfs.h"
 
+/*
+ * per-inode layout
+ */
+struct objlayout {
+	struct pnfs_layout_hdr pnfs_layout;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct objlayout, pnfs_layout);
+}
+
 /*
  * Raid engine I/O API
  */
@@ -66,6 +79,10 @@ extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
 /*
  * exported generic objects function vectors
  */
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
 extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 	struct pnfs_layout_hdr *,
 	struct nfs4_layoutget_res *,
-- 
cgit v1.2.3-70-g09d2


From d20581aa4be11407c9eeeb75992df5ef176bba0f Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:52:03 +0300
Subject: pnfs: support for non-rpc layout drivers

Non-rpc layout driver such as for objects and blocks
implement their own I/O path and error handling logic.
Therefore bypass NFS-based error handling for these layout drivers.

[fix lseg ref-count bugs, and null de-refs]
[Fall out from: non-rpc layout drivers]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[get rid of PNFS_USE_RPC_CODE]
[get rid of __nfs4_write_done_cb]
[revert useless change in nfs4_write_done_cb]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/internal.h       |  1 +
 fs/nfs/nfs4proc.c       | 13 ++++++++++---
 fs/nfs/pnfs.c           | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.h           |  2 ++
 include/linux/nfs_xdr.h |  2 ++
 5 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885d..bcf0f0ff5ee 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 
 /* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
 extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
 extern int nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c393..92c8bc4b5f9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3175,6 +3175,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+	nfs_invalidate_atime(data->inode);
+}
+
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3184,7 +3189,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 		return -EAGAIN;
 	}
 
-	nfs_invalidate_atime(data->inode);
+	__nfs4_read_done_cb(data);
 	if (task->tk_status > 0)
 		renew_lease(server, data->timestamp);
 	return 0;
@@ -3198,7 +3203,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
 
-	return data->read_done_cb(task, data);
+	return data->read_done_cb ? data->read_done_cb(task, data) :
+				    nfs4_read_done_cb(task, data);
 }
 
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3243,7 +3249,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
-	return data->write_done_cb(task, data);
+	return data->write_done_cb ? data->write_done_cb(task, data) :
+		nfs4_write_done_cb(task, data);
 }
 
 /* Reset the the nfs_write_data to send the write to the MDS. */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ef535f2a2c7..171662114fd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -243,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
 {
 	struct inode *inode = lseg->pls_layout->plh_inode;
 
-	BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 	list_del_init(&lseg->pls_list);
 	if (list_empty(&lseg->pls_layout->plh_segs)) {
 		set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -1054,6 +1054,29 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
 	pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
 }
 
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_write_done(struct nfs_write_data *data)
+{
+	int status;
+
+	if (!data->pnfs_error) {
+		pnfs_set_layoutcommit(data);
+		data->mds_ops->rpc_call_done(&data->task, data);
+		data->mds_ops->rpc_release(data);
+		return 0;
+	}
+
+	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+		data->pnfs_error);
+	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
+				    data->mds_ops, NFS_FILE_SYNC);
+	return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
+
 enum pnfs_try_status
 pnfs_try_to_write_data(struct nfs_write_data *wdata,
 			const struct rpc_call_ops *call_ops, int how)
@@ -1078,6 +1101,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
 	return trypnfs;
 }
 
+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_read_done(struct nfs_read_data *data)
+{
+	int status;
+
+	if (!data->pnfs_error) {
+		__nfs4_read_done_cb(data);
+		data->mds_ops->rpc_call_done(&data->task, data);
+		data->mds_ops->rpc_release(data);
+		return 0;
+	}
+
+	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+		data->pnfs_error);
+	status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
+				   data->mds_ops);
+	return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 925d6ef8ba7..0383e66e71f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -165,6 +165,8 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int pnfs_ld_write_done(struct nfs_write_data *);
+int pnfs_ld_read_done(struct nfs_read_data *);
 
 /* pnfs_dev.c */
 struct nfs4_deviceid_node {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7e371f7df9c..7c8ff0984a8 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1087,6 +1087,7 @@ struct nfs_read_data {
 	const struct rpc_call_ops *mds_ops;
 	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
 	__u64			mds_offset;
+	int			pnfs_error;
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
@@ -1112,6 +1113,7 @@ struct nfs_write_data {
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
 	__u64			mds_offset;	/* Filelayout dense stripe */
+	int			pnfs_error;
 	struct page		*page_array[NFS_PAGEVEC_SIZE];
 };
 
-- 
cgit v1.2.3-70-g09d2


From 04f83450388e87d86b387cf4a27b81eb7e69de7d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 22 May 2011 19:52:19 +0300
Subject: pnfs-obj: osd raid engine read/write implementation

With the use of the in-kernel osd library. Implement read/write
of data from/to osd-objects according to information specified
in the objects-layout.

Support for stripping over mirrors with a received stripe_unit.
There are however a few constrains which are not supported:
 1. Stripe Unit must be a multiple of PAGE_SIZE
 2. stripe length (stripe_unit * number_of_stripes) can not be
    bigger then 32bit.

Also support raid-groups and partial-layout. Partial-layout is
when not all the groups are received on the line, addressing
only a partial range of the file.

TODO:
  Only raid0! raid 4/5/6 support will come at later stage

A none supported layout will send IO through the MDS

[Important fallout from the last rebase]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[gfp_flags]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/objio_osd.c | 605 +++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/objlayout/objlayout.c | 254 ++++++++++++++++++
 fs/nfs/objlayout/objlayout.h |  42 +++
 3 files changed, 901 insertions(+)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 353821f7937..cc92d3b3dc3 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -46,6 +46,10 @@
 
 #define _LLU(x) ((unsigned long long)x)
 
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
 struct objio_dev_ent {
 	struct nfs4_deviceid_node id_node;
 	struct osd_dev *od;
@@ -136,6 +140,31 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
 	return container_of(lseg, struct objio_segment, lseg);
 }
 
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
+struct objio_state {
+	/* Generic layer */
+	struct objlayout_io_state ol_state;
+
+	struct objio_segment *layout;
+
+	struct kref kref;
+	objio_done_fn done;
+	void *private;
+
+	unsigned long length;
+	unsigned numdevs; /* Actually used devs in this IO */
+	/* A per-device variable array of size numdevs */
+	struct _objio_per_comp {
+		struct bio *bio;
+		struct osd_request *or;
+		unsigned long length;
+		u64 offset;
+		unsigned dev;
+	} per_dev[];
+};
+
 /* Send and wait for a get_device_info of devices in the layout,
    then look them up with the osd_initiator library */
 static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
@@ -359,6 +388,578 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
 	kfree(objio_seg);
 }
 
+int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+			 struct objlayout_io_state **outp,
+			 gfp_t gfp_flags)
+{
+	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+	struct objio_state *ios;
+	const unsigned first_size = sizeof(*ios) +
+				objio_seg->num_comps * sizeof(ios->per_dev[0]);
+
+	ios = kzalloc(first_size, gfp_flags);
+	if (unlikely(!ios))
+		return -ENOMEM;
+
+	ios->layout = objio_seg;
+
+	*outp = &ios->ol_state;
+	return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+	struct objio_state *ios = container_of(ol_state, struct objio_state,
+					       ol_state);
+
+	kfree(ios);
+}
+
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+	enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+	int lin_ret = 0;
+	int i;
+
+	for (i = 0; i <  ios->numdevs; i++) {
+		struct osd_sense_info osi;
+		struct osd_request *or = ios->per_dev[i].or;
+		unsigned dev;
+		int ret;
+
+		if (!or)
+			continue;
+
+		ret = osd_req_decode_sense(or, &osi);
+		if (likely(!ret))
+			continue;
+
+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+			/* start read offset passed endof file */
+			BUG_ON(is_write);
+			_clear_bio(ios->per_dev[i].bio);
+			dprintk("%s: start read offset passed end of file "
+				"offset=0x%llx, length=0x%lx\n", __func__,
+				_LLU(ios->per_dev[i].offset),
+				ios->per_dev[i].length);
+
+			continue; /* we recovered */
+		}
+		dev = ios->per_dev[i].dev;
+
+		if (osi.osd_err_pri >= oep) {
+			oep = osi.osd_err_pri;
+			lin_ret = ret;
+		}
+	}
+
+	return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+	unsigned i;
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+		if (per_dev->or) {
+			osd_end_request(per_dev->or);
+			per_dev->or = NULL;
+		}
+
+		if (per_dev->bio) {
+			bio_put(per_dev->bio);
+			per_dev->bio = NULL;
+		}
+	}
+}
+
+struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
+{
+	unsigned min_dev = ios->layout->comps_index;
+	unsigned max_dev = min_dev + ios->layout->num_comps;
+
+	BUG_ON(dev < min_dev || max_dev <= dev);
+	return ios->layout->ods[dev - min_dev]->od;
+}
+
+struct _striping_info {
+	u64 obj_offset;
+	u64 group_length;
+	unsigned dev;
+	unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+			      struct _striping_info *si)
+{
+	u32	stripe_unit = ios->layout->stripe_unit;
+	u32	group_width = ios->layout->group_width;
+	u64	group_depth = ios->layout->group_depth;
+	u32	U = stripe_unit * group_width;
+
+	u64	T = U * group_depth;
+	u64	S = T * ios->layout->group_count;
+	u64	M = div64_u64(file_offset, S);
+
+	/*
+	G = (L - (M * S)) / T
+	H = (L - (M * S)) % T
+	*/
+	u64	LmodU = file_offset - M * S;
+	u32	G = div64_u64(LmodU, T);
+	u64	H = LmodU - G * T;
+
+	u32	N = div_u64(H, U);
+
+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+	si->obj_offset = si->unit_off + (N * stripe_unit) +
+				  (M * group_depth * stripe_unit);
+
+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
+	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	si->dev *= ios->layout->mirrors_p1;
+
+	si->group_length = T - H;
+}
+
+static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+		gfp_t gfp_flags)
+{
+	unsigned pg = *cur_pg;
+	struct request_queue *q =
+			osd_request_queue(_io_od(ios, per_dev->dev));
+
+	per_dev->length += cur_len;
+
+	if (per_dev->bio == NULL) {
+		unsigned stripes = ios->layout->num_comps /
+						     ios->layout->mirrors_p1;
+		unsigned pages_in_stripe = stripes *
+				      (ios->layout->stripe_unit / PAGE_SIZE);
+		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+				    stripes;
+
+		if (BIO_MAX_PAGES_KMALLOC < bio_size)
+			bio_size = BIO_MAX_PAGES_KMALLOC;
+
+		per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			dprintk("Faild to allocate BIO size=%u\n", bio_size);
+			return -ENOMEM;
+		}
+	}
+
+	while (cur_len > 0) {
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
+
+		BUG_ON(ios->ol_state.nr_pages <= pg);
+		cur_len -= pglen;
+
+		added_len = bio_add_pc_page(q, per_dev->bio,
+					ios->ol_state.pages[pg], pglen, pgbase);
+		if (unlikely(pglen != added_len))
+			return -ENOMEM;
+		pgbase = 0;
+		++pg;
+	}
+	BUG_ON(cur_len);
+
+	*cur_pg = pg;
+	return 0;
+}
+
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+			      struct _striping_info *si, unsigned *last_pg,
+			      gfp_t gfp_flags)
+{
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned dev = si->dev;
+	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned cur_pg = *last_pg;
+	int ret = 0;
+
+	while (length) {
+		struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+		unsigned cur_len, page_off = 0;
+
+		if (!per_dev->length) {
+			per_dev->dev = dev;
+			if (dev < si->dev) {
+				per_dev->offset = si->obj_offset + stripe_unit -
+								   si->unit_off;
+				cur_len = stripe_unit;
+			} else if (dev == si->dev) {
+				per_dev->offset = si->obj_offset;
+				cur_len = stripe_unit - si->unit_off;
+				page_off = si->unit_off & ~PAGE_MASK;
+				BUG_ON(page_off &&
+				      (page_off != ios->ol_state.pgbase));
+			} else { /* dev > si->dev */
+				per_dev->offset = si->obj_offset - si->unit_off;
+				cur_len = stripe_unit;
+			}
+
+			if (max_comp < dev)
+				max_comp = dev;
+		} else {
+			cur_len = stripe_unit;
+		}
+		if (cur_len >= length)
+			cur_len = length;
+
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len, gfp_flags);
+		if (unlikely(ret))
+			goto out;
+
+		dev += mirrors_p1;
+		dev = (dev % devs_in_group) + first_dev;
+
+		length -= cur_len;
+		ios->length += cur_len;
+	}
+out:
+	ios->numdevs = max_comp + mirrors_p1;
+	*last_pg = cur_pg;
+	return ret;
+}
+
+static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
+{
+	u64 length = ios->ol_state.count;
+	u64 offset = ios->ol_state.offset;
+	struct _striping_info si;
+	unsigned last_pg = 0;
+	int ret = 0;
+
+	while (length) {
+		_calc_stripe_info(ios, offset, &si);
+
+		if (length < si.group_length)
+			si.group_length = length;
+
+		ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
+		if (unlikely(ret))
+			goto out;
+
+		offset += si.group_length;
+		length -= si.group_length;
+	}
+
+out:
+	if (!ios->length)
+		return ret;
+
+	return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+	struct completion *waiting = ios->private;
+
+	complete(waiting);
+	return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+	struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+	ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+	struct objio_state *ios = p;
+
+	kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	ssize_t status = 0; /* sync status */
+	unsigned i;
+	objio_done_fn saved_done_fn = ios->done;
+	bool sync = ios->ol_state.sync;
+
+	if (sync) {
+		ios->done = _sync_done;
+		ios->private = &wait;
+	}
+
+	kref_init(&ios->kref);
+
+	for (i = 0; i < ios->numdevs; i++) {
+		struct osd_request *or = ios->per_dev[i].or;
+
+		if (!or)
+			continue;
+
+		kref_get(&ios->kref);
+		osd_execute_request_async(or, _done_io, ios);
+	}
+
+	kref_put(&ios->kref, _last_io);
+
+	if (sync) {
+		wait_for_completion(&wait);
+		status = saved_done_fn(ios);
+	}
+
+	return status;
+}
+
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+	ssize_t status;
+	int ret = _io_check(ios, false);
+
+	_io_free(ios);
+
+	if (likely(!ret))
+		status = ios->length;
+	else
+		status = ret;
+
+	objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+	return status;
+}
+
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+	struct osd_request *or = NULL;
+	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+	unsigned dev = per_dev->dev;
+	struct pnfs_osd_object_cred *cred =
+			&ios->layout->comps[dev];
+	struct osd_obj_id obj = {
+		.partition = cred->oc_object_id.oid_partition_id,
+		.id = cred->oc_object_id.oid_object_id,
+	};
+	int ret;
+
+	or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+	if (unlikely(!or)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	per_dev->or = or;
+
+	osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+
+	ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+	if (ret) {
+		dprintk("%s: Faild to osd_finalize_request() => %d\n",
+			__func__, ret);
+		goto err;
+	}
+
+	dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+		__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+		per_dev->length);
+
+err:
+	return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		if (!ios->per_dev[i].length)
+			continue;
+		ret = _read_mirrors(ios, i);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	ios->done = _read_done;
+	return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+	_io_free(ios);
+	return ret;
+}
+
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+	struct objio_state *ios = container_of(ol_state, struct objio_state,
+					       ol_state);
+	int ret;
+
+	ret = _io_rw_pagelist(ios, GFP_KERNEL);
+	if (unlikely(ret))
+		return ret;
+
+	return _read_exec(ios);
+}
+
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+	ssize_t status;
+	int ret = _io_check(ios, true);
+
+	_io_free(ios);
+
+	if (likely(!ret)) {
+		/* FIXME: should be based on the OSD's persistence model
+		 * See OSD2r05 Section 4.13 Data persistence model */
+		ios->ol_state.committed = NFS_FILE_SYNC;
+		status = ios->length;
+	} else {
+		status = ret;
+	}
+
+	objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+	return status;
+}
+
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+	struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret;
+
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct osd_request *or = NULL;
+		struct pnfs_osd_object_cred *cred =
+					&ios->layout->comps[dev];
+		struct osd_obj_id obj = {
+			.partition = cred->oc_object_id.oid_partition_id,
+			.id = cred->oc_object_id.oid_object_id,
+		};
+		struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+		struct bio *bio;
+
+		or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
+		if (unlikely(!or)) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		per_dev->or = or;
+
+		if (per_dev != master_dev) {
+			bio = bio_kmalloc(GFP_NOFS,
+					  master_dev->bio->bi_max_vecs);
+			if (unlikely(!bio)) {
+				dprintk("Faild to allocate BIO size=%u\n",
+					master_dev->bio->bi_max_vecs);
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			__bio_clone(bio, master_dev->bio);
+			bio->bi_bdev = NULL;
+			bio->bi_next = NULL;
+			per_dev->bio = bio;
+			per_dev->dev = dev;
+			per_dev->length = master_dev->length;
+			per_dev->offset =  master_dev->offset;
+		} else {
+			bio = master_dev->bio;
+			bio->bi_rw |= REQ_WRITE;
+		}
+
+		osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+
+		ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+		if (ret) {
+			dprintk("%s: Faild to osd_finalize_request() => %d\n",
+				__func__, ret);
+			goto err;
+		}
+
+		dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+			__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+			per_dev->length);
+	}
+
+err:
+	return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		if (!ios->per_dev[i].length)
+			continue;
+		ret = _write_mirrors(ios, i);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	ios->done = _write_done;
+	return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+	_io_free(ios);
+	return ret;
+}
+
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+	struct objio_state *ios = container_of(ol_state, struct objio_state,
+					       ol_state);
+	int ret;
+
+	/* TODO: ios->stable = stable; */
+	ret = _io_rw_pagelist(ios, GFP_NOFS);
+	if (unlikely(ret))
+		return ret;
+
+	return _write_exec(ios);
+}
+
+/*
+ * objlayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 :  coalesce page
+ * return 0 :  don't coalesce page
+ */
+int
+objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		   struct nfs_page *req)
+{
+	return 1;
+}
 
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
@@ -370,6 +971,10 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.alloc_lseg              = objlayout_alloc_lseg,
 	.free_lseg               = objlayout_free_lseg,
 
+	.read_pagelist           = objlayout_read_pagelist,
+	.write_pagelist          = objlayout_write_pagelist,
+	.pg_test                 = objlayout_pg_test,
+
 	.free_deviceid_node	 = objio_free_deviceid_node,
 };
 
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index f14b4da3405..5157ef6d004 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -128,6 +128,260 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg)
 	objio_free_lseg(lseg);
 }
 
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+			struct page **pages,
+			unsigned pgbase,
+			loff_t offset,
+			size_t count,
+			struct pnfs_layout_segment *lseg,
+			void *rpcdata,
+			gfp_t gfp_flags)
+{
+	struct objlayout_io_state *state;
+	u64 lseg_end_offset;
+
+	dprintk("%s: allocating io_state\n", __func__);
+	if (objio_alloc_io_state(lseg, &state, gfp_flags))
+		return NULL;
+
+	BUG_ON(offset < lseg->pls_range.offset);
+	lseg_end_offset = end_offset(lseg->pls_range.offset,
+				     lseg->pls_range.length);
+	BUG_ON(offset >= lseg_end_offset);
+	if (offset + count > lseg_end_offset) {
+		count = lseg->pls_range.length -
+				(offset - lseg->pls_range.offset);
+		dprintk("%s: truncated count %Zd\n", __func__, count);
+	}
+
+	if (pgbase > PAGE_SIZE) {
+		pages += pgbase >> PAGE_SHIFT;
+		pgbase &= ~PAGE_MASK;
+	}
+
+	state->lseg = lseg;
+	state->rpcdata = rpcdata;
+	state->pages = pages;
+	state->pgbase = pgbase;
+	state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	state->offset = offset;
+	state->count = count;
+	state->sync = 0;
+
+	return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+	dprintk("%s: freeing io_state\n", __func__);
+	if (unlikely(!state))
+		return;
+
+	objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+	dprintk("%s: state %p status\n", __func__, state);
+
+	objlayout_free_io_state(state);
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_read_data *rdata;
+
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	rdata = container_of(task, struct nfs_read_data, task);
+
+	pnfs_ld_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+	int eof = state->eof;
+	struct nfs_read_data *rdata;
+
+	state->status = status;
+	dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+	rdata = state->rpcdata;
+	rdata->task.tk_status = status;
+	if (status >= 0) {
+		rdata->res.count = status;
+		rdata->res.eof = eof;
+	}
+	objlayout_iodone(state);
+	/* must not use state after this point */
+
+	if (sync)
+		pnfs_ld_read_done(rdata);
+	else {
+		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+		schedule_work(&rdata->task.u.tk_work);
+	}
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+	loff_t offset = rdata->args.offset;
+	size_t count = rdata->args.count;
+	struct objlayout_io_state *state;
+	ssize_t status = 0;
+	loff_t eof;
+
+	dprintk("%s: Begin inode %p offset %llu count %d\n",
+		__func__, rdata->inode, offset, (int)count);
+
+	eof = i_size_read(rdata->inode);
+	if (unlikely(offset + count > eof)) {
+		if (offset >= eof) {
+			status = 0;
+			rdata->res.count = 0;
+			rdata->res.eof = 1;
+			goto out;
+		}
+		count = eof - offset;
+	}
+
+	state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+					 rdata->args.pages, rdata->args.pgbase,
+					 offset, count,
+					 rdata->lseg, rdata,
+					 GFP_KERNEL);
+	if (unlikely(!state)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	state->eof = state->offset + state->count >= eof;
+
+	status = objio_read_pagelist(state);
+ out:
+	dprintk("%s: Return status %Zd\n", __func__, status);
+	rdata->pnfs_error = status;
+	return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+	struct rpc_task *task;
+	struct nfs_write_data *wdata;
+
+	dprintk("%s enter\n", __func__);
+	task = container_of(work, struct rpc_task, u.tk_work);
+	wdata = container_of(task, struct nfs_write_data, task);
+
+	pnfs_ld_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+		     bool sync)
+{
+	struct nfs_write_data *wdata;
+
+	dprintk("%s: Begin\n", __func__);
+	wdata = state->rpcdata;
+	state->status = status;
+	wdata->task.tk_status = status;
+	if (status >= 0) {
+		wdata->res.count = status;
+		wdata->verf.committed = state->committed;
+		dprintk("%s: Return status %d committed %d\n",
+			__func__, wdata->task.tk_status,
+			wdata->verf.committed);
+	} else
+		dprintk("%s: Return status %d\n",
+			__func__, wdata->task.tk_status);
+	objlayout_iodone(state);
+	/* must not use state after this point */
+
+	if (sync)
+		pnfs_ld_write_done(wdata);
+	else {
+		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+		schedule_work(&wdata->task.u.tk_work);
+	}
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+			 int how)
+{
+	struct objlayout_io_state *state;
+	ssize_t status;
+
+	dprintk("%s: Begin inode %p offset %llu count %u\n",
+		__func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+	state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+					 wdata->args.pages,
+					 wdata->args.pgbase,
+					 wdata->args.offset,
+					 wdata->args.count,
+					 wdata->lseg, wdata,
+					 GFP_NOFS);
+	if (unlikely(!state)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	state->sync = how & FLUSH_SYNC;
+
+	status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+	dprintk("%s: Return status %Zd\n", __func__, status);
+	wdata->pnfs_error = status;
+	return PNFS_ATTEMPTED;
+}
+
 /*
  * Get Device Info API for io engines
  */
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fa0262149f5..9a405e8069f 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -58,6 +58,26 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
 	return container_of(lo, struct objlayout, pnfs_layout);
 }
 
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+	struct pnfs_layout_segment *lseg;
+
+	struct page **pages;
+	unsigned pgbase;
+	unsigned nr_pages;
+	unsigned long count;
+	loff_t offset;
+	bool sync;
+
+	void *rpcdata;
+	int status;             /* res */
+	int eof;                /* res */
+	int committed;          /* res */
+};
+
 /*
  * Raid engine I/O API
  */
@@ -68,9 +88,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
 
+extern int objio_alloc_io_state(
+	struct pnfs_layout_segment *lseg,
+	struct objlayout_io_state **outp,
+	gfp_t gfp_flags);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+				    bool stable);
+
 /*
  * callback API
  */
+extern void objlayout_read_done(struct objlayout_io_state *state,
+				ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+				 ssize_t status, bool sync);
+
 extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
 	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
 	gfp_t gfp_flags);
@@ -89,4 +124,11 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 	gfp_t gfp_flags);
 extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 
+extern enum pnfs_try_status objlayout_read_pagelist(
+	struct nfs_read_data *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+	struct nfs_write_data *,
+	int how);
+
 #endif /* _OBJLAYOUT_H */
-- 
cgit v1.2.3-70-g09d2


From cbe8260369c9f88eafa035cd327dc3e02fad528c Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:52:37 +0300
Subject: pnfs: layoutreturn

NFSv4.1 LAYOUTRETURN implementation

Currently, does not support layout-type payload encoding.

Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Zhang Jingwang <zhangjingwang@nrchpc.ac.cn>
[call pnfs_return_layout right before pnfs_destroy_layout]
[remove assert_spin_locked from pnfs_clear_lseg_list]
[remove wait parameter from the layoutreturn path.]
[remove return_type field from nfs4_layoutreturn_args]
[remove range from nfs4_layoutreturn_args]
[no need to send layoutcommit from _pnfs_return_layout]
[don't wait on sync layoutreturn]
[fix layout stateid in layoutreturn args]
[fixed NULL deref in _pnfs_return_layout]
[removed recaim member of nfs4_layoutreturn_args]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/inode.c          |   3 +-
 fs/nfs/nfs4proc.c       |  82 +++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4xdr.c        | 111 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfs/pnfs.c           |  45 ++++++++++++++++++++
 fs/nfs/pnfs.h           |  18 ++++++++
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |  21 +++++++++
 7 files changed, 274 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5..e9c6d9f8f7e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1424,9 +1424,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 void nfs4_evict_inode(struct inode *inode)
 {
-	pnfs_destroy_layout(NFS_I(inode));
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
+	pnfs_return_layout(inode);
+	pnfs_destroy_layout(NFS_I(inode));
 	/* If we are holding a delegation, return it! */
 	nfs_inode_return_delegation_noreclaim(inode);
 	/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 92c8bc4b5f9..5b4124e4c22 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5673,6 +5673,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
 	return status;
 }
 
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+				&lrp->res.seq_res, 0, task))
+		return;
+	rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+	struct nfs_server *server;
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+		return;
+
+	server = NFS_SERVER(lrp->args.inode);
+	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+		nfs_restart_rpc(task, lrp->clp);
+		return;
+	}
+	if (task->tk_status == 0) {
+		struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+		if (lrp->res.lrs_present) {
+			spin_lock(&lo->plh_inode->i_lock);
+			pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+			spin_unlock(&lo->plh_inode->i_lock);
+		} else
+			BUG_ON(!list_empty(&lo->plh_segs));
+	}
+	dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+	struct nfs4_layoutreturn *lrp = calldata;
+
+	dprintk("--> %s\n", __func__);
+	put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+	kfree(calldata);
+	dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+	.rpc_call_prepare = nfs4_layoutreturn_prepare,
+	.rpc_call_done = nfs4_layoutreturn_done,
+	.rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+{
+	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+		.rpc_argp = &lrp->args,
+		.rpc_resp = &lrp->res,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = lrp->clp->cl_rpcclient,
+		.rpc_message = &msg,
+		.callback_ops = &nfs4_layoutreturn_call_ops,
+		.callback_data = lrp,
+	};
+	int status;
+
+	dprintk("--> %s\n", __func__);
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = task->tk_status;
+	dprintk("<-- %s status=%d\n", __func__, status);
+	rpc_put_task(task);
+	return status;
+}
+
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c4683..f2421206435 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
 				1 /* layoutupdate4 layout type */ + \
 				1 /* NULL filelayout layoutupdate4 payload */)
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
-
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+				encode_stateid_maxsz + \
+				1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+				1 + decode_stateid_maxsz)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
 				decode_putfh_maxsz + \
 				decode_layoutcommit_maxsz + \
 				decode_getattr_maxsz)
-
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+				encode_sequence_maxsz + \
+				encode_putfh_maxsz + \
+				encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_layoutreturn_maxsz)
 
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
 				      compound_encode_hdr_maxsz +
@@ -1889,6 +1900,31 @@ encode_layoutcommit(struct xdr_stream *xdr,
 	hdr->replen += decode_layoutcommit_maxsz;
 	return 0;
 }
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+		    const struct nfs4_layoutreturn_args *args,
+		    struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 20);
+	*p++ = cpu_to_be32(OP_LAYOUTRETURN);
+	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */
+	*p++ = cpu_to_be32(args->layout_type);
+	*p++ = cpu_to_be32(IOMODE_ANY);
+	*p = cpu_to_be32(RETURN_FILE);
+	p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+	p = xdr_encode_hyper(p, 0);
+	p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+	spin_lock(&args->inode->i_lock);
+	xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+	spin_unlock(&args->inode->i_lock);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(0);
+	hdr->nops++;
+	hdr->replen += decode_layoutreturn_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -2706,9 +2742,9 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
 /*
  *  Encode LAYOUTCOMMIT request
  */
-static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
-				     struct xdr_stream *xdr,
-				     struct nfs4_layoutcommit_args *args)
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_layoutcommit_args *args)
 {
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2720,7 +2756,24 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
 	encode_layoutcommit(xdr, args, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
-	return 0;
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+				      struct xdr_stream *xdr,
+				      struct nfs4_layoutreturn_args *args)
+{
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+	encode_layoutreturn(xdr, args, &hdr);
+	encode_nops(&hdr);
 }
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -5203,6 +5256,27 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_layoutreturn(struct xdr_stream *xdr,
+			       struct nfs4_layoutreturn_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+	if (status)
+		return status;
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+	res->lrs_present = be32_to_cpup(p);
+	if (res->lrs_present)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_layoutcommit(struct xdr_stream *xdr,
 			       struct rpc_rqst *req,
 			       struct nfs4_layoutcommit_res *res)
@@ -6319,6 +6393,30 @@ out:
 	return status;
 }
 
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+				     struct xdr_stream *xdr,
+				     struct nfs4_layoutreturn_res *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_layoutreturn(xdr, res);
+out:
+	return status;
+}
+
 /*
  * Decode LAYOUTCOMMIT response
  */
@@ -6547,6 +6645,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo),
 	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget),
 	PROC(LAYOUTCOMMIT,	enc_layoutcommit,	dec_layoutcommit),
+	PROC(LAYOUTRETURN,	enc_layoutreturn,	dec_layoutreturn),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 171662114fd..00b12824174 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -619,6 +619,51 @@ out_err_free:
 	return NULL;
 }
 
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+	struct pnfs_layout_hdr *lo = NULL;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	LIST_HEAD(tmp_list);
+	struct nfs4_layoutreturn *lrp;
+	nfs4_stateid stateid;
+	int status = 0;
+
+	dprintk("--> %s\n", __func__);
+
+	spin_lock(&ino->i_lock);
+	lo = nfsi->layout;
+	if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
+		spin_unlock(&ino->i_lock);
+		dprintk("%s: no layout segments to return\n", __func__);
+		goto out;
+	}
+	stateid = nfsi->layout->plh_stateid;
+	/* Reference matched in nfs4_layoutreturn_release */
+	get_layout_hdr(lo);
+	spin_unlock(&ino->i_lock);
+	pnfs_free_lseg_list(&tmp_list);
+
+	WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+
+	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+	if (unlikely(lrp == NULL)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	lrp->args.stateid = stateid;
+	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+	lrp->args.inode = ino;
+	lrp->clp = NFS_SERVER(ino)->nfs_client;
+
+	status = nfs4_proc_layoutreturn(lrp);
+out:
+	dprintk("<-- %s status: %d\n", __func__, status);
+	return status;
+}
+
 bool pnfs_roc(struct inode *ino)
 {
 	struct pnfs_layout_hdr *lo;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0383e66e71f..c34f7a0e3bc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -129,6 +129,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev);
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -165,6 +166,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *);
 int pnfs_ld_write_done(struct nfs_write_data *);
 int pnfs_ld_read_done(struct nfs_read_data *);
 
@@ -256,6 +258,17 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
 		put_lseg(req->wb_commit_lseg);
 }
 
+static inline int pnfs_return_layout(struct inode *ino)
+{
+	struct nfs_inode *nfsi = NFS_I(ino);
+	struct nfs_server *nfss = NFS_SERVER(ino);
+
+	if (pnfs_enabled_sb(nfss) && nfsi->layout)
+		return _pnfs_return_layout(ino);
+
+	return 0;
+}
+
 #else  /* CONFIG_NFS_V4_1 */
 
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -298,6 +311,11 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
 	return PNFS_NOT_ATTEMPTED;
 }
 
+static inline int pnfs_return_layout(struct inode *ino)
+{
+	return 0;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 178fafe0ff9..9376eaf26e1 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -562,6 +562,7 @@ enum {
 	NFSPROC4_CLNT_LAYOUTGET,
 	NFSPROC4_CLNT_GETDEVICEINFO,
 	NFSPROC4_CLNT_LAYOUTCOMMIT,
+	NFSPROC4_CLNT_LAYOUTRETURN,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7c8ff0984a8..5e8444a11ad 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -269,6 +269,27 @@ struct nfs4_layoutcommit_data {
 	struct nfs4_layoutcommit_res res;
 };
 
+struct nfs4_layoutreturn_args {
+	__u32   layout_type;
+	struct inode *inode;
+	nfs4_stateid stateid;
+	struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutreturn_res {
+	struct nfs4_sequence_res seq_res;
+	u32 lrs_present;
+	nfs4_stateid stateid;
+};
+
+struct nfs4_layoutreturn {
+	struct nfs4_layoutreturn_args args;
+	struct nfs4_layoutreturn_res res;
+	struct rpc_cred *cred;
+	struct nfs_client *clp;
+	int rpc_status;
+};
+
 /*
  * Arguments to the open call.
  */
-- 
cgit v1.2.3-70-g09d2


From 8a1636c459cb7a4b32ba4024cd1b2ba21fba6aed Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 14 Jul 2010 15:43:57 -0400
Subject: pnfs: layoutret_on_setattr

With the objects layout security model, we have object capabilities
that are associated with the layout and we anticipate that the server
will issue a cb_layoutrecall for any setattr that changes security
related attributes (user/group/mode/acl) or truncates the file.

Therefore, the layout is returned before issuing the setattr to avoid
the anticipated cb_layoutrecall.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4proc.c            |  3 +++
 fs/nfs/objlayout/objio_osd.c |  1 +
 fs/nfs/pnfs.h                | 22 ++++++++++++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5b4124e4c22..57340096c73 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2361,6 +2361,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	struct nfs4_state *state = NULL;
 	int status;
 
+	if (pnfs_ld_layoutret_on_setattr(inode))
+		pnfs_return_layout(inode);
+
 	nfs_fattr_init(fattr);
 	
 	/* Search for an existing open(O_WRITE) file */
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index cc92d3b3dc3..4e8de3ec9a6 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -964,6 +964,7 @@ objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
 
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c34f7a0e3bc..af3967a893a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,6 +65,11 @@ enum {
 	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
 };
 
+enum layoutdriver_policy_flags {
+	/* Should the pNFS client commit and return the layout upon a setattr */
+	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
+};
+
 struct nfs4_deviceid_node;
 
 /* Per-layout driver specific registration structure */
@@ -73,6 +78,7 @@ struct pnfs_layoutdriver_type {
 	const u32 id;
 	const char *name;
 	struct module *owner;
+	unsigned flags;
 
 	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
 	void (*free_layout_hdr) (struct pnfs_layout_hdr *);
@@ -258,6 +264,16 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
 		put_lseg(req->wb_commit_lseg);
 }
 
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+		return false;
+	return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+		PNFS_LAYOUTRET_ON_SETATTR;
+}
+
 static inline int pnfs_return_layout(struct inode *ino)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
@@ -316,6 +332,12 @@ static inline int pnfs_return_layout(struct inode *ino)
 	return 0;
 }
 
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+	return false;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
-- 
cgit v1.2.3-70-g09d2


From 04a555498e03b3804e2dec916a4669f5f560e503 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Sun, 22 May 2011 19:53:10 +0300
Subject: pnfs: encode_layoutreturn

Add a layout driver method to encode the layout type specific
opaque part of layout return in-line in the xdr stream.

Currently the pnfs-objects layout driver uses it to encode i/o error
information on LAYOUTRETURN.

Signed-off-by: Andy Adamson <andros@netapp.com>
[fixup layout header pointer for encode_layoutreturn]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4xdr.c | 9 +++++++--
 fs/nfs/pnfs.h    | 4 ++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f2421206435..d464badc006 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1920,8 +1920,13 @@ encode_layoutreturn(struct xdr_stream *xdr,
 	spin_lock(&args->inode->i_lock);
 	xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
 	spin_unlock(&args->inode->i_lock);
-	p = reserve_space(xdr, 4);
-	*p = cpu_to_be32(0);
+	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+			NFS_I(args->inode)->layout, xdr, args);
+	} else {
+		p = reserve_space(xdr, 4);
+		*p = cpu_to_be32(0);
+	}
 	hdr->nops++;
 	hdr->replen += decode_layoutreturn_maxsz;
 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index af3967a893a..1b6b207a880 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -104,6 +104,10 @@ struct pnfs_layoutdriver_type {
 	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
 
 	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+
+	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutreturn_args *args);
 };
 
 struct pnfs_layout_hdr {
-- 
cgit v1.2.3-70-g09d2


From adb58535e604a564495a7d50dfb0afa0ddc21bcb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 26 May 2011 21:49:46 +0300
Subject: pnfs-obj: report errors and .encode_layoutreturn Implementation.

An io_state pre-allocates an error information structure for each
possible osd-device that might error during IO. When IO is done if all
was well the io_state is freed. (as today). If the I/O has ended with an
error, the io_state is queued on a per-layout err_list. When eventually
encode_layoutreturn() is called, each error is properly encoded on the
XDR buffer and only then the io_state is removed from err_list and
de-allocated.

It is up to the io_engine to fill in the segment that fault and the type
of osd_error that occurred. By calling objlayout_io_set_result() for
each failing device.

In objio_osd:
* Allocate io-error descriptors space as part of io_state
* Use generic objlayout error reporting at end of io.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/objio_osd.c |  44 +++++++-
 fs/nfs/objlayout/objlayout.c | 232 ++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/objlayout/objlayout.h |  23 +++++
 3 files changed, 297 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 4e8de3ec9a6..8bca5e13f3e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -396,12 +396,16 @@ int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
 	struct objio_state *ios;
 	const unsigned first_size = sizeof(*ios) +
 				objio_seg->num_comps * sizeof(ios->per_dev[0]);
+	const unsigned sec_size = objio_seg->num_comps *
+						sizeof(ios->ol_state.ioerrs[0]);
 
-	ios = kzalloc(first_size, gfp_flags);
+	ios = kzalloc(first_size + sec_size, gfp_flags);
 	if (unlikely(!ios))
 		return -ENOMEM;
 
 	ios->layout = objio_seg;
+	ios->ol_state.ioerrs = ((void *)ios) + first_size;
+	ios->ol_state.num_comps = objio_seg->num_comps;
 
 	*outp = &ios->ol_state;
 	return 0;
@@ -415,6 +419,36 @@ void objio_free_io_state(struct objlayout_io_state *ol_state)
 	kfree(ios);
 }
 
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+	switch (oep) {
+	case OSD_ERR_PRI_NO_ERROR:
+		return (enum pnfs_osd_errno)0;
+
+	case OSD_ERR_PRI_CLEAR_PAGES:
+		BUG_ON(1);
+		return 0;
+
+	case OSD_ERR_PRI_RESOURCE:
+		return PNFS_OSD_ERR_RESOURCE;
+	case OSD_ERR_PRI_BAD_CRED:
+		return PNFS_OSD_ERR_BAD_CRED;
+	case OSD_ERR_PRI_NO_ACCESS:
+		return PNFS_OSD_ERR_NO_ACCESS;
+	case OSD_ERR_PRI_UNREACHABLE:
+		return PNFS_OSD_ERR_UNREACHABLE;
+	case OSD_ERR_PRI_NOT_FOUND:
+		return PNFS_OSD_ERR_NOT_FOUND;
+	case OSD_ERR_PRI_NO_SPACE:
+		return PNFS_OSD_ERR_NO_SPACE;
+	default:
+		WARN_ON(1);
+		/* fallthrough */
+	case OSD_ERR_PRI_EIO:
+		return PNFS_OSD_ERR_EIO;
+	}
+}
+
 static void _clear_bio(struct bio *bio)
 {
 	struct bio_vec *bv;
@@ -461,6 +495,12 @@ static int _io_check(struct objio_state *ios, bool is_write)
 			continue; /* we recovered */
 		}
 		dev = ios->per_dev[i].dev;
+		objlayout_io_set_result(&ios->ol_state, dev,
+					&ios->layout->comps[dev].oc_object_id,
+					osd_pri_2_pnfs_err(osi.osd_err_pri),
+					ios->per_dev[i].offset,
+					ios->per_dev[i].length,
+					is_write);
 
 		if (osi.osd_err_pri >= oep) {
 			oep = osi.osd_err_pri;
@@ -977,6 +1017,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.pg_test                 = objlayout_pg_test,
 
 	.free_deviceid_node	 = objio_free_deviceid_node,
+
+	.encode_layoutreturn     = objlayout_encode_layoutreturn,
 };
 
 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 5157ef6d004..f7caecff6b4 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -50,6 +50,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 	struct objlayout *objlay;
 
 	objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+	if (objlay) {
+		spin_lock_init(&objlay->lock);
+		INIT_LIST_HEAD(&objlay->err_list);
+	}
 	dprintk("%s: Return %p\n", __func__, objlay);
 	return &objlay->pnfs_layout;
 }
@@ -64,6 +68,7 @@ objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 
 	dprintk("%s: objlay %p\n", __func__, objlay);
 
+	WARN_ON(!list_empty(&objlay->err_list));
 	kfree(objlay);
 }
 
@@ -183,6 +188,7 @@ objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
 		pgbase &= ~PAGE_MASK;
 	}
 
+	INIT_LIST_HEAD(&state->err_list);
 	state->lseg = lseg;
 	state->rpcdata = rpcdata;
 	state->pages = pages;
@@ -213,7 +219,52 @@ objlayout_iodone(struct objlayout_io_state *state)
 {
 	dprintk("%s: state %p status\n", __func__, state);
 
-	objlayout_free_io_state(state);
+	if (likely(state->status >= 0)) {
+		objlayout_free_io_state(state);
+	} else {
+		struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+		spin_lock(&objlay->lock);
+		list_add(&objlay->err_list, &state->err_list);
+		spin_unlock(&objlay->lock);
+	}
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+			struct pnfs_osd_objid *pooid, int osd_error,
+			u64 offset, u64 length, bool is_write)
+{
+	struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+	BUG_ON(index >= state->num_comps);
+	if (osd_error) {
+		ioerr->oer_component = *pooid;
+		ioerr->oer_comp_offset = offset;
+		ioerr->oer_comp_length = length;
+		ioerr->oer_iswrite = is_write;
+		ioerr->oer_errno = osd_error;
+
+		dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+			"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+			__func__, index, ioerr->oer_errno,
+			ioerr->oer_iswrite,
+			_DEVID_LO(&ioerr->oer_component.oid_device_id),
+			_DEVID_HI(&ioerr->oer_component.oid_device_id),
+			ioerr->oer_component.oid_partition_id,
+			ioerr->oer_component.oid_object_id,
+			ioerr->oer_comp_offset,
+			ioerr->oer_comp_length);
+	} else {
+		/* User need not call if no error is reported */
+		ioerr->oer_errno = 0;
+	}
 }
 
 /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
@@ -382,6 +433,185 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
 	return PNFS_ATTEMPTED;
 }
 
+static int
+err_prio(u32 oer_errno)
+{
+	switch (oer_errno) {
+	case 0:
+		return 0;
+
+	case PNFS_OSD_ERR_RESOURCE:
+		return OSD_ERR_PRI_RESOURCE;
+	case PNFS_OSD_ERR_BAD_CRED:
+		return OSD_ERR_PRI_BAD_CRED;
+	case PNFS_OSD_ERR_NO_ACCESS:
+		return OSD_ERR_PRI_NO_ACCESS;
+	case PNFS_OSD_ERR_UNREACHABLE:
+		return OSD_ERR_PRI_UNREACHABLE;
+	case PNFS_OSD_ERR_NOT_FOUND:
+		return OSD_ERR_PRI_NOT_FOUND;
+	case PNFS_OSD_ERR_NO_SPACE:
+		return OSD_ERR_PRI_NO_SPACE;
+	default:
+		WARN_ON(1);
+		/* fallthrough */
+	case PNFS_OSD_ERR_EIO:
+		return OSD_ERR_PRI_EIO;
+	}
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+	    const struct pnfs_osd_ioerr *src_err)
+{
+	u64 dest_end, src_end;
+
+	if (!dest_err->oer_errno) {
+		*dest_err = *src_err;
+		/* accumulated device must be blank */
+		memset(&dest_err->oer_component.oid_device_id, 0,
+			sizeof(dest_err->oer_component.oid_device_id));
+
+		return;
+	}
+
+	if (dest_err->oer_component.oid_partition_id !=
+				src_err->oer_component.oid_partition_id)
+		dest_err->oer_component.oid_partition_id = 0;
+
+	if (dest_err->oer_component.oid_object_id !=
+				src_err->oer_component.oid_object_id)
+		dest_err->oer_component.oid_object_id = 0;
+
+	if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+		dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+	dest_end = end_offset(dest_err->oer_comp_offset,
+			      dest_err->oer_comp_length);
+	src_end =  end_offset(src_err->oer_comp_offset,
+			      src_err->oer_comp_length);
+	if (dest_end < src_end)
+		dest_end = src_end;
+
+	dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+	if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+	    (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+			dest_err->oer_errno = src_err->oer_errno;
+	} else if (src_err->oer_iswrite) {
+		dest_err->oer_iswrite = true;
+		dest_err->oer_errno = src_err->oer_errno;
+	}
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+	struct objlayout_io_state *state, *tmp;
+	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+		unsigned i;
+
+		for (i = 0; i < state->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+			if (!ioerr->oer_errno)
+				continue;
+
+			printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
+				"offset=0x%llx length=0x%llx\n",
+				__func__, i, ioerr->oer_errno,
+				ioerr->oer_iswrite,
+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
+				ioerr->oer_component.oid_partition_id,
+				ioerr->oer_component.oid_object_id,
+				ioerr->oer_comp_offset,
+				ioerr->oer_comp_length);
+
+			merge_ioerr(&accumulated_err, ioerr);
+		}
+		list_del(&state->err_list);
+		objlayout_free_io_state(state);
+	}
+
+	pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutreturn_args *args)
+{
+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
+	struct objlayout_io_state *state, *tmp;
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+	start = xdr_reserve_space(xdr, 4);
+	BUG_ON(!start);
+
+	spin_lock(&objlay->lock);
+
+	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+		__be32 *last_xdr = NULL, *p;
+		unsigned i;
+		int res = 0;
+
+		for (i = 0; i < state->num_comps; i++) {
+			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+			if (!ioerr->oer_errno)
+				continue;
+
+			dprintk("%s: err[%d]: errno=%d is_write=%d "
+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
+				"offset=0x%llx length=0x%llx\n",
+				__func__, i, ioerr->oer_errno,
+				ioerr->oer_iswrite,
+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
+				ioerr->oer_component.oid_partition_id,
+				ioerr->oer_component.oid_object_id,
+				ioerr->oer_comp_offset,
+				ioerr->oer_comp_length);
+
+			p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+			if (unlikely(!p)) {
+				res = -E2BIG;
+				break; /* accumulated_error */
+			}
+
+			last_xdr = p;
+			pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+		}
+
+		/* TODO: use xdr_write_pages */
+		if (unlikely(res)) {
+			/* no space for even one error descriptor */
+			BUG_ON(!last_xdr);
+
+			/* we've encountered a situation with lots and lots of
+			 * errors and no space to encode them all. Use the last
+			 * available slot to report the union of all the
+			 * remaining errors.
+			 */
+			encode_accumulated_error(objlay, last_xdr);
+			goto loop_done;
+		}
+		list_del(&state->err_list);
+		objlayout_free_io_state(state);
+	}
+loop_done:
+	spin_unlock(&objlay->lock);
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+	dprintk("%s: Return\n", __func__);
+}
+
+
 /*
  * Get Device Info API for io engines
  */
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 9a405e8069f..b0bb975058e 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -50,6 +50,10 @@
  */
 struct objlayout {
 	struct pnfs_layout_hdr pnfs_layout;
+
+	 /* for layout_return */
+	spinlock_t lock;
+	struct list_head err_list;
 };
 
 static inline struct objlayout *
@@ -76,6 +80,16 @@ struct objlayout_io_state {
 	int status;             /* res */
 	int eof;                /* res */
 	int committed;          /* res */
+
+	/* Error reporting (layout_return) */
+	struct list_head err_list;
+	unsigned num_comps;
+	/* Pointer to array of error descriptors of size num_comps.
+	 * It should contain as many entries as devices in the osd_layout
+	 * that participate in the I/O. It is up to the io_engine to allocate
+	 * needed space and set num_comps.
+	 */
+	struct pnfs_osd_ioerr *ioerrs;
 };
 
 /*
@@ -101,6 +115,10 @@ extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
 /*
  * callback API
  */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+			unsigned index, struct pnfs_osd_objid *pooid,
+			int osd_error, u64 offset, u64 length, bool is_write);
+
 extern void objlayout_read_done(struct objlayout_io_state *state,
 				ssize_t status, bool sync);
 extern void objlayout_write_done(struct objlayout_io_state *state,
@@ -131,4 +149,9 @@ extern enum pnfs_try_status objlayout_write_pagelist(
 	struct nfs_write_data *,
 	int how);
 
+extern void objlayout_encode_layoutreturn(
+	struct pnfs_layout_hdr *,
+	struct xdr_stream *,
+	const struct nfs4_layoutreturn_args *);
+
 #endif /* _OBJLAYOUT_H */
-- 
cgit v1.2.3-70-g09d2


From ac7db7264ac3314cae09893bc838fcb7e83267a4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Sun, 22 May 2011 19:53:48 +0300
Subject: pnfs: encode_layoutcommit

Add a layout driver method to encode the layout type specific
opaque part of layout commit in-line in the xdr stream.

Currently, the pnfs-objects layout driver uses it to encode metadata hints
to the MDS and the blocks layout driver to commit provisionally allocated
extents to the file.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4xdr.c | 16 +++++++++++++---
 fs/nfs/pnfs.h    |  4 ++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d464badc006..d869a5e5464 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1875,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
 
 static int
 encode_layoutcommit(struct xdr_stream *xdr,
+		    struct inode *inode,
 		    const struct nfs4_layoutcommit_args *args,
 		    struct compound_hdr *hdr)
 {
@@ -1883,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
 	dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
 		NFS_SERVER(args->inode)->pnfs_curr_ld->id);
 
-	p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
 	/* Only whole file layouts */
 	p = xdr_encode_hyper(p, 0); /* offset */
@@ -1894,7 +1895,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
 	p = xdr_encode_hyper(p, args->lastbytewritten);
 	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
 	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
-	*p++ = cpu_to_be32(0); /* no file layout payload */
+
+	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+			NFS_I(inode)->layout, xdr, args);
+	else {
+		p = reserve_space(xdr, 4);
+		*p = cpu_to_be32(0); /* no layout-type payload */
+	}
 
 	hdr->nops++;
 	hdr->replen += decode_layoutcommit_maxsz;
@@ -2751,6 +2759,8 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
 				      struct xdr_stream *xdr,
 				      struct nfs4_layoutcommit_args *args)
 {
+	struct nfs4_layoutcommit_data *data =
+		container_of(args, struct nfs4_layoutcommit_data, args);
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
@@ -2758,7 +2768,7 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-	encode_layoutcommit(xdr, args, &hdr);
+	encode_layoutcommit(xdr, data->args.inode, args, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1b6b207a880..3fda9714677 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -108,6 +108,10 @@ struct pnfs_layoutdriver_type {
 	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
 				     struct xdr_stream *xdr,
 				     const struct nfs4_layoutreturn_args *args);
+
+	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+				     struct xdr_stream *xdr,
+				     const struct nfs4_layoutcommit_args *args);
 };
 
 struct pnfs_layout_hdr {
-- 
cgit v1.2.3-70-g09d2


From a0fe8bf427f4987d7b82678292ca03cfd7331467 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 22 May 2011 19:54:13 +0300
Subject: pnfs-obj: objlayout_encode_layoutcommit implementation

* Define API for io-engines to report delta_space_used in IOs
* Encode the osd-layout specific information of the layoutcommit
  XDR buffer.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/objio_osd.c |  1 +
 fs/nfs/objlayout/objlayout.c | 30 ++++++++++++++++++++++++++++++
 fs/nfs/objlayout/objlayout.h | 30 ++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 8bca5e13f3e..3be124160e9 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -1018,6 +1018,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 
 	.free_deviceid_node	 = objio_free_deviceid_node,
 
+	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
 	.encode_layoutreturn     = objlayout_encode_layoutreturn,
 };
 
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index f7caecff6b4..dc3956c0de8 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -225,6 +225,7 @@ objlayout_iodone(struct objlayout_io_state *state)
 		struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
 
 		spin_lock(&objlay->lock);
+		objlay->delta_space_valid = OBJ_DSU_INVALID;
 		list_add(&objlay->err_list, &state->err_list);
 		spin_unlock(&objlay->lock);
 	}
@@ -433,6 +434,35 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
 	return PNFS_ATTEMPTED;
 }
 
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+			      struct xdr_stream *xdr,
+			      const struct nfs4_layoutcommit_args *args)
+{
+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
+	struct pnfs_osd_layoutupdate lou;
+	__be32 *start;
+
+	dprintk("%s: Begin\n", __func__);
+
+	spin_lock(&objlay->lock);
+	lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+	lou.dsu_delta = objlay->delta_space_used;
+	objlay->delta_space_used = 0;
+	objlay->delta_space_valid = OBJ_DSU_INIT;
+	lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+	spin_unlock(&objlay->lock);
+
+	start = xdr_reserve_space(xdr, 4);
+
+	BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+	dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+		lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
 static int
 err_prio(u32 oer_errno)
 {
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index b0bb975058e..a8244c8e042 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -51,6 +51,14 @@
 struct objlayout {
 	struct pnfs_layout_hdr pnfs_layout;
 
+	 /* for layout_commit */
+	enum osd_delta_space_valid_enum {
+		OBJ_DSU_INIT = 0,
+		OBJ_DSU_VALID,
+		OBJ_DSU_INVALID,
+	} delta_space_valid;
+	s64 delta_space_used;  /* consumed by write ops */
+
 	 /* for layout_return */
 	spinlock_t lock;
 	struct list_head err_list;
@@ -119,6 +127,23 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state,
 			unsigned index, struct pnfs_osd_objid *pooid,
 			int osd_error, u64 offset, u64 length, bool is_write);
 
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+	struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+
+	/* If one of the I/Os errored out and the delta_space_used was
+	 * invalid we render the complete report as invalid. Protocol mandate
+	 * the DSU be accurate or not reported.
+	 */
+	spin_lock(&objlay->lock);
+	if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+		objlay->delta_space_valid = OBJ_DSU_VALID;
+		objlay->delta_space_used += space_used;
+	}
+	spin_unlock(&objlay->lock);
+}
+
 extern void objlayout_read_done(struct objlayout_io_state *state,
 				ssize_t status, bool sync);
 extern void objlayout_write_done(struct objlayout_io_state *state,
@@ -149,6 +174,11 @@ extern enum pnfs_try_status objlayout_write_pagelist(
 	struct nfs_write_data *,
 	int how);
 
+extern void objlayout_encode_layoutcommit(
+	struct pnfs_layout_hdr *,
+	struct xdr_stream *,
+	const struct nfs4_layoutcommit_args *);
+
 extern void objlayout_encode_layoutreturn(
 	struct pnfs_layout_hdr *,
 	struct xdr_stream *,
-- 
cgit v1.2.3-70-g09d2


From dfed206b8857d41a91ebba030f99e30017a44dda Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 25 May 2011 20:25:22 +0300
Subject: NFSv4.1: unify pnfs_pageio_init functions

Use common code for pnfs_pageio_init_{read,write} and use
a common generic pg_test function.

Note that this function always assumes the the layout driver's
pg_test method is implemented.

[Fix BUG]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/pagelist.c |  2 ++
 fs/nfs/pnfs.c     | 55 +++++++++++++++----------------------------------------
 fs/nfs/pnfs.h     | 21 ++++++++++-----------
 fs/nfs/read.c     |  1 -
 fs/nfs/write.c    |  2 --
 5 files changed, 27 insertions(+), 54 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e221..b8704fedcd1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -229,6 +229,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
+	desc->pg_test = NULL;
+	pnfs_pageio_init(desc, inode);
 }
 
 /**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 00b12824174..568ab0eef67 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1043,46 +1043,30 @@ out_forget_reply:
 	goto out;
 }
 
-static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
-			     struct nfs_page *prev,
-			     struct nfs_page *req)
-{
-	if (pgio->pg_count == prev->wb_bytes) {
-		/* This is first coelesce call for a series of nfs_pages */
-		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   prev->wb_context,
-						   req_offset(req),
-						   pgio->pg_count,
-						   IOMODE_READ,
-						   GFP_KERNEL);
-	} else if (pgio->pg_lseg &&
-		   req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
-						pgio->pg_lseg->pls_range.length))
-		return 0;
-	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
-
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+int
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		     struct nfs_page *req)
 {
-	struct pnfs_layoutdriver_type *ld;
+	enum pnfs_iomode access_type;
+	gfp_t gfp_flags;
 
-	ld = NFS_SERVER(inode)->pnfs_curr_ld;
-	pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
-}
+	/* We assume that pg_ioflags == 0 iff we're reading a page */
+	if (pgio->pg_ioflags == 0) {
+		access_type = IOMODE_READ;
+		gfp_flags = GFP_KERNEL;
+	} else {
+		access_type = IOMODE_RW;
+		gfp_flags = GFP_NOFS;
+	}
 
-static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
-			      struct nfs_page *prev,
-			      struct nfs_page *req)
-{
 	if (pgio->pg_count == prev->wb_bytes) {
 		/* This is first coelesce call for a series of nfs_pages */
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   prev->wb_context,
 						   req_offset(req),
 						   pgio->pg_count,
-						   IOMODE_RW,
-						   GFP_NOFS);
+						   access_type,
+						   gfp_flags);
 	} else if (pgio->pg_lseg &&
 		   req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
 						pgio->pg_lseg->pls_range.length))
@@ -1090,15 +1074,6 @@ static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
 	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
 }
 
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-{
-	struct pnfs_layoutdriver_type *ld;
-
-	ld = NFS_SERVER(inode)->pnfs_curr_ld;
-	pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
-}
-
 /*
  * Called by non rpc-based layout drivers
  */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3fda9714677..c056688ee92 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -158,8 +158,7 @@ enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
 					     const struct rpc_call_ops *, int);
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
 					    const struct rpc_call_ops *);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
+int pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -293,6 +292,13 @@ static inline int pnfs_return_layout(struct inode *ino)
 	return 0;
 }
 
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+				    struct inode *inode)
+{
+	if (NFS_SERVER(inode)->pnfs_curr_ld)
+		pgio->pg_test = pnfs_generic_pg_test;
+}
+
 #else  /* CONFIG_NFS_V4_1 */
 
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -376,16 +382,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
 
-static inline void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
-{
-	pgio->pg_test = NULL;
-}
-
-static inline void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+				    struct inode *inode)
 {
-	pgio->pg_test = NULL;
 }
 
 static inline void
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 540c8bc93f9..20a7f952e24 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -664,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	pnfs_pageio_init_read(&pgio, inode);
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7edb72f27c2..e268e3b2349 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1036,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
 	size_t wsize = NFS_SERVER(inode)->wsize;
 
-	pnfs_pageio_init_write(pgio, inode);
-
 	if (wsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
 	else
-- 
cgit v1.2.3-70-g09d2


From 18ad0a9f2ccd260d37dd6bc5fa04c7819def4c84 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 25 May 2011 21:03:56 +0300
Subject: NFSv4.1: change pg_test return type to bool

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4filelayout.c      |  6 +++---
 fs/nfs/objlayout/objio_osd.c |  7 +++----
 fs/nfs/pagelist.c            | 22 +++++++++++-----------
 fs/nfs/pnfs.c                |  4 ++--
 fs/nfs/pnfs.h                |  4 ++--
 include/linux/nfs_page.h     |  2 +-
 6 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 33bda24e8cd..24f05720daf 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -651,10 +651,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 /*
  * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
  *
- * return 1 :  coalesce page
- * return 0 :  don't coalesce page
+ * return true  : coalesce page
+ * return false : don't coalesce page
  */
-int
+bool
 filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		   struct nfs_page *req)
 {
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 3be124160e9..8c2bd3eb8e8 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -991,14 +991,13 @@ ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
 /*
  * objlayout_pg_test(). Called by nfs_can_coalesce_requests()
  *
- * return 1 :  coalesce page
- * return 0 :  don't coalesce page
+ * return true iff coalesce page
  */
-int
+bool
 objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		   struct nfs_page *req)
 {
-	return 1;
+	return true;
 }
 
 static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b8704fedcd1..5344371a257 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -244,29 +244,29 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
  *
  * Return 'true' if this is the case, else return 'false'.
  */
-static int nfs_can_coalesce_requests(struct nfs_page *prev,
-				     struct nfs_page *req,
-				     struct nfs_pageio_descriptor *pgio)
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+				      struct nfs_page *req,
+				      struct nfs_pageio_descriptor *pgio)
 {
 	if (req->wb_context->cred != prev->wb_context->cred)
-		return 0;
+		return false;
 	if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
-		return 0;
+		return false;
 	if (req->wb_context->state != prev->wb_context->state)
-		return 0;
+		return false;
 	if (req->wb_index != (prev->wb_index + 1))
-		return 0;
+		return false;
 	if (req->wb_pgbase != 0)
-		return 0;
+		return false;
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
-		return 0;
+		return false;
 	/*
 	 * Non-whole file layouts need to check that req is inside of
 	 * pgio->pg_lseg.
 	 */
 	if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 /**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 568ab0eef67..212fc292761 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1043,7 +1043,7 @@ out_forget_reply:
 	goto out;
 }
 
-int
+bool
 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 		     struct nfs_page *req)
 {
@@ -1070,7 +1070,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	} else if (pgio->pg_lseg &&
 		   req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
 						pgio->pg_lseg->pls_range.length))
-		return 0;
+		return false;
 	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
 }
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c056688ee92..65daae59c8a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -87,7 +87,7 @@ struct pnfs_layoutdriver_type {
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 
 	/* test for nfs page cache coalescing */
-	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+	bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 
 	/* Returns true if layoutdriver wants to divert this request to
 	 * driver's commit routine.
@@ -158,7 +158,7 @@ enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
 					     const struct rpc_call_ops *, int);
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
 					    const struct rpc_call_ops *);
-int pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 91af2e49fa3..3a34e80ae92 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -68,7 +68,7 @@ struct nfs_pageio_descriptor {
 	int 			pg_ioflags;
 	int			pg_error;
 	struct pnfs_layout_segment *pg_lseg;
-	int			(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+	bool			(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 };
 
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
-- 
cgit v1.2.3-70-g09d2


From 89a58e32d9105c01022a757fb32ddc3b51bf0025 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 25 May 2011 20:54:40 +0300
Subject: NFSv4.1: use pnfs_generic_pg_test directly by layout driver

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/nfs4filelayout.c      |  3 +++
 fs/nfs/objlayout/objio_osd.c | 14 +-------------
 fs/nfs/pnfs.c                | 13 +++++++++----
 fs/nfs/pnfs.h                |  6 ++++--
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 24f05720daf..426908809c9 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -661,6 +661,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	u64 p_stripe, r_stripe;
 	u32 stripe_unit;
 
+	if (!pnfs_generic_pg_test(pgio, prev, req))
+		return 0;
+
 	if (!pgio->pg_lseg)
 		return 1;
 	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 8c2bd3eb8e8..3b10d09e521 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -988,18 +988,6 @@ ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
 	return _write_exec(ios);
 }
 
-/*
- * objlayout_pg_test(). Called by nfs_can_coalesce_requests()
- *
- * return true iff coalesce page
- */
-bool
-objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
-		   struct nfs_page *req)
-{
-	return true;
-}
-
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
@@ -1013,7 +1001,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 
 	.read_pagelist           = objlayout_read_pagelist,
 	.write_pagelist          = objlayout_write_pagelist,
-	.pg_test                 = objlayout_pg_test,
+	.pg_test                 = pnfs_generic_pg_test,
 
 	.free_deviceid_node	 = objio_free_deviceid_node,
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 212fc292761..d79f2df33a4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1067,12 +1067,17 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 						   pgio->pg_count,
 						   access_type,
 						   gfp_flags);
-	} else if (pgio->pg_lseg &&
-		   req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
-						pgio->pg_lseg->pls_range.length))
+		return true;
+	}
+
+	if (pgio->pg_lseg &&
+	    req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+					 pgio->pg_lseg->pls_range.length))
 		return false;
-	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+
+	return true;
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
 /*
  * Called by non rpc-based layout drivers
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 65daae59c8a..48d0a8e4d06 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -295,8 +295,10 @@ static inline int pnfs_return_layout(struct inode *ino)
 static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
 				    struct inode *inode)
 {
-	if (NFS_SERVER(inode)->pnfs_curr_ld)
-		pgio->pg_test = pnfs_generic_pg_test;
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld)
+		pgio->pg_test = ld->pg_test;
 }
 
 #else  /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-70-g09d2


From 5b36c7dc41d87d39e779a84fdc2b44b39bba32ca Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 29 May 2011 11:45:39 +0300
Subject: NFSv4.1: define nfs_generic_pg_test

By default, unless pnfs is used coalesce pages until pg_bsize
(rsize or wsize) is reached.

pnfs layout drivers define their own pg_test methods that use
pnfs_generic_pg_test and need to define their own I/O size
limits (e.g. based on the file stripe size).

[Move a check from nfs_pageio_do_add_request to nfs_generic_pg_test]
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/pagelist.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 5344371a257..7913961aff2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
 			TASK_UNINTERRUPTIBLE);
 }
 
+static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+{
+	/*
+	 * FIXME: ideally we should be able to coalesce all requests
+	 * that are not block boundary aligned, but currently this
+	 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+	 * since nfs_flush_multi and nfs_pagein_multi assume you
+	 * can have only one struct nfs_page.
+	 */
+	if (desc->pg_bsize < PAGE_SIZE)
+		return 0;
+
+	return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+}
+
 /**
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
@@ -229,7 +244,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
-	desc->pg_test = NULL;
+	desc->pg_test = nfs_generic_pg_test;
 	pnfs_pageio_init(desc, inode);
 }
 
@@ -260,13 +275,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 		return false;
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
 		return false;
-	/*
-	 * Non-whole file layouts need to check that req is inside of
-	 * pgio->pg_lseg.
-	 */
-	if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
-		return false;
-	return true;
+	return pgio->pg_test(pgio, prev, req);
 }
 
 /**
@@ -280,31 +289,18 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 				     struct nfs_page *req)
 {
-	size_t newlen = req->wb_bytes;
-
 	if (desc->pg_count != 0) {
 		struct nfs_page *prev;
 
-		/*
-		 * FIXME: ideally we should be able to coalesce all requests
-		 * that are not block boundary aligned, but currently this
-		 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
-		 * since nfs_flush_multi and nfs_pagein_multi assume you
-		 * can have only one struct nfs_page.
-		 */
-		if (desc->pg_bsize < PAGE_SIZE)
-			return 0;
-		newlen += desc->pg_count;
-		if (newlen > desc->pg_bsize)
-			return 0;
 		prev = nfs_list_entry(desc->pg_list.prev);
 		if (!nfs_can_coalesce_requests(prev, req, desc))
 			return 0;
-	} else
+	} else {
 		desc->pg_base = req->wb_pgbase;
+	}
 	nfs_list_remove_request(req);
 	nfs_list_add_request(req, &desc->pg_list);
-	desc->pg_count = newlen;
+	desc->pg_count += req->wb_bytes;
 	return 1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9342077011d54f42fa1b88b7bc1f7008dcf5fff9 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 25 May 2011 21:25:29 +0300
Subject: pnfs-obj: pg_test check for max_io_size

Implement pg_test vector to test for max IO sizes. We calculate
a max_io_size member only once, and cache it in lseg so to not
do so on every page insert.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
[simplify logic]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfs/objlayout/objio_osd.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 3b10d09e521..9cf208df1f2 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -128,6 +128,8 @@ struct objio_segment {
 	u64 group_depth;
 	unsigned group_count;
 
+	unsigned max_io_size;
+
 	unsigned comps_index;
 	unsigned num_comps;
 	/* variable length */
@@ -365,6 +367,11 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 		objio_seg->group_count = 1;
 	}
 
+	/* Cache this calculation it will hit for every page */
+	objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+				  objio_seg->stripe_unit) *
+				 objio_seg->group_width;
+
 	*outp = &objio_seg->lseg;
 	return 0;
 
@@ -988,6 +995,16 @@ ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
 	return _write_exec(ios);
 }
 
+static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+			  struct nfs_page *prev, struct nfs_page *req)
+{
+	if (!pnfs_generic_pg_test(pgio, prev, req))
+		return false;
+
+	return pgio->pg_count + req->wb_bytes <=
+			OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+}
+
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
@@ -1001,7 +1018,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 
 	.read_pagelist           = objlayout_read_pagelist,
 	.write_pagelist          = objlayout_write_pagelist,
-	.pg_test                 = pnfs_generic_pg_test,
+	.pg_test                 = objio_pg_test,
 
 	.free_deviceid_node	 = objio_free_deviceid_node,
 
-- 
cgit v1.2.3-70-g09d2


From ef1d57599dc904fdb31b8e9b5336350d21a1fde1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 29 May 2011 13:46:08 +0100
Subject: cifs/ubifs: Fix shrinker API change fallout

Commit 1495f230fa77 ("vmscan: change shrinker API by passing
shrink_control struct") changed the API of ->shrink(), but missed ubifs
and cifs instances.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifsacl.c   | 3 ++-
 fs/ubifs/shrinker.c | 3 ++-
 fs/ubifs/ubifs.h    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 8f1700623b4..21de1d6d584 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
  * Run idmap cache shrinker.
  */
 static int
-cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 {
+	int nr_to_scan = sc->nr_to_scan;
 	int nr_del = 0;
 	int nr_rem = 0;
 	struct rb_root *root;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 46961c00323..ca953a94502 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,8 +277,9 @@ static int kick_a_thread(void)
 	return 0;
 }
 
-int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 {
+	int nr = sc->nr_to_scan;
 	int freed, contention = 0;
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d1412a06f..a70d7b4ffb2 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1614,7 +1614,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 
 /* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
 
 /* commit.c */
 int ubifs_bg_thread(void *info);
-- 
cgit v1.2.3-70-g09d2


From 6345d24daf0c1fffe6642081d783cdf653ebaa5c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 29 May 2011 11:32:28 -0700
Subject: mm: Fix boot crash in mm_alloc()

Thomas Gleixner reports that we now have a boot crash triggered by
CONFIG_CPUMASK_OFFSTACK=y:

    BUG: unable to handle kernel NULL pointer dereference at   (null)
    IP: [<c11ae035>] find_next_bit+0x55/0xb0
    Call Trace:
     [<c11addda>] cpumask_any_but+0x2a/0x70
     [<c102396b>] flush_tlb_mm+0x2b/0x80
     [<c1022705>] pud_populate+0x35/0x50
     [<c10227ba>] pgd_alloc+0x9a/0xf0
     [<c103a3fc>] mm_init+0xec/0x120
     [<c103a7a3>] mm_alloc+0x53/0xd0

which was introduced by commit de03c72cfce5 ("mm: convert
mm->cpu_vm_cpumask into cpumask_var_t"), and is due to wrong ordering of
mm_init() vs mm_init_cpumask

Thomas wrote a patch to just fix the ordering of initialization, but I
hate the new double allocation in the fork path, so I ended up instead
doing some more radical surgery to clean it all up.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 14 ++++++++++++--
 include/linux/sched.h    |  1 -
 init/main.c              |  2 +-
 kernel/fork.c            | 42 ++++++++++--------------------------------
 4 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2a78aae78c6..027935c86c6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -264,6 +264,8 @@ struct mm_struct {
 
 	struct linux_binfmt *binfmt;
 
+	cpumask_var_t cpu_vm_mask_var;
+
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
@@ -311,10 +313,18 @@ struct mm_struct {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
-
-	cpumask_var_t cpu_vm_mask_var;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	struct cpumask cpumask_allocation;
+#endif
 };
 
+static inline void mm_init_cpumask(struct mm_struct *mm)
+{
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	mm->cpu_vm_mask_var = &mm->cpumask_allocation;
+#endif
+}
+
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
 static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bcddd013810..2a8621c4be1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2194,7 +2194,6 @@ static inline void mmdrop(struct mm_struct * mm)
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
-extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm);
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
diff --git a/init/main.c b/init/main.c
index d2f1e086bf3..cafba67c13b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -487,6 +487,7 @@ asmlinkage void __init start_kernel(void)
 	printk(KERN_NOTICE "%s", linux_banner);
 	setup_arch(&command_line);
 	mm_init_owner(&init_mm, &init_task);
+	mm_init_cpumask(&init_mm);
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
@@ -510,7 +511,6 @@ asmlinkage void __init start_kernel(void)
 	sort_main_extable();
 	trap_init();
 	mm_init();
-	BUG_ON(mm_init_cpumask(&init_mm, 0));
 
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
diff --git a/kernel/fork.c b/kernel/fork.c
index ca406d91671..0276c30401a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
-int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (oldmm)
-		cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
-	else
-		memset(mm_cpumask(mm), 0, cpumask_size());
-#endif
-	return 0;
-}
-
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
@@ -538,17 +524,8 @@ struct mm_struct * mm_alloc(void)
 		return NULL;
 
 	memset(mm, 0, sizeof(*mm));
-	mm = mm_init(mm, current);
-	if (!mm)
-		return NULL;
-
-	if (mm_init_cpumask(mm, NULL)) {
-		mm_free_pgd(mm);
-		free_mm(mm);
-		return NULL;
-	}
-
-	return mm;
+	mm_init_cpumask(mm);
+	return mm_init(mm, current);
 }
 
 /*
@@ -559,7 +536,6 @@ struct mm_struct * mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
-	free_cpumask_var(mm->cpu_vm_mask_var);
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
@@ -753,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 		goto fail_nomem;
 
 	memcpy(mm, oldmm, sizeof(*mm));
+	mm_init_cpumask(mm);
 
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -765,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
-	if (mm_init_cpumask(mm, oldmm))
-		goto fail_nocpumask;
-
 	if (init_new_context(tsk, mm))
 		goto fail_nocontext;
 
@@ -794,9 +768,6 @@ fail_nomem:
 	return NULL;
 
 fail_nocontext:
-	free_cpumask_var(mm->cpu_vm_mask_var);
-
-fail_nocpumask:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
@@ -1591,6 +1562,13 @@ void __init proc_caches_init(void)
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+	/*
+	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
+	 * whole struct cpumask for the OFFSTACK case. We could change
+	 * this to *only* allocate as much of it as required by the
+	 * maximum number of CPU's we can ever have.  The cpumask_allocation
+	 * is at the end of the structure, exactly for that reason.
+	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-- 
cgit v1.2.3-70-g09d2


From 3b06b3ebf44170c90c893c6c80916db6e922b9f2 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 03:49:02 -0500
Subject: eCryptfs: Fix new inode race condition

Only unlock and d_add() new inodes after the plaintext inode size has
been read from the lower filesystem. This fixes a race condition that
was sometimes seen during a multi-job kernel build in an eCryptfs mount.

https://bugzilla.kernel.org/show_bug.cgi?id=36002

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Reported-by: David <david@unsolicited.net>
Tested-by: David <david@unsolicited.net>
---
 fs/ecryptfs/crypto.c          |  4 ++--
 fs/ecryptfs/ecryptfs_kernel.h |  4 ++--
 fs/ecryptfs/file.c            |  2 +-
 fs/ecryptfs/inode.c           | 42 ++++++++++++++++++++++--------------------
 fs/ecryptfs/main.c            |  6 +++---
 5 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b8d5c809102..f48c4987a15 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1568,11 +1568,11 @@ out:
 }
 
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
-					    struct dentry *ecryptfs_dentry)
+					    struct inode *inode)
 {
 	int rc;
 
-	rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode);
+	rc = ecryptfs_read_xattr_region(page_virt, inode);
 	if (rc)
 		goto out;
 	if (!contains_ecryptfs_marker(page_virt	+ ECRYPTFS_FILE_SIZE_BYTES)) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 41a45323637..72aa24a4c71 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -662,7 +662,7 @@ void ecryptfs_write_crypt_stat_flags(char *page_virt,
 int ecryptfs_read_and_validate_header_region(char *data,
 					     struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
-					    struct dentry *ecryptfs_dentry);
+					    struct inode *inode);
 u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
@@ -753,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
 			     struct dentry *lower_dentry,
 			     struct vfsmount *lower_mnt,
 			     const struct cred *cred);
-int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry);
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
 void ecryptfs_put_lower_file(struct inode *inode);
 int
 ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 566e5472f78..4ec9eb00a24 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 				      | ECRYPTFS_ENCRYPTED);
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
-	rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to initialize "
 			"the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index fc7d2b74850..f0ad965d7d5 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -259,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
 				"context; rc = [%d]\n", rc);
 		goto out;
 	}
-	rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry,
+				     ecryptfs_dentry->d_inode);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to initialize "
 			"the lower file for the dentry with name "
@@ -350,50 +351,51 @@ static int ecryptfs_lookup_interpose(struct dentry *ecryptfs_dentry,
 		       __func__, rc);
 		goto out;
 	}
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-	d_add(ecryptfs_dentry, inode);
-	if (S_ISDIR(lower_inode->i_mode))
-		goto out;
-	if (S_ISLNK(lower_inode->i_mode))
-		goto out;
-	if (special_file(lower_inode->i_mode))
+	if (!S_ISREG(inode->i_mode)) {
+		if (inode->i_state & I_NEW)
+			unlock_new_inode(inode);
+		d_add(ecryptfs_dentry, inode);
 		goto out;
+	}
 	/* Released in this function */
 	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
 	if (!page_virt) {
 		printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
 		       __func__);
 		rc = -ENOMEM;
+		make_bad_inode(inode);
 		goto out;
 	}
-	rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+	rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to initialize "
 			"the lower file for the dentry with name "
 			"[%s]; rc = [%d]\n", __func__,
 			ecryptfs_dentry->d_name.name, rc);
+		make_bad_inode(inode);
 		goto out_free_kmem;
 	}
 	put_lower = 1;
-	crypt_stat = &ecryptfs_inode_to_private(
-					ecryptfs_dentry->d_inode)->crypt_stat;
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
 	/* TODO: lock for crypt_stat comparison */
 	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
 			ecryptfs_set_default_sizes(crypt_stat);
-	rc = ecryptfs_read_and_validate_header_region(page_virt,
-						      ecryptfs_dentry->d_inode);
+	rc = ecryptfs_read_and_validate_header_region(page_virt, inode);
 	if (rc) {
 		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_and_validate_xattr_region(page_virt,
-							     ecryptfs_dentry);
+							     inode);
 		if (rc) {
 			rc = 0;
-			goto out_free_kmem;
+			goto unlock_inode;
 		}
 		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
 	}
-	ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
+	ecryptfs_i_size_init(page_virt, inode);
+unlock_inode:
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	d_add(ecryptfs_dentry, inode);
 out_free_kmem:
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
 	goto out;
@@ -403,7 +405,7 @@ out_put:
 	d_drop(ecryptfs_dentry);
 out:
 	if (put_lower)
-		ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
+		ecryptfs_put_lower_file(inode);
 	return rc;
 }
 
@@ -843,7 +845,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		lower_ia->ia_valid &= ~ATTR_SIZE;
 		return 0;
 	}
-	rc = ecryptfs_get_lower_file(dentry);
+	rc = ecryptfs_get_lower_file(dentry, inode);
 	if (rc)
 		return rc;
 	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
@@ -999,7 +1001,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 
 		mount_crypt_stat = &ecryptfs_superblock_to_private(
 			dentry->d_sb)->mount_crypt_stat;
-		rc = ecryptfs_get_lower_file(dentry);
+		rc = ecryptfs_get_lower_file(dentry, inode);
 		if (rc) {
 			mutex_unlock(&crypt_stat->cs_mutex);
 			goto out;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 7c697abab39..943a4f55ed6 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
 	return rc;
 }
 
-int ecryptfs_get_lower_file(struct dentry *dentry)
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
 {
-	struct ecryptfs_inode_info *inode_info =
-		ecryptfs_inode_to_private(dentry->d_inode);
+	struct ecryptfs_inode_info *inode_info;
 	int count, rc = 0;
 
+	inode_info = ecryptfs_inode_to_private(inode);
 	mutex_lock(&inode_info->lower_file_mutex);
 	count = atomic_inc_return(&inode_info->lower_file_count);
 	if (WARN_ON_ONCE(count < 1))
-- 
cgit v1.2.3-70-g09d2


From 7a86617e553f47761b10f57de472d7262562b7de Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 2 May 2011 00:39:54 -0500
Subject: eCryptfs: Return useful code from contains_ecryptfs_marker

Instead of having the calling functions translate the true/false return
code to either 0 or -EINVAL, have contains_ecryptfs_marker() return 0 or
-EINVAL so that the calling functions can just reuse the return code.

Also, rename the function to ecryptfs_validate_marker() to avoid callers
mistakenly thinking that it returns true/false codes.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f48c4987a15..162f9baf9eb 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1024,25 +1024,25 @@ out:
 }
 
 /**
- * contains_ecryptfs_marker - check for the ecryptfs marker
+ * ecryptfs_validate_marker - check for the ecryptfs marker
  * @data: The data block in which to check
  *
- * Returns one if marker found; zero if not found
+ * Returns zero if marker found; -EINVAL if not found
  */
-static int contains_ecryptfs_marker(char *data)
+static int ecryptfs_validate_marker(char *data)
 {
 	u32 m_1, m_2;
 
 	m_1 = get_unaligned_be32(data);
 	m_2 = get_unaligned_be32(data + 4);
 	if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
-		return 1;
+		return 0;
 	ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
 			"MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
 			MAGIC_ECRYPTFS_MARKER);
 	ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
 			"[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
-	return 0;
+	return -EINVAL;
 }
 
 struct ecryptfs_flag_map_elem {
@@ -1217,10 +1217,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
 		       __func__, rc);
 		goto out;
 	}
-	if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
-		rc = -EINVAL;
-	} else
-		rc = 0;
+	rc = ecryptfs_validate_marker(data + ECRYPTFS_FILE_SIZE_BYTES);
 out:
 	return rc;
 }
@@ -1496,11 +1493,9 @@ static int ecryptfs_read_headers_virt(char *page_virt,
 	crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
 		ecryptfs_dentry->d_sb)->mount_crypt_stat;
 	offset = ECRYPTFS_FILE_SIZE_BYTES;
-	rc = contains_ecryptfs_marker(page_virt + offset);
-	if (rc == 0) {
-		rc = -EINVAL;
+	rc = ecryptfs_validate_marker(page_virt + offset);
+	if (rc)
 		goto out;
-	}
 	if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
 		ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
 	offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
@@ -1575,11 +1570,10 @@ int ecryptfs_read_and_validate_xattr_region(char *page_virt,
 	rc = ecryptfs_read_xattr_region(page_virt, inode);
 	if (rc)
 		goto out;
-	if (!contains_ecryptfs_marker(page_virt	+ ECRYPTFS_FILE_SIZE_BYTES)) {
+	rc = ecryptfs_validate_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES);
+	if (rc)
 		printk(KERN_WARNING "Valid data found in [%s] xattr, but "
 			"the marker is invalid\n", ECRYPTFS_XATTR_NAME);
-		rc = -EINVAL;
-	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From 778aeb42a708d2a57e491d2cbb5a1e74f61270b9 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 04:56:23 -0500
Subject: eCryptfs: Cleanup and optimize ecryptfs_lookup_interpose()

ecryptfs_lookup_interpose() has turned into spaghetti code over the
years. This is an effort to clean it up.

 - Shorten overly descriptive variable names such as ecryptfs_dentry
 - Simplify gotos and error paths
 - Create helper function for reading plaintext i_size from metadata

It also includes an optimization when reading i_size from the metadata.
A complete page-sized kmem_cache_alloc() was being done to read in 16
bytes of metadata. The buffer for that is now statically declared.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c          |  45 +++++++-------
 fs/ecryptfs/ecryptfs_kernel.h |   7 ++-
 fs/ecryptfs/inode.c           | 141 +++++++++++++++++++-----------------------
 3 files changed, 88 insertions(+), 105 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 162f9baf9eb..66d8e6748a4 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1201,24 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
 	return rc;
 }
 
-int ecryptfs_read_and_validate_header_region(char *data,
-					     struct inode *ecryptfs_inode)
+int ecryptfs_read_and_validate_header_region(struct inode *inode)
 {
-	struct ecryptfs_crypt_stat *crypt_stat =
-		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+	u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+	u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
 	int rc;
 
-	if (crypt_stat->extent_size == 0)
-		crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
-	rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
-				 ecryptfs_inode);
-	if (rc < 0) {
-		printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
-		       __func__, rc);
-		goto out;
-	}
-	rc = ecryptfs_validate_marker(data + ECRYPTFS_FILE_SIZE_BYTES);
-out:
+	rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
+				 inode);
+	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return rc >= 0 ? -EINVAL : rc;
+	rc = ecryptfs_validate_marker(marker);
+	if (!rc)
+		ecryptfs_i_size_init(file_size, inode);
 	return rc;
 }
 
@@ -1562,19 +1557,21 @@ out:
 	return rc;
 }
 
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
 					    struct inode *inode)
 {
+	u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+	u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
 	int rc;
 
-	rc = ecryptfs_read_xattr_region(page_virt, inode);
-	if (rc)
-		goto out;
-	rc = ecryptfs_validate_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES);
-	if (rc)
-		printk(KERN_WARNING "Valid data found in [%s] xattr, but "
-			"the marker is invalid\n", ECRYPTFS_XATTR_NAME);
-out:
+	rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
+				     ECRYPTFS_XATTR_NAME, file_size,
+				     ECRYPTFS_SIZE_AND_MARKER_BYTES);
+	if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
+		return rc >= 0 ? -EINVAL : rc;
+	rc = ecryptfs_validate_marker(marker);
+	if (!rc)
+		ecryptfs_i_size_init(file_size, inode);
 	return rc;
 }
 
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 72aa24a4c71..8297ddaca7c 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
 #define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8	/* 4*2 */
 #define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
+#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
+					+ MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
@@ -659,9 +661,8 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
 void ecryptfs_write_crypt_stat_flags(char *page_virt,
 				     struct ecryptfs_crypt_stat *crypt_stat,
 				     size_t *written);
-int ecryptfs_read_and_validate_header_region(char *data,
-					     struct inode *ecryptfs_inode);
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
+int ecryptfs_read_and_validate_header_region(struct inode *inode);
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
 					    struct inode *inode);
 u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f0ad965d7d5..7349ade17de 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -307,105 +307,90 @@ out:
 	return rc;
 }
 
+static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc;
+
+	rc = ecryptfs_get_lower_file(dentry, inode);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%s]; rc = [%d]\n", __func__,
+			dentry->d_name.name, rc);
+		return rc;
+	}
+
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	/* TODO: lock for crypt_stat comparison */
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+		ecryptfs_set_default_sizes(crypt_stat);
+
+	rc = ecryptfs_read_and_validate_header_region(inode);
+	ecryptfs_put_lower_file(inode);
+	if (rc) {
+		rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
+		if (!rc)
+			crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+
+	/* Must return 0 to allow non-eCryptfs files to be looked up, too */
+	return 0;
+}
+
 /**
  * ecryptfs_lookup_interpose - Dentry interposition for a lookup
  */
-static int ecryptfs_lookup_interpose(struct dentry *ecryptfs_dentry,
+static int ecryptfs_lookup_interpose(struct dentry *dentry,
 				     struct dentry *lower_dentry,
-				     struct inode *ecryptfs_dir_inode)
+				     struct inode *dir_inode)
 {
-	struct dentry *lower_dir_dentry;
+	struct inode *inode, *lower_inode = lower_dentry->d_inode;
+	struct ecryptfs_dentry_info *dentry_info;
 	struct vfsmount *lower_mnt;
-	struct inode *inode, *lower_inode;
-	struct ecryptfs_crypt_stat *crypt_stat;
-	char *page_virt = NULL;
-	int put_lower = 0, rc = 0;
-
-	lower_dir_dentry = lower_dentry->d_parent;
-	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-				   ecryptfs_dentry->d_parent));
-	lower_inode = lower_dentry->d_inode;
-	fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
+	int rc = 0;
+
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+	fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
 	BUG_ON(!lower_dentry->d_count);
-	ecryptfs_set_dentry_private(ecryptfs_dentry,
-				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
-						     GFP_KERNEL));
-	if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
-		rc = -ENOMEM;
+
+	dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+	ecryptfs_set_dentry_private(dentry, dentry_info);
+	if (!dentry_info) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting "
 		       "to allocate ecryptfs_dentry_info struct\n",
 			__func__);
-		goto out_put;
+		dput(lower_dentry);
+		mntput(lower_mnt);
+		d_drop(dentry);
+		return -ENOMEM;
 	}
-	ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
-	ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
+	ecryptfs_set_dentry_lower(dentry, lower_dentry);
+	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+
 	if (!lower_dentry->d_inode) {
 		/* We want to add because we couldn't find in lower */
-		d_add(ecryptfs_dentry, NULL);
-		goto out;
+		d_add(dentry, NULL);
+		return 0;
 	}
-	inode = __ecryptfs_get_inode(lower_inode, ecryptfs_dir_inode->i_sb);
+	inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
 	if (IS_ERR(inode)) {
-		rc = PTR_ERR(inode);
-		printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
-		       __func__, rc);
-		goto out;
-	}
-	if (!S_ISREG(inode->i_mode)) {
-		if (inode->i_state & I_NEW)
-			unlock_new_inode(inode);
-		d_add(ecryptfs_dentry, inode);
-		goto out;
-	}
-	/* Released in this function */
-	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
-	if (!page_virt) {
-		printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
-		       __func__);
-		rc = -ENOMEM;
-		make_bad_inode(inode);
-		goto out;
-	}
-	rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
-	if (rc) {
-		printk(KERN_ERR "%s: Error attempting to initialize "
-			"the lower file for the dentry with name "
-			"[%s]; rc = [%d]\n", __func__,
-			ecryptfs_dentry->d_name.name, rc);
-		make_bad_inode(inode);
-		goto out_free_kmem;
+		printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
+		       __func__, PTR_ERR(inode));
+		return PTR_ERR(inode);
 	}
-	put_lower = 1;
-	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
-	/* TODO: lock for crypt_stat comparison */
-	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
-			ecryptfs_set_default_sizes(crypt_stat);
-	rc = ecryptfs_read_and_validate_header_region(page_virt, inode);
-	if (rc) {
-		memset(page_virt, 0, PAGE_CACHE_SIZE);
-		rc = ecryptfs_read_and_validate_xattr_region(page_virt,
-							     inode);
+	if (S_ISREG(inode->i_mode)) {
+		rc = ecryptfs_i_size_read(dentry, inode);
 		if (rc) {
-			rc = 0;
-			goto unlock_inode;
+			make_bad_inode(inode);
+			return rc;
 		}
-		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
 	}
-	ecryptfs_i_size_init(page_virt, inode);
-unlock_inode:
+
 	if (inode->i_state & I_NEW)
 		unlock_new_inode(inode);
-	d_add(ecryptfs_dentry, inode);
-out_free_kmem:
-	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
-	goto out;
-out_put:
-	dput(lower_dentry);
-	mntput(lower_mnt);
-	d_drop(ecryptfs_dentry);
-out:
-	if (put_lower)
-		ecryptfs_put_lower_file(inode);
+	d_add(dentry, inode);
+
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3063287053bca5207e121c567b95b2b6f0bdc2c8 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 05:11:12 -0500
Subject: eCryptfs: Remove ecryptfs_header_cache_2

Now that ecryptfs_lookup_interpose() is no longer using
ecryptfs_header_cache_2 to read in metadata, the kmem_cache can be
removed and the ecryptfs_header_cache_1 kmem_cache can be renamed to
ecryptfs_header_cache.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c          | 7 +++----
 fs/ecryptfs/ecryptfs_kernel.h | 3 +--
 fs/ecryptfs/main.c            | 9 ++-------
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 66d8e6748a4..58609bde3b9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1234,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt,
 	(*written) = 6;
 }
 
-struct kmem_cache *ecryptfs_header_cache_1;
-struct kmem_cache *ecryptfs_header_cache_2;
+struct kmem_cache *ecryptfs_header_cache;
 
 /**
  * ecryptfs_write_headers_virt
@@ -1601,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 	ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
 						      mount_crypt_stat);
 	/* Read the first page from the underlying file */
-	page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER);
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
 	if (!page_virt) {
 		rc = -ENOMEM;
 		printk(KERN_ERR "%s: Unable to allocate page_virt\n",
@@ -1646,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 out:
 	if (page_virt) {
 		memset(page_virt, 0, PAGE_CACHE_SIZE);
-		kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+		kmem_cache_free(ecryptfs_header_cache, page_virt);
 	}
 	return rc;
 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 8297ddaca7c..43c7c43b06f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -605,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache;
 extern struct kmem_cache *ecryptfs_dentry_info_cache;
 extern struct kmem_cache *ecryptfs_inode_info_cache;
 extern struct kmem_cache *ecryptfs_sb_info_cache;
-extern struct kmem_cache *ecryptfs_header_cache_1;
-extern struct kmem_cache *ecryptfs_header_cache_2;
+extern struct kmem_cache *ecryptfs_header_cache;
 extern struct kmem_cache *ecryptfs_xattr_cache;
 extern struct kmem_cache *ecryptfs_key_record_cache;
 extern struct kmem_cache *ecryptfs_key_sig_cache;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 943a4f55ed6..9f1bb747d77 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -635,13 +635,8 @@ static struct ecryptfs_cache_info {
 		.size = sizeof(struct ecryptfs_sb_info),
 	},
 	{
-		.cache = &ecryptfs_header_cache_1,
-		.name = "ecryptfs_headers_1",
-		.size = PAGE_CACHE_SIZE,
-	},
-	{
-		.cache = &ecryptfs_header_cache_2,
-		.name = "ecryptfs_headers_2",
+		.cache = &ecryptfs_header_cache,
+		.name = "ecryptfs_headers",
 		.size = PAGE_CACHE_SIZE,
 	},
 	{
-- 
cgit v1.2.3-70-g09d2


From fac04863cef53a69830590b2e1c54345068a9747 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 29 May 2011 14:06:42 -0700
Subject: arm gpio drivers: make them 'depends on ARM'

We had a few drivers move from arch/arm into drivers/gpio, but they
don't actually compile without the ARM platform headers etc.  As a
result they were messing up allyesconfig on x86.

Make them depend on ARM.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 592397629dd..4a7f6314345 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -89,24 +89,28 @@ config GPIO_IT8761E
 config GPIO_EXYNOS4
 	bool "Samsung Exynos4 GPIO library support"
 	default y if CPU_EXYNOS4210
+	depends on ARM
 	help
 	  Say yes here to support Samsung Exynos4 series SoCs GPIO library
 
 config GPIO_PLAT_SAMSUNG
 	bool "Samsung SoCs GPIO library support"
 	default y if SAMSUNG_GPIOLIB_4BIT
+	depends on ARM
 	help
 	  Say yes here to support Samsung SoCs GPIO library
 
 config GPIO_S5PC100
 	bool "Samsung S5PC100 GPIO library support"
 	default y if CPU_S5PC100
+	depends on ARM
 	help
 	  Say yes here to support Samsung S5PC100 SoCs GPIO library
 
 config GPIO_S5PV210
 	bool "Samsung S5PV210/S5PC110 GPIO library support"
 	default y if CPU_S5PV210
+	depends on ARM
 	help
 	  Say yes here to support Samsung S5PV210/S5PC110 SoCs GPIO library
 
-- 
cgit v1.2.3-70-g09d2


From 55922c9d1b84b89cb946c777fddccb3247e7df2c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 29 May 2011 17:43:36 -0700
Subject: Linux 3.0-rc1

.. except there are various scripts that really know that there are
three numbers, so it calls itself "3.0.0-rc1".

Hopefully by the time the final 3.0 is out, we'll have that extra zero
all figured out.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 529d93fa243..afb8e0d26f2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
-VERSION = 2
-PATCHLEVEL = 6
-SUBLEVEL = 39
-EXTRAVERSION =
-NAME = Flesh-Eating Bats with Fangs
+VERSION = 3
+PATCHLEVEL = 0
+SUBLEVEL = 0
+EXTRAVERSION = -rc1
+NAME = Sneaky Weasel
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
-- 
cgit v1.2.3-70-g09d2


From 15517f7c213442e4d8a098cf0732b237f764c576 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 May 2011 11:14:08 -0600
Subject: lguest: fix timer interrupt setup

Without an IRQ chip set, we now get a WARN_ON and no timer interrupt.  This
prevents booting.

Fortunately, the fix is a one-liner: set up the timer IRQ like everything
else.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: stable@kernel.org # .39.x
---
 arch/x86/lguest/boot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index e191c096ab9..db832fd65ec 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -993,6 +993,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 static void lguest_time_init(void)
 {
 	/* Set up the timer interrupt (0) to go to our simple timer routine */
+	lguest_setup_irq(0);
 	irq_set_handler(0, lguest_time_irq);
 
 	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-- 
cgit v1.2.3-70-g09d2


From bc805a03c26e1e25171bc627c6264553d27f746c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 May 2011 11:14:11 -0600
Subject: lguest: fix up compilation after move

ed16648eb5b86917f0b90bdcdbc857202da72f90 "Move kvm, uml, and lguest
subdirectories" broke the lguest example launcher.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/virtual/lguest/Makefile | 2 +-
 Documentation/virtual/lguest/lguest.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/lguest/Makefile b/Documentation/virtual/lguest/Makefile
index bebac6b4f33..0ac34206f7a 100644
--- a/Documentation/virtual/lguest/Makefile
+++ b/Documentation/virtual/lguest/Makefile
@@ -1,5 +1,5 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
-# Missing headers?  Add "-I../../include -I../../arch/x86/include"
+# Missing headers?  Add "-I../../../include -I../../../arch/x86/include"
 CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE
 
 all: lguest
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c
index d9da7e14853..711ba9e9a00 100644
--- a/Documentation/virtual/lguest/lguest.c
+++ b/Documentation/virtual/lguest/lguest.c
@@ -49,7 +49,7 @@
 #include <linux/virtio_rng.h>
 #include <linux/virtio_ring.h>
 #include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
+#include "../../../include/linux/lguest_launcher.h"
 /*L:110
  * We can ignore the 42 include files we need for this program, but I do want
  * to draw attention to the use of kernel-style types.
-- 
cgit v1.2.3-70-g09d2


From 990c91f0af46c57f0291060d928c7ab82f9d5667 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 May 2011 11:14:12 -0600
Subject: lguest: remove support for VIRTIO_F_NOTIFY_ON_EMPTY.

No virtio device does this any more, so no need to clutter lguest with it.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/virtual/lguest/lguest.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c
index 711ba9e9a00..cd9d6af61d0 100644
--- a/Documentation/virtual/lguest/lguest.c
+++ b/Documentation/virtual/lguest/lguest.c
@@ -135,9 +135,6 @@ struct device {
 	/* Is it operational */
 	bool running;
 
-	/* Does Guest want an intrrupt on empty? */
-	bool irq_on_empty;
-
 	/* Device-specific data. */
 	void *priv;
 };
@@ -637,10 +634,7 @@ static void trigger_irq(struct virtqueue *vq)
 
 	/* If they don't want an interrupt, don't send one... */
 	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
-		/* ... unless they've asked us to force one on empty. */
-		if (!vq->dev->irq_on_empty
-		    || lg_last_avail(vq) != vq->vring.avail->idx)
-			return;
+		return;
 	}
 
 	/* Send the Guest an interrupt tell them we used something up. */
@@ -1057,15 +1051,6 @@ static void create_thread(struct virtqueue *vq)
 	close(vq->eventfd);
 }
 
-static bool accepted_feature(struct device *dev, unsigned int bit)
-{
-	const u8 *features = get_feature_bits(dev) + dev->feature_len;
-
-	if (dev->feature_len < bit / CHAR_BIT)
-		return false;
-	return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT));
-}
-
 static void start_device(struct device *dev)
 {
 	unsigned int i;
@@ -1079,8 +1064,6 @@ static void start_device(struct device *dev)
 		verbose(" %02x", get_feature_bits(dev)
 			[dev->feature_len+i]);
 
-	dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
-
 	for (vq = dev->vq; vq; vq = vq->next) {
 		if (vq->service)
 			create_thread(vq);
@@ -1564,7 +1547,6 @@ static void setup_tun_net(char *arg)
 	/* Set up the tun device. */
 	configure_device(ipfd, tapif, ip);
 
-	add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
 	/* Expect Guest to handle everything except UFO */
 	add_feature(dev, VIRTIO_NET_F_CSUM);
 	add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
-- 
cgit v1.2.3-70-g09d2


From 7a7c924cf03da2a76ea4dc0aac1a788cf95a9c29 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Feb 2011 21:43:48 +0100
Subject: virtio_blk: allow re-reading config space at runtime

Wire up the virtio_driver config_changed method to get notified about
config changes raised by the host.  For now we just re-read the device
size to support online resizing of devices, but once we add more
attributes that might be changeable they could be added as well.

Note that the config_changed method is called from irq context, so
we'll have to use the workqueue infrastructure to provide us a proper
user context for our changes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c | 88 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6ecf89cdf00..33a48a80c7e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -6,10 +6,12 @@
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
 #include <linux/scatterlist.h>
+#include <linux/string_helpers.h>
 
 #define PART_BITS 4
 
 static int major, index;
+struct workqueue_struct *virtblk_wq;
 
 struct virtio_blk
 {
@@ -26,6 +28,9 @@ struct virtio_blk
 
 	mempool_t *pool;
 
+	/* Process context for config space updates */
+	struct work_struct config_work;
+
 	/* What host tells us, plus 2 for header & tailer. */
 	unsigned int sg_elems;
 
@@ -291,6 +296,46 @@ static ssize_t virtblk_serial_show(struct device *dev,
 }
 DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
 
+static void virtblk_config_changed_work(struct work_struct *work)
+{
+	struct virtio_blk *vblk =
+		container_of(work, struct virtio_blk, config_work);
+	struct virtio_device *vdev = vblk->vdev;
+	struct request_queue *q = vblk->disk->queue;
+	char cap_str_2[10], cap_str_10[10];
+	u64 capacity, size;
+
+	/* Host must always specify the capacity. */
+	vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
+			  &capacity, sizeof(capacity));
+
+	/* If capacity is too big, truncate with warning. */
+	if ((sector_t)capacity != capacity) {
+		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+			 (unsigned long long)capacity);
+		capacity = (sector_t)-1;
+	}
+
+	size = capacity * queue_logical_block_size(q);
+	string_get_size(size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+	string_get_size(size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
+
+	dev_notice(&vdev->dev,
+		  "new size: %llu %d-byte logical blocks (%s/%s)\n",
+		  (unsigned long long)capacity,
+		  queue_logical_block_size(q),
+		  cap_str_10, cap_str_2);
+
+	set_capacity(vblk->disk, capacity);
+}
+
+static void virtblk_config_changed(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	queue_work(virtblk_wq, &vblk->config_work);
+}
+
 static int __devinit virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
@@ -327,6 +372,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
 	sg_init_table(vblk->sg, vblk->sg_elems);
+	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 
 	/* We expect one virtqueue, for output. */
 	vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
@@ -477,6 +523,8 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk = vdev->priv;
 
+	flush_work(&vblk->config_work);
+
 	/* Nothing should be pending. */
 	BUG_ON(!list_empty(&vblk->reqs));
 
@@ -508,27 +556,47 @@ static unsigned int features[] = {
  * Use __refdata to avoid this warning.
  */
 static struct virtio_driver __refdata virtio_blk = {
-	.feature_table = features,
-	.feature_table_size = ARRAY_SIZE(features),
-	.driver.name =	KBUILD_MODNAME,
-	.driver.owner =	THIS_MODULE,
-	.id_table =	id_table,
-	.probe =	virtblk_probe,
-	.remove =	__devexit_p(virtblk_remove),
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= virtblk_probe,
+	.remove			= __devexit_p(virtblk_remove),
+	.config_changed		= virtblk_config_changed,
 };
 
 static int __init init(void)
 {
+	int error;
+
+	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+	if (!virtblk_wq)
+		return -ENOMEM;
+
 	major = register_blkdev(0, "virtblk");
-	if (major < 0)
-		return major;
-	return register_virtio_driver(&virtio_blk);
+	if (major < 0) {
+		error = major;
+		goto out_destroy_workqueue;
+	}
+
+	error = register_virtio_driver(&virtio_blk);
+	if (error)
+		goto out_unregister_blkdev;
+	return 0;
+
+out_unregister_blkdev:
+	unregister_blkdev(major, "virtblk");
+out_destroy_workqueue:
+	destroy_workqueue(virtblk_wq);
+	return error;
 }
 
 static void __exit fini(void)
 {
 	unregister_blkdev(major, "virtblk");
 	unregister_virtio_driver(&virtio_blk);
+	destroy_workqueue(virtblk_wq);
 }
 module_init(init);
 module_exit(fini);
-- 
cgit v1.2.3-70-g09d2


From 6917f83ffe5e6b6414ccc845263b792ed201c0f1 Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Sun, 24 Apr 2011 02:49:26 +0800
Subject: drivers, block: virtio_blk: Replace cryptic number with the macro

It is easier to figure out the context by reading SCSI_SENSE_BUFFERSIZE
instead of plain '96'.

Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/block/virtio_blk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 33a48a80c7e..079c08808d8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -7,6 +7,7 @@
 #include <linux/virtio_blk.h>
 #include <linux/scatterlist.h>
 #include <linux/string_helpers.h>
+#include <scsi/scsi_cmnd.h>
 
 #define PART_BITS 4
 
@@ -146,7 +147,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
 
 	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
-		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
+		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
 		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
 			   sizeof(vbr->in_hdr));
 	}
-- 
cgit v1.2.3-70-g09d2


From 177dbd95637a52b9118aca757d5856ec3995d3e7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 May 2011 11:14:13 -0600
Subject: virtio console: don't manually set or finalize
 VIRTIO_CONSOLE_F_MULTIPORT.

That's already been done by the virtio infrastructure before the probe
function is called.

Reported-by: alexey.kardashevskiy@au1.ibm.com
Acked-by: Amit Shah <amit.shah@redhat.com>
Tested-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/virtio_console.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 838568a7dbf..fb68b129537 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1677,17 +1677,12 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
 	portdev->config.max_nr_ports = 1;
 	if (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT)) {
 		multiport = true;
-		vdev->features[0] |= 1 << VIRTIO_CONSOLE_F_MULTIPORT;
-
 		vdev->config->get(vdev, offsetof(struct virtio_console_config,
 						 max_nr_ports),
 				  &portdev->config.max_nr_ports,
 				  sizeof(portdev->config.max_nr_ports));
 	}
 
-	/* Let the Host know we support multiple ports.*/
-	vdev->config->finalize_features(vdev);
-
 	err = init_vqs(portdev);
 	if (err < 0) {
 		dev_err(&vdev->dev, "Error %d initializing vqs\n", err);
-- 
cgit v1.2.3-70-g09d2


From bf50e69f63d21091e525185c3ae761412be0ba72 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 7 Apr 2011 10:43:25 -0700
Subject: virtio balloon: kill tell-host-first logic

The virtio balloon driver has a VIRTIO_BALLOON_F_MUST_TELL_HOST
feature bit.  Whenever the bit is set, the guest kernel must
always tell the host before we free pages back to the allocator.
Without this feature, we might free a page (and have another
user touch it) while the hypervisor is unprepared for it.

But, if the bit is _not_ set, we are under no obligation to
reverse the order; we're under no obligation to do _anything_.
As of now, qemu-kvm defines the bit, but doesn't set it.

This patch makes the "tell host first" logic the only case.  This
should make everybody happy, and reduce the amount of untested or
untestable code in the kernel.

This _also_ means that we don't have to preserve a pfn list
after the pages are freed, which should let us get rid of some
temporary storage (vb->pfns) eventually.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_balloon.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0f1da45ba47..e058ace2a4a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -40,9 +40,6 @@ struct virtio_balloon
 	/* Waiting for host to ack the pages we released. */
 	struct completion acked;
 
-	/* Do we have to tell Host *before* we reuse pages? */
-	bool tell_host_first;
-
 	/* The pages we've told the Host we're not using. */
 	unsigned int num_pages;
 	struct list_head pages;
@@ -151,13 +148,14 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num)
 		vb->num_pages--;
 	}
 
-	if (vb->tell_host_first) {
-		tell_host(vb, vb->deflate_vq);
-		release_pages_by_pfn(vb->pfns, vb->num_pfns);
-	} else {
-		release_pages_by_pfn(vb->pfns, vb->num_pfns);
-		tell_host(vb, vb->deflate_vq);
-	}
+
+	/*
+	 * Note that if
+	 * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
+	 * is true, we *have* to do it in this order
+	 */
+	tell_host(vb, vb->deflate_vq);
+	release_pages_by_pfn(vb->pfns, vb->num_pfns);
 }
 
 static inline void update_stat(struct virtio_balloon *vb, int idx,
@@ -325,9 +323,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		goto out_del_vqs;
 	}
 
-	vb->tell_host_first
-		= virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
-
 	return 0;
 
 out_del_vqs:
-- 
cgit v1.2.3-70-g09d2


From a1b383870a28cfbd1657d4922c0fafc634a62ebd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 May 2011 11:14:13 -0600
Subject: virtio: add full three-clause BSD text to headers.

It's unclear to me if it's important, but it's obviously causing my
technical colleages some headaches and I'd hate such imprecision to
slow virtio adoption.

I've emailed this to all non-trivial contributors for approval, too.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Ryan Harper <ryanh@us.ibm.com>
Acked-by: Anthony Liguori <aliguori@us.ibm.com>
Acked-by: Eric Van Hensbergen <ericvh@gmail.com>
Acked-by: john cooper <john.cooper@redhat.com>
Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
---
 include/linux/virtio_9p.h      | 25 ++++++++++++++++++++++++-
 include/linux/virtio_balloon.h | 25 ++++++++++++++++++++++++-
 include/linux/virtio_blk.h     | 25 ++++++++++++++++++++++++-
 include/linux/virtio_config.h  | 25 ++++++++++++++++++++++++-
 include/linux/virtio_console.h | 26 +++++++++++++++++++++++++-
 include/linux/virtio_ids.h     | 24 +++++++++++++++++++++++-
 include/linux/virtio_net.h     | 25 ++++++++++++++++++++++++-
 include/linux/virtio_pci.h     | 23 +++++++++++++++++++++++
 include/linux/virtio_ring.h    | 23 +++++++++++++++++++++++
 9 files changed, 214 insertions(+), 7 deletions(-)

diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h
index e68b439b286..277c4ad44e8 100644
--- a/include/linux/virtio_9p.h
+++ b/include/linux/virtio_9p.h
@@ -1,7 +1,30 @@
 #ifndef _LINUX_VIRTIO_9P_H
 #define _LINUX_VIRTIO_9P_H
 /* This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers. */
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h
index a50ecd1b81a..652dc8bea92 100644
--- a/include/linux/virtio_balloon.h
+++ b/include/linux/virtio_balloon.h
@@ -1,7 +1,30 @@
 #ifndef _LINUX_VIRTIO_BALLOON_H
 #define _LINUX_VIRTIO_BALLOON_H
 /* This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers. */
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
 
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 167720d695e..e0edb40ca7a 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -1,7 +1,30 @@
 #ifndef _LINUX_VIRTIO_BLK_H
 #define _LINUX_VIRTIO_BLK_H
 /* This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers. */
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 800617b4ddd..39c88c5ad19 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -1,7 +1,30 @@
 #ifndef _LINUX_VIRTIO_CONFIG_H
 #define _LINUX_VIRTIO_CONFIG_H
 /* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
- * anyone can use the definitions to implement compatible drivers/servers. */
+ * anyone can use the definitions to implement compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
 
 /* Virtio devices use a standardized configuration space to define their
  * features and pass configuration information, but each implementation can
diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
index e4d333543a3..bdf4b003473 100644
--- a/include/linux/virtio_console.h
+++ b/include/linux/virtio_console.h
@@ -5,7 +5,31 @@
 #include <linux/virtio_config.h>
 /*
  * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
- * anyone can use the definitions to implement compatible drivers/servers.
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  *
  * Copyright (C) Red Hat, Inc., 2009, 2010, 2011
  * Copyright (C) Amit Shah <amit.shah@redhat.com>, 2009, 2010, 2011
diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
index 06660c0a78d..85bb0bb66ff 100644
--- a/include/linux/virtio_ids.h
+++ b/include/linux/virtio_ids.h
@@ -5,7 +5,29 @@
  *
  * This header is BSD licensed so anyone can use the definitions to implement
  * compatible drivers/servers.
- */
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
 
 #define VIRTIO_ID_NET		1 /* virtio net */
 #define VIRTIO_ID_BLOCK		2 /* virtio block */
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 085e42298ce..136040bba3e 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -1,7 +1,30 @@
 #ifndef _LINUX_VIRTIO_NET_H
 #define _LINUX_VIRTIO_NET_H
 /* This header is BSD licensed so anyone can use the definitions to implement
- * compatible drivers/servers. */
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. */
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index 9a3d7c48c62..ea66f3f60d6 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -11,6 +11,29 @@
  *
  * This header is BSD licensed so anyone can use the definitions to implement
  * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  */
 
 #ifndef _LINUX_VIRTIO_PCI_H
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index e4d144b132b..a813e5d460e 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -7,6 +7,29 @@
  * This header is BSD licensed so anyone can use the definitions to implement
  * compatible drivers/servers.
  *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
  * Copyright Rusty Russell IBM Corporation 2007. */
 #include <linux/types.h>
 
-- 
cgit v1.2.3-70-g09d2


From 770b31a85e000b0194974922f238a30ade4246b6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:10:17 +0300
Subject: virtio: event index interface

Define a new feature bit for the guest and host to utilize
an event index (like Xen) instead if a flag bit to enable/disable
interrupts and kicks.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/virtio_ring.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index a813e5d460e..c4eef73deb3 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -52,6 +52,12 @@
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC	28
 
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field. */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field. */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
 	/* Address (guest-physical). */
@@ -106,6 +112,7 @@ struct vring {
  *	__u16 avail_flags;
  *	__u16 avail_idx;
  *	__u16 available[num];
+ *	__u16 used_event_idx;
  *
  *	// Padding to the next align boundary.
  *	char pad[];
@@ -114,8 +121,14 @@ struct vring {
  *	__u16 used_flags;
  *	__u16 used_idx;
  *	struct vring_used_elem used[num];
+ *	__u16 avail_event_idx;
  * };
  */
+/* We publish the used event index at the end of the available ring, and vice
+ * versa. They are at the end for backwards compatibility. */
+#define vring_used_event(vr) ((vr)->avail->ring[(vr)->num])
+#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
+
 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
 			      unsigned long align)
 {
@@ -130,7 +143,7 @@ static inline unsigned vring_size(unsigned int num, unsigned long align)
 {
 	return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
 		 + align - 1) & ~(align - 1))
-		+ sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
+		+ sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num;
 }
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3-70-g09d2


From bf7035bf20563a6cadcb9e870406e7b21daf5e30 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:10:27 +0300
Subject: virtio ring: inline function to check for events

With the new used_event and avail_event and features, both
host and guest need similar logic to check whether events are
enabled, so it helps to put the common code in the header.

Note that Xen has similar logic for notification hold-off
in include/xen/interface/io/ring.h with req_event and req_prod
corresponding to event_idx + 1 and new_idx respectively.
+1 comes from the fact that req_event and req_prod in Xen start at 1,
while event index in virtio starts at 0.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/virtio_ring.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index c4eef73deb3..4a32cb6da42 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -146,6 +146,20 @@ static inline unsigned vring_size(unsigned int num, unsigned long align)
 		+ sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num;
 }
 
+/* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */
+/* Assuming a given event_idx value from the other size, if
+ * we have just incremented index from old to new_idx,
+ * should we trigger an event? */
+static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
+{
+	/* Note: Xen has similar logic for notification hold-off
+	 * in include/xen/interface/io/ring.h with req_event and req_prod
+	 * corresponding to event_idx + 1 and new_idx respectively.
+	 * Note also that req_event and req_prod in Xen start at 1,
+	 * event indexes in virtio start at 0. */
+	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
+}
+
 #ifdef __KERNEL__
 #include <linux/irqreturn.h>
 struct virtio_device;
-- 
cgit v1.2.3-70-g09d2


From a5c262c5fd83ece01bd649fb08416c501d4c59d7 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:10:44 +0300
Subject: virtio_ring: support event idx feature

Support for the new event idx feature:
1. When enabling interrupts, publish the current avail index
   value to the host to get interrupts on the next update.
2. Use the new avail_event feature to reduce the number
   of exits from the guest.

Simple test with the simulator:

[virtio]# time ./virtio_test
spurious wakeus: 0x7

real    0m0.169s
user    0m0.140s
sys     0m0.019s
[virtio]# time ./virtio_test --no-event-idx
spurious wakeus: 0x11

real    0m0.649s
user    0m0.295s
sys     0m0.335s

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index b0043fb26a4..a5fadc40c44 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -82,6 +82,9 @@ struct vring_virtqueue
 	/* Host supports indirect buffers */
 	bool indirect;
 
+	/* Host publishes avail event idx */
+	bool event;
+
 	/* Number of free buffers */
 	unsigned int num_free;
 	/* Head of free buffer list. */
@@ -237,18 +240,22 @@ EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp);
 void virtqueue_kick(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 new, old;
 	START_USE(vq);
 	/* Descriptors and available array need to be set before we expose the
 	 * new available array entries. */
 	virtio_wmb();
 
-	vq->vring.avail->idx += vq->num_added;
+	old = vq->vring.avail->idx;
+	new = vq->vring.avail->idx = old + vq->num_added;
 	vq->num_added = 0;
 
 	/* Need to update avail index before checking if we should notify */
 	virtio_mb();
 
-	if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
+	if (vq->event ?
+	    vring_need_event(vring_avail_event(&vq->vring), new, old) :
+	    !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
 		/* Prod other side to tell it about changes. */
 		vq->notify(&vq->vq);
 
@@ -324,6 +331,14 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	ret = vq->data[i];
 	detach_buf(vq, i);
 	vq->last_used_idx++;
+	/* If we expect an interrupt for the next entry, tell host
+	 * by writing event index and flush out the write before
+	 * the read in the next get_buf call. */
+	if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vring_used_event(&vq->vring) = vq->last_used_idx;
+		virtio_mb();
+	}
+
 	END_USE(vq);
 	return ret;
 }
@@ -345,7 +360,11 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
 
 	/* We optimistically turn back on interrupts, then check if there was
 	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always do both to keep code simple. */
 	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+	vring_used_event(&vq->vring) = vq->last_used_idx;
 	virtio_mb();
 	if (unlikely(more_used(vq))) {
 		END_USE(vq);
@@ -438,6 +457,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 #endif
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback)
@@ -472,6 +492,8 @@ void vring_transport_features(struct virtio_device *vdev)
 		switch (i) {
 		case VIRTIO_RING_F_INDIRECT_DESC:
 			break;
+		case VIRTIO_RING_F_EVENT_IDX:
+			break;
 		default:
 			/* We don't understand this bit. */
 			clear_bit(i, vdev->features);
-- 
cgit v1.2.3-70-g09d2


From 8ea8cf89e19aeb596b818ee5f2bec8a8b0586b60 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:10:54 +0300
Subject: vhost: support event index

Support the new event index feature. When acked,
utilize it to reduce the # of interrupts sent to the guest.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/vhost/net.c   |  12 ++---
 drivers/vhost/test.c  |   6 +--
 drivers/vhost/vhost.c | 138 +++++++++++++++++++++++++++++++++++++-------------
 drivers/vhost/vhost.h |  21 +++++---
 4 files changed, 127 insertions(+), 50 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a85e5..e224a92baa1 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -144,7 +144,7 @@ static void handle_tx(struct vhost_net *net)
 	}
 
 	mutex_lock(&vq->mutex);
-	vhost_disable_notify(vq);
+	vhost_disable_notify(&net->dev, vq);
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
@@ -166,8 +166,8 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
-			if (unlikely(vhost_enable_notify(vq))) {
-				vhost_disable_notify(vq);
+			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			break;
@@ -315,7 +315,7 @@ static void handle_rx(struct vhost_net *net)
 		return;
 
 	mutex_lock(&vq->mutex);
-	vhost_disable_notify(vq);
+	vhost_disable_notify(&net->dev, vq);
 	vhost_hlen = vq->vhost_hlen;
 	sock_hlen = vq->sock_hlen;
 
@@ -334,10 +334,10 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		/* OK, now we need to know about added descriptors. */
 		if (!headcount) {
-			if (unlikely(vhost_enable_notify(vq))) {
+			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
-				vhost_disable_notify(vq);
+				vhost_disable_notify(&net->dev, vq);
 				continue;
 			}
 			/* Nothing new?  Wait for eventfd to tell us
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 099f30230d0..734e1d74ad8 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -49,7 +49,7 @@ static void handle_vq(struct vhost_test *n)
 		return;
 
 	mutex_lock(&vq->mutex);
-	vhost_disable_notify(vq);
+	vhost_disable_notify(&n->dev, vq);
 
 	for (;;) {
 		head = vhost_get_vq_desc(&n->dev, vq, vq->iov,
@@ -61,8 +61,8 @@ static void handle_vq(struct vhost_test *n)
 			break;
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(vq))) {
-				vhost_disable_notify(vq);
+			if (unlikely(vhost_enable_notify(&n->dev, vq))) {
+				vhost_disable_notify(&n->dev, vq);
 				continue;
 			}
 			break;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7aa4eea930f..ea966b35635 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -37,6 +37,9 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };
 
+#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
+#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -161,6 +164,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->last_avail_idx = 0;
 	vq->avail_idx = 0;
 	vq->last_used_idx = 0;
+	vq->signalled_used = 0;
+	vq->signalled_used_valid = false;
 	vq->used_flags = 0;
 	vq->log_used = false;
 	vq->log_addr = -1ull;
@@ -489,16 +494,17 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
 	return 1;
 }
 
-static int vq_access_ok(unsigned int num,
+static int vq_access_ok(struct vhost_dev *d, unsigned int num,
 			struct vring_desc __user *desc,
 			struct vring_avail __user *avail,
 			struct vring_used __user *used)
 {
+	size_t s = vhost_has_feature(d, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 	return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
 	       access_ok(VERIFY_READ, avail,
-			 sizeof *avail + num * sizeof *avail->ring) &&
+			 sizeof *avail + num * sizeof *avail->ring + s) &&
 	       access_ok(VERIFY_WRITE, used,
-			sizeof *used + num * sizeof *used->ring);
+			sizeof *used + num * sizeof *used->ring + s);
 }
 
 /* Can we log writes? */
@@ -514,9 +520,11 @@ int vhost_log_access_ok(struct vhost_dev *dev)
 
 /* Verify access for write logging. */
 /* Caller should have vq mutex and device mutex */
-static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
+static int vq_log_access_ok(struct vhost_dev *d, struct vhost_virtqueue *vq,
+			    void __user *log_base)
 {
 	struct vhost_memory *mp;
+	size_t s = vhost_has_feature(d, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
 	mp = rcu_dereference_protected(vq->dev->memory,
 				       lockdep_is_held(&vq->mutex));
@@ -524,15 +532,15 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
 			    vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
 		(!vq->log_used || log_access_ok(log_base, vq->log_addr,
 					sizeof *vq->used +
-					vq->num * sizeof *vq->used->ring));
+					vq->num * sizeof *vq->used->ring + s));
 }
 
 /* Can we start vq? */
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
-	return vq_access_ok(vq->num, vq->desc, vq->avail, vq->used) &&
-		vq_log_access_ok(vq, vq->log_base);
+	return vq_access_ok(vq->dev, vq->num, vq->desc, vq->avail, vq->used) &&
+		vq_log_access_ok(vq->dev, vq, vq->log_base);
 }
 
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
@@ -577,6 +585,7 @@ static int init_used(struct vhost_virtqueue *vq,
 
 	if (r)
 		return r;
+	vq->signalled_used_valid = false;
 	return get_user(vq->last_used_idx, &used->idx);
 }
 
@@ -674,7 +683,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		 * If it is not, we don't as size might not have been setup.
 		 * We will verify when backend is configured. */
 		if (vq->private_data) {
-			if (!vq_access_ok(vq->num,
+			if (!vq_access_ok(d, vq->num,
 				(void __user *)(unsigned long)a.desc_user_addr,
 				(void __user *)(unsigned long)a.avail_user_addr,
 				(void __user *)(unsigned long)a.used_user_addr)) {
@@ -818,7 +827,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
 			vq = d->vqs + i;
 			mutex_lock(&vq->mutex);
 			/* If ring is inactive, will check when it's enabled. */
-			if (vq->private_data && !vq_log_access_ok(vq, base))
+			if (vq->private_data && !vq_log_access_ok(d, vq, base))
 				r = -EFAULT;
 			else
 				vq->log_base = base;
@@ -1219,6 +1228,10 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 
 	/* On success, increment avail index. */
 	vq->last_avail_idx++;
+
+	/* Assume notifications from guest are disabled at this point,
+	 * if they aren't we would need to update avail_event index. */
+	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
 	return head;
 }
 
@@ -1267,6 +1280,12 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
 			eventfd_signal(vq->log_ctx, 1);
 	}
 	vq->last_used_idx++;
+	/* If the driver never bothers to signal in a very long while,
+	 * used index might wrap around. If that happens, invalidate
+	 * signalled_used index we stored. TODO: make sure driver
+	 * signals at least once in 2^16 and remove this. */
+	if (unlikely(vq->last_used_idx == vq->signalled_used))
+		vq->signalled_used_valid = false;
 	return 0;
 }
 
@@ -1275,6 +1294,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 			    unsigned count)
 {
 	struct vring_used_elem __user *used;
+	u16 old, new;
 	int start;
 
 	start = vq->last_used_idx % vq->num;
@@ -1292,7 +1312,14 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 			   ((void __user *)used - (void __user *)vq->used),
 			  count * sizeof *used);
 	}
-	vq->last_used_idx += count;
+	old = vq->last_used_idx;
+	new = (vq->last_used_idx += count);
+	/* If the driver never bothers to signal in a very long while,
+	 * used index might wrap around. If that happens, invalidate
+	 * signalled_used index we stored. TODO: make sure driver
+	 * signals at least once in 2^16 and remove this. */
+	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
+		vq->signalled_used_valid = false;
 	return 0;
 }
 
@@ -1331,29 +1358,47 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 	return r;
 }
 
-/* This actually signals the guest, using eventfd. */
-void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-	__u16 flags;
-
+	__u16 old, new, event;
+	bool v;
 	/* Flush out used index updates. This is paired
 	 * with the barrier that the Guest executes when enabling
 	 * interrupts. */
 	smp_mb();
 
-	if (__get_user(flags, &vq->avail->flags)) {
-		vq_err(vq, "Failed to get flags");
-		return;
+	if (vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+	    unlikely(vq->avail_idx == vq->last_avail_idx))
+		return true;
+
+	if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+		__u16 flags;
+		if (__get_user(flags, &vq->avail->flags)) {
+			vq_err(vq, "Failed to get flags");
+			return true;
+		}
+		return !(flags & VRING_AVAIL_F_NO_INTERRUPT);
 	}
+	old = vq->signalled_used;
+	v = vq->signalled_used_valid;
+	new = vq->signalled_used = vq->last_used_idx;
+	vq->signalled_used_valid = true;
 
-	/* If they don't want an interrupt, don't signal, unless empty. */
-	if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
-	    (vq->avail_idx != vq->last_avail_idx ||
-	     !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
-		return;
+	if (unlikely(!v))
+		return true;
 
+	if (get_user(event, vhost_used_event(vq))) {
+		vq_err(vq, "Failed to get used event idx");
+		return true;
+	}
+	return vring_need_event(event, new, old);
+}
+
+/* This actually signals the guest, using eventfd. */
+void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
 	/* Signal the Guest tell them we used something up. */
-	if (vq->call_ctx)
+	if (vq->call_ctx && vhost_notify(dev, vq))
 		eventfd_signal(vq->call_ctx, 1);
 }
 
@@ -1376,7 +1421,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 }
 
 /* OK, now we need to know about added descriptors. */
-bool vhost_enable_notify(struct vhost_virtqueue *vq)
+bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
 	u16 avail_idx;
 	int r;
@@ -1384,11 +1429,34 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq)
 	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
 		return false;
 	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
-	r = put_user(vq->used_flags, &vq->used->flags);
-	if (r) {
-		vq_err(vq, "Failed to enable notification at %p: %d\n",
-		       &vq->used->flags, r);
-		return false;
+	if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+		r = put_user(vq->used_flags, &vq->used->flags);
+		if (r) {
+			vq_err(vq, "Failed to enable notification at %p: %d\n",
+			       &vq->used->flags, r);
+			return false;
+		}
+	} else {
+		r = put_user(vq->avail_idx, vhost_avail_event(vq));
+		if (r) {
+			vq_err(vq, "Failed to update avail event index at %p: %d\n",
+			       vhost_avail_event(vq), r);
+			return false;
+		}
+	}
+	if (unlikely(vq->log_used)) {
+		void __user *used;
+		/* Make sure data is seen before log. */
+		smp_wmb();
+		used = vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX) ?
+			&vq->used->flags : vhost_avail_event(vq);
+		/* Log used flags or event index entry write. Both are 16 bit
+		 * fields. */
+		log_write(vq->log_base, vq->log_addr +
+			   (used - (void __user *)vq->used),
+			  sizeof(u16));
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
 	}
 	/* They could have slipped one in as we were doing that: make
 	 * sure it's written, then check again. */
@@ -1404,15 +1472,17 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq)
 }
 
 /* We don't need to be notified again. */
-void vhost_disable_notify(struct vhost_virtqueue *vq)
+void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
 	int r;
 
 	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
 		return;
 	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
-	r = put_user(vq->used_flags, &vq->used->flags);
-	if (r)
-		vq_err(vq, "Failed to enable notification at %p: %d\n",
-		       &vq->used->flags, r);
+	if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+		r = put_user(vq->used_flags, &vq->used->flags);
+		if (r)
+			vq_err(vq, "Failed to enable notification at %p: %d\n",
+			       &vq->used->flags, r);
+	}
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae3851..8e03379dd30 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -84,6 +84,12 @@ struct vhost_virtqueue {
 	/* Used flags */
 	u16 used_flags;
 
+	/* Last used index value we have signalled on */
+	u16 signalled_used;
+
+	/* Last used index value we have signalled on */
+	bool signalled_used_valid;
+
 	/* Log writes to used structure. */
 	bool log_used;
 	u64 log_addr;
@@ -149,8 +155,8 @@ void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
 			       struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
-void vhost_disable_notify(struct vhost_virtqueue *);
-bool vhost_enable_notify(struct vhost_virtqueue *);
+void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
@@ -162,11 +168,12 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 	} while (0)
 
 enum {
-	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
-			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
-			 (1 << VHOST_F_LOG_ALL) |
-			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
-			 (1 << VIRTIO_NET_F_MRG_RXBUF),
+	VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+			 (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+			 (1ULL << VIRTIO_RING_F_EVENT_IDX) |
+			 (1ULL << VHOST_F_LOG_ALL) |
+			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
+			 (1ULL << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
-- 
cgit v1.2.3-70-g09d2


From 4423fe40b03f32b11e72ecfa03077e702e55d5a9 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:11:05 +0300
Subject: virtio_test: support event index

Add ability to test the new event idx feature,
enable by default.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 tools/virtio/virtio_test.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index df0c6d2c386..74d3331bdaf 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -197,6 +197,14 @@ const struct option longopts[] = {
 		.name = "help",
 		.val = 'h',
 	},
+	{
+		.name = "event-idx",
+		.val = 'E',
+	},
+	{
+		.name = "no-event-idx",
+		.val = 'e',
+	},
 	{
 		.name = "indirect",
 		.val = 'I',
@@ -211,13 +219,17 @@ const struct option longopts[] = {
 
 static void help()
 {
-	fprintf(stderr, "Usage: virtio_test [--help] [--no-indirect]\n");
+	fprintf(stderr, "Usage: virtio_test [--help]"
+		" [--no-indirect]"
+		" [--no-event-idx]"
+		"\n");
 }
 
 int main(int argc, char **argv)
 {
 	struct vdev_info dev;
-	unsigned long long features = 1ULL << VIRTIO_RING_F_INDIRECT_DESC;
+	unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+		(1ULL << VIRTIO_RING_F_EVENT_IDX);
 	int o;
 
 	for (;;) {
@@ -228,6 +240,9 @@ int main(int argc, char **argv)
 		case '?':
 			help();
 			exit(2);
+		case 'e':
+			features &= ~(1ULL << VIRTIO_RING_F_EVENT_IDX);
+			break;
 		case 'h':
 			help();
 			goto done;
-- 
cgit v1.2.3-70-g09d2


From 7ab358c23cbf15cea08129cd722d1ce77433a94d Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:11:14 +0300
Subject: virtio: add api for delayed callbacks

Add an API that tells the other side that callbacks
should be delayed until a lot of work has been done.
Implement using the new event_idx feature.

Note: it might seem advantageous to let the drivers
ask for a callback after a specific capacity has
been reached. However, as a single head can
free many entries in the descriptor table,
we don't really have a clue about capacity
until get_buf is called. The API is the simplest
to implement at the moment, we'll see what kind of
hints drivers can pass when there's more than one
user of the feature.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 27 +++++++++++++++++++++++++++
 include/linux/virtio.h       |  9 +++++++++
 2 files changed, 36 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a5fadc40c44..68b9136847a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -376,6 +376,33 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 
+bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	u16 bufs;
+
+	START_USE(vq);
+
+	/* We optimistically turn back on interrupts, then check if there was
+	 * more to do. */
+	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+	 * either clear the flags bit or point the event index at the next
+	 * entry. Always do both to keep code simple. */
+	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+	/* TODO: tune this threshold */
+	bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
+	vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
+	virtio_mb();
+	if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
+		END_USE(vq);
+		return false;
+	}
+
+	END_USE(vq);
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
+
 void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index aff5b4f7404..71088574960 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -51,6 +51,13 @@ struct virtqueue {
  *	This re-enables callbacks; it returns "false" if there are pending
  *	buffers in the queue, to detect a possible race between the driver
  *	checking for more work, and enabling callbacks.
+ * virtqueue_enable_cb_delayed: restart callbacks after disable_cb.
+ *	vq: the struct virtqueue we're talking about.
+ *	This re-enables callbacks but hints to the other side to delay
+ *	interrupts until most of the available buffers have been processed;
+ *	it returns "false" if there are many pending buffers in the queue,
+ *	to detect a possible race between the driver checking for more work,
+ *	and enabling callbacks.
  * virtqueue_detach_unused_buf: detach first unused buffer
  * 	vq: the struct virtqueue we're talking about.
  * 	Returns NULL or the "data" token handed to add_buf
@@ -86,6 +93,8 @@ void virtqueue_disable_cb(struct virtqueue *vq);
 
 bool virtqueue_enable_cb(struct virtqueue *vq);
 
+bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
+
 void *virtqueue_detach_unused_buf(struct virtqueue *vq);
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 7a66f784375c5922315bbe879b789ee50b924d26 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 20 May 2011 02:11:23 +0300
Subject: virtio_net: delay TX callbacks

Ask for delayed callbacks on TX ring full, to give the
other side more of a chance to make progress.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0cb0b063267..f6853247a62 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -609,7 +609,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
 		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb(vi->svq))) {
+		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
 			/* More just got used, free them then recheck. */
 			capacity += free_old_xmit_skbs(vi);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
-- 
cgit v1.2.3-70-g09d2


From a1706ac4c0201ea0143dc0db0659001b26ceeabb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 30 May 2011 07:42:51 +0200
Subject: Revert "block: Remove extra discard_alignment from hd_struct."

It was not a good idea to start dereferencing disk->queue from
the fs sysfs strategy for displaying discard alignment. We ran
into first a NULL pointer deref, and after fixing that we sometimes
see unvalid disk->queue pointer values.

Since discard is the only one of the bunch actually looking into
the queue, just revert the change.

This reverts commit 23ceb5b7719e9276d4fa72a3ecf94dd396755276.

Conflicts:
	fs/partitions/check.c
---
 fs/partitions/check.c | 10 +++-------
 include/linux/genhd.h |  1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f82e762eeca..d545e97d99c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,13 +255,7 @@ ssize_t part_discard_alignment_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
-	struct gendisk *disk = dev_to_disk(dev);
-	unsigned int alignment = 0;
-
-	if (disk->queue)
-		alignment = queue_limit_discard_alignment(&disk->queue->limits,
-								p->start_sect);
-	return sprintf(buf, "%u\n", alignment);
+	return sprintf(buf, "%u\n", p->discard_alignment);
 }
 
 ssize_t part_stat_show(struct device *dev,
@@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->start_sect = start;
 	p->alignment_offset =
 		queue_limit_alignment_offset(&disk->queue->limits, start);
+	p->discard_alignment =
+		queue_limit_discard_alignment(&disk->queue->limits, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index b78956b3c2e..300d7582006 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -100,6 +100,7 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	sector_t alignment_offset;
+	unsigned int discard_alignment;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-- 
cgit v1.2.3-70-g09d2


From 3cebde2413ba42504cf2c10ec1d47582912435cd Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 29 May 2011 21:20:59 -0700
Subject: vfs: shrink_dcache_parent before rmdir, dir rename

The dentry_unhash push-down series missed that shink_dcache_parent needs to
be called prior to rmdir or dir rename to clear DCACHE_REFERENCED and
allow efficient dentry reclaim.

Reported-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/namei.c b/fs/namei.c
index 1ab641f2e78..e2e4e8d032e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2579,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (error)
 		goto out;
 
+	shrink_dcache_parent(dentry);
 	error = dir->i_op->rmdir(dir, dentry);
 	if (error)
 		goto out;
@@ -2993,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
 		goto out;
 
+	if (target)
+		shrink_dcache_parent(new_dentry);
 	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (error)
 		goto out;
-- 
cgit v1.2.3-70-g09d2


From c7427d23f7ed695ac226dbe3a84d7f19091d34ce Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 May 2011 01:50:53 -0400
Subject: autofs4: bogus dentry_unhash() added in ->unlink()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87d95a8cddb..f55ae23b137 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	dentry_unhash(dentry);
-
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
 		if (p_ino && dentry->d_parent != dentry)
-- 
cgit v1.2.3-70-g09d2


From 598e887d8b01655780c81cc86a9e7820ed091580 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Mon, 30 May 2011 11:38:06 +0200
Subject: x86 idle: Fix mwait deprecation warning message

Fix:

  arch/x86/kernel/process.c:645:1: warning: unknown escape sequence '\i'

due to missing escape backslash, introduced by this commit:

  5d4c47e0195b: x86 idle: deprecate mwait_idle() and "idle=mwait" cmdline param

Signed-off-by: Borislav Petkov <bp@alien8.de>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1306748286-24701-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 426a5b66f7e..3d83a71af7d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -642,7 +642,7 @@ static int __init idle_setup(char *str)
 		boot_option_idle_override = IDLE_POLL;
 	} else if (!strcmp(str, "mwait")) {
 		boot_option_idle_override = IDLE_FORCE_MWAIT;
-		WARN_ONCE(1, "\idle=mwait\" will be removed in 2012\"\n");
+		WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\"\n");
 	} else if (!strcmp(str, "halt")) {
 		/*
 		 * When the boot option of idle=halt is added, halt is
-- 
cgit v1.2.3-70-g09d2


From 4f3c125c7420c85eaff627145557e392a871922d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 May 2011 08:23:57 -0400
Subject: x86: Fix mwait_play_dead() faulting on mwait-incapable cpus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A logic error in mwait_play_dead() causes the kernel to use
mwait even on cpus which don't support it, such as KVM virtual
cpus.

Introduced by:

  349c004e3d31: x86: A fast way to check capabilities of the current cpu

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=36222
Reported-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1306758237-9327-1-git-send-email-avi@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index eefd96765e7..33a0c11797d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1332,7 +1332,7 @@ static inline void mwait_play_dead(void)
 	void *mwait_ptr;
 	struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
 
-	if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
+	if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
 		return;
 	if (!this_cpu_has(X86_FEATURE_CLFLSH))
 		return;
-- 
cgit v1.2.3-70-g09d2


From 194cd8dfc9e4f29249b3bd45c5cb5c0a33a6438c Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Date: Tue, 31 May 2011 13:27:41 +0900
Subject: sh: asm/tlb.h needs linux/swap.h

Commit 1e56a56410bb64bce62d44563e35a143fc2d515f introduced the mmu_gather
rework for sh, but missed a linux/swap.h include:

	CC      arch/sh/mm/tlb-urb.o
	In file included from arch/sh/mm/tlb-urb.c:14:0:
	arch/sh/include/asm/tlb.h: In function '__tlb_remove_page':
	arch/sh/include/asm/tlb.h:92:2: error: implicit declaration of function 'free_page_and_swap_cache'

Signed-off-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/tlb.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 6c308d8b9a5..ec88bfcdf7c 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -9,6 +9,7 @@
 #include <linux/pagemap.h>
 
 #ifdef CONFIG_MMU
+#include <linux/swap.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-- 
cgit v1.2.3-70-g09d2


From 65d517eb7224d24ee4206416161390f30d69e622 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 31 May 2011 14:37:44 +0900
Subject: sh64: asm/pgtable.h needs asm/mmu.h

Needed to satisfy the __in_29bit_mode() check.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/pgtable.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index db85916b9e9..9210e93a92c 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -18,6 +18,7 @@
 #include <asm/pgtable-2level.h>
 #endif
 #include <asm/page.h>
+#include <asm/mmu.h>
 
 #ifndef __ASSEMBLY__
 #include <asm/addrspace.h>
-- 
cgit v1.2.3-70-g09d2


From 3f9b8520b06013939ad247ba08b69529b5f14be1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 31 May 2011 14:38:29 +0900
Subject: sh64: Move from P1SEG to CAC_ADDR for consistent sync.

sh64 doesn't define a P1SEGADDR, resulting in a build failure. The proper
mapping can be attained for both sh32 and 64 via the CAC_ADDR macro, so
switch to that instead.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/mm/consistent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index 40733a95240..f251b5f2765 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -82,7 +82,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	void *addr;
 
 	addr = __in_29bit_mode() ?
-	       (void *)P1SEGADDR((unsigned long)vaddr) : vaddr;
+	       (void *)CAC_ADDR((unsigned long)vaddr) : vaddr;
 
 	switch (direction) {
 	case DMA_FROM_DEVICE:		/* invalidate only */
-- 
cgit v1.2.3-70-g09d2


From db7eba292e913390fa881272bfbc3da0a5380513 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 31 May 2011 14:39:49 +0900
Subject: sh: Fix up asm-generic/ptrace.h fallout.

There was an ordering issue with regards to instruction_pointer() being
used in profile_pc() prior to the asm-generic/ptrace.h include, which
subsequently provided the instruction_pointer() definition. In the
interest of simplicity we simply open-code the regs->pc deref for the
profile_pc() definition instead.

The FP functions were also broken due to a lack of a common regs->fp,
so provide a common GET_FP() that is safe for both architectures in order
to fix up the frame pointer helpers too.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/ptrace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/sh/include/asm/ptrace.h b/arch/sh/include/asm/ptrace.h
index 40725b4a801..88bd6be168a 100644
--- a/arch/sh/include/asm/ptrace.h
+++ b/arch/sh/include/asm/ptrace.h
@@ -41,7 +41,9 @@
 
 #define user_mode(regs)			(((regs)->sr & 0x40000000)==0)
 #define kernel_stack_pointer(_regs)	((unsigned long)(_regs)->regs[15])
-#define GET_USP(regs) ((regs)->regs[15])
+
+#define GET_FP(regs)	((regs)->regs[14])
+#define GET_USP(regs)	((regs)->regs[15])
 
 extern void show_regs(struct pt_regs *);
 
@@ -131,7 +133,7 @@ extern void ptrace_triggered(struct perf_event *bp, int nmi,
 
 static inline unsigned long profile_pc(struct pt_regs *regs)
 {
-	unsigned long pc = instruction_pointer(regs);
+	unsigned long pc = regs->pc;
 
 	if (virt_addr_uncached(pc))
 		return CAC_ADDR(pc);
-- 
cgit v1.2.3-70-g09d2


From d4905ce38c73964b868037e49a5945e1cf47a7f2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 31 May 2011 15:23:20 +0900
Subject: Revert "clocksource: sh_tmu: Runtime PM support"

This reverts commit 1b842e91fea9447eff5eb687e28ad61c02f5033e.

There is a fundamental ordering race between the early and late probe
paths and the runtime PM tie-in that results in __pm_runtime_resume()
attempting to take a lock that hasn't been initialized yet (which by
proxy also suggests that pm_runtime_init() hasn't yet been run on the
device either, making the entire thing unsafe) -- resulting in instant
death on SMP or on UP with spinlock debugging enabled:

	 sh_tmu.0: used for clock events
	 sh_tmu.0: used for periodic clock events
	BUG: spinlock trylock failure on UP on CPU#0, swapper/0
	 lock: 804db198, .magic: 00000000, .owner: <none>/-1, .owner_cpu: 0
	...

Revert it for now until the ordering issues can be resolved, or we can get
some more help from the runtime PM framework to make this possible.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_tmu.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 17296288a20..80813576861 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -25,7 +25,6 @@
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <linux/clk.h>
-#include <linux/pm_runtime.h>
 #include <linux/irq.h>
 #include <linux/err.h>
 #include <linux/clocksource.h>
@@ -110,12 +109,10 @@ static int sh_tmu_enable(struct sh_tmu_priv *p)
 {
 	int ret;
 
-	/* wake up device and enable clock */
-	pm_runtime_get_sync(&p->pdev->dev);
+	/* enable clock */
 	ret = clk_enable(p->clk);
 	if (ret) {
 		dev_err(&p->pdev->dev, "cannot enable clock\n");
-		pm_runtime_put_sync(&p->pdev->dev);
 		return ret;
 	}
 
@@ -144,9 +141,8 @@ static void sh_tmu_disable(struct sh_tmu_priv *p)
 	/* disable interrupts in TMU block */
 	sh_tmu_write(p, TCR, 0x0000);
 
-	/* stop clock and mark device as idle */
+	/* stop clock */
 	clk_disable(p->clk);
-	pm_runtime_put_sync(&p->pdev->dev);
 }
 
 static void sh_tmu_set_next(struct sh_tmu_priv *p, unsigned long delta,
@@ -415,7 +411,6 @@ static int __devinit sh_tmu_probe(struct platform_device *pdev)
 
 	if (p) {
 		dev_info(&pdev->dev, "kept as earlytimer\n");
-		pm_runtime_enable(&pdev->dev);
 		return 0;
 	}
 
@@ -430,9 +425,6 @@ static int __devinit sh_tmu_probe(struct platform_device *pdev)
 		kfree(p);
 		platform_set_drvdata(pdev, NULL);
 	}
-
-	if (!is_early_platform_device(pdev))
-		pm_runtime_enable(&pdev->dev);
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9436b4abec28a22edd961ae375535d940625f1f2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 31 May 2011 15:26:42 +0900
Subject: Revert "clocksource: sh_cmt: Runtime PM support"

This reverts commit 01fa68b58492a5d6708a91c1f474b6a099a9509e.

The same note as per the sh_tmu change applies here, too.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_cmt.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 036e5865eb4..dc7c033ef58 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -24,7 +24,6 @@
 #include <linux/ioport.h>
 #include <linux/io.h>
 #include <linux/clk.h>
-#include <linux/pm_runtime.h>
 #include <linux/irq.h>
 #include <linux/err.h>
 #include <linux/clocksource.h>
@@ -153,12 +152,10 @@ static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
 {
 	int ret;
 
-	/* wake up device and enable clock */
-	pm_runtime_get_sync(&p->pdev->dev);
+	/* enable clock */
 	ret = clk_enable(p->clk);
 	if (ret) {
 		dev_err(&p->pdev->dev, "cannot enable clock\n");
-		pm_runtime_put_sync(&p->pdev->dev);
 		return ret;
 	}
 
@@ -190,9 +187,8 @@ static void sh_cmt_disable(struct sh_cmt_priv *p)
 	/* disable interrupts in CMT block */
 	sh_cmt_write(p, CMCSR, 0);
 
-	/* stop clock and mark device as idle */
+	/* stop clock */
 	clk_disable(p->clk);
-	pm_runtime_put_sync(&p->pdev->dev);
 }
 
 /* private flags */
@@ -664,7 +660,6 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 
 	if (p) {
 		dev_info(&pdev->dev, "kept as earlytimer\n");
-		pm_runtime_enable(&pdev->dev);
 		return 0;
 	}
 
@@ -679,9 +674,6 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 		kfree(p);
 		platform_set_drvdata(pdev, NULL);
 	}
-
-	if (!is_early_platform_device(pdev))
-		pm_runtime_enable(&pdev->dev);
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5c2de44417523385010b529599a2b30f290831a3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 31 May 2011 15:53:03 +0900
Subject: dmaengine: shdma: Fix up fallout from runtime PM changes.

The runtime PM changes introduce sh_dmae_rst() wrapping via the
runtime_resume helper, depending on dev_get_drvdata() to fetch the
platform data needed for the DMAOR initialization default at a time
where drvdata hasn't yet been established by the probe path, resulting
in general probe misery:

        Unable to handle kernel NULL pointer dereference at virtual address 000000c4
        pc = 8025adee
        *pde = 00000000
        Oops: 0000 [#1]
        Modules linked in:

        Pid : 1, Comm:           swapper
        CPU : 0                  Not tainted  (3.0.0-rc1-00012-g9436b4a-dirty #1456)

        PC is at sh_dmae_rst+0x28/0x86
        PR is at sh_dmae_rst+0x22/0x86
        PC  : 8025adee SP  : 9e803d10 SR  : 400080f1 TEA : 000000c4
        R0  : 000000c4 R1  : 0000fff8 R2  : 00000000 R3  : 00000040
        R4  : 000000f0 R5  : 00000000 R6  : 00000000 R7  : 804f184c
        R8  : 00000000 R9  : 804dd0e8 R10 : 80283204 R11 : ffffffda
        R12 : 000000a0 R13 : 804dd18c R14 : 9e803d10
        MACH: 00000000 MACL: 00008f20 GBR : 00000000 PR  : 8025ade8

        Call trace:
        [<8025ae70>] sh_dmae_runtime_resume+0x24/0x34
        [<80283238>] pm_generic_runtime_resume+0x34/0x3c
        [<80283370>] rpm_callback+0x4a/0x7e
        [<80283efc>] rpm_resume+0x240/0x384
        [<80283f54>] rpm_resume+0x298/0x384
        [<8028428c>] __pm_runtime_resume+0x44/0x7c
        [<8038a358>] __ioremap_caller+0x0/0xec
        [<80284296>] __pm_runtime_resume+0x4e/0x7c
        [<8038a358>] __ioremap_caller+0x0/0xec
        [<80666254>] sh_dmae_probe+0x180/0x6a0
        [<802803ae>] platform_drv_probe+0x26/0x2e

Fix up the ordering accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/dma/shdma.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
index 636e40925b1..727e76ff13e 100644
--- a/drivers/dma/shdma.c
+++ b/drivers/dma/shdma.c
@@ -1144,6 +1144,8 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
 	/* platform data */
 	shdev->pdata = pdata;
 
+	platform_set_drvdata(pdev, shdev);
+
 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_get_sync(&pdev->dev);
 
@@ -1256,7 +1258,6 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
 
 	pm_runtime_put(&pdev->dev);
 
-	platform_set_drvdata(pdev, shdev);
 	dma_async_device_register(&shdev->common);
 
 	return err;
@@ -1278,6 +1279,8 @@ rst_err:
 
 	if (dmars)
 		iounmap(shdev->dmars);
+
+	platform_set_drvdata(pdev, NULL);
 emapdmars:
 	iounmap(shdev->chan_reg);
 	synchronize_rcu();
@@ -1316,6 +1319,8 @@ static int __exit sh_dmae_remove(struct platform_device *pdev)
 		iounmap(shdev->dmars);
 	iounmap(shdev->chan_reg);
 
+	platform_set_drvdata(pdev, NULL);
+
 	synchronize_rcu();
 	kfree(shdev);
 
-- 
cgit v1.2.3-70-g09d2


From d72bce0e67e8afc6eb959f656013cbb577426f1e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 May 2011 13:34:51 +0200
Subject: rcu: Cure load woes

Commit cc3ce5176d83 (rcu: Start RCU kthreads in TASK_INTERRUPTIBLE
state) fudges a sleeping task' state, resulting in the scheduler seeing
a TASK_UNINTERRUPTIBLE task going to sleep, but a TASK_INTERRUPTIBLE
task waking up. The result is unbalanced load calculation.

The problem that patch tried to address is that the RCU threads could
stay in UNINTERRUPTIBLE state for quite a while and triggering the hung
task detector due to on-demand wake-ups.

Cure the problem differently by always giving the tasks at least one
wake-up once the CPU is fully up and running, this will kick them out of
the initial UNINTERRUPTIBLE state and into the regular INTERRUPTIBLE
wait state.

[ The alternative would be teaching kthread_create() to start threads as
  INTERRUPTIBLE but that needs a tad more thought. ]

Reported-by: Damien Wyart <damien.wyart@free.fr>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul E. McKenney <paul.mckenney@linaro.org>
Link: http://lkml.kernel.org/r/1306755291.1200.2872.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 54 +++++++++++++++++++++++++++++++++++++++++--------
 kernel/rcutree_plugin.h | 11 +++++++++-
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 77a7671dd14..89419ff92e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1648,7 +1648,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	kthread_bind(t, cpu);
-	set_task_state(t, TASK_INTERRUPTIBLE);
 	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
 	WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
 	per_cpu(rcu_cpu_kthread_task, cpu) = t;
@@ -1756,7 +1755,6 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
 		if (IS_ERR(t))
 			return PTR_ERR(t);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
-		set_task_state(t, TASK_INTERRUPTIBLE);
 		rnp->node_kthread_task = t;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		sp.sched_priority = 99;
@@ -1765,6 +1763,8 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
 	return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
 }
 
+static void rcu_wake_one_boost_kthread(struct rcu_node *rnp);
+
 /*
  * Spawn all kthreads -- called as soon as the scheduler is running.
  */
@@ -1772,18 +1772,30 @@ static int __init rcu_spawn_kthreads(void)
 {
 	int cpu;
 	struct rcu_node *rnp;
+	struct task_struct *t;
 
 	rcu_kthreads_spawnable = 1;
 	for_each_possible_cpu(cpu) {
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
-		if (cpu_online(cpu))
+		if (cpu_online(cpu)) {
 			(void)rcu_spawn_one_cpu_kthread(cpu);
+			t = per_cpu(rcu_cpu_kthread_task, cpu);
+			if (t)
+				wake_up_process(t);
+		}
 	}
 	rnp = rcu_get_root(rcu_state);
 	(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+	if (rnp->node_kthread_task)
+		wake_up_process(rnp->node_kthread_task);
 	if (NUM_RCU_NODES > 1) {
-		rcu_for_each_leaf_node(rcu_state, rnp)
+		rcu_for_each_leaf_node(rcu_state, rnp) {
 			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+			t = rnp->node_kthread_task;
+			if (t)
+				wake_up_process(t);
+			rcu_wake_one_boost_kthread(rnp);
+		}
 	}
 	return 0;
 }
@@ -2188,14 +2200,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
 {
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
 }
 
-static void __cpuinit rcu_online_kthreads(int cpu)
+static void __cpuinit rcu_prepare_kthreads(int cpu)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
@@ -2208,6 +2220,31 @@ static void __cpuinit rcu_online_kthreads(int cpu)
 	}
 }
 
+/*
+ * kthread_create() creates threads in TASK_UNINTERRUPTIBLE state,
+ * but the RCU threads are woken on demand, and if demand is low this
+ * could be a while triggering the hung task watchdog.
+ *
+ * In order to avoid this, poke all tasks once the CPU is fully
+ * up and running.
+ */
+static void __cpuinit rcu_online_kthreads(int cpu)
+{
+	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+	struct rcu_node *rnp = rdp->mynode;
+	struct task_struct *t;
+
+	t = per_cpu(rcu_cpu_kthread_task, cpu);
+	if (t)
+		wake_up_process(t);
+
+	t = rnp->node_kthread_task;
+	if (t)
+		wake_up_process(t);
+
+	rcu_wake_one_boost_kthread(rnp);
+}
+
 /*
  * Handle CPU online/offline notification events.
  */
@@ -2221,10 +2258,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		rcu_online_cpu(cpu);
-		rcu_online_kthreads(cpu);
+		rcu_prepare_cpu(cpu);
+		rcu_prepare_kthreads(cpu);
 		break;
 	case CPU_ONLINE:
+		rcu_online_kthreads(cpu);
 	case CPU_DOWN_FAILED:
 		rcu_node_kthread_setaffinity(rnp, -1);
 		rcu_cpu_kthread_setrt(cpu, 1);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a767b7dac36..c8bff3099a8 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1295,7 +1295,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	set_task_state(t, TASK_INTERRUPTIBLE);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	sp.sched_priority = RCU_KTHREAD_PRIO;
@@ -1303,6 +1302,12 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	return 0;
 }
 
+static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp)
+{
+	if (rnp->boost_kthread_task)
+		wake_up_process(rnp->boost_kthread_task);
+}
+
 #else /* #ifdef CONFIG_RCU_BOOST */
 
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1326,6 +1331,10 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	return 0;
 }
 
+static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp)
+{
+}
+
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 #ifndef CONFIG_SMP
-- 
cgit v1.2.3-70-g09d2


From 339dedf709e21d5718d6596750166f70e8bed40a Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 31 May 2011 18:01:23 +1000
Subject: powerpc/pmac: Don't register pmac PIC syscore ops when HW not present

The Apple custom PIC only exist in some earlier machine models,
anything with an MPIC will crash on suspend if we register those
syscore ops unconditionally.

This is a regression caused by commit f5a592f7d74e ("PM / PowerPC: Use
struct syscore_ops instead of sysdevs for PM")

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/powermac/pic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 9089b042119..7667db448aa 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -715,7 +715,8 @@ static struct syscore_ops pmacpic_syscore_ops = {
 
 static int __init init_pmacpic_syscore(void)
 {
-	register_syscore_ops(&pmacpic_syscore_ops);
+	if (pmac_irq_hw[0])
+		register_syscore_ops(&pmacpic_syscore_ops);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ea9d6553b3b3044e7374774cc33bb1b2eee19dd3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 31 May 2011 13:45:53 +0200
Subject: block: remove unwanted semicolons

Since those defined functions require additional semicolon
from the caller, they could cause potential syntax errors
when used in if-else statements.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ae9091a6848..1a23722e887 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1282,8 +1282,8 @@ queue_max_integrity_segments(struct request_queue *q)
 #define blk_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
-#define blk_integrity_unregister(a)		do { } while (0);
-#define blk_queue_max_integrity_segments(a, b)	do { } while (0);
+#define blk_integrity_unregister(a)		do { } while (0)
+#define blk_queue_max_integrity_segments(a, b)	do { } while (0)
 #define queue_max_integrity_segments(a)		(0)
 #define blk_integrity_merge_rq(a, b, c)		(0)
 #define blk_integrity_merge_bio(a, b, c)	(0)
-- 
cgit v1.2.3-70-g09d2


From 83caba8436b28bd9081013fd840424983127d4ca Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 31 May 2011 10:09:24 -0700
Subject: [IA64] wire up sendmmsg() syscall for Itanium

Add entries in unistd.h and entry.S to make this new syscall visible.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/include/asm/unistd.h | 3 ++-
 arch/ia64/kernel/entry.S       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index 1cf0f496f74..7c928da35b1 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -320,11 +320,12 @@
 #define __NR_clock_adjtime		1328
 #define __NR_syncfs			1329
 #define __NR_setns			1330
+#define __NR_sendmmsg			1331
 
 #ifdef __KERNEL__
 
 
-#define NR_syscalls			307 /* length of syscall table */
+#define NR_syscalls			308 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 9ca80193cd4..97dd2abdeb1 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1776,6 +1776,7 @@ sys_call_table:
 	data8 sys_clock_adjtime
 	data8 sys_syncfs
 	data8 sys_setns				// 1330
+	data8 sys_sendmmsg
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
 #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
-- 
cgit v1.2.3-70-g09d2


From 4495a7d41dbda03841c2a1c2a5ce7135a45131ba Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kyungmin.park@samsung.com>
Date: Tue, 31 May 2011 10:04:09 +0200
Subject: CFQ: Fix typo and remove unnecessary semicolon

Fix comment typo and remove unnecessary semicolon at macro

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7c52d688892..8a02c955c61 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -185,7 +185,7 @@ struct cfq_group {
 	int nr_cfqq;
 
 	/*
-	 * Per group busy queus average. Useful for workload slice calc. We
+	 * Per group busy queues average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
 	 * only for RT and BE class and slot for IDLE class remains unused.
 	 * This is primarily done to avoid confusion and a gcc warning.
@@ -369,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy);
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-			blkg_path(&(cfqq)->cfqg->blkg), ##args);
+			blkg_path(&(cfqq)->cfqg->blkg), ##args)
 
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
-				blkg_path(&(cfqg)->blkg), ##args);      \
+				blkg_path(&(cfqg)->blkg), ##args)       \
 
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 #endif
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
-- 
cgit v1.2.3-70-g09d2


From 71005be40a7fc95edda3cc462361ce0243e4f5fa Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Thu, 26 May 2011 21:31:08 +0100
Subject: libertas: Set command sequence number later to ensure consistency

Before this patch, the command sequence number is being set before
lbs_queue_cmd() adds the command to the queue. However, lbs_queue_cmd()
sometimes forces commands to queue-jump (e.g. CMD_802_11_WAKEUP_CONFIRM).
It currently does this without considering that sequence numbers might need
adjusting to keep things running in order.

Fix this by setting the sequence number at a later stage, just before
we're actually submitting the command to the hardware. Also fixes a
possible race where seqnum was being modified outside of the driver
lock.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Acked-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/libertas/cmd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 84566db486d..71c8f3fccfa 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -994,6 +994,8 @@ static void lbs_submit_command(struct lbs_private *priv,
 	cmd = cmdnode->cmdbuf;
 
 	spin_lock_irqsave(&priv->driver_lock, flags);
+	priv->seqnum++;
+	cmd->seqnum = cpu_to_le16(priv->seqnum);
 	priv->cur_cmd = cmdnode;
 	spin_unlock_irqrestore(&priv->driver_lock, flags);
 
@@ -1621,11 +1623,9 @@ struct cmd_ctrl_node *__lbs_cmd_async(struct lbs_private *priv,
 	/* Copy the incoming command to the buffer */
 	memcpy(cmdnode->cmdbuf, in_cmd, in_cmd_size);
 
-	/* Set sequence number, clean result, move to buffer */
-	priv->seqnum++;
+	/* Set command, clean result, move to buffer */
 	cmdnode->cmdbuf->command = cpu_to_le16(command);
 	cmdnode->cmdbuf->size    = cpu_to_le16(in_cmd_size);
-	cmdnode->cmdbuf->seqnum  = cpu_to_le16(priv->seqnum);
 	cmdnode->cmdbuf->result  = 0;
 
 	lbs_deb_host("PREP_CMD: command 0x%04x\n", command);
-- 
cgit v1.2.3-70-g09d2


From dd08682150e1815fe5cdd0673a2f2e9cd2d55a7a Mon Sep 17 00:00:00 2001
From: Luciano Coelho <coelho@ti.com>
Date: Fri, 27 May 2011 15:34:45 +0300
Subject: wl12xx: fix passive and radar channel generation for scheduled scan

We were comparing bitwise AND results with a boolean, so when the
boolean was set to true, it was not matching as it should.

Fix this by booleanizing the bitwise AND results with !!.

Signed-off-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/wl12xx/scan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/scan.c b/drivers/net/wireless/wl12xx/scan.c
index f37e5a39197..f223e0ed0c4 100644
--- a/drivers/net/wireless/wl12xx/scan.c
+++ b/drivers/net/wireless/wl12xx/scan.c
@@ -338,8 +338,8 @@ wl1271_scan_get_sched_scan_channels(struct wl1271 *wl,
 		flags = req->channels[i]->flags;
 
 		if (!(flags & IEEE80211_CHAN_DISABLED) &&
-		    ((flags & IEEE80211_CHAN_PASSIVE_SCAN) == passive) &&
-		    ((flags & IEEE80211_CHAN_RADAR) == radar) &&
+		    (!!(flags & IEEE80211_CHAN_PASSIVE_SCAN) == passive) &&
+		    (!!(flags & IEEE80211_CHAN_RADAR) == radar) &&
 		    (req->channels[i]->band == band)) {
 			wl1271_debug(DEBUG_SCAN, "band %d, center_freq %d ",
 				     req->channels[i]->band,
-- 
cgit v1.2.3-70-g09d2


From 2497a246e880d1fb537f754f551177c01fa39242 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <coelho@ti.com>
Date: Fri, 27 May 2011 15:34:46 +0300
Subject: wl12xx: fix DFS channels handling in scheduled scan

DFS channels were never getting included in the scheduled scans,
because they always contain the passive flag as well and the call was
asking for DFS and active channels.

Fix this by ignoring the passive flag when collecting DFS channels.
Also, move the DFS channels in the channel list before the 5GHz active
channels (this was implemented in the FW differently than specified).

Signed-off-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/wl12xx/scan.c | 31 ++++++++++++++++++++-----------
 drivers/net/wireless/wl12xx/scan.h |  3 +++
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/scan.c b/drivers/net/wireless/wl12xx/scan.c
index f223e0ed0c4..8ccbb911776 100644
--- a/drivers/net/wireless/wl12xx/scan.c
+++ b/drivers/net/wireless/wl12xx/scan.c
@@ -337,10 +337,12 @@ wl1271_scan_get_sched_scan_channels(struct wl1271 *wl,
 	     i++) {
 		flags = req->channels[i]->flags;
 
-		if (!(flags & IEEE80211_CHAN_DISABLED) &&
-		    (!!(flags & IEEE80211_CHAN_PASSIVE_SCAN) == passive) &&
+		if ((req->channels[i]->band == band) &&
+		    !(flags & IEEE80211_CHAN_DISABLED) &&
 		    (!!(flags & IEEE80211_CHAN_RADAR) == radar) &&
-		    (req->channels[i]->band == band)) {
+		    /* if radar is set, we ignore the passive flag */
+		    (radar ||
+		     !!(flags & IEEE80211_CHAN_PASSIVE_SCAN) == passive)) {
 			wl1271_debug(DEBUG_SCAN, "band %d, center_freq %d ",
 				     req->channels[i]->band,
 				     req->channels[i]->center_freq);
@@ -350,6 +352,8 @@ wl1271_scan_get_sched_scan_channels(struct wl1271 *wl,
 			wl1271_debug(DEBUG_SCAN, "max_power %d",
 				     req->channels[i]->max_power);
 
+			if (flags & IEEE80211_CHAN_RADAR)
+				channels[j].flags |= SCAN_CHANNEL_FLAGS_DFS;
 			if (flags & IEEE80211_CHAN_PASSIVE_SCAN) {
 				channels[j].passive_duration =
 					cpu_to_le16(c->dwell_time_passive);
@@ -359,7 +363,7 @@ wl1271_scan_get_sched_scan_channels(struct wl1271 *wl,
 				channels[j].max_duration =
 					cpu_to_le16(c->max_dwell_time_active);
 			}
-			channels[j].tx_power_att = req->channels[j]->max_power;
+			channels[j].tx_power_att = req->channels[i]->max_power;
 			channels[j].channel = req->channels[i]->hw_value;
 
 			j++;
@@ -386,7 +390,11 @@ wl1271_scan_sched_scan_channels(struct wl1271 *wl,
 		wl1271_scan_get_sched_scan_channels(wl, req, cfg->channels,
 						    IEEE80211_BAND_2GHZ,
 						    false, false, idx);
-	idx += cfg->active[0];
+	/*
+	 * 5GHz channels always start at position 14, not immediately
+	 * after the last 2.4GHz channel
+	 */
+	idx = 14;
 
 	cfg->passive[1] =
 		wl1271_scan_get_sched_scan_channels(wl, req, cfg->channels,
@@ -394,22 +402,23 @@ wl1271_scan_sched_scan_channels(struct wl1271 *wl,
 						    false, true, idx);
 	idx += cfg->passive[1];
 
-	cfg->active[1] =
+	cfg->dfs =
 		wl1271_scan_get_sched_scan_channels(wl, req, cfg->channels,
 						    IEEE80211_BAND_5GHZ,
-						    false, false, 14);
-	idx += cfg->active[1];
+						    true, true, idx);
+	idx += cfg->dfs;
 
-	cfg->dfs =
+	cfg->active[1] =
 		wl1271_scan_get_sched_scan_channels(wl, req, cfg->channels,
 						    IEEE80211_BAND_5GHZ,
-						    true, false, idx);
-	idx += cfg->dfs;
+						    false, false, idx);
+	idx += cfg->active[1];
 
 	wl1271_debug(DEBUG_SCAN, "    2.4GHz: active %d passive %d",
 		     cfg->active[0], cfg->passive[0]);
 	wl1271_debug(DEBUG_SCAN, "    5GHz: active %d passive %d",
 		     cfg->active[1], cfg->passive[1]);
+	wl1271_debug(DEBUG_SCAN, "    DFS: %d", cfg->dfs);
 
 	return idx;
 }
diff --git a/drivers/net/wireless/wl12xx/scan.h b/drivers/net/wireless/wl12xx/scan.h
index c83319579ca..a0b6c5d67b0 100644
--- a/drivers/net/wireless/wl12xx/scan.h
+++ b/drivers/net/wireless/wl12xx/scan.h
@@ -137,6 +137,9 @@ enum {
 	SCAN_BSS_TYPE_ANY,
 };
 
+#define SCAN_CHANNEL_FLAGS_DFS		BIT(0)
+#define SCAN_CHANNEL_FLAGS_DFS_ENABLED	BIT(1)
+
 struct conn_scan_ch_params {
 	__le16 min_duration;
 	__le16 max_duration;
-- 
cgit v1.2.3-70-g09d2


From 50a66d7f04adbfab9db55144c58dc693358cb635 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <coelho@ti.com>
Date: Fri, 27 May 2011 15:34:47 +0300
Subject: wl12xx: add separate config value for DFS dwell time on sched scan

Use a different value for DFS dwell time when performing a scheduled
scan.  Previously we were using the same value as for normal passive
scans.  This adds some flexibility between these two different types
of passive scan.

For now we use 150 TUs for DFS channel dwell time.  This may need to
be fine-tuned in the future.

Signed-off-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/wl12xx/conf.h | 3 +++
 drivers/net/wireless/wl12xx/main.c | 1 +
 drivers/net/wireless/wl12xx/scan.c | 7 +++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/conf.h b/drivers/net/wireless/wl12xx/conf.h
index 1ab6c86aac4..c83fefb6662 100644
--- a/drivers/net/wireless/wl12xx/conf.h
+++ b/drivers/net/wireless/wl12xx/conf.h
@@ -1157,6 +1157,9 @@ struct conf_sched_scan_settings {
 	/* time to wait on the channel for passive scans (in TUs) */
 	u32 dwell_time_passive;
 
+	/* time to wait on the channel for DFS scans (in TUs) */
+	u32 dwell_time_dfs;
+
 	/* number of probe requests to send on each channel in active scans */
 	u8 num_probe_reqs;
 
diff --git a/drivers/net/wireless/wl12xx/main.c b/drivers/net/wireless/wl12xx/main.c
index bc00e52f644..e6497dc669d 100644
--- a/drivers/net/wireless/wl12xx/main.c
+++ b/drivers/net/wireless/wl12xx/main.c
@@ -311,6 +311,7 @@ static struct conf_drv_settings default_conf = {
 		.min_dwell_time_active = 8,
 		.max_dwell_time_active = 30,
 		.dwell_time_passive    = 100,
+		.dwell_time_dfs        = 150,
 		.num_probe_reqs        = 2,
 		.rssi_threshold        = -90,
 		.snr_threshold         = 0,
diff --git a/drivers/net/wireless/wl12xx/scan.c b/drivers/net/wireless/wl12xx/scan.c
index 8ccbb911776..28ec0addc65 100644
--- a/drivers/net/wireless/wl12xx/scan.c
+++ b/drivers/net/wireless/wl12xx/scan.c
@@ -352,9 +352,12 @@ wl1271_scan_get_sched_scan_channels(struct wl1271 *wl,
 			wl1271_debug(DEBUG_SCAN, "max_power %d",
 				     req->channels[i]->max_power);
 
-			if (flags & IEEE80211_CHAN_RADAR)
+			if (flags & IEEE80211_CHAN_RADAR) {
 				channels[j].flags |= SCAN_CHANNEL_FLAGS_DFS;
-			if (flags & IEEE80211_CHAN_PASSIVE_SCAN) {
+				channels[j].passive_duration =
+					cpu_to_le16(c->dwell_time_dfs);
+			}
+			else if (flags & IEEE80211_CHAN_PASSIVE_SCAN) {
 				channels[j].passive_duration =
 					cpu_to_le16(c->dwell_time_passive);
 			} else {
-- 
cgit v1.2.3-70-g09d2


From 66870b1ccd5c1460e437c18b0026e2dcaab1ece9 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <coelho@ti.com>
Date: Fri, 27 May 2011 15:34:48 +0300
Subject: wl12xx: fix oops in sched_scan when forcing a passive scan

Fix kernel oops when trying to use passive scheduled scans.  The
reason was that in passive scans there are no SSIDs, so there was a
NULL pointer dereference.

To solve the problem, we now check the number of SSIDs provided in the
sched_scan request and only access the list if there's one or more
(ie. passive scan is not forced).  We also force all the channels to
be passive by adding the IEEE80211_CHAN_PASSIVE_SCAN flag locally
before the checks in the wl1271_scan_get_sched_scan_channels()
function.

Signed-off-by: Luciano Coelho <coelho@ti.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/wl12xx/scan.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/scan.c b/drivers/net/wireless/wl12xx/scan.c
index 28ec0addc65..56f76abc754 100644
--- a/drivers/net/wireless/wl12xx/scan.c
+++ b/drivers/net/wireless/wl12xx/scan.c
@@ -331,12 +331,16 @@ wl1271_scan_get_sched_scan_channels(struct wl1271 *wl,
 	struct conf_sched_scan_settings *c = &wl->conf.sched_scan;
 	int i, j;
 	u32 flags;
+	bool force_passive = !req->n_ssids;
 
 	for (i = 0, j = start;
 	     i < req->n_channels && j < MAX_CHANNELS_ALL_BANDS;
 	     i++) {
 		flags = req->channels[i]->flags;
 
+		if (force_passive)
+			flags |= IEEE80211_CHAN_PASSIVE_SCAN;
+
 		if ((req->channels[i]->band == band) &&
 		    !(flags & IEEE80211_CHAN_DISABLED) &&
 		    (!!(flags & IEEE80211_CHAN_RADAR) == radar) &&
@@ -433,6 +437,7 @@ int wl1271_scan_sched_scan_config(struct wl1271 *wl,
 	struct wl1271_cmd_sched_scan_config *cfg = NULL;
 	struct conf_sched_scan_settings *c = &wl->conf.sched_scan;
 	int i, total_channels, ret;
+	bool force_passive = !req->n_ssids;
 
 	wl1271_debug(DEBUG_CMD, "cmd sched_scan scan config");
 
@@ -456,7 +461,7 @@ int wl1271_scan_sched_scan_config(struct wl1271 *wl,
 	for (i = 0; i < SCAN_MAX_CYCLE_INTERVALS; i++)
 		cfg->intervals[i] = cpu_to_le32(req->interval);
 
-	if (req->ssids[0].ssid_len && req->ssids[0].ssid) {
+	if (!force_passive && req->ssids[0].ssid_len && req->ssids[0].ssid) {
 		cfg->filter_type = SCAN_SSID_FILTER_SPECIFIC;
 		cfg->ssid_len = req->ssids[0].ssid_len;
 		memcpy(cfg->ssid, req->ssids[0].ssid,
@@ -473,7 +478,7 @@ int wl1271_scan_sched_scan_config(struct wl1271 *wl,
 		goto out;
 	}
 
-	if (cfg->active[0]) {
+	if (!force_passive && cfg->active[0]) {
 		ret = wl1271_cmd_build_probe_req(wl, req->ssids[0].ssid,
 						 req->ssids[0].ssid_len,
 						 ies->ie[IEEE80211_BAND_2GHZ],
@@ -485,7 +490,7 @@ int wl1271_scan_sched_scan_config(struct wl1271 *wl,
 		}
 	}
 
-	if (cfg->active[1]) {
+	if (!force_passive && cfg->active[1]) {
 		ret = wl1271_cmd_build_probe_req(wl,  req->ssids[0].ssid,
 						 req->ssids[0].ssid_len,
 						 ies->ie[IEEE80211_BAND_5GHZ],
-- 
cgit v1.2.3-70-g09d2


From 59342f6a6bc35df623fb44784daa5e1077063b8f Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Mon, 30 May 2011 10:15:47 +0300
Subject: zd1211rw: fix to work on OHCI

zd1211 devices register 'EP 4 OUT' endpoint as Interrupt type on USB 2.0:

      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x04  EP 4 OUT
        bmAttributes            3
          Transfer Type            Interrupt
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0040  1x 64 bytes
        bInterval               1

However on USB 1.1 endpoint becomes Bulk:

      Endpoint Descriptor:
        bLength                 7
        bDescriptorType         5
        bEndpointAddress     0x04  EP 4 OUT
        bmAttributes            2
          Transfer Type            Bulk
          Synch Type               None
          Usage Type               Data
        wMaxPacketSize     0x0040  1x 64 bytes
        bInterval               0

Commit 37939810b937aba830dd751291fcdc51cae1a6cb assumed that endpoint is
always interrupt type and changed usb_bulk_msg() calls to usb_interrupt_msg().

Problem here is that usb_bulk_msg() on interrupt endpoint selfcorrects the
call and changes requested pipe to interrupt type (see usb_bulk_msg).
However with usb_interrupt_msg() on bulk endpoint does not correct the
pipe type to bulk, but instead URB is submitted with interrupt type pipe.

So pre-2.6.39 used usb_bulk_msg() and therefore worked with both endpoint
types, however in 2.6.39 usb_interrupt_msg() with bulk endpoint causes
ohci_hcd to fail submitted URB instantly with -ENOSPC and preventing zd1211rw
from working with OHCI.

Fix this by detecting endpoint type and using correct endpoint/pipe types
for URB. Also fix asynchronous zd_usb_iowrite16v_async() to use right
URB type on 'EP 4 OUT'.

Cc: stable@kernel.org
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/zd1211rw/zd_usb.c | 53 +++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c
index 0e819943b9e..631194d4982 100644
--- a/drivers/net/wireless/zd1211rw/zd_usb.c
+++ b/drivers/net/wireless/zd1211rw/zd_usb.c
@@ -1533,6 +1533,31 @@ static void __exit usb_exit(void)
 module_init(usb_init);
 module_exit(usb_exit);
 
+static int zd_ep_regs_out_msg(struct usb_device *udev, void *data, int len,
+			      int *actual_length, int timeout)
+{
+	/* In USB 2.0 mode EP_REGS_OUT endpoint is interrupt type. However in
+	 * USB 1.1 mode endpoint is bulk. Select correct type URB by endpoint
+	 * descriptor.
+	 */
+	struct usb_host_endpoint *ep;
+	unsigned int pipe;
+
+	pipe = usb_sndintpipe(udev, EP_REGS_OUT);
+	ep = usb_pipe_endpoint(udev, pipe);
+	if (!ep)
+		return -EINVAL;
+
+	if (usb_endpoint_xfer_int(&ep->desc)) {
+		return usb_interrupt_msg(udev, pipe, data, len,
+					 actual_length, timeout);
+	} else {
+		pipe = usb_sndbulkpipe(udev, EP_REGS_OUT);
+		return usb_bulk_msg(udev, pipe, data, len, actual_length,
+				    timeout);
+	}
+}
+
 static int usb_int_regs_length(unsigned int count)
 {
 	return sizeof(struct usb_int_regs) + count * sizeof(struct reg_data);
@@ -1648,15 +1673,14 @@ int zd_usb_ioread16v(struct zd_usb *usb, u16 *values,
 
 	udev = zd_usb_to_usbdev(usb);
 	prepare_read_regs_int(usb);
-	r = usb_interrupt_msg(udev, usb_sndintpipe(udev, EP_REGS_OUT),
-			      req, req_len, &actual_req_len, 50 /* ms */);
+	r = zd_ep_regs_out_msg(udev, req, req_len, &actual_req_len, 50 /*ms*/);
 	if (r) {
 		dev_dbg_f(zd_usb_dev(usb),
-			"error in usb_interrupt_msg(). Error number %d\n", r);
+			"error in zd_ep_regs_out_msg(). Error number %d\n", r);
 		goto error;
 	}
 	if (req_len != actual_req_len) {
-		dev_dbg_f(zd_usb_dev(usb), "error in usb_interrupt_msg()\n"
+		dev_dbg_f(zd_usb_dev(usb), "error in zd_ep_regs_out_msg()\n"
 			" req_len %d != actual_req_len %d\n",
 			req_len, actual_req_len);
 		r = -EIO;
@@ -1818,9 +1842,17 @@ int zd_usb_iowrite16v_async(struct zd_usb *usb, const struct zd_ioreq16 *ioreqs,
 		rw->value = cpu_to_le16(ioreqs[i].value);
 	}
 
-	usb_fill_int_urb(urb, udev, usb_sndintpipe(udev, EP_REGS_OUT),
-			 req, req_len, iowrite16v_urb_complete, usb,
-			 ep->desc.bInterval);
+	/* In USB 2.0 mode endpoint is interrupt type. However in USB 1.1 mode
+	 * endpoint is bulk. Select correct type URB by endpoint descriptor.
+	 */
+	if (usb_endpoint_xfer_int(&ep->desc))
+		usb_fill_int_urb(urb, udev, usb_sndintpipe(udev, EP_REGS_OUT),
+				 req, req_len, iowrite16v_urb_complete, usb,
+				 ep->desc.bInterval);
+	else
+		usb_fill_bulk_urb(urb, udev, usb_sndbulkpipe(udev, EP_REGS_OUT),
+				  req, req_len, iowrite16v_urb_complete, usb);
+
 	urb->transfer_flags |= URB_FREE_BUFFER;
 
 	/* Submit previous URB */
@@ -1924,15 +1956,14 @@ int zd_usb_rfwrite(struct zd_usb *usb, u32 value, u8 bits)
 	}
 
 	udev = zd_usb_to_usbdev(usb);
-	r = usb_interrupt_msg(udev, usb_sndintpipe(udev, EP_REGS_OUT),
-			      req, req_len, &actual_req_len, 50 /* ms */);
+	r = zd_ep_regs_out_msg(udev, req, req_len, &actual_req_len, 50 /*ms*/);
 	if (r) {
 		dev_dbg_f(zd_usb_dev(usb),
-			"error in usb_interrupt_msg(). Error number %d\n", r);
+			"error in zd_ep_regs_out_msg(). Error number %d\n", r);
 		goto out;
 	}
 	if (req_len != actual_req_len) {
-		dev_dbg_f(zd_usb_dev(usb), "error in usb_interrupt_msg()"
+		dev_dbg_f(zd_usb_dev(usb), "error in zd_ep_regs_out_msg()"
 			" req_len %d != actual_req_len %d\n",
 			req_len, actual_req_len);
 		r = -EIO;
-- 
cgit v1.2.3-70-g09d2


From 1144181c1bc054dc5e001a6f10b4820167e6c883 Mon Sep 17 00:00:00 2001
From: Wey-Yi Guy <wey-yi.w.guy@intel.com>
Date: Mon, 30 May 2011 09:32:52 -0700
Subject: iwlagn: fix incorrect PCI subsystem id for 6150 devices

For 6150 devices, modify the supported PCI subsystem ID.

Cc: stable@kernel.org
Signed-off-by: Wey-Yi Guy <wey-yi.w.guy@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlwifi/iwl-6000.c | 28 ++++++++++++++++++----------
 drivers/net/wireless/iwlwifi/iwl-agn.c  |  6 +++---
 drivers/net/wireless/iwlwifi/iwl-agn.h  |  1 +
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/iwlwifi/iwl-6000.c b/drivers/net/wireless/iwlwifi/iwl-6000.c
index f8c710db6e6..fda6fe08cf9 100644
--- a/drivers/net/wireless/iwlwifi/iwl-6000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-6000.c
@@ -603,19 +603,27 @@ struct iwl_cfg iwl6050_2abg_cfg = {
 	IWL_DEVICE_6050,
 };
 
+#define IWL_DEVICE_6150						\
+	.fw_name_pre = IWL6050_FW_PRE,				\
+	.ucode_api_max = IWL6050_UCODE_API_MAX,			\
+	.ucode_api_min = IWL6050_UCODE_API_MIN,			\
+	.ops = &iwl6150_ops,					\
+	.eeprom_ver = EEPROM_6150_EEPROM_VERSION,		\
+	.eeprom_calib_ver = EEPROM_6150_TX_POWER_VERSION,	\
+	.base_params = &iwl6050_base_params,			\
+	.need_dc_calib = true,					\
+	.led_mode = IWL_LED_BLINK,				\
+	.internal_wimax_coex = true
+
 struct iwl_cfg iwl6150_bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N + WiMAX 6150 BGN",
-	.fw_name_pre = IWL6050_FW_PRE,
-	.ucode_api_max = IWL6050_UCODE_API_MAX,
-	.ucode_api_min = IWL6050_UCODE_API_MIN,
-	.eeprom_ver = EEPROM_6150_EEPROM_VERSION,
-	.eeprom_calib_ver = EEPROM_6150_TX_POWER_VERSION,
-	.ops = &iwl6150_ops,
-	.base_params = &iwl6050_base_params,
+	IWL_DEVICE_6150,
 	.ht_params = &iwl6000_ht_params,
-	.need_dc_calib = true,
-	.led_mode = IWL_LED_RF_STATE,
-	.internal_wimax_coex = true,
+};
+
+struct iwl_cfg iwl6150_bg_cfg = {
+	.name = "Intel(R) Centrino(R) Wireless-N + WiMAX 6150 BG",
+	IWL_DEVICE_6150,
 };
 
 struct iwl_cfg iwl6000_3agn_cfg = {
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 11c6c1169e7..a662adcb2ad 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -3831,11 +3831,11 @@ static DEFINE_PCI_DEVICE_TABLE(iwl_hw_card_ids) = {
 
 /* 6150 WiFi/WiMax Series */
 	{IWL_PCI_DEVICE(0x0885, 0x1305, iwl6150_bgn_cfg)},
-	{IWL_PCI_DEVICE(0x0885, 0x1306, iwl6150_bgn_cfg)},
+	{IWL_PCI_DEVICE(0x0885, 0x1307, iwl6150_bg_cfg)},
 	{IWL_PCI_DEVICE(0x0885, 0x1325, iwl6150_bgn_cfg)},
-	{IWL_PCI_DEVICE(0x0885, 0x1326, iwl6150_bgn_cfg)},
+	{IWL_PCI_DEVICE(0x0885, 0x1327, iwl6150_bg_cfg)},
 	{IWL_PCI_DEVICE(0x0886, 0x1315, iwl6150_bgn_cfg)},
-	{IWL_PCI_DEVICE(0x0886, 0x1316, iwl6150_bgn_cfg)},
+	{IWL_PCI_DEVICE(0x0886, 0x1317, iwl6150_bg_cfg)},
 
 /* 1000 Series WiFi */
 	{IWL_PCI_DEVICE(0x0083, 0x1205, iwl1000_bgn_cfg)},
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.h b/drivers/net/wireless/iwlwifi/iwl-agn.h
index 2495fe7a58c..d1716844002 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.h
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.h
@@ -89,6 +89,7 @@ extern struct iwl_cfg iwl6000_3agn_cfg;
 extern struct iwl_cfg iwl6050_2agn_cfg;
 extern struct iwl_cfg iwl6050_2abg_cfg;
 extern struct iwl_cfg iwl6150_bgn_cfg;
+extern struct iwl_cfg iwl6150_bg_cfg;
 extern struct iwl_cfg iwl1000_bgn_cfg;
 extern struct iwl_cfg iwl1000_bg_cfg;
 extern struct iwl_cfg iwl100_bgn_cfg;
-- 
cgit v1.2.3-70-g09d2


From 48bdf072c3f1f8f739f76d19c74f4c79605cac46 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Sun, 29 May 2011 10:55:44 +0000
Subject: ip_options_compile: properly handle unaligned pointer

The current code takes an unaligned pointer and does htonl() on it to
make it big-endian, then does a memcpy().  The problem is that the
compiler decides that since the pointer is to a __be32, it is legal
to optimize the copy into a processor word store.  However, on an
architecture that does not handled unaligned writes in kernel space,
this produces an unaligned exception fault.

The solution is to track the pointer as a "char *" (which removes a bunch
of unpleasant casts in any case), and then just use put_unaligned_be32()
to write the value to memory.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: David S. Miller <davem@zippy.davemloft.net>
---
 net/ipv4/ip_options.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index c3118e1cd3b..ec93335901d 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <asm/uaccess.h>
+#include <asm/unaligned.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/icmp.h>
@@ -350,7 +351,7 @@ int ip_options_compile(struct net *net,
 				goto error;
 			}
 			if (optptr[2] <= optlen) {
-				__be32 *timeptr = NULL;
+				unsigned char *timeptr = NULL;
 				if (optptr[2]+3 > optptr[1]) {
 					pp_ptr = optptr + 2;
 					goto error;
@@ -359,7 +360,7 @@ int ip_options_compile(struct net *net,
 				      case IPOPT_TS_TSONLY:
 					opt->ts = optptr - iph;
 					if (skb)
-						timeptr = (__be32*)&optptr[optptr[2]-1];
+						timeptr = &optptr[optptr[2]-1];
 					opt->ts_needtime = 1;
 					optptr[2] += 4;
 					break;
@@ -371,7 +372,7 @@ int ip_options_compile(struct net *net,
 					opt->ts = optptr - iph;
 					if (rt)  {
 						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
-						timeptr = (__be32*)&optptr[optptr[2]+3];
+						timeptr = &optptr[optptr[2]+3];
 					}
 					opt->ts_needaddr = 1;
 					opt->ts_needtime = 1;
@@ -389,7 +390,7 @@ int ip_options_compile(struct net *net,
 						if (inet_addr_type(net, addr) == RTN_UNICAST)
 							break;
 						if (skb)
-							timeptr = (__be32*)&optptr[optptr[2]+3];
+							timeptr = &optptr[optptr[2]+3];
 					}
 					opt->ts_needtime = 1;
 					optptr[2] += 8;
@@ -403,10 +404,10 @@ int ip_options_compile(struct net *net,
 				}
 				if (timeptr) {
 					struct timespec tv;
-					__be32  midtime;
+					u32  midtime;
 					getnstimeofday(&tv);
-					midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
-					memcpy(timeptr, &midtime, sizeof(__be32));
+					midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
+					put_unaligned_be32(midtime, timeptr);
 					opt->is_changed = 1;
 				}
 			} else {
-- 
cgit v1.2.3-70-g09d2


From b10cec8a4e8167075b9e1ff3f05419769e7f381a Mon Sep 17 00:00:00 2001
From: Dennis Aberilla <dennis.aberilla@mimomax.com>
Date: Sun, 29 May 2011 11:46:54 +0000
Subject: drivers/net: ks8842 Fix crash on received packet when in PIO mode.

This patch fixes a driver crash during packet reception due to not enough
bytes allocated in the skb. Since the loop reads out 4 bytes at a time, we
need to allow for up to 3 bytes of slack space.

Signed-off-by: Dennis Aberilla <denzzzhome@yahoo.com>
Signed-off-by: David S. Miller <davem@zippy.davemloft.net>
---
 drivers/net/ks8842.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ks8842.c b/drivers/net/ks8842.c
index f0d8346d0fa..9bd0f551a38 100644
--- a/drivers/net/ks8842.c
+++ b/drivers/net/ks8842.c
@@ -662,7 +662,7 @@ static void ks8842_rx_frame(struct net_device *netdev,
 
 	/* check the status */
 	if ((status & RXSR_VALID) && !(status & RXSR_ERROR)) {
-		struct sk_buff *skb = netdev_alloc_skb_ip_align(netdev, len);
+		struct sk_buff *skb = netdev_alloc_skb_ip_align(netdev, len + 3);
 
 		if (skb) {
 
-- 
cgit v1.2.3-70-g09d2


From a000c01e60e40e15304ffe48fff051d17a7bea91 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Sun, 29 May 2011 23:23:36 +0000
Subject: sctp: stop pending timers and purge queues when peer restart asoc

If the peer restart the asoc, we should not only fail any unsent/unacked
data, but also stop the T3-rtx, SACK, T4-rto timers, and teardown ASCONF
queues.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h |  1 +
 include/net/sctp/structs.h |  2 +-
 net/sctp/associola.c       | 23 ++++++++++++++---------
 net/sctp/sm_sideeffect.c   |  3 +++
 net/sctp/sm_statefuns.c    | 14 ++++++++++++--
 5 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index 2b447646ce4..dd6847e5d6e 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -107,6 +107,7 @@ typedef enum {
 	SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */
 	SCTP_CMD_SEND_MSG,	 /* Send the whole use message */
 	SCTP_CMD_SEND_NEXT_ASCONF, /* Send the next ASCONF after ACK */
+	SCTP_CMD_PURGE_ASCONF_QUEUE, /* Purge all asconf queues.*/
 	SCTP_CMD_LAST
 } sctp_verb_t;
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 795f4886e11..7df327a6d56 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1993,7 +1993,7 @@ void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc);
 struct sctp_chunk *sctp_assoc_lookup_asconf_ack(
 					const struct sctp_association *asoc,
 					__be32 serial);
-
+void sctp_asconf_queue_teardown(struct sctp_association *asoc);
 
 int sctp_cmp_addr_exact(const union sctp_addr *ss1,
 			const union sctp_addr *ss2);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 525f97c467e..4a62888f2e4 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -444,15 +444,7 @@ void sctp_association_free(struct sctp_association *asoc)
 
 	asoc->peer.transport_count = 0;
 
-	/* Free any cached ASCONF_ACK chunk. */
-	sctp_assoc_free_asconf_acks(asoc);
-
-	/* Free the ASCONF queue. */
-	sctp_assoc_free_asconf_queue(asoc);
-
-	/* Free any cached ASCONF chunk. */
-	if (asoc->addip_last_asconf)
-		sctp_chunk_free(asoc->addip_last_asconf);
+	sctp_asconf_queue_teardown(asoc);
 
 	/* AUTH - Free the endpoint shared keys */
 	sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);
@@ -1646,3 +1638,16 @@ struct sctp_chunk *sctp_assoc_lookup_asconf_ack(
 
 	return NULL;
 }
+
+void sctp_asconf_queue_teardown(struct sctp_association *asoc)
+{
+	/* Free any cached ASCONF_ACK chunk. */
+	sctp_assoc_free_asconf_acks(asoc);
+
+	/* Free the ASCONF queue. */
+	sctp_assoc_free_asconf_queue(asoc);
+
+	/* Free any cached ASCONF chunk. */
+	if (asoc->addip_last_asconf)
+		sctp_chunk_free(asoc->addip_last_asconf);
+}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index d612ca1ca6c..534c2e5feb0 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1670,6 +1670,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 		case SCTP_CMD_SEND_NEXT_ASCONF:
 			sctp_cmd_send_asconf(asoc);
 			break;
+		case SCTP_CMD_PURGE_ASCONF_QUEUE:
+			sctp_asconf_queue_teardown(asoc);
+			break;
 		default:
 			pr_warn("Impossible command: %u, %p\n",
 				cmd->verb, cmd->obj.ptr);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7f4a4f8368e..a297283154d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1718,11 +1718,21 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(const struct sctp_endpoint *ep,
 		return SCTP_DISPOSITION_CONSUME;
 	}
 
-	/* For now, fail any unsent/unacked data.  Consider the optional
-	 * choice of resending of this data.
+	/* For now, stop pending T3-rtx and SACK timers, fail any unsent/unacked
+	 * data. Consider the optional choice of resending of this data.
 	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
 	sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL());
 
+	/* Stop pending T4-rto timer, teardown ASCONF queue, ASCONF-ACK queue
+	 * and ASCONF-ACK cache.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+	sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL());
+
 	repl = sctp_make_cookie_ack(new_asoc, chunk);
 	if (!repl)
 		goto nomem;
-- 
cgit v1.2.3-70-g09d2


From 930a6eac9f40e692bd9670d89bcd9ac0f4019356 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Mon, 30 May 2011 07:06:24 +0000
Subject: drivers/net/usb/catc.c: Fix potential deadlock in catc_ctrl_run()

catc_ctrl_run() calls usb_submit_urb() with GFP_KERNEL, while it is called from
catc_ctrl_async() and catc_ctrl_done() with catc->ctrl_lock spinlock held.

The patch replaces GFP_KERNEL with GFP_ATOMIC.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/catc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c
index d7221c4a5dc..8056f8a27c6 100644
--- a/drivers/net/usb/catc.c
+++ b/drivers/net/usb/catc.c
@@ -495,7 +495,7 @@ static void catc_ctrl_run(struct catc *catc)
 	if (!q->dir && q->buf && q->len)
 		memcpy(catc->ctrl_buf, q->buf, q->len);
 
-	if ((status = usb_submit_urb(catc->ctrl_urb, GFP_KERNEL)))
+	if ((status = usb_submit_urb(catc->ctrl_urb, GFP_ATOMIC)))
 		err("submit(ctrl_urb) status %d", status);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 948252cb9e01d65a89ecadf67be5018351eee15e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 31 May 2011 19:27:48 -0700
Subject: Revert "net: fix section mismatches"

This reverts commit e5cb966c0838e4da43a3b0751bdcac7fe719f7b4.

It causes new build regressions with gcc-4.2 which is
pretty common on non-x86 platforms.

Reported-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/3c509.c             | 14 ++++++-------
 drivers/net/3c59x.c             |  4 ++--
 drivers/net/depca.c             | 35 ++++++++++++++++----------------
 drivers/net/hp100.c             | 12 +++++------
 drivers/net/ibmlana.c           |  4 ++--
 drivers/net/irda/smsc-ircc2.c   | 44 ++++++++++++++++++++---------------------
 drivers/net/ne3210.c            | 15 ++++++--------
 drivers/net/smc-mca.c           |  6 +++---
 drivers/net/tokenring/madgemc.c |  2 +-
 drivers/net/tulip/de4x5.c       |  4 ++--
 10 files changed, 68 insertions(+), 72 deletions(-)

diff --git a/drivers/net/3c509.c b/drivers/net/3c509.c
index 5f25889e27e..44b28b2d700 100644
--- a/drivers/net/3c509.c
+++ b/drivers/net/3c509.c
@@ -185,7 +185,7 @@ static int max_interrupt_work = 10;
 static int nopnp;
 #endif
 
-static int el3_common_init(struct net_device *dev);
+static int __devinit el3_common_init(struct net_device *dev);
 static void el3_common_remove(struct net_device *dev);
 static ushort id_read_eeprom(int index);
 static ushort read_eeprom(int ioaddr, int index);
@@ -395,7 +395,7 @@ static struct isa_driver el3_isa_driver = {
 static int isa_registered;
 
 #ifdef CONFIG_PNP
-static const struct pnp_device_id el3_pnp_ids[] __devinitconst = {
+static struct pnp_device_id el3_pnp_ids[] = {
 	{ .id = "TCM5090" }, /* 3Com Etherlink III (TP) */
 	{ .id = "TCM5091" }, /* 3Com Etherlink III */
 	{ .id = "TCM5094" }, /* 3Com Etherlink III (combo) */
@@ -478,7 +478,7 @@ static int pnp_registered;
 #endif /* CONFIG_PNP */
 
 #ifdef CONFIG_EISA
-static const struct eisa_device_id el3_eisa_ids[] __devinitconst = {
+static struct eisa_device_id el3_eisa_ids[] = {
 		{ "TCM5090" },
 		{ "TCM5091" },
 		{ "TCM5092" },
@@ -508,7 +508,7 @@ static int eisa_registered;
 #ifdef CONFIG_MCA
 static int el3_mca_probe(struct device *dev);
 
-static const short el3_mca_adapter_ids[] __devinitconst = {
+static short el3_mca_adapter_ids[] __initdata = {
 		0x627c,
 		0x627d,
 		0x62db,
@@ -517,7 +517,7 @@ static const short el3_mca_adapter_ids[] __devinitconst = {
 		0x0000
 };
 
-static const char *const el3_mca_adapter_names[] __devinitconst = {
+static char *el3_mca_adapter_names[] __initdata = {
 		"3Com 3c529 EtherLink III (10base2)",
 		"3Com 3c529 EtherLink III (10baseT)",
 		"3Com 3c529 EtherLink III (test mode)",
@@ -601,7 +601,7 @@ static void el3_common_remove (struct net_device *dev)
 }
 
 #ifdef CONFIG_MCA
-static int __devinit el3_mca_probe(struct device *device)
+static int __init el3_mca_probe(struct device *device)
 {
 	/* Based on Erik Nygren's (nygren@mit.edu) 3c529 patch,
 	 * heavily modified by Chris Beauregard
@@ -671,7 +671,7 @@ static int __devinit el3_mca_probe(struct device *device)
 #endif /* CONFIG_MCA */
 
 #ifdef CONFIG_EISA
-static int __devinit el3_eisa_probe (struct device *device)
+static int __init el3_eisa_probe (struct device *device)
 {
 	short i;
 	int ioaddr, irq, if_port;
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index 99f43d27544..8cc22568ebd 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -901,14 +901,14 @@ static const struct dev_pm_ops vortex_pm_ops = {
 #endif /* !CONFIG_PM */
 
 #ifdef CONFIG_EISA
-static const struct eisa_device_id vortex_eisa_ids[] __devinitconst = {
+static struct eisa_device_id vortex_eisa_ids[] = {
 	{ "TCM5920", CH_3C592 },
 	{ "TCM5970", CH_3C597 },
 	{ "" }
 };
 MODULE_DEVICE_TABLE(eisa, vortex_eisa_ids);
 
-static int __devinit vortex_eisa_probe(struct device *device)
+static int __init vortex_eisa_probe(struct device *device)
 {
 	void __iomem *ioaddr;
 	struct eisa_device *edev;
diff --git a/drivers/net/depca.c b/drivers/net/depca.c
index 17654059922..8b0084d17c8 100644
--- a/drivers/net/depca.c
+++ b/drivers/net/depca.c
@@ -331,18 +331,18 @@ static struct {
                          "DE422",\
                          ""}
 
-static const char* const depca_signature[] __devinitconst = DEPCA_SIGNATURE;
+static char* __initdata depca_signature[] = DEPCA_SIGNATURE;
 
 enum depca_type {
 	DEPCA, de100, de101, de200, de201, de202, de210, de212, de422, unknown
 };
 
-static const char depca_string[] = "depca";
+static char depca_string[] = "depca";
 
 static int depca_device_remove (struct device *device);
 
 #ifdef CONFIG_EISA
-static const struct eisa_device_id depca_eisa_ids[] __devinitconst = {
+static struct eisa_device_id depca_eisa_ids[] = {
 	{ "DEC4220", de422 },
 	{ "" }
 };
@@ -367,19 +367,19 @@ static struct eisa_driver depca_eisa_driver = {
 #define DE210_ID 0x628d
 #define DE212_ID 0x6def
 
-static const short depca_mca_adapter_ids[] __devinitconst = {
+static short depca_mca_adapter_ids[] = {
 	DE210_ID,
 	DE212_ID,
 	0x0000
 };
 
-static const char *depca_mca_adapter_name[] = {
+static char *depca_mca_adapter_name[] = {
 	"DEC EtherWORKS MC Adapter (DE210)",
 	"DEC EtherWORKS MC Adapter (DE212)",
 	NULL
 };
 
-static const enum depca_type depca_mca_adapter_type[] = {
+static enum depca_type depca_mca_adapter_type[] = {
 	de210,
 	de212,
 	0
@@ -541,9 +541,10 @@ static void SetMulticastFilter(struct net_device *dev);
 static int load_packet(struct net_device *dev, struct sk_buff *skb);
 static void depca_dbg_open(struct net_device *dev);
 
-static const u_char de1xx_irq[] __devinitconst = { 2, 3, 4, 5, 7, 9, 0 };
-static const u_char de2xx_irq[] __devinitconst = { 5, 9, 10, 11, 15, 0 };
-static const u_char de422_irq[] __devinitconst = { 5, 9, 10, 11, 0 };
+static u_char de1xx_irq[] __initdata = { 2, 3, 4, 5, 7, 9, 0 };
+static u_char de2xx_irq[] __initdata = { 5, 9, 10, 11, 15, 0 };
+static u_char de422_irq[] __initdata = { 5, 9, 10, 11, 0 };
+static u_char *depca_irq;
 
 static int irq;
 static int io;
@@ -579,7 +580,7 @@ static const struct net_device_ops depca_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-static int __devinit depca_hw_init (struct net_device *dev, struct device *device)
+static int __init depca_hw_init (struct net_device *dev, struct device *device)
 {
 	struct depca_private *lp;
 	int i, j, offset, netRAM, mem_len, status = 0;
@@ -747,7 +748,6 @@ static int __devinit depca_hw_init (struct net_device *dev, struct device *devic
 	if (dev->irq < 2) {
 		unsigned char irqnum;
 		unsigned long irq_mask, delay;
-		const u_char *depca_irq;
 
 		irq_mask = probe_irq_on();
 
@@ -770,7 +770,6 @@ static int __devinit depca_hw_init (struct net_device *dev, struct device *devic
 			break;
 
 		default:
-			depca_irq = NULL;
 			break;	/* Not reached */
 		}
 
@@ -1303,7 +1302,7 @@ static void SetMulticastFilter(struct net_device *dev)
 	}
 }
 
-static int __devinit depca_common_init (u_long ioaddr, struct net_device **devp)
+static int __init depca_common_init (u_long ioaddr, struct net_device **devp)
 {
 	int status = 0;
 
@@ -1334,7 +1333,7 @@ static int __devinit depca_common_init (u_long ioaddr, struct net_device **devp)
 /*
 ** Microchannel bus I/O device probe
 */
-static int __devinit depca_mca_probe(struct device *device)
+static int __init depca_mca_probe(struct device *device)
 {
 	unsigned char pos[2];
 	unsigned char where;
@@ -1458,7 +1457,7 @@ static int __devinit depca_mca_probe(struct device *device)
 ** ISA bus I/O device probe
 */
 
-static void __devinit depca_platform_probe (void)
+static void __init depca_platform_probe (void)
 {
 	int i;
 	struct platform_device *pldev;
@@ -1498,7 +1497,7 @@ static void __devinit depca_platform_probe (void)
 	}
 }
 
-static enum depca_type __devinit depca_shmem_probe (ulong *mem_start)
+static enum depca_type __init depca_shmem_probe (ulong *mem_start)
 {
 	u_long mem_base[] = DEPCA_RAM_BASE_ADDRESSES;
 	enum depca_type adapter = unknown;
@@ -1559,7 +1558,7 @@ static int __devinit depca_isa_probe (struct platform_device *device)
 */
 
 #ifdef CONFIG_EISA
-static int __devinit depca_eisa_probe (struct device *device)
+static int __init depca_eisa_probe (struct device *device)
 {
 	enum depca_type adapter = unknown;
 	struct eisa_device *edev;
@@ -1630,7 +1629,7 @@ static int __devexit depca_device_remove (struct device *device)
 ** and Boot (readb) ROM. This will also give us a clue to the network RAM
 ** base address.
 */
-static int __devinit DepcaSignature(char *name, u_long base_addr)
+static int __init DepcaSignature(char *name, u_long base_addr)
 {
 	u_int i, j, k;
 	void __iomem *ptr;
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index c52a1df5d92..8e10d2f6a5a 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -188,14 +188,14 @@ struct hp100_private {
  *  variables
  */
 #ifdef CONFIG_ISA
-static const char *const hp100_isa_tbl[] __devinitconst = {
+static const char *hp100_isa_tbl[] = {
 	"HWPF150", /* HP J2573 rev A */
 	"HWP1950", /* HP J2573 */
 };
 #endif
 
 #ifdef CONFIG_EISA
-static const struct eisa_device_id hp100_eisa_tbl[] __devinitconst = {
+static struct eisa_device_id hp100_eisa_tbl[] = {
 	{ "HWPF180" }, /* HP J2577 rev A */
 	{ "HWP1920" }, /* HP 27248B */
 	{ "HWP1940" }, /* HP J2577 */
@@ -336,7 +336,7 @@ static __devinit const char *hp100_read_id(int ioaddr)
 }
 
 #ifdef CONFIG_ISA
-static __devinit int hp100_isa_probe1(struct net_device *dev, int ioaddr)
+static __init int hp100_isa_probe1(struct net_device *dev, int ioaddr)
 {
 	const char *sig;
 	int i;
@@ -372,7 +372,7 @@ static __devinit int hp100_isa_probe1(struct net_device *dev, int ioaddr)
  * EISA and PCI are handled by device infrastructure.
  */
 
-static int  __devinit hp100_isa_probe(struct net_device *dev, int addr)
+static int  __init hp100_isa_probe(struct net_device *dev, int addr)
 {
 	int err = -ENODEV;
 
@@ -396,7 +396,7 @@ static int  __devinit hp100_isa_probe(struct net_device *dev, int addr)
 #endif /* CONFIG_ISA */
 
 #if !defined(MODULE) && defined(CONFIG_ISA)
-struct net_device * __devinit hp100_probe(int unit)
+struct net_device * __init hp100_probe(int unit)
 {
 	struct net_device *dev = alloc_etherdev(sizeof(struct hp100_private));
 	int err;
@@ -2843,7 +2843,7 @@ static void cleanup_dev(struct net_device *d)
 }
 
 #ifdef CONFIG_EISA
-static int __devinit hp100_eisa_probe (struct device *gendev)
+static int __init hp100_eisa_probe (struct device *gendev)
 {
 	struct net_device *dev = alloc_etherdev(sizeof(struct hp100_private));
 	struct eisa_device *edev = to_eisa_device(gendev);
diff --git a/drivers/net/ibmlana.c b/drivers/net/ibmlana.c
index 136d7544cc3..a7d6cad3295 100644
--- a/drivers/net/ibmlana.c
+++ b/drivers/net/ibmlana.c
@@ -895,12 +895,12 @@ static int ibmlana_irq;
 static int ibmlana_io;
 static int startslot;		/* counts through slots when probing multiple devices */
 
-static const short ibmlana_adapter_ids[] __devinitconst = {
+static short ibmlana_adapter_ids[] __initdata = {
 	IBM_LANA_ID,
 	0x0000
 };
 
-static const char *const ibmlana_adapter_names[] __devinitconst = {
+static char *ibmlana_adapter_names[] __devinitdata = {
 	"IBM LAN Adapter/A",
 	NULL
 };
diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 69b5707db36..8800e1fe412 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -222,19 +222,19 @@ static void smsc_ircc_set_transceiver_for_speed(struct smsc_ircc_cb *self, u32 s
 static void smsc_ircc_sir_wait_hw_transmitter_finish(struct smsc_ircc_cb *self);
 
 /* Probing */
-static int smsc_ircc_look_for_chips(void);
-static const struct smsc_chip * smsc_ircc_probe(unsigned short cfg_base, u8 reg, const struct smsc_chip *chip, char *type);
-static int smsc_superio_flat(const struct smsc_chip *chips, unsigned short cfg_base, char *type);
-static int smsc_superio_paged(const struct smsc_chip *chips, unsigned short cfg_base, char *type);
-static int smsc_superio_fdc(unsigned short cfg_base);
-static int smsc_superio_lpc(unsigned short cfg_base);
+static int __init smsc_ircc_look_for_chips(void);
+static const struct smsc_chip * __init smsc_ircc_probe(unsigned short cfg_base, u8 reg, const struct smsc_chip *chip, char *type);
+static int __init smsc_superio_flat(const struct smsc_chip *chips, unsigned short cfg_base, char *type);
+static int __init smsc_superio_paged(const struct smsc_chip *chips, unsigned short cfg_base, char *type);
+static int __init smsc_superio_fdc(unsigned short cfg_base);
+static int __init smsc_superio_lpc(unsigned short cfg_base);
 #ifdef CONFIG_PCI
-static int preconfigure_smsc_chip(struct smsc_ircc_subsystem_configuration *conf);
-static int preconfigure_through_82801(struct pci_dev *dev, struct smsc_ircc_subsystem_configuration *conf);
-static void preconfigure_ali_port(struct pci_dev *dev,
+static int __init preconfigure_smsc_chip(struct smsc_ircc_subsystem_configuration *conf);
+static int __init preconfigure_through_82801(struct pci_dev *dev, struct smsc_ircc_subsystem_configuration *conf);
+static void __init preconfigure_ali_port(struct pci_dev *dev,
 					 unsigned short port);
-static int preconfigure_through_ali(struct pci_dev *dev, struct smsc_ircc_subsystem_configuration *conf);
-static int smsc_ircc_preconfigure_subsystems(unsigned short ircc_cfg,
+static int __init preconfigure_through_ali(struct pci_dev *dev, struct smsc_ircc_subsystem_configuration *conf);
+static int __init smsc_ircc_preconfigure_subsystems(unsigned short ircc_cfg,
 						    unsigned short ircc_fir,
 						    unsigned short ircc_sir,
 						    unsigned char ircc_dma,
@@ -366,7 +366,7 @@ static inline void register_bank(int iobase, int bank)
 }
 
 /* PNP hotplug support */
-static const struct pnp_device_id smsc_ircc_pnp_table[] __devinitconst = {
+static const struct pnp_device_id smsc_ircc_pnp_table[] = {
 	{ .id = "SMCf010", .driver_data = 0 },
 	/* and presumably others */
 	{ }
@@ -515,7 +515,7 @@ static const struct net_device_ops smsc_ircc_netdev_ops = {
  *    Try to open driver instance
  *
  */
-static int __devinit smsc_ircc_open(unsigned int fir_base, unsigned int sir_base, u8 dma, u8 irq)
+static int __init smsc_ircc_open(unsigned int fir_base, unsigned int sir_base, u8 dma, u8 irq)
 {
 	struct smsc_ircc_cb *self;
 	struct net_device *dev;
@@ -2273,7 +2273,7 @@ static int __init smsc_superio_paged(const struct smsc_chip *chips, unsigned sho
 }
 
 
-static int __devinit smsc_access(unsigned short cfg_base, unsigned char reg)
+static int __init smsc_access(unsigned short cfg_base, unsigned char reg)
 {
 	IRDA_DEBUG(1, "%s\n", __func__);
 
@@ -2281,7 +2281,7 @@ static int __devinit smsc_access(unsigned short cfg_base, unsigned char reg)
 	return inb(cfg_base) != reg ? -1 : 0;
 }
 
-static const struct smsc_chip * __devinit smsc_ircc_probe(unsigned short cfg_base, u8 reg, const struct smsc_chip *chip, char *type)
+static const struct smsc_chip * __init smsc_ircc_probe(unsigned short cfg_base, u8 reg, const struct smsc_chip *chip, char *type)
 {
 	u8 devid, xdevid, rev;
 
@@ -2406,7 +2406,7 @@ static int __init smsc_superio_lpc(unsigned short cfg_base)
 #ifdef CONFIG_PCI
 #define PCIID_VENDOR_INTEL 0x8086
 #define PCIID_VENDOR_ALI 0x10b9
-static const struct smsc_ircc_subsystem_configuration subsystem_configurations[] __devinitconst = {
+static struct smsc_ircc_subsystem_configuration subsystem_configurations[] __initdata = {
 	/*
 	 * Subsystems needing entries:
 	 * 0x10b9:0x1533 0x103c:0x0850 HP nx9010 family
@@ -2532,7 +2532,7 @@ static const struct smsc_ircc_subsystem_configuration subsystem_configurations[]
  * (FIR port, SIR port, FIR DMA, FIR IRQ)
  * through the chip configuration port.
  */
-static int __devinit preconfigure_smsc_chip(struct
+static int __init preconfigure_smsc_chip(struct
 					 smsc_ircc_subsystem_configuration
 					 *conf)
 {
@@ -2633,7 +2633,7 @@ static int __devinit preconfigure_smsc_chip(struct
  * or Intel 82801DB/DBL (ICH4/ICH4-L) LPC Interface Bridge.
  * They all work the same way!
  */
-static int __devinit preconfigure_through_82801(struct pci_dev *dev,
+static int __init preconfigure_through_82801(struct pci_dev *dev,
 					     struct
 					     smsc_ircc_subsystem_configuration
 					     *conf)
@@ -2786,7 +2786,7 @@ static int __devinit preconfigure_through_82801(struct pci_dev *dev,
  * This is based on reverse-engineering since ALi does not
  * provide any data sheet for the 1533 chip.
  */
-static void __devinit preconfigure_ali_port(struct pci_dev *dev,
+static void __init preconfigure_ali_port(struct pci_dev *dev,
 					 unsigned short port)
 {
 	unsigned char reg;
@@ -2824,7 +2824,7 @@ static void __devinit preconfigure_ali_port(struct pci_dev *dev,
 	IRDA_MESSAGE("Activated ALi 1533 ISA bridge port 0x%04x.\n", port);
 }
 
-static int __devinit preconfigure_through_ali(struct pci_dev *dev,
+static int __init preconfigure_through_ali(struct pci_dev *dev,
 					   struct
 					   smsc_ircc_subsystem_configuration
 					   *conf)
@@ -2837,7 +2837,7 @@ static int __devinit preconfigure_through_ali(struct pci_dev *dev,
 	return preconfigure_smsc_chip(conf);
 }
 
-static int __devinit smsc_ircc_preconfigure_subsystems(unsigned short ircc_cfg,
+static int __init smsc_ircc_preconfigure_subsystems(unsigned short ircc_cfg,
 						    unsigned short ircc_fir,
 						    unsigned short ircc_sir,
 						    unsigned char ircc_dma,
@@ -2849,7 +2849,7 @@ static int __devinit smsc_ircc_preconfigure_subsystems(unsigned short ircc_cfg,
 	int ret = 0;
 
 	for_each_pci_dev(dev) {
-		const struct smsc_ircc_subsystem_configuration *conf;
+		struct smsc_ircc_subsystem_configuration *conf;
 
 		/*
 		 * Cache the subsystem vendor/device:
diff --git a/drivers/net/ne3210.c b/drivers/net/ne3210.c
index e8984b0ca52..243ed2aee88 100644
--- a/drivers/net/ne3210.c
+++ b/drivers/net/ne3210.c
@@ -80,20 +80,17 @@ static void ne3210_block_output(struct net_device *dev, int count, const unsigne
 
 #define NE3210_DEBUG	0x0
 
-static const unsigned char irq_map[] __devinitconst =
-	{ 15, 12, 11, 10, 9, 7, 5, 3 };
-static const unsigned int shmem_map[] __devinitconst =
-	{ 0xff0, 0xfe0, 0xfff0, 0xd8, 0xffe0, 0xffc0, 0xd0, 0x0 };
-static const char *const ifmap[] __devinitconst =
-	{ "UTP", "?", "BNC", "AUI" };
-static const int ifmap_val[] __devinitconst = {
+static unsigned char irq_map[] __initdata = {15, 12, 11, 10, 9, 7, 5, 3};
+static unsigned int shmem_map[] __initdata = {0xff0, 0xfe0, 0xfff0, 0xd8, 0xffe0, 0xffc0, 0xd0, 0x0};
+static const char *ifmap[] __initdata = {"UTP", "?", "BNC", "AUI"};
+static int ifmap_val[] __initdata = {
 		IF_PORT_10BASET,
 		IF_PORT_UNKNOWN,
 		IF_PORT_10BASE2,
 		IF_PORT_AUI,
 };
 
-static int __devinit ne3210_eisa_probe (struct device *device)
+static int __init ne3210_eisa_probe (struct device *device)
 {
 	unsigned long ioaddr, phys_mem;
 	int i, retval, port_index;
@@ -316,7 +313,7 @@ static void ne3210_block_output(struct net_device *dev, int count,
 	memcpy_toio(shmem, buf, count);
 }
 
-static const struct eisa_device_id ne3210_ids[] __devinitconst = {
+static struct eisa_device_id ne3210_ids[] = {
 	{ "EGL0101" },
 	{ "NVL1801" },
 	{ "" },
diff --git a/drivers/net/smc-mca.c b/drivers/net/smc-mca.c
index 0f29f261fcf..d07c39cb4da 100644
--- a/drivers/net/smc-mca.c
+++ b/drivers/net/smc-mca.c
@@ -156,7 +156,7 @@ static const struct {
    { 14, 15 }
 };
 
-static const short smc_mca_adapter_ids[] __devinitconst = {
+static short smc_mca_adapter_ids[] __initdata = {
 	0x61c8,
 	0x61c9,
 	0x6fc0,
@@ -168,7 +168,7 @@ static const short smc_mca_adapter_ids[] __devinitconst = {
 	0x0000
 };
 
-static const char *const smc_mca_adapter_names[] __devinitconst = {
+static char *smc_mca_adapter_names[] __initdata = {
 	"SMC Ethercard PLUS Elite/A BNC/AUI (WD8013EP/A)",
 	"SMC Ethercard PLUS Elite/A UTP/AUI (WD8013WP/A)",
 	"WD Ethercard PLUS/A (WD8003E/A or WD8003ET/A)",
@@ -199,7 +199,7 @@ static const struct net_device_ops ultramca_netdev_ops = {
 #endif
 };
 
-static int __devinit ultramca_probe(struct device *gen_dev)
+static int __init ultramca_probe(struct device *gen_dev)
 {
 	unsigned short ioaddr;
 	struct net_device *dev;
diff --git a/drivers/net/tokenring/madgemc.c b/drivers/net/tokenring/madgemc.c
index 1313aa1315f..2bedc0ace81 100644
--- a/drivers/net/tokenring/madgemc.c
+++ b/drivers/net/tokenring/madgemc.c
@@ -727,7 +727,7 @@ static int __devexit madgemc_remove(struct device *device)
 	return 0;
 }
 
-static const short madgemc_adapter_ids[] __devinitconst = {
+static short madgemc_adapter_ids[] __initdata = {
 	0x002d,
 	0x0000
 };
diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c
index 45144d5bd11..efaa1d69b72 100644
--- a/drivers/net/tulip/de4x5.c
+++ b/drivers/net/tulip/de4x5.c
@@ -1995,7 +1995,7 @@ SetMulticastFilter(struct net_device *dev)
 
 static u_char de4x5_irq[] = EISA_ALLOWED_IRQ_LIST;
 
-static int __devinit de4x5_eisa_probe (struct device *gendev)
+static int __init de4x5_eisa_probe (struct device *gendev)
 {
 	struct eisa_device *edev;
 	u_long iobase;
@@ -2097,7 +2097,7 @@ static int __devexit de4x5_eisa_remove (struct device *device)
 	return 0;
 }
 
-static const struct eisa_device_id de4x5_eisa_ids[] __devinitconst = {
+static struct eisa_device_id de4x5_eisa_ids[] = {
         { "DEC4250", 0 },	/* 0 is the board name index... */
         { "" }
 };
-- 
cgit v1.2.3-70-g09d2


From a5b2c5b2ad5853591a6cac6134cd0f599a720865 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Tue, 31 May 2011 11:31:41 -0700
Subject: AppArmor: fix oops in apparmor_setprocattr

When invalid parameters are passed to apparmor_setprocattr a NULL deref
oops occurs when it tries to record an audit message. This is because
it is passing NULL for the profile parameter for aa_audit. But aa_audit
now requires that the profile passed is not NULL.

Fix this by passing the current profile on the task that is trying to
setprocattr.

Signed-off-by: Kees Cook <kees@ubuntu.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
Cc: stable@kernel.org
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/apparmor/lsm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index ae3a698415e..ec1bcecf2cd 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -593,7 +593,8 @@ static int apparmor_setprocattr(struct task_struct *task, char *name,
 			sa.aad.op = OP_SETPROCATTR;
 			sa.aad.info = name;
 			sa.aad.error = -EINVAL;
-			return aa_audit(AUDIT_APPARMOR_DENIED, NULL, GFP_KERNEL,
+			return aa_audit(AUDIT_APPARMOR_DENIED,
+					__aa_current_profile(), GFP_KERNEL,
 					&sa, NULL);
 		}
 	} else if (strcmp(name, "exec") == 0) {
-- 
cgit v1.2.3-70-g09d2


From 4c49ff3fe128ca68dabd07537415c419ad7f82f9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 1 Jun 2011 08:27:41 +0200
Subject: block: blkdev_get() should access ->bd_disk only after success

d4dc210f69 (block: don't block events on excl write for non-optical
devices) added dereferencing of bdev->bd_disk to test
GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; however, bdev->bd_disk can be
%NULL if open failed which can lead to an oops.

Test the flag after testing open was successful, not before.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: David Miller <davem@davemloft.net>
Tested-by: David Miller <davem@davemloft.net>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/block_dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1f2b1997833..1a2421f908f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1272,8 +1272,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 		 * individual writeable reference is too fragile given the
 		 * way @mode is used in blkdev_get/put().
 		 */
-		if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
-		    !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+		if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
 			bdev->bd_write_holder = true;
 			disk_block_events(disk);
 		}
-- 
cgit v1.2.3-70-g09d2


From 603d04b2010976a52f62b7633f9999d104046900 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Sat, 28 May 2011 10:04:25 -0400
Subject: kgdbts: only use new asm-generic/ptrace.h api when needed

The new instruction_pointer_set helper is defined for people who have
converted to asm-generic/ptrace.h, so don't use it generally unless
the arch needs it (in which case it has been converted).  This should
fix building of kgdb tests for arches not yet converted.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/kgdbts.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index b0c56313dbb..8cebec5e85e 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -304,7 +304,10 @@ static int check_and_rewind_pc(char *put_str, char *arg)
 		return 1;
 	}
 	/* Readjust the instruction pointer if needed */
-	instruction_pointer_set(&kgdbts_regs, ip + offset);
+	ip += offset;
+#ifdef GDB_ADJUSTS_BREAK_OFFSET
+	instruction_pointer_set(&kgdbts_regs, ip);
+#endif
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ab75950b11e74145ffe61376ac073d56645aab8a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 26 May 2011 06:51:48 +0300
Subject: UBIFS: supress false error messages

Commit ab51afe05273741f72383529ef488aa1ea598ec6 was a good clean-up, but
it introduced a regression - now UBIFS prints scary error messages during
recovery on all corrupted nodes, even though the corruptions are expected
(due to a power cut). This patch fixes the issue.

Additionally fix a typo in a commentary introduced by the same commit.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/recovery.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 731d9e2e7b5..95e24183b71 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -635,7 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 		 * Scan quietly until there is an error from which we cannot
 		 * recover
 		 */
-		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 		if (ret == SCANNED_A_NODE) {
 			/* A valid node, and not a padding node */
 			struct ubifs_ch *ch = buf;
@@ -701,7 +701,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 	 * While we are in the middle of the same min. I/O unit keep dropping
 	 * nodes. So basically, what we want is to make sure that the last min.
 	 * I/O unit where we saw the corruption is dropped completely with all
-	 * the uncorrupted node which may possibly sit there.
+	 * the uncorrupted nodes which may possibly sit there.
 	 *
 	 * In other words, let's name the min. I/O unit where the corruption
 	 * starts B, and the previous min. I/O unit A. The below code tries to
-- 
cgit v1.2.3-70-g09d2


From 1a0b06997ceca96db9259e537eb935f9fe59a3de Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 26 May 2011 08:26:05 +0300
Subject: UBIFS: introduce a "grouped" journal head flag

Journal heads are different in a way how UBIFS writes nodes there. All normal
journal heads receive grouped nodes, while the GC journal heads receives
ungrouped nodes. This patch adds a 'grouped' flag to 'struct ubifs_jhead' which
describes this property.

This patch is a preparation to a further recovery fix.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 5 ++++-
 fs/ubifs/ubifs.h | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1ab0d22e4c9..1e40db740da 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c)
 
 		c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
 		c->jheads[i].wbuf.jhead = i;
+		c->jheads[i].grouped = 1;
 	}
 
 	c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
 	/*
 	 * Garbage Collector head likely contains long-term data and
-	 * does not need to be synchronized by timer.
+	 * does not need to be synchronized by timer. Also GC head nodes are
+	 * not grouped.
 	 */
 	c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
 	c->jheads[GCHD].wbuf.no_timer = 1;
+	c->jheads[GCHD].grouped = 0;
 
 	return 0;
 }
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a70d7b4ffb2..adeca14c0e9 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -722,12 +722,14 @@ struct ubifs_bud {
  * struct ubifs_jhead - journal head.
  * @wbuf: head's write-buffer
  * @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
  *
  * Note, the @buds list is protected by the @c->buds_lock.
  */
 struct ubifs_jhead {
 	struct ubifs_wbuf wbuf;
 	struct list_head buds_list;
+	unsigned int grouped:1;
 };
 
 /**
-- 
cgit v1.2.3-70-g09d2


From efcfde54ca68091b164f9aec544c7233a9760aff Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 26 May 2011 08:36:52 +0300
Subject: UBIFS: amend ubifs_recover_leb interface

Instead of passing "grouped" parameter to 'ubifs_recover_leb()' which tells
whether the nodes are grouped in the LEB to recover, pass the journal head
number and let 'ubifs_recover_leb()' look at the journal head's 'grouped' flag.

This patch is a preparation to a further fix where we'll need to know the
journal head number for other purposes.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/orphan.c   |  2 +-
 fs/ubifs/recovery.c | 10 ++++++----
 fs/ubifs/replay.c   |  3 +--
 fs/ubifs/ubifs.h    |  2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index bd644bf587a..a5422fffbd6 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c)
 		if (IS_ERR(sleb)) {
 			if (PTR_ERR(sleb) == -EUCLEAN)
 				sleb = ubifs_recover_leb(c, lnum, 0,
-							 c->sbuf, 0);
+							 c->sbuf, -1);
 			if (IS_ERR(sleb)) {
 				err = PTR_ERR(sleb);
 				break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 95e24183b71..6adb5328a01 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -604,7 +604,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
  * @lnum: LEB number
  * @offs: offset
  * @sbuf: LEB-sized buffer to use
- * @grouped: nodes may be grouped for recovery
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ *         belong to any journal head)
  *
  * This function does a scan of a LEB, but caters for errors that might have
  * been caused by the unclean unmount from which we are attempting to recover.
@@ -612,13 +613,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
  * found, and a negative error code in case of failure.
  */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
-					 int offs, void *sbuf, int grouped)
+					 int offs, void *sbuf, int jhead)
 {
 	int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+	int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
 	struct ubifs_scan_leb *sleb;
 	void *buf = sbuf + offs;
 
-	dbg_rcvry("%d:%d", lnum, offs);
+	dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
 
 	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
 	if (IS_ERR(sleb))
@@ -881,7 +883,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 		}
 		ubifs_scan_destroy(sleb);
 	}
-	return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+	return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
 }
 
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 6617280d167..5e97161ce4d 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
 		 * these LEBs could possibly be written to at the power cut
 		 * time.
 		 */
-		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
-					 b->bud->jhead != GCHD);
+		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
 	else
 		sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
 	if (IS_ERR(sleb))
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index adeca14c0e9..f79983d6f86 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1744,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
 int ubifs_recover_master_node(struct ubifs_info *c);
 int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
-					 int offs, void *sbuf, int grouped);
+					 int offs, void *sbuf, int jhead);
 struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 					     int offs, void *sbuf);
 int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
-- 
cgit v1.2.3-70-g09d2


From da8b94ea61c5d80aae0cc7b7541f1e0fa7459391 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 26 May 2011 08:58:19 +0300
Subject: UBIFS: fix recovery broken by the previous recovery fix

Unfortunately, the recovery fix d1606a59b6be4ea392eabd40d1250aa1eeb19efb
(UBIFS: fix extremely rare mount failure) broke recovery. This commit make
UBIFS drop the last min. I/O unit in all journal heads, but this is needed only
for the GC head. And this does not work for non-GC heads. For example, if
suppose we have min. I/O units A and B, and A contains a valid node X, which
was fsynced, and then a group of nodes Y which spans the rest of A and B. In
this case we'll drop not only Y, but also X, which is obviously incorrect.

This patch fixes the issue and additionally makes recovery to drop last min.
I/O unit only for the GC head, and leave things as they have been for ages for
the other heads - this is safer.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/recovery.c | 152 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 87 insertions(+), 65 deletions(-)

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 6adb5328a01..783d8e0beb7 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 
 /**
- * drop_last_node - drop the last node or group of nodes.
+ * drop_last_group - drop the last group of nodes.
  * @sleb: scanned LEB information
  * @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
  *
  * This is a helper function for 'ubifs_recover_leb()' which drops the last
- * node of the scanned LEB or the last group of nodes if @grouped is not zero.
- * This function returns %1 if a node was dropped and %0 otherwise.
+ * group of nodes of the scanned LEB.
  */
-static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
 {
-	int dropped = 0;
-
 	while (!list_empty(&sleb->nodes)) {
 		struct ubifs_scan_node *snod;
 		struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 				  list);
 		ch = snod->node;
 		if (ch->group_type != UBIFS_IN_NODE_GROUP)
-			return dropped;
-		dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+			break;
+
+		dbg_rcvry("dropping grouped node at %d:%d",
+			  sleb->lnum, snod->offs);
+		*offs = snod->offs;
+		list_del(&snod->list);
+		kfree(snod);
+		sleb->nodes_cnt -= 1;
+	}
+}
+
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+	struct ubifs_scan_node *snod;
+
+	if (!list_empty(&sleb->nodes)) {
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+
+		dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
 		*offs = snod->offs;
 		list_del(&snod->list);
 		kfree(snod);
 		sleb->nodes_cnt -= 1;
-		dropped = 1;
-		if (!grouped)
-			break;
 	}
-	return dropped;
 }
 
 /**
@@ -697,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 		 * If nodes are grouped, always drop the incomplete group at
 		 * the end.
 		 */
-		drop_last_node(sleb, &offs, 1);
+		drop_last_group(sleb, &offs);
 
-	/*
-	 * While we are in the middle of the same min. I/O unit keep dropping
-	 * nodes. So basically, what we want is to make sure that the last min.
-	 * I/O unit where we saw the corruption is dropped completely with all
-	 * the uncorrupted nodes which may possibly sit there.
-	 *
-	 * In other words, let's name the min. I/O unit where the corruption
-	 * starts B, and the previous min. I/O unit A. The below code tries to
-	 * deal with a situation when half of B contains valid nodes or the end
-	 * of a valid node, and the second half of B contains corrupted data or
-	 * garbage. This means that UBIFS had been writing to B just before the
-	 * power cut happened. I do not know how realistic is this scenario
-	 * that half of the min. I/O unit had been written successfully and the
-	 * other half not, but this is possible in our 'failure mode emulation'
-	 * infrastructure at least.
-	 *
-	 * So what is the problem, why we need to drop those nodes? Whey can't
-	 * we just clean-up the second half of B by putting a padding node
-	 * there? We can, and this works fine with one exception which was
-	 * reproduced with power cut emulation testing and happens extremely
-	 * rarely. The description follows, but it is worth noting that that is
-	 * only about the GC head, so we could do this trick only if the bud
-	 * belongs to the GC head, but it does not seem to be worth an
-	 * additional "if" statement.
-	 *
-	 * So, imagine the file-system is full, we run GC which is moving valid
-	 * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
-	 * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
-	 * and will try to continue. Imagine that LEB X is currently the
-	 * dirtiest LEB, and the amount of used space in LEB Y is exactly the
-	 * same as amount of free space in LEB X.
-	 *
-	 * And a power cut happens when nodes are moved from LEB X to LEB Y. We
-	 * are here trying to recover LEB Y which is the GC head LEB. We find
-	 * the min. I/O unit B as described above. Then we clean-up LEB Y by
-	 * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
-	 * fails, because it cannot find a dirty LEB which could be GC'd into
-	 * LEB Y! Even LEB X does not match because the amount of valid nodes
-	 * there does not fit the free space in LEB Y any more! And this is
-	 * because of the padding node which we added to LEB Y. The
-	 * user-visible effect of this which I once observed and analysed is
-	 * that we cannot mount the file-system with -ENOSPC error.
-	 *
-	 * So obviously, to make sure that situation does not happen we should
-	 * free min. I/O unit B in LEB Y completely and the last used min. I/O
-	 * unit in LEB Y should be A. This is basically what the below code
-	 * tries to do.
-	 */
-	while (min_io_unit == round_down(offs, c->min_io_size) &&
-	       min_io_unit != offs &&
-	       drop_last_node(sleb, &offs, grouped));
+	if (jhead == GCHD) {
+		/*
+		 * If this LEB belongs to the GC head then while we are in the
+		 * middle of the same min. I/O unit keep dropping nodes. So
+		 * basically, what we want is to make sure that the last min.
+		 * I/O unit where we saw the corruption is dropped completely
+		 * with all the uncorrupted nodes which may possibly sit there.
+		 *
+		 * In other words, let's name the min. I/O unit where the
+		 * corruption starts B, and the previous min. I/O unit A. The
+		 * below code tries to deal with a situation when half of B
+		 * contains valid nodes or the end of a valid node, and the
+		 * second half of B contains corrupted data or garbage. This
+		 * means that UBIFS had been writing to B just before the power
+		 * cut happened. I do not know how realistic is this scenario
+		 * that half of the min. I/O unit had been written successfully
+		 * and the other half not, but this is possible in our 'failure
+		 * mode emulation' infrastructure at least.
+		 *
+		 * So what is the problem, why we need to drop those nodes? Why
+		 * can't we just clean-up the second half of B by putting a
+		 * padding node there? We can, and this works fine with one
+		 * exception which was reproduced with power cut emulation
+		 * testing and happens extremely rarely.
+		 *
+		 * Imagine the file-system is full, we run GC which starts
+		 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
+		 * the current GC head LEB). The @c->gc_lnum is -1, which means
+		 * that GC will retain LEB X and will try to continue. Imagine
+		 * that LEB X is currently the dirtiest LEB, and the amount of
+		 * used space in LEB Y is exactly the same as amount of free
+		 * space in LEB X.
+		 *
+		 * And a power cut happens when nodes are moved from LEB X to
+		 * LEB Y. We are here trying to recover LEB Y which is the GC
+		 * head LEB. We find the min. I/O unit B as described above.
+		 * Then we clean-up LEB Y by padding min. I/O unit. And later
+		 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
+		 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
+		 * does not match because the amount of valid nodes there does
+		 * not fit the free space in LEB Y any more! And this is
+		 * because of the padding node which we added to LEB Y. The
+		 * user-visible effect of this which I once observed and
+		 * analysed is that we cannot mount the file-system with
+		 * -ENOSPC error.
+		 *
+		 * So obviously, to make sure that situation does not happen we
+		 * should free min. I/O unit B in LEB Y completely and the last
+		 * used min. I/O unit in LEB Y should be A. This is basically
+		 * what the below code tries to do.
+		 */
+		while (offs > min_io_unit)
+			drop_last_node(sleb, &offs);
+	}
 
 	buf = sbuf + offs;
 	len = c->leb_size - offs;
-- 
cgit v1.2.3-70-g09d2


From 63da029015b5255915cd6d61f19ffc276ad4635d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 23 May 2011 11:37:09 -0700
Subject: mtd: fix physmap.h warnings

Fix build warnings in physmap.h:

include/linux/mtd/physmap.h:25: warning: 'struct platform_device' declared inside parameter list
include/linux/mtd/physmap.h:25: warning: its scope is only this definition or declaration, which is probably not what you want
include/linux/mtd/physmap.h:26: warning: 'struct platform_device' declared inside parameter list
include/linux/mtd/physmap.h:27: warning: 'struct platform_device' declared inside parameter list

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/physmap.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index d40bfa1d9c9..e5f21d293c7 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -19,6 +19,7 @@
 #include <linux/mtd/partitions.h>
 
 struct map_info;
+struct platform_device;
 
 struct physmap_flash_data {
 	unsigned int		width;
-- 
cgit v1.2.3-70-g09d2


From 6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Wed, 25 May 2011 19:13:49 +0100
Subject: intel-iommu: Enable super page (2MiB, 1GiB, etc.) support

There are no externally-visible changes with this. In the loop in the
internal __domain_mapping() function, we simply detect if we are mapping:
  - size >= 2MiB, and
  - virtual address aligned to 2MiB, and
  - physical address aligned to 2MiB, and
  - on hardware that supports superpages.

(and likewise for larger superpages).

We automatically use a superpage for such mappings. We never have to
worry about *breaking* superpages, since we trust that we will always
*unmap* the same range that was mapped. So all we need to do is ensure
that dma_pte_clear_range() will also cope with superpages.

Adjust pfn_to_dma_pte() to take a superpage 'level' as an argument, so
it can return a PTE at the appropriate level rather than always
extending the page tables all the way down to level 1. Again, this is
simplified by the fact that we should never encounter existing small
pages when we're creating a mapping; any old mapping that used the same
virtual range will have been entirely removed and its obsolete page
tables freed.

Provide an 'intel_iommu=sp_off' argument on the command line as a
chicken bit. Not that it should ever be required.

==

The original commit seen in the iommu-2.6.git was Youquan's
implementation (and completion) of my own half-baked code which I'd
typed into an email. Followed by half a dozen subsequent 'fixes'.

I've taken the unusual step of rewriting history and collapsing the
original commits in order to keep the main history simpler, and make
life easier for the people who are going to have to backport this to
older kernels. And also so I can give it a more coherent commit comment
which (hopefully) gives a better explanation of what's going on.

The original sequence of commits leading to identical code was:

Youquan Song (3):
      intel-iommu: super page support
      intel-iommu: Fix superpage alignment calculation error
      intel-iommu: Fix superpage level calculation error in dma_pfn_level_pte()

David Woodhouse (4):
      intel-iommu: Precalculate superpage support for dmar_domain
      intel-iommu: Fix hardware_largepage_caps()
      intel-iommu: Fix inappropriate use of superpages in __domain_mapping()
      intel-iommu: Fix phys_pfn in __domain_mapping for sglist pages

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/kernel-parameters.txt |   5 +-
 drivers/pci/intel-iommu.c           | 157 +++++++++++++++++++++++++++++++-----
 include/linux/dma_remapping.h       |   4 +
 3 files changed, 147 insertions(+), 19 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc85a927819..d005487c1a2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -999,7 +999,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			With this option on every unmap_single operation will
 			result in a hardware IOTLB flush operation as opposed
 			to batching them for performance.
-
+		sp_off [Default Off]
+			By default, super page will be supported if Intel IOMMU
+			has the capability. With this option, super page will
+			not be supported.
 	intremap=	[X86-64, Intel-IOMMU]
 			Format: { on (default) | off | nosid }
 			on	enable Interrupt Remapping (default)
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 395f253c049..e6fe1994f9d 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
 	return (pfn + level_size(level) - 1) & level_mask(level);
 }
 
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+	return  1 << ((lvl - 1) * LEVEL_STRIDE);
+}
+
 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
    are never going to work. */
 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@ -343,6 +348,9 @@ struct dmar_domain {
 	int		iommu_coherency;/* indicate coherency of iommu access */
 	int		iommu_snooping; /* indicate snooping control feature*/
 	int		iommu_count;	/* reference count of iommu */
+	int		iommu_superpage;/* Level of superpages supported:
+					   0 == 4KiB (no superpages), 1 == 2MiB,
+					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 	spinlock_t	iommu_lock;	/* protect iommu set in domain */
 	u64		max_addr;	/* maximum mapped address */
 };
@@ -392,6 +400,7 @@ int dmar_disabled = 1;
 static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str)
 			printk(KERN_INFO
 				"Intel-IOMMU: disable batched IOTLB flush\n");
 			intel_iommu_strict = 1;
+		} else if (!strncmp(str, "sp_off", 6)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable supported super page\n");
+			intel_iommu_superpage = 0;
 		}
 
 		str += strcspn(str, ",");
@@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
 	}
 }
 
+static void domain_update_iommu_superpage(struct dmar_domain *domain)
+{
+	int i, mask = 0xf;
+
+	if (!intel_iommu_superpage) {
+		domain->iommu_superpage = 0;
+		return;
+	}
+
+	domain->iommu_superpage = 4; /* 1TiB */
+
+	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+		mask |= cap_super_page_val(g_iommus[i]->cap);
+		if (!mask) {
+			break;
+		}
+	}
+	domain->iommu_superpage = fls(mask);
+}
+
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
 	domain_update_iommu_coherency(domain);
 	domain_update_iommu_snooping(domain);
+	domain_update_iommu_superpage(domain);
 }
 
 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@ -694,23 +728,31 @@ out:
 }
 
 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
-				      unsigned long pfn)
+				      unsigned long pfn, int large_level)
 {
 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 	struct dma_pte *parent, *pte = NULL;
 	int level = agaw_to_level(domain->agaw);
-	int offset;
+	int offset, target_level;
 
 	BUG_ON(!domain->pgd);
 	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 	parent = domain->pgd;
 
+	/* Search pte */
+	if (!large_level)
+		target_level = 1;
+	else
+		target_level = large_level;
+
 	while (level > 0) {
 		void *tmp_page;
 
 		offset = pfn_level_offset(pfn, level);
 		pte = &parent[offset];
-		if (level == 1)
+		if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
+			break;
+		if (level == target_level)
 			break;
 
 		if (!dma_pte_present(pte)) {
@@ -738,10 +780,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 	return pte;
 }
 
+
 /* return address's pte at specific level */
 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 					 unsigned long pfn,
-					 int level)
+					 int level, int *large_page)
 {
 	struct dma_pte *parent, *pte = NULL;
 	int total = agaw_to_level(domain->agaw);
@@ -754,8 +797,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 		if (level == total)
 			return pte;
 
-		if (!dma_pte_present(pte))
+		if (!dma_pte_present(pte)) {
+			*large_page = total;
 			break;
+		}
+
+		if (pte->val & DMA_PTE_LARGE_PAGE) {
+			*large_page = total;
+			return pte;
+		}
+
 		parent = phys_to_virt(dma_pte_addr(pte));
 		total--;
 	}
@@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
 				unsigned long last_pfn)
 {
 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+	unsigned int large_page = 1;
 	struct dma_pte *first_pte, *pte;
 
 	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
@@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
 
 	/* we don't need lock here; nobody else touches the iova range */
 	do {
-		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
+		large_page = 1;
+		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 		if (!pte) {
-			start_pfn = align_to_level(start_pfn + 1, 2);
+			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 			continue;
 		}
-		do { 
+		do {
 			dma_clear_pte(pte);
-			start_pfn++;
+			start_pfn += lvl_to_nr_pages(large_page);
 			pte++;
 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 
@@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
 	int total = agaw_to_level(domain->agaw);
 	int level;
 	unsigned long tmp;
+	int large_page = 2;
 
 	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
@@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
 			return;
 
 		do {
-			first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
+			large_page = level;
+			first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+			if (large_page > level)
+				level = large_page + 1;
 			if (!pte) {
 				tmp = align_to_level(tmp + 1, level + 1);
 				continue;
@@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	else
 		domain->iommu_snooping = 0;
 
+	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
 	domain->iommu_count = 1;
 	domain->nid = iommu->node;
 
@@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
 }
 
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+					  unsigned long iov_pfn,
+					  unsigned long phy_pfn,
+					  unsigned long pages)
+{
+	int support, level = 1;
+	unsigned long pfnmerge;
+
+	support = domain->iommu_superpage;
+
+	/* To use a large page, the virtual *and* physical addresses
+	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+	   of them will mean we have to use smaller pages. So just
+	   merge them and check both at once. */
+	pfnmerge = iov_pfn | phy_pfn;
+
+	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+		pages >>= VTD_STRIDE_SHIFT;
+		if (!pages)
+			break;
+		pfnmerge >>= VTD_STRIDE_SHIFT;
+		level++;
+		support--;
+	}
+	return level;
+}
+
 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			    struct scatterlist *sg, unsigned long phys_pfn,
 			    unsigned long nr_pages, int prot)
@@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 	phys_addr_t uninitialized_var(pteval);
 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 	unsigned long sg_res;
+	unsigned int largepage_lvl = 0;
+	unsigned long lvl_pages = 0;
 
 	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
 
@@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
 	}
 
-	while (nr_pages--) {
+	while (nr_pages > 0) {
 		uint64_t tmp;
 
 		if (!sg_res) {
@@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
 			sg->dma_length = sg->length;
 			pteval = page_to_phys(sg_page(sg)) | prot;
+			phys_pfn = pteval >> VTD_PAGE_SHIFT;
 		}
+
 		if (!pte) {
-			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
+			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+
+			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
 			if (!pte)
 				return -ENOMEM;
+			/* It is large page*/
+			if (largepage_lvl > 1)
+				pteval |= DMA_PTE_LARGE_PAGE;
+			else
+				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
+
 		}
 		/* We don't need lock here, nobody else
 		 * touches the iova range
@@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			}
 			WARN_ON(1);
 		}
+
+		lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+		BUG_ON(nr_pages < lvl_pages);
+		BUG_ON(sg_res < lvl_pages);
+
+		nr_pages -= lvl_pages;
+		iov_pfn += lvl_pages;
+		phys_pfn += lvl_pages;
+		pteval += lvl_pages * VTD_PAGE_SIZE;
+		sg_res -= lvl_pages;
+
+		/* If the next PTE would be the first in a new page, then we
+		   need to flush the cache on the entries we've just written.
+		   And then we'll need to recalculate 'pte', so clear it and
+		   let it get set again in the if (!pte) block above.
+
+		   If we're done (!nr_pages) we need to flush the cache too.
+
+		   Also if we've been setting superpages, we may need to
+		   recalculate 'pte' and switch back to smaller pages for the
+		   end of the mapping, if the trailing size is not enough to
+		   use another superpage (i.e. sg_res < lvl_pages). */
 		pte++;
-		if (!nr_pages || first_pte_in_page(pte)) {
+		if (!nr_pages || first_pte_in_page(pte) ||
+		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
 			domain_flush_cache(domain, first_pte,
 					   (void *)pte - (void *)first_pte);
 			pte = NULL;
 		}
-		iov_pfn++;
-		pteval += VTD_PAGE_SIZE;
-		sg_res--;
-		if (!sg_res)
+
+		if (!sg_res && nr_pages)
 			sg = sg_next(sg);
 	}
 	return 0;
@@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	domain->iommu_count = 0;
 	domain->iommu_coherency = 0;
 	domain->iommu_snooping = 0;
+	domain->iommu_superpage = 0;
 	domain->max_addr = 0;
 	domain->nid = -1;
 
@@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 	struct dma_pte *pte;
 	u64 phys = 0;
 
-	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
+	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
 	if (pte)
 		phys = dma_pte_addr(pte);
 
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 5619f852273..bbd8661b347 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -9,8 +9,12 @@
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
 
+#define VTD_STRIDE_SHIFT        (9)
+#define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
+
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
+#define DMA_PTE_LARGE_PAGE (1 << 7)
 #define DMA_PTE_SNP (1 << 11)
 
 #define CONTEXT_TT_MULTI_LEVEL	0
-- 
cgit v1.2.3-70-g09d2


From 9b4554b21ed07e8556405510638171f0c787742a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 24 May 2011 12:19:04 -0400
Subject: intel-iommu: Only unlink device domains from iommu

Commit a97590e5 added unlinking domains from iommus to reciprocate the
iommu from domains unlinking that was already done.  We actually want
to only do this for device domains and never for the static
identity map domain or VM domains.  The SI domain is special and
never freed, while VM domain->id lives in their own special address
space, separate from iommu->domain_ids.

In the current code, a VM can get domain->id zero, then mark that
domain unused when unbound from pci-stub.  This leads to DMAR
write faults when the device is re-bound to the host driver.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: stable@kernel.org
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index e6fe1994f9d..4eaec2fa136 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3561,10 +3561,13 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
 		domain_update_iommu_cap(domain);
 		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
 
-		spin_lock_irqsave(&iommu->lock, tmp_flags);
-		clear_bit(domain->id, iommu->domain_ids);
-		iommu->domains[domain->id] = NULL;
-		spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
+			spin_lock_irqsave(&iommu->lock, tmp_flags);
+			clear_bit(domain->id, iommu->domain_ids);
+			iommu->domains[domain->id] = NULL;
+			spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+		}
 	}
 
 	spin_unlock_irqrestore(&device_domain_lock, flags);
-- 
cgit v1.2.3-70-g09d2


From 8fcc5372fbac085199d84a880503ed67aba3fe49 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Sat, 28 May 2011 13:15:02 -0500
Subject: intel-iommu: Check for identity mapping candidate using system dma
 mask

The identity mapping code appears to make the assumption that if the
devices dma_mask is greater than 32bits the device can use identity
mapping.  But that is not true: take the case where we have a 40bit
device in a 44bit architecture. The device can potentially receive a
physical address that it will truncate and cause incorrect addresses
to be used.

Instead check to see if the device's dma_mask is large enough
to address the system's dma_mask.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Mike Habeck <habeck@sgi.com>
Cc: stable@kernel.org
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 4eaec2fa136..dcf051d53b7 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2316,8 +2316,19 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
 	 * Assume that they will -- if they turn out not to be, then we can 
 	 * take them out of the 1:1 domain later.
 	 */
-	if (!startup)
-		return pdev->dma_mask > DMA_BIT_MASK(32);
+	if (!startup) {
+		/*
+		 * If the device's dma_mask is less than the system's memory
+		 * size then this is not a candidate for identity mapping.
+		 */
+		u64 dma_mask = pdev->dma_mask;
+
+		if (pdev->dev.coherent_dma_mask &&
+		    pdev->dev.coherent_dma_mask < dma_mask)
+			dma_mask = pdev->dev.coherent_dma_mask;
+
+		return dma_mask >= dma_get_required_mask(&pdev->dev);
+	}
 
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From cb452a4040bb051d92e85d6e7eb60c11734c1781 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 28 May 2011 13:15:03 -0500
Subject: intel-iommu: Speed up processing of the identity_mapping function

When there are a large count of PCI devices, and the pass through
option for iommu is set, much time is spent in the identity_mapping
function hunting though the iommu domains to check if a specific
device is "identity mapped".

Speed up the function by checking the cached info to see if
it's mapped to the static identity domain.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Mike Habeck <habeck@sgi.com>
Cc: stable@kernel.org
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index dcf051d53b7..98be0b55ac0 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2235,10 +2235,10 @@ static int identity_mapping(struct pci_dev *pdev)
 	if (likely(!iommu_identity_mapping))
 		return 0;
 
+	info = pdev->dev.archdata.iommu;
+	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
+		return (info->domain == si_domain);
 
-	list_for_each_entry(info, &si_domain->devices, link)
-		if (info->dev == pdev)
-			return 1;
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1c9fc3d11b84fbd0c4f4aa7855702c2a1f098ebb Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Sat, 28 May 2011 13:15:04 -0500
Subject: intel-iommu: Dont cache iova above 32bit

Mike Travis and Mike Habeck reported an issue where iova allocation
would return a range that was larger than a device's dma mask.

https://lkml.org/lkml/2011/3/29/423

The dmar initialization code will reserve all PCI MMIO regions and copy
those reservations into a domain specific iova tree.  It is possible for
one of those regions to be above the dma mask of a device.  It is typical
to allocate iovas with a 32bit mask (despite device's dma mask possibly
being larger) and cache the result until it exhausts the lower 32bit
address space.  Freeing the iova range that is >= the last iova in the
lower 32bit range when there is still an iova above the 32bit range will
corrupt the cached iova by pointing it to a region that is above 32bit.
If that region is also larger than the device's dma mask, a subsequent
allocation will return an unusable iova and cause dma failure.

Simply don't cache an iova that is above the 32bit caching boundary.

Reported-by: Mike Travis <travis@sgi.com>
Reported-by: Mike Habeck <habeck@sgi.com>
Cc: stable@kernel.org
Acked-by: Mike Travis <travis@sgi.com>
Tested-by: Mike Habeck <habeck@sgi.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/iova.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c
index 9606e599a47..c5c274ab5c5 100644
--- a/drivers/pci/iova.c
+++ b/drivers/pci/iova.c
@@ -63,8 +63,16 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 	curr = iovad->cached32_node;
 	cached_iova = container_of(curr, struct iova, node);
 
-	if (free->pfn_lo >= cached_iova->pfn_lo)
-		iovad->cached32_node = rb_next(&free->node);
+	if (free->pfn_lo >= cached_iova->pfn_lo) {
+		struct rb_node *node = rb_next(&free->node);
+		struct iova *iova = container_of(node, struct iova, node);
+
+		/* only cache if it's below 32bit pfn */
+		if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
+			iovad->cached32_node = node;
+		else
+			iovad->cached32_node = NULL;
+	}
 }
 
 /* Computes the padding size required, to make the
-- 
cgit v1.2.3-70-g09d2


From c681d0ba1252954208220ad32248a3e8e2fc98e4 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 28 May 2011 13:15:05 -0500
Subject: intel-iommu: Use coherent DMA mask when requested

The __intel_map_single function is not honoring the passed in DMA mask.
This results in not using the coherent DMA mask when called from
intel_alloc_coherent().

Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Reviewed-by: Mike Habeck <habeck@sgi.com>
Cc: stable@kernel.org
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 98be0b55ac0..5cbab7f19ae 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2732,8 +2732,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 	iommu = domain_get_iommu(domain);
 	size = aligned_nrpages(paddr, size);
 
-	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
-				pdev->dma_mask);
+	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
 	if (!iova)
 		goto error;
 
-- 
cgit v1.2.3-70-g09d2


From 825507d6d059f1cbe2503e0e5a3926225b983aec Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 28 May 2011 13:15:06 -0500
Subject: intel-iommu: Remove Host Bridge devices from identity mapping

When using the 1:1 (identity) PCI DMA remapping, PCI Host Bridge devices
that do not use the IOMMU causes a kernel panic.  Fix that by not
inserting those devices into the si_domain.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Mike Habeck <habeck@sgi.com>
Cc: stable@kernel.org
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 5cbab7f19ae..7f757ce60c5 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -46,6 +46,8 @@
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
 
+#define IS_BRIDGE_HOST_DEVICE(pdev) \
+			    ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
@@ -2343,6 +2345,9 @@ static int __init iommu_prepare_static_identity_mapping(int hw)
 		return -EFAULT;
 
 	for_each_pci_dev(pdev) {
+		/* Skip Host/PCI Bridge devices */
+		if (IS_BRIDGE_HOST_DEVICE(pdev))
+			continue;
 		if (iommu_should_identity_map(pdev, 1)) {
 			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
 			       hw ? "hardware" : "software", pci_name(pdev));
-- 
cgit v1.2.3-70-g09d2


From 8519dc4401ddf8a5399f979870bbeeadbc111186 Mon Sep 17 00:00:00 2001
From: Mike Habeck <habeck@sgi.com>
Date: Sat, 28 May 2011 13:15:07 -0500
Subject: intel-iommu: Add domain check in domain_remove_one_dev_info

The comment in domain_remove_one_dev_info() states "No need to compare
PCI domain; it has to be the same". But for the si_domain that isn't
going to be true, as it consists of all the PCI devices that are
identity mapped thus multiple PCI domains can be in si_domain.  The
code needs to validate the PCI domain too.

Signed-off-by: Mike Habeck <habeck@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: stable@kernel.org
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 7f757ce60c5..b0c96d39080 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3537,8 +3537,8 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain,
 	spin_lock_irqsave(&device_domain_lock, flags);
 	list_for_each_safe(entry, tmp, &domain->devices) {
 		info = list_entry(entry, struct device_domain_info, link);
-		/* No need to compare PCI domain; it has to be the same */
-		if (info->bus == pdev->bus->number &&
+		if (info->segment == pci_domain_nr(pdev->bus) &&
+		    info->bus == pdev->bus->number &&
 		    info->devfn == pdev->devfn) {
 			list_del(&info->link);
 			list_del(&info->global);
-- 
cgit v1.2.3-70-g09d2


From 70e535d1e5d1e4317e894d6228b762cf9c3fbc6a Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 31 May 2011 00:22:52 +0100
Subject: intel-iommu: Fix off-by-one in RMRR setup

We were mapping an extra byte (and hence usually an extra page):
iommu_prepare_identity_map() expects to be given an 'end' argument which
is the last byte to be mapped; not the first byte *not* to be mapped.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index b0c96d39080..a8867bd745e 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2147,7 +2147,7 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
 	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
 		return 0;
 	return iommu_prepare_identity_map(pdev, rmrr->base_address,
-		rmrr->end_address + 1);
+		rmrr->end_address);
 }
 
 #ifdef CONFIG_DMAR_FLOPPY_WA
@@ -2161,7 +2161,7 @@ static inline void iommu_prepare_isa(void)
 		return;
 
 	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
-	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
+	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
 
 	if (ret)
 		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
-- 
cgit v1.2.3-70-g09d2


From 6464920a6e30604cb71d0ecbaa20e35009bd76fb Mon Sep 17 00:00:00 2001
From: Laszlo Ersek <lersek@redhat.com>
Date: Wed, 25 May 2011 12:24:25 +0200
Subject: xen/blkback: don't call vbd_size() if bd_disk is NULL

...because vbd_size() dereferences bd_disk if bd_part is NULL.

Signed-off-by: Laszlo Ersek<lersek@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 34570823355..6cc0db1bf52 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -357,14 +357,13 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	}
 
 	vbd->bdev = bdev;
-	vbd->size = vbd_sz(vbd);
-
 	if (vbd->bdev->bd_disk == NULL) {
 		DPRINTK("xen_vbd_create: device %08x doesn't exist.\n",
 			vbd->pdevice);
 		xen_vbd_free(vbd);
 		return -ENOENT;
 	}
+	vbd->size = vbd_sz(vbd);
 
 	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
 		vbd->type |= VDISK_CDROM;
-- 
cgit v1.2.3-70-g09d2


From 9b83c771214cf6a256ee875050e6eaf320cf7983 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Fri, 27 May 2011 09:27:16 +0300
Subject: xen/blkback: potential null dereference in error handling

blkbk->pending_pages can be NULL here so I added a check for it.

Signed-off-by: Dan Carpenter <error27@gmail.com>
[v1: Redid the loop a bit]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/blkback.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index c73910cc28c..5cf2993a833 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -809,11 +809,13 @@ static int __init xen_blkif_init(void)
  failed_init:
 	kfree(blkbk->pending_reqs);
 	kfree(blkbk->pending_grant_handles);
-	for (i = 0; i < mmap_pages; i++) {
-		if (blkbk->pending_pages[i])
-			__free_page(blkbk->pending_pages[i]);
+	if (blkbk->pending_pages) {
+		for (i = 0; i < mmap_pages; i++) {
+			if (blkbk->pending_pages[i])
+				__free_page(blkbk->pending_pages[i]);
+		}
+		kfree(blkbk->pending_pages);
 	}
-	kfree(blkbk->pending_pages);
 	kfree(blkbk);
 	blkbk = NULL;
 	return rc;
-- 
cgit v1.2.3-70-g09d2


From 614198bbf8d74617381aea82521b261c7f9baaf6 Mon Sep 17 00:00:00 2001
From: Per Dalen <per.dalen@appeartv.com>
Date: Tue, 31 May 2011 06:54:21 -0700
Subject: hwmon: (max6642) Rename temp_fault sysfs attribute to temp2_fault

The temp_fault sysfs attribute is wrong, it should be temp2_fault instead.

Reported-by: Guenter Roeck <guenter.roeck@ericsson.com>
Signed-off-by: Per Dalen <per.dalen@appeartv.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
---
 drivers/hwmon/max6642.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/hwmon/max6642.c b/drivers/hwmon/max6642.c
index 0f9fc40379c..0f30e1bb366 100644
--- a/drivers/hwmon/max6642.c
+++ b/drivers/hwmon/max6642.c
@@ -246,7 +246,7 @@ static SENSOR_DEVICE_ATTR_2(temp1_max, S_IWUSR | S_IRUGO, show_temp_max,
 			    set_temp_max, 0, MAX6642_REG_W_LOCAL_HIGH);
 static SENSOR_DEVICE_ATTR_2(temp2_max, S_IWUSR | S_IRUGO, show_temp_max,
 			    set_temp_max, 1, MAX6642_REG_W_REMOTE_HIGH);
-static SENSOR_DEVICE_ATTR(temp_fault, S_IRUGO, show_alarm, NULL, 2);
+static SENSOR_DEVICE_ATTR(temp2_fault, S_IRUGO, show_alarm, NULL, 2);
 static SENSOR_DEVICE_ATTR(temp1_max_alarm, S_IRUGO, show_alarm, NULL, 6);
 static SENSOR_DEVICE_ATTR(temp2_max_alarm, S_IRUGO, show_alarm, NULL, 4);
 
@@ -256,7 +256,7 @@ static struct attribute *max6642_attributes[] = {
 	&sensor_dev_attr_temp1_max.dev_attr.attr,
 	&sensor_dev_attr_temp2_max.dev_attr.attr,
 
-	&sensor_dev_attr_temp_fault.dev_attr.attr,
+	&sensor_dev_attr_temp2_fault.dev_attr.attr,
 	&sensor_dev_attr_temp1_max_alarm.dev_attr.attr,
 	&sensor_dev_attr_temp2_max_alarm.dev_attr.attr,
 	NULL
-- 
cgit v1.2.3-70-g09d2


From 4c6e0f8101e62d8b2d01dc94b835a98b191a1454 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 31 May 2011 15:50:51 -0400
Subject: hwmon: (coretemp) Relax target temperature range check

The current temperature range check of MSR_IA32_TEMPERATURE_TARGET
seems too strict to me, some TjMax values documented in
Documentation/hwmon/coretemp wouldn't pass. Relax the check so that
all the documented values pass.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
---
 drivers/hwmon/coretemp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index de3d2465fe2..0eeff46c0ea 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -296,7 +296,7 @@ static int get_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
 		 * If the TjMax is not plausible, an assumption
 		 * will be used
 		 */
-		if (val > 80 && val < 120) {
+		if (val >= 70 && val <= 125) {
 			dev_info(dev, "TjMax is %d C.\n", val);
 			return val * 1000;
 		}
-- 
cgit v1.2.3-70-g09d2


From 333ba7325213f0a09dfa5ceeddb056d6ad74b3b5 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Sun, 29 May 2011 15:53:20 +0300
Subject: cfg80211: don't drop p2p probe responses

Commit 0a35d36 ("cfg80211: Use capability info to detect mesh beacons")
assumed that probe response with both ESS and IBSS bits cleared
means that the frame was sent by a mesh sta.

However, these capabilities are also being used in the p2p_find phase,
and the mesh-validation broke it.

Rename the WLAN_CAPABILITY_IS_MBSS macro, and verify that mesh ies
exist before assuming this frame was sent by a mesh sta.

Signed-off-by: Eliad Peller <eliad@wizery.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h |  8 ++++++--
 net/wireless/scan.c       | 43 ++++++++++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b2eee587988..bf56b6f7827 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1003,8 +1003,12 @@ struct ieee80211_ht_info {
 #define WLAN_CAPABILITY_ESS		(1<<0)
 #define WLAN_CAPABILITY_IBSS		(1<<1)
 
-/* A mesh STA sets the ESS and IBSS capability bits to zero */
-#define WLAN_CAPABILITY_IS_MBSS(cap)	\
+/*
+ * A mesh STA sets the ESS and IBSS capability bits to zero.
+ * however, this holds true for p2p probe responses (in the p2p_find
+ * phase) as well.
+ */
+#define WLAN_CAPABILITY_IS_STA_BSS(cap)	\
 	(!((cap) & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)))
 
 #define WLAN_CAPABILITY_CF_POLLABLE	(1<<2)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 73a441d237b..7a6c67667d7 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -267,13 +267,35 @@ static bool is_bss(struct cfg80211_bss *a,
 	return memcmp(ssidie + 2, ssid, ssid_len) == 0;
 }
 
+static bool is_mesh_bss(struct cfg80211_bss *a)
+{
+	const u8 *ie;
+
+	if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability))
+		return false;
+
+	ie = cfg80211_find_ie(WLAN_EID_MESH_ID,
+			      a->information_elements,
+			      a->len_information_elements);
+	if (!ie)
+		return false;
+
+	ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
+			      a->information_elements,
+			      a->len_information_elements);
+	if (!ie)
+		return false;
+
+	return true;
+}
+
 static bool is_mesh(struct cfg80211_bss *a,
 		    const u8 *meshid, size_t meshidlen,
 		    const u8 *meshcfg)
 {
 	const u8 *ie;
 
-	if (!WLAN_CAPABILITY_IS_MBSS(a->capability))
+	if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability))
 		return false;
 
 	ie = cfg80211_find_ie(WLAN_EID_MESH_ID,
@@ -311,7 +333,7 @@ static int cmp_bss(struct cfg80211_bss *a,
 	if (a->channel != b->channel)
 		return b->channel->center_freq - a->channel->center_freq;
 
-	if (WLAN_CAPABILITY_IS_MBSS(a->capability | b->capability)) {
+	if (is_mesh_bss(a) && is_mesh_bss(b)) {
 		r = cmp_ies(WLAN_EID_MESH_ID,
 			    a->information_elements,
 			    a->len_information_elements,
@@ -457,7 +479,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev,
 		    struct cfg80211_internal_bss *res)
 {
 	struct cfg80211_internal_bss *found = NULL;
-	const u8 *meshid, *meshcfg;
 
 	/*
 	 * The reference to "res" is donated to this function.
@@ -470,22 +491,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev,
 
 	res->ts = jiffies;
 
-	if (WLAN_CAPABILITY_IS_MBSS(res->pub.capability)) {
-		/* must be mesh, verify */
-		meshid = cfg80211_find_ie(WLAN_EID_MESH_ID,
-					  res->pub.information_elements,
-					  res->pub.len_information_elements);
-		meshcfg = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
-					   res->pub.information_elements,
-					   res->pub.len_information_elements);
-		if (!meshid || !meshcfg ||
-		    meshcfg[1] != sizeof(struct ieee80211_meshconf_ie)) {
-			/* bogus mesh */
-			kref_put(&res->ref, bss_release);
-			return NULL;
-		}
-	}
-
 	spin_lock_bh(&dev->bss_lock);
 
 	found = rb_find_bss(dev, res);
-- 
cgit v1.2.3-70-g09d2


From 21fdc87248d1d28492c775e05fa92b3c8c7bc8db Mon Sep 17 00:00:00 2001
From: Daniel Halperin <dhalperi@cs.washington.edu>
Date: Tue, 31 May 2011 11:59:30 -0700
Subject: ath9k: fix two more bugs in tx power

This is the same fix as

   commit 841051602e3fa18ea468fe5a177aa92b6eb44b56
   Author: Matteo Croce <technoboy85@gmail.com>
   Date:   Fri Dec 3 02:25:08 2010 +0100

   The ath9k driver subtracts 3 dBm to the txpower as with two radios the
   signal power is doubled.
   The resulting value is assigned in an u16 which overflows and makes
   the card work at full power.

in two more places. I grepped the ath tree and didn't find any others.

Cc: stable@kernel.org
Signed-off-by: Daniel Halperin <dhalperi@cs.washington.edu>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath9k/ar9003_eeprom.c | 10 ++++++++--
 drivers/net/wireless/ath/ath9k/eeprom_9287.c   | 10 ++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c b/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c
index 0ca7635d066..ff8150e46f0 100644
--- a/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c
+++ b/drivers/net/wireless/ath/ath9k/ar9003_eeprom.c
@@ -4645,10 +4645,16 @@ static void ar9003_hw_set_power_per_rate_table(struct ath_hw *ah,
 	case 1:
 		break;
 	case 2:
-		scaledPower -= REDUCE_SCALED_POWER_BY_TWO_CHAIN;
+		if (scaledPower > REDUCE_SCALED_POWER_BY_TWO_CHAIN)
+			scaledPower -= REDUCE_SCALED_POWER_BY_TWO_CHAIN;
+		else
+			scaledPower = 0;
 		break;
 	case 3:
-		scaledPower -= REDUCE_SCALED_POWER_BY_THREE_CHAIN;
+		if (scaledPower > REDUCE_SCALED_POWER_BY_THREE_CHAIN)
+			scaledPower -= REDUCE_SCALED_POWER_BY_THREE_CHAIN;
+		else
+			scaledPower = 0;
 		break;
 	}
 
diff --git a/drivers/net/wireless/ath/ath9k/eeprom_9287.c b/drivers/net/wireless/ath/ath9k/eeprom_9287.c
index 7856f0d4512..343fc9f946d 100644
--- a/drivers/net/wireless/ath/ath9k/eeprom_9287.c
+++ b/drivers/net/wireless/ath/ath9k/eeprom_9287.c
@@ -524,10 +524,16 @@ static void ath9k_hw_set_ar9287_power_per_rate_table(struct ath_hw *ah,
 	case 1:
 		break;
 	case 2:
-		scaledPower -= REDUCE_SCALED_POWER_BY_TWO_CHAIN;
+		if (scaledPower > REDUCE_SCALED_POWER_BY_TWO_CHAIN)
+			scaledPower -= REDUCE_SCALED_POWER_BY_TWO_CHAIN;
+		else
+			scaledPower = 0;
 		break;
 	case 3:
-		scaledPower -= REDUCE_SCALED_POWER_BY_THREE_CHAIN;
+		if (scaledPower > REDUCE_SCALED_POWER_BY_THREE_CHAIN)
+			scaledPower -= REDUCE_SCALED_POWER_BY_THREE_CHAIN;
+		else
+			scaledPower = 0;
 		break;
 	}
 	scaledPower = max((u16)0, scaledPower);
-- 
cgit v1.2.3-70-g09d2


From a7567b2059020bf3fa96c389ec25eed8e28ad4ba Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 1 Jun 2011 08:29:54 +0200
Subject: bluetooth l2cap: fix locking in l2cap_global_chan_by_psm

read_lock() ... read_unlock_bh() is clearly bogus.
This was broken by

commit 23691d75cdc69c3b285211b4d77746aa20a17d18
Author: Gustavo F. Padovan <padovan@profusion.mobi>
Date:   Wed Apr 27 18:26:32 2011 -0300

    Bluetooth: Remove l2cap_sk_list

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/bluetooth/l2cap_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index a86f9ba4f05..e64a1c2df23 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -906,7 +906,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr
 		if (c->psm == psm) {
 			/* Exact match. */
 			if (!bacmp(&bt_sk(sk)->src, src)) {
-				read_unlock_bh(&chan_list_lock);
+				read_unlock(&chan_list_lock);
 				return c;
 			}
 
-- 
cgit v1.2.3-70-g09d2


From dfe21582ac5ebc460dda98c67e8589dd506d02cd Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 1 Jun 2011 17:17:57 +0200
Subject: iwl4965: correctly validate temperature value

In some cases we can read wrong temperature value. If after that
temperature value will not be updated to good one, we badly configure
tx power parameters and device is unable to send a data.

Resolves:
https://bugzilla.kernel.org/show_bug.cgi?id=35932

Cc: stable@kernel.org # 2.6.39+
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlegacy/iwl-4965.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/iwlegacy/iwl-4965.c b/drivers/net/wireless/iwlegacy/iwl-4965.c
index f5433c74b84..f9db25bb35c 100644
--- a/drivers/net/wireless/iwlegacy/iwl-4965.c
+++ b/drivers/net/wireless/iwlegacy/iwl-4965.c
@@ -1543,7 +1543,7 @@ static void iwl4965_temperature_calib(struct iwl_priv *priv)
 	s32 temp;
 
 	temp = iwl4965_hw_get_temperature(priv);
-	if (temp < 0)
+	if (IWL_TX_POWER_TEMPERATURE_OUT_OF_RANGE(temp))
 		return;
 
 	if (priv->temperature != temp) {
-- 
cgit v1.2.3-70-g09d2


From 3cc39b3f061e90f69cb1f65d72c005c56cddd6a6 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Wed, 1 Jun 2011 16:06:04 -0400
Subject: tile: enable CONFIG_BUGVERBOSE

Trivial config change to enable backtraces on panic.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28afa4c5333..dd373c8ee94 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -697,7 +697,7 @@ config DEBUG_BUGVERBOSE
 	bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EXPERT
 	depends on BUG
 	depends on ARM || AVR32 || M32R || M68K || SPARC32 || SPARC64 || \
-		   FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300
+		   FRV || SUPERH || GENERIC_BUG || BLACKFIN || MN10300 || TILE
 	default y
 	help
 	  Say Y here to make BUG() panics output the file name and line number
-- 
cgit v1.2.3-70-g09d2


From 0f48f2600911d5de6393829e4a9986d4075558b3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 2 Jun 2011 05:29:19 +0900
Subject: block: fix mismerge of the DISK_EVENT_MEDIA_CHANGE removal

Jens' back-merge commit 698567f3fa79 ("Merge commit 'v2.6.39' into
for-2.6.40/core") was incorrectly done, and re-introduced the
DISK_EVENT_MEDIA_CHANGE lines that had been removed earlier in commits

 - 9fd097b14918 ("block: unexport DISK_EVENT_MEDIA_CHANGE for
   legacy/fringe drivers")

 - 7eec77a1816a ("ide: unexport DISK_EVENT_MEDIA_CHANGE for ide-gd
   and ide-cd")

because of conflicts with the "g->flags" updates near-by by commit
d4dc210f69bc ("block: don't block events on excl write for non-optical
devices")

As a result, we re-introduced the hanging behavior due to infinite disk
media change reports.

Tssk, tssk, people! Don't do back-merges at all, and *definitely* don't
do them to hide merge conflicts from me - especially as I'm likely
better at merging them than you are, since I do so many merges.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/paride/pcd.c | 1 -
 drivers/cdrom/viocd.c      | 1 -
 drivers/ide/ide-cd.c       | 1 -
 3 files changed, 3 deletions(-)

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index a0aabd904a5..46b8136c31b 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -321,7 +321,6 @@ static void pcd_init_units(void)
 		strcpy(disk->disk_name, cd->name);	/* umm... */
 		disk->fops = &pcd_bdops;
 		disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-		disk->events = DISK_EVENT_MEDIA_CHANGE;
 	}
 }
 
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index ae15a4ddaa9..7878da89d29 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -627,7 +627,6 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	gendisk->fops = &viocd_fops;
 	gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE |
 			 GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-	gendisk->events = DISK_EVENT_MEDIA_CHANGE;
 	set_capacity(gendisk, 0);
 	gendisk->private_data = d;
 	d->viocd_disk = gendisk;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 6e5123b1d34..144d27261e4 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1782,7 +1782,6 @@ static int ide_cd_probe(ide_drive_t *drive)
 	ide_cd_read_toc(drive, &sense);
 	g->fops = &idecd_ops;
 	g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-	g->events = DISK_EVENT_MEDIA_CHANGE;
 	add_disk(g);
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 1fa7b6a29c61358cc2ca6f64cef4aa0e1a7ca74c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 2 Jun 2011 06:11:24 +0900
Subject: Revert "mm: fail GFP_DMA allocations when ZONE_DMA is not configured"

This reverts commit a197b59ae6e8bee56fcef37ea2482dc08414e2ac.

As rmk says:
 "Commit a197b59ae6e8 (mm: fail GFP_DMA allocations when ZONE_DMA is not
  configured) is causing regressions on ARM with various drivers which
  use GFP_DMA.

  The behaviour up until now has been to silently ignore that flag when
  CONFIG_ZONE_DMA is not enabled, and to allocate from the normal zone.
  However, as a result of the above commit, such allocations now fail
  which causes drivers to fail.  These are regressions compared to the
  previous kernel version."

so just revert it.

Requested-by: Russell King <linux@arm.linux.org.uk>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a4e1db3f198..4e8985acdab 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2247,10 +2247,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
-#ifndef CONFIG_ZONE_DMA
-	if (WARN_ON_ONCE(gfp_mask & __GFP_DMA))
-		return NULL;
-#endif
 
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
-- 
cgit v1.2.3-70-g09d2


From 4f5f71a7abe329bdad81ee6a8e4545054a7cc30a Mon Sep 17 00:00:00 2001
From: Guenter Roeck <guenter.roeck@ericsson.com>
Date: Tue, 31 May 2011 06:54:21 -0700
Subject: hwmon: (coretemp) Fix TjMax detection for older CPUs

Commit a321cedb12904114e2ba5041a3673ca24deb09c9 excludes CPU models 0xe, 0xf,
0x16, and 0x1a from TjMax temperature adjustment, even though several of those
CPUs are known to have TiMax other than 100 degrees C, and even though the code
in adjust_tjmax() explicitly handles those CPUs and points to a Web document
listing several of the affected CPU IDs.

Reinstate original TjMax adjustment if TjMax can not be determined using the
IA32_TEMPERATURE_TARGET register.

https://bugzilla.kernel.org/show_bug.cgi?id=32582

Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
Cc: Huaxu Wan <huaxu.wan@linux.intel.com>
Cc: Carsten Emde <C.Emde@osadl.org>
Cc: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Yong Wang <yong.y.wang@linux.intel.com>
Cc: Rudolf Marek <r.marek@assembler.cz>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Tested-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Fenghua Yu <fenghua.yu@intel.com>
Cc: <stable@kernel.org> # .35.x .36.x .37.x .38.x .39.x
---
 drivers/hwmon/coretemp.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 0eeff46c0ea..1680977cfba 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -304,24 +304,9 @@ static int get_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
 
 	/*
 	 * An assumption is made for early CPUs and unreadable MSR.
-	 * NOTE: the given value may not be correct.
+	 * NOTE: the calculated value may not be correct.
 	 */
-
-	switch (c->x86_model) {
-	case 0xe:
-	case 0xf:
-	case 0x16:
-	case 0x1a:
-		dev_warn(dev, "TjMax is assumed as 100 C!\n");
-		return 100000;
-	case 0x17:
-	case 0x1c:		/* Atom CPUs */
-		return adjust_tjmax(c, id, dev);
-	default:
-		dev_warn(dev, "CPU (model=0x%x) is not supported yet,"
-			" using default TjMax of 100C.\n", c->x86_model);
-		return 100000;
-	}
+	return adjust_tjmax(c, id, dev);
 }
 
 static void __devinit get_ucode_rev_on_cpu(void *edx)
-- 
cgit v1.2.3-70-g09d2


From bb9973e4e73f43bd86698483d0c3f7a362ff94ce Mon Sep 17 00:00:00 2001
From: Guenter Roeck <guenter.roeck@ericsson.com>
Date: Wed, 1 Jun 2011 11:03:41 -0700
Subject: hwmon: (coretemp) Further relax temperature range checks

Further relax temperature range checks after reading the IA32_TEMPERATURE_TARGET
register. If the register returns a value other than 0 in bits 16..32, assume
that the returned value is correct.

This change applies to both packet and core temperature limits.

Cc: Carsten Emde <C.Emde@osadl.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
Acked-by: Fenghua Yu <fenghua.yu@intel.com>
---
 drivers/hwmon/coretemp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 1680977cfba..85e937984ff 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -296,7 +296,7 @@ static int get_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
 		 * If the TjMax is not plausible, an assumption
 		 * will be used
 		 */
-		if (val >= 70 && val <= 125) {
+		if (val) {
 			dev_info(dev, "TjMax is %d C.\n", val);
 			return val * 1000;
 		}
@@ -326,7 +326,7 @@ static int get_pkg_tjmax(unsigned int cpu, struct device *dev)
 	err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
 	if (!err) {
 		val = (eax >> 16) & 0xff;
-		if (val > 80 && val < 120)
+		if (val)
 			return val * 1000;
 	}
 	dev_warn(dev, "Unable to read Pkg-TjMax from CPU:%u\n", cpu);
-- 
cgit v1.2.3-70-g09d2


From d0733d2e29b652b2e7b1438ececa732e4eed98eb Mon Sep 17 00:00:00 2001
From: Marcus Meissner <meissner@suse.de>
Date: Wed, 1 Jun 2011 21:05:22 -0700
Subject: net/ipv4: Check for mistakenly passed in non-IPv4 address

Check against mistakenly passing in IPv6 addresses (which would result
in an INADDR_ANY bind) or similar incompatible sockaddrs.

Signed-off-by: Marcus Meissner <meissner@suse.de>
Cc: Reinhard Max <max@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cc1463156cd..9c1926027a2 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -465,6 +465,9 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < sizeof(struct sockaddr_in))
 		goto out;
 
+	if (addr->sin_family != AF_INET)
+		goto out;
+
 	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
 
 	/* Not specified by any standard per-se, however it breaks too
-- 
cgit v1.2.3-70-g09d2


From 307f73df2b9829ee5a261d1ed432ff683c426cdf Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 31 May 2011 22:53:19 +0000
Subject: vlan: fix typo in vlan_dev_hard_start_xmit()

commit 4af429d29b341bb1735f04c2fb960178ed5d52e7 (vlan: lockless
transmit path) have a typo in vlan_dev_hard_start_xmit(), using
u64_stats_update_begin() to end the stat update, it should be
u64_stats_update_end().

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index f247f5bff88..7ea5cf9ea08 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -165,7 +165,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 		u64_stats_update_begin(&stats->syncp);
 		stats->tx_packets++;
 		stats->tx_bytes += len;
-		u64_stats_update_begin(&stats->syncp);
+		u64_stats_update_end(&stats->syncp);
 	} else {
 		this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped);
 	}
-- 
cgit v1.2.3-70-g09d2


From 85e3c65fa3a1d0542c181510a950a2be7733ff29 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Wed, 1 Jun 2011 02:01:41 +0000
Subject: usbnet/cdc_ncm: add missing .reset_resume hook

This avoids messages like this after suspend:

   cdc_ncm 2-1.4:1.6: no reset_resume for driver cdc_ncm?
   cdc_ncm 2-1.4:1.7: no reset_resume for driver cdc_ncm?
   cdc_ncm 2-1.4:1.6: usb0: unregister 'cdc_ncm' usb-0000:00:1d.0-1.4, CDC NCM

This is important for the Ericsson F5521gw GSM/UMTS modem.
Otherwise modemmanager looses the fact that the cdc_ncm and cdc_acm devices
belong together.

The cdc_ether module does the same.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index cdd3ae48610..f33ca6aa29e 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -54,7 +54,7 @@
 #include <linux/usb/usbnet.h>
 #include <linux/usb/cdc.h>
 
-#define	DRIVER_VERSION				"24-May-2011"
+#define	DRIVER_VERSION				"01-June-2011"
 
 /* CDC NCM subclass 3.2.1 */
 #define USB_CDC_NCM_NDP16_LENGTH_MIN		0x10
@@ -1234,6 +1234,7 @@ static struct usb_driver cdc_ncm_driver = {
 	.disconnect = cdc_ncm_disconnect,
 	.suspend = usbnet_suspend,
 	.resume = usbnet_resume,
+	.reset_resume =	usbnet_resume,
 	.supports_autosuspend = 1,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 41be5a4a3668810bf3687a76c2b017bd437039e0 Mon Sep 17 00:00:00 2001
From: "sjur.brandeland@stericsson.com" <sjur.brandeland@stericsson.com>
Date: Wed, 1 Jun 2011 00:55:37 +0000
Subject: caif: Fix race when conditionally taking rtnl lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Take the RTNL lock unconditionally when calling dev_close.
Taking the lock conditionally may cause race conditions.

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/chnl_net.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 649ebacaf6b..adbb424403d 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -139,17 +139,14 @@ static void close_work(struct work_struct *work)
 	struct chnl_net *dev = NULL;
 	struct list_head *list_node;
 	struct list_head *_tmp;
-	/* May be called with or without RTNL lock held */
-	int islocked = rtnl_is_locked();
-	if (!islocked)
-		rtnl_lock();
+
+	rtnl_lock();
 	list_for_each_safe(list_node, _tmp, &chnl_net_list) {
 		dev = list_entry(list_node, struct chnl_net, list_field);
 		if (dev->state == CAIF_SHUTDOWN)
 			dev_close(dev->netdev);
 	}
-	if (!islocked)
-		rtnl_unlock();
+	rtnl_unlock();
 }
 static DECLARE_WORK(close_worker, close_work);
 
-- 
cgit v1.2.3-70-g09d2


From a3bcc23e890a6d49d6763d9eb073d711de2e0469 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Wed, 1 Jun 2011 06:49:10 +0000
Subject: af-packet: Add flag to distinguish VID 0 from no-vlan.

Currently, user-space cannot determine if a 0 tcp_vlan_tci
means there is no VLAN tag or the VLAN ID was zero.

Add flag to make this explicit.  User-space can check for
TP_STATUS_VLAN_VALID || tp_vlan_tci > 0, which will be backwards
compatible. Older could would have just checked for tp_vlan_tci,
so it will work no worse than before.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |  1 +
 net/packet/af_packet.c    | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 72bfa5a034d..6d66ce1791a 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -70,6 +70,7 @@ struct tpacket_auxdata {
 #define TP_STATUS_COPY		0x2
 #define TP_STATUS_LOSING	0x4
 #define TP_STATUS_CSUMNOTREADY	0x8
+#define TP_STATUS_VLAN_VALID   0x10 /* auxdata has valid tp_vlan_tci */
 
 /* Tx ring - header status */
 #define TP_STATUS_AVAILABLE	0x0
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 925f715686a..ba248d93399 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -798,7 +798,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			getnstimeofday(&ts);
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (vlan_tx_tag_present(skb)) {
+			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+			status |= TP_STATUS_VLAN_VALID;
+		} else {
+			h.h2->tp_vlan_tci = 0;
+		}
 		hdrlen = sizeof(*h.h2);
 		break;
 	default:
@@ -1725,8 +1730,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
-
+		if (vlan_tx_tag_present(skb)) {
+			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+			aux.tp_status |= TP_STATUS_VLAN_VALID;
+		} else {
+			aux.tp_vlan_tci = 0;
+		}
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From b722dbf176b67c75fe0f5a6b1b31f5ea8aa6117d Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 1 Jun 2011 07:10:10 +0000
Subject: drivers/net/davinci_emac.c: add missing clk_put

Go to existing error handling code at the end of the function that calls
clk_put.

A simplified version of the semantic match that finds this problem is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@r exists@
expression e1,e2;
statement S;
@@

e1 = clk_get@p1(...);
... when != e1 = e2
    when != clk_put(e1)
    when any
if (...) { ... when != clk_put(e1)
               when != if (...) { ... clk_put(e1) ... }
* return@p3 ...;
 } else S
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Acked-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/davinci_emac.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/davinci_emac.c b/drivers/net/davinci_emac.c
index 29a4f06fbfc..dcc4a170b0f 100644
--- a/drivers/net/davinci_emac.c
+++ b/drivers/net/davinci_emac.c
@@ -1781,8 +1781,8 @@ static int __devinit davinci_emac_probe(struct platform_device *pdev)
 	ndev = alloc_etherdev(sizeof(struct emac_priv));
 	if (!ndev) {
 		dev_err(&pdev->dev, "error allocating net_device\n");
-		clk_put(emac_clk);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto free_clk;
 	}
 
 	platform_set_drvdata(pdev, ndev);
@@ -1796,7 +1796,8 @@ static int __devinit davinci_emac_probe(struct platform_device *pdev)
 	pdata = pdev->dev.platform_data;
 	if (!pdata) {
 		dev_err(&pdev->dev, "no platform data\n");
-		return -ENODEV;
+		rc = -ENODEV;
+		goto probe_quit;
 	}
 
 	/* MAC addr and PHY mask , RMII enable info from platform_data */
@@ -1929,8 +1930,9 @@ no_dma:
 	iounmap(priv->remap_addr);
 
 probe_quit:
-	clk_put(emac_clk);
 	free_netdev(ndev);
+free_clk:
+	clk_put(emac_clk);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6979d5dd96a4a4975ce240982436e92a3da23315 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 1 Jun 2011 10:18:09 +0000
Subject: net: dm9000: Get the chip in a known good state before enabling
 interrupts

Currently the DM9000 driver requests the primary interrupt before it
resets the chip and puts it into a known good state. This means that if
the chip is asserting interrupt for some reason we can end up with a
screaming IRQ that the interrupt handler is unable to deal with. Avoid
this by only requesting the interrupt after we've reset the chip so we
know what state it's in.

This started manifesting itself on one of my boards in the past month or
so, I suspect as a result of some core infrastructure changes removing
some form of mitigation against bad behaviour here, even when things boot
it seems that the new code brings the interface up more quickly.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dm9000.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dm9000.c b/drivers/net/dm9000.c
index fbaff3584bd..ee597e676ee 100644
--- a/drivers/net/dm9000.c
+++ b/drivers/net/dm9000.c
@@ -1157,9 +1157,6 @@ dm9000_open(struct net_device *dev)
 
 	irqflags |= IRQF_SHARED;
 
-	if (request_irq(dev->irq, dm9000_interrupt, irqflags, dev->name, dev))
-		return -EAGAIN;
-
 	/* GPIO0 on pre-activate PHY, Reg 1F is not set by reset */
 	iow(db, DM9000_GPR, 0);	/* REG_1F bit0 activate phyxcer */
 	mdelay(1); /* delay needs by DM9000B */
@@ -1168,6 +1165,9 @@ dm9000_open(struct net_device *dev)
 	dm9000_reset(db);
 	dm9000_init_dm9000(dev);
 
+	if (request_irq(dev->irq, dm9000_interrupt, irqflags, dev->name, dev))
+		return -EAGAIN;
+
 	/* Init driver variable */
 	db->dbug_cnt = 0;
 
-- 
cgit v1.2.3-70-g09d2


From a1b2cc50679c1d2eed44e2885f6178ce907498b7 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Tue, 31 May 2011 09:25:16 +0000
Subject: dmaengine: shdma: fix a regression: initialise DMA channels for
 memcpy

A recent patch has introduced a regression, where repeating a memcpy
DMA test with shdma module unloading between them skips the DMA channel
configuration. Fix this regression by always configuring the channel
during its allocation.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/dma/shdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
index 727e76ff13e..2a638f9f09a 100644
--- a/drivers/dma/shdma.c
+++ b/drivers/dma/shdma.c
@@ -343,7 +343,7 @@ static int sh_dmae_alloc_chan_resources(struct dma_chan *chan)
 
 		dmae_set_dmars(sh_chan, cfg->mid_rid);
 		dmae_set_chcr(sh_chan, cfg->chcr);
-	} else if ((sh_dmae_readl(sh_chan, CHCR) & 0xf00) != 0x400) {
+	} else {
 		dmae_init(sh_chan);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 816af7422f5bdbb74a0c9bb09735a9aeb9522c30 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Wed, 1 Jun 2011 07:32:07 +0000
Subject: ARM: mach-shmobile: add DMAC clock definitions on SH7372

These definitions are needed to let the runtime PM subsystem turn off
DMAC clocks, when it is suspended by the driver.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/arm/mach-shmobile/clock-sh7372.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm/mach-shmobile/clock-sh7372.c b/arch/arm/mach-shmobile/clock-sh7372.c
index d17eb66f4ac..c0800d83971 100644
--- a/arch/arm/mach-shmobile/clock-sh7372.c
+++ b/arch/arm/mach-shmobile/clock-sh7372.c
@@ -509,6 +509,7 @@ enum { MSTP001,
        MSTP118, MSTP117, MSTP116, MSTP113,
        MSTP106, MSTP101, MSTP100,
        MSTP223,
+       MSTP218, MSTP217, MSTP216,
        MSTP207, MSTP206, MSTP204, MSTP203, MSTP202, MSTP201, MSTP200,
        MSTP329, MSTP328, MSTP323, MSTP322, MSTP314, MSTP313, MSTP312,
        MSTP423, MSTP415, MSTP413, MSTP411, MSTP410, MSTP406, MSTP403,
@@ -534,6 +535,9 @@ static struct clk mstp_clks[MSTP_NR] = {
 	[MSTP101] = MSTP(&div4_clks[DIV4_M1], SMSTPCR1, 1, 0), /* VPU */
 	[MSTP100] = MSTP(&div4_clks[DIV4_B], SMSTPCR1, 0, 0), /* LCDC0 */
 	[MSTP223] = MSTP(&div6_clks[DIV6_SPU], SMSTPCR2, 23, 0), /* SPU2 */
+	[MSTP218] = MSTP(&div4_clks[DIV4_HP], SMSTPCR2, 18, 0), /* DMAC1 */
+	[MSTP217] = MSTP(&div4_clks[DIV4_HP], SMSTPCR2, 17, 0), /* DMAC2 */
+	[MSTP216] = MSTP(&div4_clks[DIV4_HP], SMSTPCR2, 16, 0), /* DMAC3 */
 	[MSTP207] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 7, 0), /* SCIFA5 */
 	[MSTP206] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 6, 0), /* SCIFB */
 	[MSTP204] = MSTP(&div6_clks[DIV6_SUB], SMSTPCR2, 4, 0), /* SCIFA0 */
@@ -626,6 +630,9 @@ static struct clk_lookup lookups[] = {
 	CLKDEV_DEV_ID("sh_mobile_lcdc_fb.0", &mstp_clks[MSTP100]), /* LCDC0 */
 	CLKDEV_DEV_ID("uio_pdrv_genirq.6", &mstp_clks[MSTP223]), /* SPU2DSP0 */
 	CLKDEV_DEV_ID("uio_pdrv_genirq.7", &mstp_clks[MSTP223]), /* SPU2DSP1 */
+	CLKDEV_DEV_ID("sh-dma-engine.0", &mstp_clks[MSTP218]), /* DMAC1 */
+	CLKDEV_DEV_ID("sh-dma-engine.1", &mstp_clks[MSTP217]), /* DMAC2 */
+	CLKDEV_DEV_ID("sh-dma-engine.2", &mstp_clks[MSTP216]), /* DMAC3 */
 	CLKDEV_DEV_ID("sh-sci.5", &mstp_clks[MSTP207]), /* SCIFA5 */
 	CLKDEV_DEV_ID("sh-sci.6", &mstp_clks[MSTP206]), /* SCIFB */
 	CLKDEV_DEV_ID("sh-sci.0", &mstp_clks[MSTP204]), /* SCIFA0 */
-- 
cgit v1.2.3-70-g09d2


From 2e4ceec4edaef6e903422792de4f7f37de98cec6 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 1 Jun 2011 19:48:50 +0000
Subject: drivers/net/can/flexcan.c: add missing clk_put

The failed_get label is used after the call to clk_get has succeeded, so it
should be moved up above the call to clk_put.

The failed_req labels doesn't do anything different than failed_get, so
delete it.

A simplified version of the semantic match that finds this problem is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@r exists@
expression e1,e2;
statement S;
@@

e1 = clk_get@p1(...);
... when != e1 = e2
    when != clk_put(e1)
    when any
if (...) { ... when != clk_put(e1)
               when != if (...) { ... clk_put(e1) ... }
* return@p3 ...;
 } else S
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/flexcan.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index d4990568bae..17678117ed6 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -923,7 +923,7 @@ static int __devinit flexcan_probe(struct platform_device *pdev)
 	mem_size = resource_size(mem);
 	if (!request_mem_region(mem->start, mem_size, pdev->name)) {
 		err = -EBUSY;
-		goto failed_req;
+		goto failed_get;
 	}
 
 	base = ioremap(mem->start, mem_size);
@@ -977,9 +977,8 @@ static int __devinit flexcan_probe(struct platform_device *pdev)
 	iounmap(base);
  failed_map:
 	release_mem_region(mem->start, mem_size);
- failed_req:
-	clk_put(clk);
  failed_get:
+	clk_put(clk);
  failed_clock:
 	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From e73e079bf128d68284efedeba1fbbc18d78610f9 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 25 May 2011 15:52:14 -0500
Subject: [SCSI] Fix oops caused by queue refcounting failure

In certain circumstances, we can get an oops from a torn down device.
Most notably this is from CD roms trying to call scsi_ioctl.  The root
cause of the problem is the fact that after scsi_remove_device() has
been called, the queue is fully torn down.  This is actually wrong
since the queue can be used until the sdev release function is called.
Therefore, we add an extra reference to the queue which is released in
sdev->release, so the queue always exists.

Reported-by: Parag Warudkar <parag.lkml@gmail.com>
Cc: stable@kernel.org
Signed-off-by: James Bottomley <jbottomley@parallels.com>
---
 drivers/scsi/scsi_scan.c  | 2 +-
 drivers/scsi/scsi_sysfs.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 58584dc0724..44e8ca398ef 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -297,7 +297,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 		kfree(sdev);
 		goto out;
 	}
-
+	blk_get_queue(sdev->request_queue);
 	sdev->request_queue->queuedata = sdev;
 	scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index e63912510fb..e0bd3f790fc 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -322,6 +322,7 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 		kfree(evt);
 	}
 
+	blk_put_queue(sdev->request_queue);
 	/* NULL queue means the device can't be used */
 	sdev->request_queue = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From 28304f485c3627cc5e1665b92e26eb7fcfe98088 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 2 Jun 2011 13:05:02 +0200
Subject: cfq-iosched: Remove bogus check in queue_fail path

queue_fail can only be reached if cic is NULL, so its check for cic must
be bogus.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8a02c955c61..3c7b537bf90 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3786,9 +3786,6 @@ new_queue:
 	return 0;
 
 queue_fail:
-	if (cic)
-		put_io_context(cic->ioc);
-
 	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	cfq_log(cfqd, "set_request fail");
-- 
cgit v1.2.3-70-g09d2


From e2bd9678fc0085acf540dc4cb48ff961cd4d88c0 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 2 Jun 2011 13:05:02 +0200
Subject: block: Use hlist_entry() for io_context.cic_list.first

list_entry() and hlist_entry() are both simply aliases for
container_of(), but since io_context.cic_list.first is an hlist_node one
should at least use the correct alias.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index c898049dafd..342eae9b0d3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc)
 	if (!hlist_empty(&ioc->cic_list)) {
 		struct cfq_io_context *cic;
 
-		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
 								cic_list);
 		cic->dtor(ioc);
 	}
@@ -57,7 +57,7 @@ static void cfq_exit(struct io_context *ioc)
 	if (!hlist_empty(&ioc->cic_list)) {
 		struct cfq_io_context *cic;
 
-		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
 								cic_list);
 		cic->exit(ioc);
 	}
-- 
cgit v1.2.3-70-g09d2


From 4c8cc55b3c0ebe989e727017933945b68b4327cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 1 Jun 2011 23:22:30 -0400
Subject: ktest: Fix off-by-one in config bisect result

Because in perl the array size returned by $#arr, is the last
index and not the actually size of the array, we end the config
bisect early, thinking there is only one config left when there
are in fact two. Thus the result has a 50% chance of picking
the correct config that caused the problem.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 1fd29b2daa9..8dc8c3cf3ac 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1638,7 +1638,7 @@ sub run_config_bisect {
 	if (!$found) {
 	    # try the other half
 	    doprint "Top half produced no set configs, trying bottom half\n";
-	    @tophalf = @start_list[$half .. $#start_list];
+	    @tophalf = @start_list[$half + 1 .. $#start_list];
 	    create_config @tophalf;
 	    read_current_config \%current_config;
 	    foreach my $config (@tophalf) {
@@ -1690,7 +1690,7 @@ sub run_config_bisect {
 	# remove half the configs we are looking at and see if
 	# they are good.
 	$half = int($#start_list / 2);
-    } while ($half > 0);
+    } while ($#start_list > 0);
 
     # we found a single config, try it again unless we are running manually
 
-- 
cgit v1.2.3-70-g09d2


From 4da46da2d295c0d9f4aaf28dd2b70a1ecb42d972 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 1 Jun 2011 23:25:13 -0400
Subject: ktest: Fix result of rebooting the kernel

The command that is called that reboots the kernel may fail
but the return code is not passed back to the ktest.pl script.
This is because a ';' is used between the two commands and
if the second command fails, only the first command's return
code is returned. Using a '&&' between the two commands fixes
this.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 8dc8c3cf3ac..6c68259e730 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -788,7 +788,7 @@ sub wait_for_input
 
 sub reboot_to {
     if ($reboot_type eq "grub") {
-	run_ssh "'(echo \"savedefault --default=$grub_number --once\" | grub --batch; reboot)'";
+	run_ssh "'(echo \"savedefault --default=$grub_number --once\" | grub --batch && reboot)'";
 	return;
     }
 
-- 
cgit v1.2.3-70-g09d2


From 9bf7174949aef2f43253956e1f3ab01698abbd79 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 1 Jun 2011 23:27:19 -0400
Subject: ktest: Ignore unset values of the minconfig in config_bisect

By ignoring the unset values of the minconfig in deciding
what to test in the config_bisect can cause the problem
config from being tested too.

Just do not test the configs that are set in the minconfig.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 tools/testing/ktest/ktest.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 6c68259e730..cef28e6632b 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1480,7 +1480,7 @@ sub process_config_ignore {
 	or dodie "Failed to read $config";
 
     while (<IN>) {
-	if (/^(.*?(CONFIG\S*)(=.*| is not set))/) {
+	if (/^((CONFIG\S*)=.*)/) {
 	    $config_ignore{$2} = $1;
 	}
     }
-- 
cgit v1.2.3-70-g09d2


From bf0be0e951cf1c4c9ce38032195cd8095a16d828 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 30 May 2011 12:49:01 +0200
Subject: ALSA: 6fire: Don't leak firmware in error path

One of the error paths in
sound/usb/6fire/firmware.c::usb6fire_fw_ezusb_upload() neglects to free
the memory allocated for the firmware before returning, thus leaking the
memory.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/6fire/firmware.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/usb/6fire/firmware.c b/sound/usb/6fire/firmware.c
index d47beffedb0..a91719d5918 100644
--- a/sound/usb/6fire/firmware.c
+++ b/sound/usb/6fire/firmware.c
@@ -227,6 +227,7 @@ static int usb6fire_fw_ezusb_upload(
 	ret = usb6fire_fw_ihex_init(fw, rec);
 	if (ret < 0) {
 		kfree(rec);
+		release_firmware(fw);
 		snd_printk(KERN_ERR PREFIX "error validating ezusb "
 				"firmware %s.\n", fwname);
 		return ret;
-- 
cgit v1.2.3-70-g09d2


From b36a968927b789b2dd8de0aaf7a72ef7c1f0d012 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 2 Jun 2011 14:21:45 -0400
Subject: asm-generic/unistd.h: support sendmmsg syscall

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/unistd.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index ae90e0f6399..4f76959397f 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -683,9 +683,11 @@ __SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime)
 __SYSCALL(__NR_syncfs, sys_syncfs)
 #define __NR_setns 268
 __SYSCALL(__NR_setns, sys_setns)
+#define __NR_sendmmsg 269
+__SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg)
 
 #undef __NR_syscalls
-#define __NR_syscalls 269
+#define __NR_syscalls 270
 
 /*
  * All syscalls below here should go away really,
-- 
cgit v1.2.3-70-g09d2


From ec764bf083a6ff396234351b51fd236f53c903bf Mon Sep 17 00:00:00 2001
From: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
Date: Mon, 30 May 2011 21:48:34 +0000
Subject: net: tracepoint of net_dev_xmit sees freed skb and causes panic

Because there is a possibility that skb is kfree_skb()ed and zero cleared
after ndo_start_xmit, we should not see the contents of skb like skb->len and
skb->dev->name after ndo_start_xmit. But trace_net_dev_xmit does that
and causes panic by NULL pointer dereference.
This patch fixes trace_net_dev_xmit not to see the contents of skb directly.

If you want to reproduce this panic,

1. Get tracepoint of net_dev_xmit on
2. Create 2 guests on KVM
2. Make 2 guests use virtio_net
4. Execute netperf from one to another for a long time as a network burden
5. host will panic(It takes about 30 minutes)

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/net.h | 12 +++++++-----
 net/core/dev.c             |  7 +++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 5f247f5ffc5..f99645d05a8 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -12,22 +12,24 @@
 TRACE_EVENT(net_dev_xmit,
 
 	TP_PROTO(struct sk_buff *skb,
-		 int rc),
+		 int rc,
+		 struct net_device *dev,
+		 unsigned int skb_len),
 
-	TP_ARGS(skb, rc),
+	TP_ARGS(skb, rc, dev, skb_len),
 
 	TP_STRUCT__entry(
 		__field(	void *,		skbaddr		)
 		__field(	unsigned int,	len		)
 		__field(	int,		rc		)
-		__string(	name,		skb->dev->name	)
+		__string(	name,		dev->name	)
 	),
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
-		__entry->len = skb->len;
+		__entry->len = skb_len;
 		__entry->rc = rc;
-		__assign_str(name, skb->dev->name);
+		__assign_str(name, dev->name);
 	),
 
 	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
diff --git a/net/core/dev.c b/net/core/dev.c
index c7e305d13b7..939307891e7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2096,6 +2096,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc = NETDEV_TX_OK;
+	unsigned int skb_len;
 
 	if (likely(!skb->next)) {
 		u32 features;
@@ -2146,8 +2147,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			}
 		}
 
+		skb_len = skb->len;
 		rc = ops->ndo_start_xmit(skb, dev);
-		trace_net_dev_xmit(skb, rc);
+		trace_net_dev_xmit(skb, rc, dev, skb_len);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		return rc;
@@ -2167,8 +2169,9 @@ gso:
 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 			skb_dst_drop(nskb);
 
+		skb_len = nskb->len;
 		rc = ops->ndo_start_xmit(nskb, dev);
-		trace_net_dev_xmit(nskb, rc);
+		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
 				goto out_kfree_gso_skb;
-- 
cgit v1.2.3-70-g09d2


From 9a2e0fb0893ddf595d0a372e681f5b98017c6d90 Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Thu, 2 Jun 2011 13:01:39 +0000
Subject: tg3: Fix tg3_skb_error_unmap()

This function attempts to free one fragment beyond the number of
fragments that were actually mapped.  This patch brings back the limit
to the correct spot.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Tested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f4b01c638a3..a1f9f9eef37 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -5774,7 +5774,7 @@ static void tg3_skb_error_unmap(struct tg3_napi *tnapi,
 			 dma_unmap_addr(txb, mapping),
 			 skb_headlen(skb),
 			 PCI_DMA_TODEVICE);
-	for (i = 0; i <= last; i++) {
+	for (i = 0; i < last; i++) {
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 		entry = NEXT_TX(entry);
-- 
cgit v1.2.3-70-g09d2


From 4dffbe03d1e940aba878f9420e67feb8423cdd08 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 3 Jun 2011 10:05:02 +0200
Subject: ALSA: hda - Fix HP and Front pins of ad1988/ad1989 in
 ad198x_power_eapd()

In ad198x_power_eapd(), wrong pin NIDs are used for controlling EAPD for
HP and Front outputs of AD1988/AD1989.  These are actually same with the
ones for AD1984 & co, port-A is 0x11 and port-D 0x12.

Reported-by: Raymond Yau <superquad.vortex2@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_analog.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c
index 696ac259030..82c4b2f5359 100644
--- a/sound/pci/hda/patch_analog.c
+++ b/sound/pci/hda/patch_analog.c
@@ -524,6 +524,10 @@ static void ad198x_power_eapd(struct hda_codec *codec)
 	case 0x11d4184a:
 	case 0x11d4194a:
 	case 0x11d4194b:
+	case 0x11d41988:
+	case 0x11d4198b:
+	case 0x11d4989a:
+	case 0x11d4989b:
 		ad198x_power_eapd_write(codec, 0x12, 0x11);
 		break;
 	case 0x11d41981:
@@ -533,12 +537,6 @@ static void ad198x_power_eapd(struct hda_codec *codec)
 	case 0x11d41986:
 		ad198x_power_eapd_write(codec, 0x1b, 0x1a);
 		break;
-	case 0x11d41988:
-	case 0x11d4198b:
-	case 0x11d4989a:
-	case 0x11d4989b:
-		ad198x_power_eapd_write(codec, 0x29, 0x22);
-		break;
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From a01ef051d584c12ede5cd5275b008b2ded57f3d9 Mon Sep 17 00:00:00 2001
From: Raymond Yau <superquad.vortex2@gmail.com>
Date: Wed, 1 Jun 2011 15:09:48 +0800
Subject: ALSA: hda - Check pin support EAPD in ad198x_power_eapd_write

Check whether the pin supports EAPD in ad198x_power_eapd_write.

Signed-off-by: Raymond Yau <superquad.vortex2@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_analog.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c
index 82c4b2f5359..d694e9d4921 100644
--- a/sound/pci/hda/patch_analog.c
+++ b/sound/pci/hda/patch_analog.c
@@ -506,9 +506,11 @@ static void ad198x_power_eapd_write(struct hda_codec *codec, hda_nid_t front,
 				hda_nid_t hp)
 {
 	struct ad198x_spec *spec = codec->spec;
-	snd_hda_codec_write(codec, front, 0, AC_VERB_SET_EAPD_BTLENABLE,
+	if (snd_hda_query_pin_caps(codec, front) & AC_PINCAP_EAPD)
+		snd_hda_codec_write(codec, front, 0, AC_VERB_SET_EAPD_BTLENABLE,
 			    !spec->inv_eapd ? 0x00 : 0x02);
-	snd_hda_codec_write(codec, hp, 0, AC_VERB_SET_EAPD_BTLENABLE,
+	if (snd_hda_query_pin_caps(codec, hp) & AC_PINCAP_EAPD)
+		snd_hda_codec_write(codec, hp, 0, AC_VERB_SET_EAPD_BTLENABLE,
 			    !spec->inv_eapd ? 0x00 : 0x02);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9676001559fce06e37c7dc230ab275f605556176 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 26 May 2011 11:47:35 +0300
Subject: ALSA: fm801: add error handling if auto-detect fails

In the original code if auto detect failed and tea575x_tuner == 4
then we copy bogus information to chip->tea.card.  I've changed the
autodetect code to cleanup and return -ENODEV on error instead.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/fm801.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sound/pci/fm801.c b/sound/pci/fm801.c
index eacd4901a30..a7ec7030cf8 100644
--- a/sound/pci/fm801.c
+++ b/sound/pci/fm801.c
@@ -1234,9 +1234,12 @@ static int __devinit snd_fm801_create(struct snd_card *card,
 	sprintf(chip->tea.bus_info, "PCI:%s", pci_name(pci));
 	if ((tea575x_tuner & TUNER_TYPE_MASK) > 0 &&
 	    (tea575x_tuner & TUNER_TYPE_MASK) < 4) {
-		if (snd_tea575x_init(&chip->tea))
+		if (snd_tea575x_init(&chip->tea)) {
 			snd_printk(KERN_ERR "TEA575x radio not found\n");
-	} else if ((tea575x_tuner & TUNER_TYPE_MASK) == 0)
+			snd_fm801_free(chip);
+			return -ENODEV;
+		}
+	} else if ((tea575x_tuner & TUNER_TYPE_MASK) == 0) {
 		/* autodetect tuner connection */
 		for (tea575x_tuner = 1; tea575x_tuner <= 3; tea575x_tuner++) {
 			chip->tea575x_tuner = tea575x_tuner;
@@ -1246,6 +1249,12 @@ static int __devinit snd_fm801_create(struct snd_card *card,
 				break;
 			}
 		}
+		if (tea575x_tuner == 4) {
+			snd_printk(KERN_ERR "TEA575x radio not found\n");
+			snd_fm801_free(chip);
+			return -ENODEV;
+		}
+	}
 	strlcpy(chip->tea.card, snd_fm801_tea575x_gpios[(tea575x_tuner & TUNER_TYPE_MASK) - 1].name, sizeof(chip->tea.card));
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From d50a2fb63643dce8506520dab5ffb8f49cc45cb2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 3 Jun 2011 02:28:49 -0700
Subject: ALSA: asihpi: Use angle brackets for system includes

Use the normal include style.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/asihpi/hpidspcd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/pci/asihpi/hpidspcd.c b/sound/pci/asihpi/hpidspcd.c
index fb311d8c05b..5c6ea113d21 100644
--- a/sound/pci/asihpi/hpidspcd.c
+++ b/sound/pci/asihpi/hpidspcd.c
@@ -60,7 +60,7 @@ struct code_header {
 	    HPI_VER_MINOR(HPI_VER) * 100 + HPI_VER_RELEASE(HPI_VER)))
 
 /***********************************************************************/
-#include "linux/pci.h"
+#include <linux/pci.h>
 /*-------------------------------------------------------------------*/
 short hpi_dsp_code_open(u32 adapter, struct dsp_code *ps_dsp_code,
 	u32 *pos_error_code)
-- 
cgit v1.2.3-70-g09d2


From 5ff6197f828d5ea051b3abf77cb61f8a34480e8d Mon Sep 17 00:00:00 2001
From: Steven Miao <realmz6@gmail.com>
Date: Wed, 1 Jun 2011 15:52:41 +0800
Subject: Blackfin: strncpy: fix handling of zero lengths

The jump to 4f will cause the NUL padding loop to run at least one time,
so if string length is zero just jump to the end.  Otherwise we wrongly
write one NUL byte when size==0.

Signed-off-by: Steven Miao <realmz6@gmail.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 arch/blackfin/lib/strncpy.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/blackfin/lib/strncpy.S b/arch/blackfin/lib/strncpy.S
index f3931d50b4a..2c07dddac99 100644
--- a/arch/blackfin/lib/strncpy.S
+++ b/arch/blackfin/lib/strncpy.S
@@ -25,7 +25,7 @@
 
 ENTRY(_strncpy)
 	CC = R2 == 0;
-	if CC JUMP 4f;
+	if CC JUMP 6f;
 
 	P2 = R2 ;       /* size */
 	P0 = R0 ;       /* dst*/
-- 
cgit v1.2.3-70-g09d2


From cf610bf4199770420629d3bc273494bd27ad6c1d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 31 May 2011 07:03:21 +0300
Subject: UBIFS: fix shrinker object count reports

Sometimes VM asks the shrinker to return amount of objects it can shrink,
and we return the ubifs_clean_zn_cnt in that case. However, it is possible
that this counter is negative for a short period of time, due to the way
UBIFS TNC code updates it. And I can observe the following warnings sometimes:

shrink_slab: ubifs_shrinker+0x0/0x2b7 [ubifs] negative objects to delete nr=-8541616642706119788

This patch makes sure UBIFS never returns negative count of objects.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: stable@kernel.org
---
 fs/ubifs/shrinker.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index ca953a94502..9e1d05666fe 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -284,7 +284,11 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
 	if (nr == 0)
-		return clean_zn_cnt;
+		/*
+		 * Due to the way UBIFS updates the clean znode counter it may
+		 * temporarily be negative.
+		 */
+		return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
 
 	if (!clean_zn_cnt) {
 		/*
-- 
cgit v1.2.3-70-g09d2


From 812eb258311f89bcd664a34a620f249d54a2cd83 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 31 May 2011 08:40:40 +0300
Subject: UBIFS: fix memory leak on error path

UBIFS leaks memory on error path in 'ubifs_jnl_update()' in case of write
failure because it forgets to free the 'struct ubifs_dent_node *dent' object.
Although the object is small, the alignment can make it large - e.g., 2KiB
if the min. I/O unit is 2KiB.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: stable@kernel.org
---
 fs/ubifs/journal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 34b1679e6e3..cef0460f4c5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -669,6 +669,7 @@ out_free:
 
 out_release:
 	release_head(c, BASEHD);
+	kfree(dent);
 out_ro:
 	ubifs_ro_mode(c, err);
 	if (last_reference)
-- 
cgit v1.2.3-70-g09d2


From 837072377034d0a0b18b851d1ab95676b245cc0a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 31 May 2011 14:26:07 +0300
Subject: UBIFS: fix clean znode counter corruption in error cases

UBIFS maintains per-filesystem and global clean znode counters
('c->clean_zn_cnt' and 'ubifs_clean_zn_cnt'). It is important to maintain
correct values there since the shrinker relies on 'ubifs_clean_zn_cnt'.

However, in case of failures during commit the counters were corrupted. E.g.,
if a failure happens in the middle of 'write_index()', then some nodes in the
commit list ('c->cnext') are marked as clean, and some are marked as dirty. And
the 'ubifs_destroy_tnc_subtree()' frees does not retrun correct count, and we
end up with non-zero 'c->clean_zn_cnt' when unmounting. This means that if we
have 2 file-sytem and one of them fails, and we unmount it,
'ubifs_clean_zn_cnt' stays incorrect and confuses the shrinker.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/tnc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8119b1fd8d9..91b4213dde8 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
  */
 void ubifs_tnc_close(struct ubifs_info *c)
 {
-	long clean_freed;
-
 	tnc_destroy_cnext(c);
 	if (c->zroot.znode) {
-		clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
-		atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+		long n;
+
+		ubifs_destroy_tnc_subtree(c->zroot.znode);
+		n = atomic_long_read(&c->clean_zn_cnt);
+		atomic_long_sub(n, &ubifs_clean_zn_cnt);
 	}
 	kfree(c->gap_lebs);
 	kfree(c->ilebs);
-- 
cgit v1.2.3-70-g09d2


From 4f1ab9b01d34eac9fc958f7150d3bf266dcc1685 Mon Sep 17 00:00:00 2001
From: Ben Gardiner <bengardiner@nanometrics.ca>
Date: Mon, 30 May 2011 14:56:14 -0400
Subject: UBIFS: assert no fixup when writing a node

The current free space fixup can result in some writing to the UBI volume
when the space_fixup flag is set.

To catch instances where UBIFS is writing to the NAND while the space_fixup
flag is set, add an assert to ubifs_write_node().

Artem: tweaked the patch, added similar assertion to the write buffer
       write path.

Signed-off-by: Ben Gardiner <bengardiner@nanometrics.ca>
Reviewed-by: Matthew L. Creech <mlcreech@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 166951e0dcd..3be645e012c 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(wbuf->size % c->min_io_size == 0);
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
 	ubifs_assert(!c->ro_media && !c->ro_mount);
+	ubifs_assert(!c->space_fixup);
 	if (c->leb_size - wbuf->offs >= c->max_write_size)
 		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
 
@@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
 	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
 	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
 	ubifs_assert(!c->ro_media && !c->ro_mount);
+	ubifs_assert(!c->space_fixup);
 
 	if (c->ro_error)
 		return -EROFS;
-- 
cgit v1.2.3-70-g09d2


From 781c5717a95a74b294beb38b8276943b0f8b5bb4 Mon Sep 17 00:00:00 2001
From: Ben Gardiner <bengardiner@nanometrics.ca>
Date: Mon, 30 May 2011 14:56:15 -0400
Subject: UBIFS: intialize LPT earlier

The current 'mount_ubifs()' implementation does not initialize the LPT until the
the master node is marked dirty. Move the LPT initialization to before marking
the master node dirty. This is a preparation for the next patch which will move
the free-space-fixup check to before marking the master node dirty, because we
have to fix-up the free space before doing any writes.

Artem: massaged the patch and commit message.

Signed-off-by: Ben Gardiner <bengardiner@nanometrics.ca>
Reviewed-by: Matthew L. Creech <mlcreech@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1e40db740da..6d357fd9c28 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1282,17 +1282,24 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_master;
 
-	init_constants_master(c);
-
 	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
 		ubifs_msg("recovery needed");
 		c->need_recovery = 1;
-		if (!c->ro_mount) {
-			err = ubifs_recover_inl_heads(c, c->sbuf);
-			if (err)
-				goto out_master;
-		}
-	} else if (!c->ro_mount) {
+	}
+
+	init_constants_master(c);
+
+	if (c->need_recovery && !c->ro_mount) {
+		err = ubifs_recover_inl_heads(c, c->sbuf);
+		if (err)
+			goto out_master;
+	}
+
+	err = ubifs_lpt_init(c, 1, !c->ro_mount);
+	if (err)
+		goto out_master;
+
+	if (!c->ro_mount) {
 		/*
 		 * Set the "dirty" flag so that if we reboot uncleanly we
 		 * will notice this immediately on the next mount.
@@ -1300,13 +1307,9 @@ static int mount_ubifs(struct ubifs_info *c)
 		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
 		err = ubifs_write_master(c);
 		if (err)
-			goto out_master;
+			goto out_lpt;
 	}
 
-	err = ubifs_lpt_init(c, 1, !c->ro_mount);
-	if (err)
-		goto out_lpt;
-
 	err = dbg_check_idx_size(c, c->bi.old_idx_sz);
 	if (err)
 		goto out_lpt;
-- 
cgit v1.2.3-70-g09d2


From 098011940a2549ae7182db4bf101c3e3d2b4e6df Mon Sep 17 00:00:00 2001
From: Ben Gardiner <bengardiner@nanometrics.ca>
Date: Mon, 30 May 2011 14:56:16 -0400
Subject: UBIFS: fix-up free space earlier

The free space fixup is currently initiated during mount after the call to
ubifs_write_master() which results in a write to PEBs; this has been observed
with the patch 'assert no fixup when writing a node' applied:

Move the free space fixup on mount to before the calls to
ubifs_recover_inl_heads() and ubifs_write_master(). This results in no
assertions with the previously mentioned patch applied.

Artem: tweaked the patch a bit

Signed-off-by: Ben Gardiner <bengardiner@nanometrics>
Reviewed-by: Matthew L. Creech <mlcreech@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6d357fd9c28..b5aeb5a8ebe 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1282,13 +1282,13 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_master;
 
+	init_constants_master(c);
+
 	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
 		ubifs_msg("recovery needed");
 		c->need_recovery = 1;
 	}
 
-	init_constants_master(c);
-
 	if (c->need_recovery && !c->ro_mount) {
 		err = ubifs_recover_inl_heads(c, c->sbuf);
 		if (err)
@@ -1299,6 +1299,12 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_master;
 
+	if (!c->ro_mount && c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			goto out_master;
+	}
+
 	if (!c->ro_mount) {
 		/*
 		 * Set the "dirty" flag so that if we reboot uncleanly we
@@ -1402,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c)
 	} else
 		ubifs_assert(c->lst.taken_empty_lebs > 0);
 
-	if (!c->ro_mount && c->space_fixup) {
-		err = ubifs_fixup_free_space(c);
-		if (err)
-			goto out_infos;
-	}
-
 	err = dbg_check_filesystem(c);
 	if (err)
 		goto out_infos;
-- 
cgit v1.2.3-70-g09d2


From 157186bc185cdf588fecba0fc5d7466e3e5d49d7 Mon Sep 17 00:00:00 2001
From: Eric Lammerts <alsa-devel@lists.lammerts.org>
Date: Fri, 27 May 2011 18:16:52 -0400
Subject: ALSA: usb - turn off de-emphasis in s/pdif for cm6206

CM6206: Turn off de-emphasis channel status bit in S/PDIF output.

Signed-off-by: Eric Lammerts <eric@lammerts.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 2e969cbb393..090e1930dfd 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -403,7 +403,7 @@ static int snd_usb_cm106_boot_quirk(struct usb_device *dev)
 static int snd_usb_cm6206_boot_quirk(struct usb_device *dev)
 {
 	int err, reg;
-	int val[] = {0x200c, 0x3000, 0xf800, 0x143f, 0x0000, 0x3000};
+	int val[] = {0x2004, 0x3000, 0xf800, 0x143f, 0x0000, 0x3000};
 
 	for (reg = 0; reg < ARRAY_SIZE(val); reg++) {
 		err = snd_usb_cm106_write_int_reg(dev, reg, val[reg]);
-- 
cgit v1.2.3-70-g09d2


From 55db4c64eddf37e31279ec15fe90314713bc9cfa Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 4 Jun 2011 06:33:24 +0900
Subject: Revert "tty: make receive_buf() return the amout of bytes received"

This reverts commit b1c43f82c5aa265442f82dba31ce985ebb7aa71c.

It was broken in so many ways, and results in random odd pty issues.

It re-introduced the buggy schedule_work() in flush_to_ldisc() that can
cause endless work-loops (see commit a5660b41af6a: "tty: fix endless
work loop when the buffer fills up").

It also used an "unsigned int" return value fo the ->receive_buf()
function, but then made multiple functions return a negative error code,
and didn't actually check for the error in the caller.

And it didn't actually work at all.  BenH bisected down odd tty behavior
to it:
  "It looks like the patch is causing some major malfunctions of the X
   server for me, possibly related to PTYs.  For example, cat'ing a
   large file in a gnome terminal hangs the kernel for -minutes- in a
   loop of what looks like flush_to_ldisc/workqueue code, (some ftrace
   data in the quoted bits further down).

   ...

   Some more data: It -looks- like what happens is that the
   flush_to_ldisc work queue entry constantly re-queues itself (because
   the PTY is full ?) and the workqueue thread will basically loop
   forver calling it without ever scheduling, thus starving the consumer
   process that could have emptied the PTY."

which is pretty much exactly the problem we fixed in a5660b41af6a.

Milton Miller pointed out the 'unsigned int' issue.

Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reported-by: Milton Miller <miltonm@bga.com>
Cc: Stefan Bigler <stefan.bigler@keymile.com>
Cc: Toby Gray <toby.gray@realvnc.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/bluetooth/hci_ldisc.c      | 17 ++++-------
 drivers/input/serio/serport.c      | 10 ++-----
 drivers/isdn/gigaset/ser-gigaset.c |  8 ++---
 drivers/misc/ti-st/st_core.c       |  6 ++--
 drivers/net/caif/caif_serial.c     |  6 ++--
 drivers/net/can/slcan.c            |  9 ++----
 drivers/net/hamradio/6pack.c       |  8 ++---
 drivers/net/hamradio/mkiss.c       | 11 +++----
 drivers/net/irda/irtty-sir.c       | 16 +++++-----
 drivers/net/ppp_async.c            |  6 ++--
 drivers/net/ppp_synctty.c          |  6 ++--
 drivers/net/slip.c                 | 11 +++----
 drivers/net/wan/x25_asy.c          |  7 ++---
 drivers/tty/n_gsm.c                |  6 ++--
 drivers/tty/n_hdlc.c               | 18 +++++------
 drivers/tty/n_r3964.c              | 10 +++----
 drivers/tty/n_tty.c                | 61 +++++++++++++++++++++++++++++---------
 drivers/tty/tty_buffer.c           | 15 ++++------
 drivers/tty/vt/selection.c         |  3 +-
 include/linux/tty_ldisc.h          |  9 +++---
 20 files changed, 115 insertions(+), 128 deletions(-)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index b3f01996318..48ad2a7ab08 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -355,29 +355,24 @@ static void hci_uart_tty_wakeup(struct tty_struct *tty)
  *             flags        pointer to flags for data
  *             count        count of received data in bytes
  *     
- * Return Value:    Number of bytes received
+ * Return Value:    None
  */
-static unsigned int hci_uart_tty_receive(struct tty_struct *tty,
-		const u8 *data, char *flags, int count)
+static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data, char *flags, int count)
 {
 	struct hci_uart *hu = (void *)tty->disc_data;
-	int received;
 
 	if (!hu || tty != hu->tty)
-		return -ENODEV;
+		return;
 
 	if (!test_bit(HCI_UART_PROTO_SET, &hu->flags))
-		return -EINVAL;
+		return;
 
 	spin_lock(&hu->rx_lock);
-	received = hu->proto->recv(hu, (void *) data, count);
-	if (received > 0)
-		hu->hdev->stat.byte_rx += received;
+	hu->proto->recv(hu, (void *) data, count);
+	hu->hdev->stat.byte_rx += count;
 	spin_unlock(&hu->rx_lock);
 
 	tty_unthrottle(tty);
-
-	return received;
 }
 
 static int hci_uart_register_dev(struct hci_uart *hu)
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index f3698967edf..8755f5f3ad3 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -120,21 +120,17 @@ static void serport_ldisc_close(struct tty_struct *tty)
  * 'interrupt' routine.
  */
 
-static unsigned int serport_ldisc_receive(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void serport_ldisc_receive(struct tty_struct *tty, const unsigned char *cp, char *fp, int count)
 {
 	struct serport *serport = (struct serport*) tty->disc_data;
 	unsigned long flags;
 	unsigned int ch_flags;
-	int ret = 0;
 	int i;
 
 	spin_lock_irqsave(&serport->lock, flags);
 
-	if (!test_bit(SERPORT_ACTIVE, &serport->flags)) {
-		ret = -EINVAL;
+	if (!test_bit(SERPORT_ACTIVE, &serport->flags))
 		goto out;
-	}
 
 	for (i = 0; i < count; i++) {
 		switch (fp[i]) {
@@ -156,8 +152,6 @@ static unsigned int serport_ldisc_receive(struct tty_struct *tty,
 
 out:
 	spin_unlock_irqrestore(&serport->lock, flags);
-
-	return ret == 0 ? count : ret;
 }
 
 /*
diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c
index 1d44d470897..86a5c4f7775 100644
--- a/drivers/isdn/gigaset/ser-gigaset.c
+++ b/drivers/isdn/gigaset/ser-gigaset.c
@@ -674,7 +674,7 @@ gigaset_tty_ioctl(struct tty_struct *tty, struct file *file,
  *	cflags	buffer containing error flags for received characters (ignored)
  *	count	number of received characters
  */
-static unsigned int
+static void
 gigaset_tty_receive(struct tty_struct *tty, const unsigned char *buf,
 		    char *cflags, int count)
 {
@@ -683,12 +683,12 @@ gigaset_tty_receive(struct tty_struct *tty, const unsigned char *buf,
 	struct inbuf_t *inbuf;
 
 	if (!cs)
-		return -ENODEV;
+		return;
 	inbuf = cs->inbuf;
 	if (!inbuf) {
 		dev_err(cs->dev, "%s: no inbuf\n", __func__);
 		cs_put(cs);
-		return -EINVAL;
+		return;
 	}
 
 	tail = inbuf->tail;
@@ -725,8 +725,6 @@ gigaset_tty_receive(struct tty_struct *tty, const unsigned char *buf,
 	gig_dbg(DEBUG_INTR, "%s-->BH", __func__);
 	gigaset_schedule_event(cs);
 	cs_put(cs);
-
-	return count;
 }
 
 /*
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 1a05fe08e2c..f91f82eabda 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -747,8 +747,8 @@ static void st_tty_close(struct tty_struct *tty)
 	pr_debug("%s: done ", __func__);
 }
 
-static unsigned int st_tty_receive(struct tty_struct *tty,
-		const unsigned char *data, char *tty_flags, int count)
+static void st_tty_receive(struct tty_struct *tty, const unsigned char *data,
+			   char *tty_flags, int count)
 {
 #ifdef VERBOSE
 	print_hex_dump(KERN_DEBUG, ">in>", DUMP_PREFIX_NONE,
@@ -761,8 +761,6 @@ static unsigned int st_tty_receive(struct tty_struct *tty,
 	 */
 	st_recv(tty->disc_data, data, count);
 	pr_debug("done %s", __func__);
-
-	return count;
 }
 
 /* wake-up function called in from the TTY layer
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 73c7e03617e..3df0c0f8b8b 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -167,8 +167,8 @@ static inline void debugfs_tx(struct ser_device *ser, const u8 *data, int size)
 
 #endif
 
-static unsigned int ldisc_receive(struct tty_struct *tty,
-		const u8 *data, char *flags, int count)
+static void ldisc_receive(struct tty_struct *tty, const u8 *data,
+			char *flags, int count)
 {
 	struct sk_buff *skb = NULL;
 	struct ser_device *ser;
@@ -215,8 +215,6 @@ static unsigned int ldisc_receive(struct tty_struct *tty,
 	} else
 		++ser->dev->stats.rx_dropped;
 	update_tty_status(ser);
-
-	return count;
 }
 
 static int handle_tx(struct ser_device *ser)
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index 75622d54581..1b49df6b247 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -425,17 +425,16 @@ static void slc_setup(struct net_device *dev)
  * in parallel
  */
 
-static unsigned int slcan_receive_buf(struct tty_struct *tty,
+static void slcan_receive_buf(struct tty_struct *tty,
 			      const unsigned char *cp, char *fp, int count)
 {
 	struct slcan *sl = (struct slcan *) tty->disc_data;
-	int bytes = count;
 
 	if (!sl || sl->magic != SLCAN_MAGIC || !netif_running(sl->dev))
-		return -ENODEV;
+		return;
 
 	/* Read the characters out of the buffer */
-	while (bytes--) {
+	while (count--) {
 		if (fp && *fp++) {
 			if (!test_and_set_bit(SLF_ERROR, &sl->flags))
 				sl->dev->stats.rx_errors++;
@@ -444,8 +443,6 @@ static unsigned int slcan_receive_buf(struct tty_struct *tty,
 		}
 		slcan_unesc(sl, *cp++);
 	}
-
-	return count;
 }
 
 /************************************
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 992089639ea..3e5d0b6b651 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -456,7 +456,7 @@ out:
  * a block of 6pack data has been received, which can now be decapsulated
  * and sent on to some IP layer for further processing.
  */
-static unsigned int sixpack_receive_buf(struct tty_struct *tty,
+static void sixpack_receive_buf(struct tty_struct *tty,
 	const unsigned char *cp, char *fp, int count)
 {
 	struct sixpack *sp;
@@ -464,11 +464,11 @@ static unsigned int sixpack_receive_buf(struct tty_struct *tty,
 	int count1;
 
 	if (!count)
-		return 0;
+		return;
 
 	sp = sp_get(tty);
 	if (!sp)
-		return -ENODEV;
+		return;
 
 	memcpy(buf, cp, count < sizeof(buf) ? count : sizeof(buf));
 
@@ -487,8 +487,6 @@ static unsigned int sixpack_receive_buf(struct tty_struct *tty,
 
 	sp_put(sp);
 	tty_unthrottle(tty);
-
-	return count1;
 }
 
 /*
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 0e4f2353114..4c628393c8b 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -923,14 +923,13 @@ static long mkiss_compat_ioctl(struct tty_struct *tty, struct file *file,
  * a block of data has been received, which can now be decapsulated
  * and sent on to the AX.25 layer for further processing.
  */
-static unsigned int mkiss_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void mkiss_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+	char *fp, int count)
 {
 	struct mkiss *ax = mkiss_get(tty);
-	int bytes = count;
 
 	if (!ax)
-		return -ENODEV;
+		return;
 
 	/*
 	 * Argh! mtu change time! - costs us the packet part received
@@ -940,7 +939,7 @@ static unsigned int mkiss_receive_buf(struct tty_struct *tty,
 		ax_changedmtu(ax);
 
 	/* Read the characters out of the buffer */
-	while (bytes--) {
+	while (count--) {
 		if (fp != NULL && *fp++) {
 			if (!test_and_set_bit(AXF_ERROR, &ax->flags))
 				ax->dev->stats.rx_errors++;
@@ -953,8 +952,6 @@ static unsigned int mkiss_receive_buf(struct tty_struct *tty,
 
 	mkiss_put(ax);
 	tty_unthrottle(tty);
-
-	return count;
 }
 
 /*
diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c
index 035861d8acb..3352b2443e5 100644
--- a/drivers/net/irda/irtty-sir.c
+++ b/drivers/net/irda/irtty-sir.c
@@ -216,23 +216,23 @@ static int irtty_do_write(struct sir_dev *dev, const unsigned char *ptr, size_t
  * usbserial:	urb-complete-interrupt / softint
  */
 
-static unsigned int irtty_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void irtty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+			      char *fp, int count) 
 {
 	struct sir_dev *dev;
 	struct sirtty_cb *priv = tty->disc_data;
 	int	i;
 
-	IRDA_ASSERT(priv != NULL, return -ENODEV;);
-	IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return -EINVAL;);
+	IRDA_ASSERT(priv != NULL, return;);
+	IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return;);
 
 	if (unlikely(count==0))		/* yes, this happens */
-		return 0;
+		return;
 
 	dev = priv->dev;
 	if (!dev) {
 		IRDA_WARNING("%s(), not ready yet!\n", __func__);
-		return -ENODEV;
+		return;
 	}
 
 	for (i = 0; i < count; i++) {
@@ -242,13 +242,11 @@ static unsigned int irtty_receive_buf(struct tty_struct *tty,
  		if (fp && *fp++) { 
 			IRDA_DEBUG(0, "Framing or parity error!\n");
 			sirdev_receive(dev, NULL, 0);	/* notify sir_dev (updating stats) */
-			return -EINVAL;
+			return;
  		}
 	}
 
 	sirdev_receive(dev, cp, count);
-
-	return count;
 }
 
 /*
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index 53872d7d738..a1b82c9c67d 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -340,7 +340,7 @@ ppp_asynctty_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 }
 
 /* May sleep, don't call from interrupt level or with interrupts disabled */
-static unsigned int
+static void
 ppp_asynctty_receive(struct tty_struct *tty, const unsigned char *buf,
 		  char *cflags, int count)
 {
@@ -348,7 +348,7 @@ ppp_asynctty_receive(struct tty_struct *tty, const unsigned char *buf,
 	unsigned long flags;
 
 	if (!ap)
-		return -ENODEV;
+		return;
 	spin_lock_irqsave(&ap->recv_lock, flags);
 	ppp_async_input(ap, buf, cflags, count);
 	spin_unlock_irqrestore(&ap->recv_lock, flags);
@@ -356,8 +356,6 @@ ppp_asynctty_receive(struct tty_struct *tty, const unsigned char *buf,
 		tasklet_schedule(&ap->tsk);
 	ap_put(ap);
 	tty_unthrottle(tty);
-
-	return count;
 }
 
 static void
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 0815790a5cf..2573f525f11 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -381,7 +381,7 @@ ppp_sync_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 }
 
 /* May sleep, don't call from interrupt level or with interrupts disabled */
-static unsigned int
+static void
 ppp_sync_receive(struct tty_struct *tty, const unsigned char *buf,
 		  char *cflags, int count)
 {
@@ -389,7 +389,7 @@ ppp_sync_receive(struct tty_struct *tty, const unsigned char *buf,
 	unsigned long flags;
 
 	if (!ap)
-		return -ENODEV;
+		return;
 	spin_lock_irqsave(&ap->recv_lock, flags);
 	ppp_sync_input(ap, buf, cflags, count);
 	spin_unlock_irqrestore(&ap->recv_lock, flags);
@@ -397,8 +397,6 @@ ppp_sync_receive(struct tty_struct *tty, const unsigned char *buf,
 		tasklet_schedule(&ap->tsk);
 	sp_put(ap);
 	tty_unthrottle(tty);
-
-	return count;
 }
 
 static void
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 584809c656d..8ec1a9a0bb9 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -670,17 +670,16 @@ static void sl_setup(struct net_device *dev)
  * in parallel
  */
 
-static unsigned int slip_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void slip_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+							char *fp, int count)
 {
 	struct slip *sl = tty->disc_data;
-	int bytes = count;
 
 	if (!sl || sl->magic != SLIP_MAGIC || !netif_running(sl->dev))
-		return -ENODEV;
+		return;
 
 	/* Read the characters out of the buffer */
-	while (bytes--) {
+	while (count--) {
 		if (fp && *fp++) {
 			if (!test_and_set_bit(SLF_ERROR, &sl->flags))
 				sl->dev->stats.rx_errors++;
@@ -694,8 +693,6 @@ static unsigned int slip_receive_buf(struct tty_struct *tty,
 #endif
 			slip_unesc(sl, *cp++);
 	}
-
-	return count;
 }
 
 /************************************
diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 40398bf7d03..24297b274cd 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -517,18 +517,17 @@ static int x25_asy_close(struct net_device *dev)
  * and sent on to some IP layer for further processing.
  */
 
-static unsigned int x25_asy_receive_buf(struct tty_struct *tty,
+static void x25_asy_receive_buf(struct tty_struct *tty,
 				const unsigned char *cp, char *fp, int count)
 {
 	struct x25_asy *sl = tty->disc_data;
-	int bytes = count;
 
 	if (!sl || sl->magic != X25_ASY_MAGIC || !netif_running(sl->dev))
 		return;
 
 
 	/* Read the characters out of the buffer */
-	while (bytes--) {
+	while (count--) {
 		if (fp && *fp++) {
 			if (!test_and_set_bit(SLF_ERROR, &sl->flags))
 				sl->dev->stats.rx_errors++;
@@ -537,8 +536,6 @@ static unsigned int x25_asy_receive_buf(struct tty_struct *tty,
 		}
 		x25_asy_unesc(sl, *cp++);
 	}
-
-	return count;
 }
 
 /*
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index a4c42a75a3b..09e8c7d53af 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2128,8 +2128,8 @@ static void gsmld_detach_gsm(struct tty_struct *tty, struct gsm_mux *gsm)
 	gsm->tty = NULL;
 }
 
-static unsigned int gsmld_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+			      char *fp, int count)
 {
 	struct gsm_mux *gsm = tty->disc_data;
 	const unsigned char *dp;
@@ -2162,8 +2162,6 @@ static unsigned int gsmld_receive_buf(struct tty_struct *tty,
 	}
 	/* FASYNC if needed ? */
 	/* If clogged call tty_throttle(tty); */
-
-	return count;
 }
 
 /**
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index cac666314ae..cea56033b34 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -188,8 +188,8 @@ static unsigned int n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp,
 				    poll_table *wait);
 static int n_hdlc_tty_open(struct tty_struct *tty);
 static void n_hdlc_tty_close(struct tty_struct *tty);
-static unsigned int n_hdlc_tty_receive(struct tty_struct *tty,
-		const __u8 *cp, char *fp, int count);
+static void n_hdlc_tty_receive(struct tty_struct *tty, const __u8 *cp,
+			       char *fp, int count);
 static void n_hdlc_tty_wakeup(struct tty_struct *tty);
 
 #define bset(p,b)	((p)[(b) >> 5] |= (1 << ((b) & 0x1f)))
@@ -509,8 +509,8 @@ static void n_hdlc_tty_wakeup(struct tty_struct *tty)
  * Called by tty low level driver when receive data is available. Data is
  * interpreted as one HDLC frame.
  */
-static unsigned int n_hdlc_tty_receive(struct tty_struct *tty,
-		const __u8 *data, char *flags, int count)
+static void n_hdlc_tty_receive(struct tty_struct *tty, const __u8 *data,
+			       char *flags, int count)
 {
 	register struct n_hdlc *n_hdlc = tty2n_hdlc (tty);
 	register struct n_hdlc_buf *buf;
@@ -521,20 +521,20 @@ static unsigned int n_hdlc_tty_receive(struct tty_struct *tty,
 		
 	/* This can happen if stuff comes in on the backup tty */
 	if (!n_hdlc || tty != n_hdlc->tty)
-		return -ENODEV;
+		return;
 		
 	/* verify line is using HDLC discipline */
 	if (n_hdlc->magic != HDLC_MAGIC) {
 		printk("%s(%d) line not using HDLC discipline\n",
 			__FILE__,__LINE__);
-		return -EINVAL;
+		return;
 	}
 	
 	if ( count>maxframe ) {
 		if (debuglevel >= DEBUG_LEVEL_INFO)	
 			printk("%s(%d) rx count>maxframesize, data discarded\n",
 			       __FILE__,__LINE__);
-		return -EINVAL;
+		return;
 	}
 
 	/* get a free HDLC buffer */	
@@ -550,7 +550,7 @@ static unsigned int n_hdlc_tty_receive(struct tty_struct *tty,
 		if (debuglevel >= DEBUG_LEVEL_INFO)	
 			printk("%s(%d) no more rx buffers, data discarded\n",
 			       __FILE__,__LINE__);
-		return -EINVAL;
+		return;
 	}
 		
 	/* copy received data to HDLC buffer */
@@ -565,8 +565,6 @@ static unsigned int n_hdlc_tty_receive(struct tty_struct *tty,
 	if (n_hdlc->tty->fasync != NULL)
 		kill_fasync (&n_hdlc->tty->fasync, SIGIO, POLL_IN);
 
-	return count;
-
 }	/* end of n_hdlc_tty_receive() */
 
 /**
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index a4bc39c21a4..5c6c31459a2 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -139,8 +139,8 @@ static int r3964_ioctl(struct tty_struct *tty, struct file *file,
 static void r3964_set_termios(struct tty_struct *tty, struct ktermios *old);
 static unsigned int r3964_poll(struct tty_struct *tty, struct file *file,
 		struct poll_table_struct *wait);
-static unsigned int r3964_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count);
+static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+		char *fp, int count);
 
 static struct tty_ldisc_ops tty_ldisc_N_R3964 = {
 	.owner = THIS_MODULE,
@@ -1239,8 +1239,8 @@ static unsigned int r3964_poll(struct tty_struct *tty, struct file *file,
 	return result;
 }
 
-static unsigned int r3964_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+			char *fp, int count)
 {
 	struct r3964_info *pInfo = tty->disc_data;
 	const unsigned char *p;
@@ -1257,8 +1257,6 @@ static unsigned int r3964_receive_buf(struct tty_struct *tty,
 		}
 
 	}
-
-	return count;
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 95d0a9c2dd1..0ad32888091 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -81,6 +81,38 @@ static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
 	return put_user(x, ptr);
 }
 
+/**
+ *	n_tty_set__room	-	receive space
+ *	@tty: terminal
+ *
+ *	Called by the driver to find out how much data it is
+ *	permitted to feed to the line discipline without any being lost
+ *	and thus to manage flow control. Not serialized. Answers for the
+ *	"instant".
+ */
+
+static void n_tty_set_room(struct tty_struct *tty)
+{
+	/* tty->read_cnt is not read locked ? */
+	int	left = N_TTY_BUF_SIZE - tty->read_cnt - 1;
+	int old_left;
+
+	/*
+	 * If we are doing input canonicalization, and there are no
+	 * pending newlines, let characters through without limit, so
+	 * that erase characters will be handled.  Other excess
+	 * characters will be beeped.
+	 */
+	if (left <= 0)
+		left = tty->icanon && !tty->canon_data;
+	old_left = tty->receive_room;
+	tty->receive_room = left;
+
+	/* Did this open up the receive buffer? We may need to flip */
+	if (left && !old_left)
+		schedule_work(&tty->buf.work);
+}
+
 static void put_tty_queue_nolock(unsigned char c, struct tty_struct *tty)
 {
 	if (tty->read_cnt < N_TTY_BUF_SIZE) {
@@ -152,6 +184,7 @@ static void reset_buffer_flags(struct tty_struct *tty)
 
 	tty->canon_head = tty->canon_data = tty->erasing = 0;
 	memset(&tty->read_flags, 0, sizeof tty->read_flags);
+	n_tty_set_room(tty);
 	check_unthrottle(tty);
 }
 
@@ -1327,19 +1360,17 @@ static void n_tty_write_wakeup(struct tty_struct *tty)
  *	calls one at a time and in order (or using flush_to_ldisc)
  */
 
-static unsigned int n_tty_receive_buf(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
+			      char *fp, int count)
 {
 	const unsigned char *p;
 	char *f, flags = TTY_NORMAL;
 	int	i;
 	char	buf[64];
 	unsigned long cpuflags;
-	int left;
-	int ret = 0;
 
 	if (!tty->read_buf)
-		return 0;
+		return;
 
 	if (tty->real_raw) {
 		spin_lock_irqsave(&tty->read_lock, cpuflags);
@@ -1349,7 +1380,6 @@ static unsigned int n_tty_receive_buf(struct tty_struct *tty,
 		memcpy(tty->read_buf + tty->read_head, cp, i);
 		tty->read_head = (tty->read_head + i) & (N_TTY_BUF_SIZE-1);
 		tty->read_cnt += i;
-		ret += i;
 		cp += i;
 		count -= i;
 
@@ -1359,10 +1389,8 @@ static unsigned int n_tty_receive_buf(struct tty_struct *tty,
 		memcpy(tty->read_buf + tty->read_head, cp, i);
 		tty->read_head = (tty->read_head + i) & (N_TTY_BUF_SIZE-1);
 		tty->read_cnt += i;
-		ret += i;
 		spin_unlock_irqrestore(&tty->read_lock, cpuflags);
 	} else {
-		ret = count;
 		for (i = count, p = cp, f = fp; i; i--, p++) {
 			if (f)
 				flags = *f++;
@@ -1390,6 +1418,8 @@ static unsigned int n_tty_receive_buf(struct tty_struct *tty,
 			tty->ops->flush_chars(tty);
 	}
 
+	n_tty_set_room(tty);
+
 	if ((!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) ||
 		L_EXTPROC(tty)) {
 		kill_fasync(&tty->fasync, SIGIO, POLL_IN);
@@ -1402,12 +1432,8 @@ static unsigned int n_tty_receive_buf(struct tty_struct *tty,
 	 * mode.  We don't want to throttle the driver if we're in
 	 * canonical mode and don't have a newline yet!
 	 */
-	left = N_TTY_BUF_SIZE - tty->read_cnt - 1;
-
-	if (left < TTY_THRESHOLD_THROTTLE)
+	if (tty->receive_room < TTY_THRESHOLD_THROTTLE)
 		tty_throttle(tty);
-
-	return ret;
 }
 
 int is_ignored(int sig)
@@ -1451,6 +1477,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
 	if (test_bit(TTY_HW_COOK_IN, &tty->flags)) {
 		tty->raw = 1;
 		tty->real_raw = 1;
+		n_tty_set_room(tty);
 		return;
 	}
 	if (I_ISTRIP(tty) || I_IUCLC(tty) || I_IGNCR(tty) ||
@@ -1503,6 +1530,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
 		else
 			tty->real_raw = 0;
 	}
+	n_tty_set_room(tty);
 	/* The termios change make the tty ready for I/O */
 	wake_up_interruptible(&tty->write_wait);
 	wake_up_interruptible(&tty->read_wait);
@@ -1784,6 +1812,8 @@ do_it_again:
 				retval = -ERESTARTSYS;
 				break;
 			}
+			/* FIXME: does n_tty_set_room need locking ? */
+			n_tty_set_room(tty);
 			timeout = schedule_timeout(timeout);
 			continue;
 		}
@@ -1855,8 +1885,10 @@ do_it_again:
 		 * longer than TTY_THRESHOLD_UNTHROTTLE in canonical mode,
 		 * we won't get any more characters.
 		 */
-		if (n_tty_chars_in_buffer(tty) <= TTY_THRESHOLD_UNTHROTTLE)
+		if (n_tty_chars_in_buffer(tty) <= TTY_THRESHOLD_UNTHROTTLE) {
+			n_tty_set_room(tty);
 			check_unthrottle(tty);
+		}
 
 		if (b - buf >= minimum)
 			break;
@@ -1878,6 +1910,7 @@ do_it_again:
 	} else if (test_and_clear_bit(TTY_PUSH, &tty->flags))
 		 goto do_it_again;
 
+	n_tty_set_room(tty);
 	return retval;
 }
 
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 46de2e075da..f1a7918d71a 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -416,7 +416,6 @@ static void flush_to_ldisc(struct work_struct *work)
 		struct tty_buffer *head, *tail = tty->buf.tail;
 		int seen_tail = 0;
 		while ((head = tty->buf.head) != NULL) {
-			int copied;
 			int count;
 			char *char_buf;
 			unsigned char *flag_buf;
@@ -443,19 +442,17 @@ static void flush_to_ldisc(struct work_struct *work)
 			   line discipline as we want to empty the queue */
 			if (test_bit(TTY_FLUSHPENDING, &tty->flags))
 				break;
+			if (!tty->receive_room || seen_tail)
+				break;
+			if (count > tty->receive_room)
+				count = tty->receive_room;
 			char_buf = head->char_buf_ptr + head->read;
 			flag_buf = head->flag_buf_ptr + head->read;
+			head->read += count;
 			spin_unlock_irqrestore(&tty->buf.lock, flags);
-			copied = disc->ops->receive_buf(tty, char_buf,
+			disc->ops->receive_buf(tty, char_buf,
 							flag_buf, count);
 			spin_lock_irqsave(&tty->buf.lock, flags);
-
-			head->read += copied;
-
-			if (copied == 0 || seen_tail) {
-				schedule_work(&tty->buf.work);
-				break;
-			}
 		}
 		clear_bit(TTY_FLUSHING, &tty->flags);
 	}
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index 67b1d0d7c8a..fb864e7fcd1 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -332,7 +332,8 @@ int paste_selection(struct tty_struct *tty)
 			continue;
 		}
 		count = sel_buffer_lth - pasted;
-		count = tty->ldisc->ops->receive_buf(tty, sel_buffer + pasted,
+		count = min(count, tty->receive_room);
+		tty->ldisc->ops->receive_buf(tty, sel_buffer + pasted,
 								NULL, count);
 		pasted += count;
 	}
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 5b07792ccb4..ff7dc08696a 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -76,7 +76,7 @@
  * 	tty device.  It is solely the responsibility of the line
  * 	discipline to handle poll requests.
  *
- * unsigned int (*receive_buf)(struct tty_struct *, const unsigned char *cp,
+ * void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
  * 		       char *fp, int count);
  *
  * 	This function is called by the low-level tty driver to send
@@ -84,8 +84,7 @@
  * 	processing.  <cp> is a pointer to the buffer of input
  * 	character received by the device.  <fp> is a pointer to a
  * 	pointer of flag bytes which indicate whether a character was
- * 	received with a parity error, etc. Returns the amount of bytes
- * 	received.
+ * 	received with a parity error, etc.
  * 
  * void	(*write_wakeup)(struct tty_struct *);
  *
@@ -141,8 +140,8 @@ struct tty_ldisc_ops {
 	/*
 	 * The following routines are called from below.
 	 */
-	unsigned int (*receive_buf)(struct tty_struct *,
-			const unsigned char *cp, char *fp, int count);
+	void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
+			       char *fp, int count);
 	void	(*write_wakeup)(struct tty_struct *);
 	void	(*dcd_change)(struct tty_struct *, unsigned int,
 				struct pps_event_time *);
-- 
cgit v1.2.3-70-g09d2


From bb3d6bf1919a20c56b3257b4ec09e67a9226cfb2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 4 Jun 2011 07:00:50 +0900
Subject: Revert "ASoC: Update cx20442 for TTY API change"

This reverts commit ed0bd2333cffc3d856db9beb829543c1dfc00982.

Since we reverted the TTY API change, we should revert the ASoC update
to it too.

Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Liam Girdwood <lrg@ti.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 sound/soc/codecs/cx20442.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/sound/soc/codecs/cx20442.c b/sound/soc/codecs/cx20442.c
index f8c663dcff0..d68ea532cc7 100644
--- a/sound/soc/codecs/cx20442.c
+++ b/sound/soc/codecs/cx20442.c
@@ -262,14 +262,14 @@ static int v253_hangup(struct tty_struct *tty)
 }
 
 /* Line discipline .receive_buf() */
-static unsigned int v253_receive(struct tty_struct *tty,
-				 const unsigned char *cp, char *fp, int count)
+static void v253_receive(struct tty_struct *tty,
+				const unsigned char *cp, char *fp, int count)
 {
 	struct snd_soc_codec *codec = tty->disc_data;
 	struct cx20442_priv *cx20442;
 
 	if (!codec)
-		return count;
+		return;
 
 	cx20442 = snd_soc_codec_get_drvdata(codec);
 
@@ -281,8 +281,6 @@ static unsigned int v253_receive(struct tty_struct *tty,
 		codec->hw_write = (hw_write_t)tty->ops->write;
 		codec->card->pop_time = 1;
 	}
-
-	return count;
 }
 
 /* Line discipline .write_wakeup() */
-- 
cgit v1.2.3-70-g09d2


From 1bc8779349d6278e2713a1ff94418c2a6746a791 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Sat, 28 May 2011 21:57:55 +0200
Subject: btrfs: scrub: don't reuse bios and pages

The current scrub implementation reuses bios and pages as often as possible,
allocating them only on start and releasing them when finished. This leads
to more problems with the block layer than it's worth. The elevator gets
confused when there are more pages added to the bio than bi_size suggests.
This patch completely rips out the reuse of bios and pages and allocates
them freshly for each submit.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Chris Maosn <chris.mason@oracle.com>
---
 fs/btrfs/scrub.c | 114 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 49 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dfed0c27ac..2d1f8909a8e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev)
 	}
 }
 
+static void scrub_free_bio(struct bio *bio)
+{
+	int i;
+	struct page *last_page = NULL;
+
+	if (!bio)
+		return;
+
+	for (i = 0; i < bio->bi_vcnt; ++i) {
+		if (bio->bi_io_vec[i].bv_page == last_page)
+			continue;
+		last_page = bio->bi_io_vec[i].bv_page;
+		__free_page(last_page);
+	}
+	bio_put(bio);
+}
+
 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
 {
 	int i;
-	int j;
-	struct page *last_page;
 
 	if (!sdev)
 		return;
 
 	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
 		struct scrub_bio *sbio = sdev->bios[i];
-		struct bio *bio;
 
 		if (!sbio)
 			break;
 
-		bio = sbio->bio;
-		if (bio) {
-			last_page = NULL;
-			for (j = 0; j < bio->bi_vcnt; ++j) {
-				if (bio->bi_io_vec[j].bv_page == last_page)
-					continue;
-				last_page = bio->bi_io_vec[j].bv_page;
-				__free_page(last_page);
-			}
-			bio_put(bio);
-		}
+		scrub_free_bio(sbio->bio);
 		kfree(sbio);
 	}
 
@@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 {
 	struct scrub_dev *sdev;
 	int		i;
-	int		j;
-	int		ret;
 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 
 	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
@@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 		goto nomem;
 	sdev->dev = dev;
 	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
-		struct bio *bio;
 		struct scrub_bio *sbio;
 
 		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 			goto nomem;
 		sdev->bios[i] = sbio;
 
-		bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
-		if (!bio)
-			goto nomem;
-
 		sbio->index = i;
 		sbio->sdev = sdev;
-		sbio->bio = bio;
 		sbio->count = 0;
 		sbio->work.func = scrub_checksum;
-		bio->bi_private = sdev->bios[i];
-		bio->bi_end_io = scrub_bio_end_io;
-		bio->bi_sector = 0;
-		bio->bi_bdev = dev->bdev;
-		bio->bi_size = 0;
-
-		for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
-			struct page *page;
-			page = alloc_page(GFP_NOFS);
-			if (!page)
-				goto nomem;
-
-			ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-			if (!ret)
-				goto nomem;
-		}
-		WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
 
 		if (i != SCRUB_BIOS_PER_DEV-1)
 			sdev->bios[i]->next_free = i + 1;
@@ -394,6 +373,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 
 	sbio->err = err;
+	sbio->bio = bio;
 
 	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
 }
@@ -453,6 +433,8 @@ static void scrub_checksum(struct btrfs_work *work)
 	}
 
 out:
+	scrub_free_bio(sbio->bio);
+	sbio->bio = NULL;
 	spin_lock(&sdev->list_lock);
 	sbio->next_free = sdev->first_free;
 	sdev->first_free = sbio->index;
@@ -583,25 +565,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
 	struct scrub_bio *sbio;
+	struct bio *bio;
+	int i;
 
 	if (sdev->curr == -1)
 		return 0;
 
 	sbio = sdev->bios[sdev->curr];
 
-	sbio->bio->bi_sector = sbio->physical >> 9;
-	sbio->bio->bi_size = sbio->count * PAGE_SIZE;
-	sbio->bio->bi_next = NULL;
-	sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-	sbio->bio->bi_comp_cpu = -1;
-	sbio->bio->bi_bdev = sdev->dev->bdev;
+	bio = bio_alloc(GFP_NOFS, sbio->count);
+	if (!bio)
+		goto nomem;
+
+	bio->bi_private = sbio;
+	bio->bi_end_io = scrub_bio_end_io;
+	bio->bi_bdev = sdev->dev->bdev;
+	bio->bi_sector = sbio->physical >> 9;
+
+	for (i = 0; i < sbio->count; ++i) {
+		struct page *page;
+		int ret;
+
+		page = alloc_page(GFP_NOFS);
+		if (!page)
+			goto nomem;
+
+		ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (!ret) {
+			__free_page(page);
+			goto nomem;
+		}
+	}
+
 	sbio->err = 0;
 	sdev->curr = -1;
 	atomic_inc(&sdev->in_flight);
 
-	submit_bio(0, sbio->bio);
+	submit_bio(READ, bio);
 
 	return 0;
+
+nomem:
+	scrub_free_bio(bio);
+
+	return -ENOMEM;
 }
 
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -633,7 +640,11 @@ again:
 		sbio->logical = logical;
 	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
 		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-		scrub_submit(sdev);
+		int ret;
+
+		ret = scrub_submit(sdev);
+		if (ret)
+			return ret;
 		goto again;
 	}
 	sbio->spag[sbio->count].flags = flags;
@@ -645,8 +656,13 @@ again:
 		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
 	}
 	++sbio->count;
-	if (sbio->count == SCRUB_PAGES_PER_BIO || force)
-		scrub_submit(sdev);
+	if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+		int ret;
+
+		ret = scrub_submit(sdev);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 17aca1c987cff89dc4279371857035da902c8854 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Jun 2011 01:13:45 -0400
Subject: Btrfs: fix uninit variable in the delayed inode code

The nitems counter needs to start at zero

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/delayed-inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b46d94d1dea..c61c32cf0f7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
 	INIT_LIST_HEAD(&head);
 
 	next = item;
+	nitems = 0;
 
 	/*
 	 * count the number of the continuous items that we can insert in batch
-- 
cgit v1.2.3-70-g09d2


From 211f96c24f117fcc6e9e2431e40d92f4de22625e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Jun 2011 01:26:53 -0400
Subject: Btrfs: make sure we don't overflow the free space cache crc page

The free space cache uses only one page for crcs right now,
which means we can't have a cache file bigger than the
crcs we can fit in the first page.  This adds a check to
enforce that restriction.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd38d4c3a59..1cb72394498 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -590,10 +590,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
 		PAGE_CACHE_SHIFT;
+
+	/* Since the first page has all of our checksums and our generation we
+	 * need to calculate the offset into the page that we can start writing
+	 * our entries.
+	 */
+	first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
+
 	filemap_write_and_wait(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, inode->i_size &
 				 ~(root->sectorsize - 1), (u64)-1);
 
+	/* make sure we don't overflow that first page */
+	if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
+		/* this is really the same as running out of space, where we also return 0 */
+		printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
+		ret = 0;
+		goto out_update;
+	}
+
 	/* We need a checksum per page. */
 	crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
 	if (!crc)
@@ -605,12 +620,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		return -1;
 	}
 
-	/* Since the first page has all of our checksums and our generation we
-	 * need to calculate the offset into the page that we can start writing
-	 * our entries.
-	 */
-	first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
-
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
 		cluster = list_entry(block_group->cluster_list.next,
@@ -872,12 +881,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	ret = 1;
 
 out_free:
+	kfree(checksums);
+	kfree(pages);
+
+out_update:
 	if (ret != 1) {
 		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
 		BTRFS_I(inode)->generation = 0;
 	}
-	kfree(checksums);
-	kfree(pages);
 	btrfs_update_inode(trans, root, inode);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From ca456ae280c0646e1e571c3b9a3834c55e90adfe Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 1 Jun 2011 09:42:49 +0000
Subject: Btrfs: don't save the inode cache in non-FS roots

This adds extra checks to make sure the inode map we are caching really
belongs to a FS root instead of a special relocation tree.  It
prevents crashes during balancing operations.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode-map.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 3262cd17a12..04f7199facb 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -388,6 +388,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	int prealloc;
 	bool retry = false;
 
+	/* only fs tree and subvol/snap needs ino cache */
+	if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+	    (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 5f3f302a6f4cb74906c05fad1d03fc5e95c7e5af Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 30 May 2011 08:36:16 +0000
Subject: btrfs: false BUG_ON when degraded

In degraded mode the struct btrfs_device of missing devs don't have
device->name set. A kstrdup of NULL correctly returns NULL. Don't
BUG in this case.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c48214ef5c0..da541dfca2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		BUG_ON(!new_device);
 		memcpy(new_device, device, sizeof(*new_device));
 		new_device->name = kstrdup(device->name, GFP_NOFS);
-		BUG_ON(!new_device->name);
+		BUG_ON(device->name && !new_device->name);
 		new_device->bdev = NULL;
 		new_device->writeable = 0;
 		new_device->in_fs_metadata = 0;
-- 
cgit v1.2.3-70-g09d2


From d132a538d258f8f52fd0cd8b5017755f4e915386 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 31 May 2011 19:33:33 +0000
Subject: Btrfs: don't save the inode cache if we are deleting this root

With xfstest 254 I can panic the box every time with the inode number caching
stuff on.  This is because we clean the inodes out when we delete the subvolume,
but then we write out the inode cache which adds an inode to the subvolume inode
tree, and then when it gets evicted again the root gets added back on the dead
roots list and is deleted again, so we have a double free.  To stop this from
happening just return 0 if refs is 0 (and we're not the tree root since tree
root always has refs of 0).  With this fix 254 no longer panics.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode-map.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 04f7199facb..2d0d50067a7 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -394,6 +394,11 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	     root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
 		return 0;
 
+	/* Don't save inode cache if we are deleting this root */
+	if (btrfs_root_refs(&root->root_item) == 0 &&
+	    root != root->fs_info->tree_root)
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From a4689d2bd3b00dcf5c4320f06e0ab88810fbff9c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 31 May 2011 17:08:14 +0000
Subject: btrfs: use btrfs_ino to access inode number

commit 4cb5300bc ("Btrfs: add mount -o auto_defrag") accesses inode
number directly while it should use the helper with the new inode
number allocator.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  | 2 +-
 fs/btrfs/ioctl.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e3a1b0c2394..982b5ea9762 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 	if (!defrag)
 		return -ENOMEM;
 
-	defrag->ino = inode->i_ino;
+	defrag->ino = btrfs_ino(inode);
 	defrag->transid = transid;
 	defrag->root = root->root_key.objectid;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 74c80595d70..ac37040e426 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -706,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root,
 	struct btrfs_file_extent_item *extent;
 	int type;
 	int ret;
+	u64 ino = btrfs_ino(inode);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	min_key.objectid = inode->i_ino;
+	min_key.objectid = ino;
 	min_key.type = BTRFS_EXTENT_DATA_KEY;
 	min_key.offset = *off;
 
-	max_key.objectid = inode->i_ino;
+	max_key.objectid = ino;
 	max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
@@ -726,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root,
 					   path, 0, newer_than);
 		if (ret != 0)
 			goto none;
-		if (min_key.objectid != inode->i_ino)
+		if (min_key.objectid != ino)
 			goto none;
 		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
 			goto none;
-- 
cgit v1.2.3-70-g09d2


From e7786c3ae517b2c433edc91714e86be770e9f1ce Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Sat, 28 May 2011 20:58:38 +0000
Subject: btrfs: scrub: add explicit plugging

With the removal of the implicit plugging scrub ends up doing more and
smaller I/O than necessary. This patch adds explicit plugging per chunk.

Signed-off-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/scrub.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2d1f8909a8e..1204eab9402 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -348,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(complete);
 
-	/* we are going to wait on this IO */
-	rw |= REQ_SYNC;
-
 	bio = bio_alloc(GFP_NOFS, 1);
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
@@ -359,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 	bio->bi_private = &complete;
 	submit_bio(rw, bio);
 
+	/* this will also unplug the queue */
 	wait_for_completion(&complete);
 
 	ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -743,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
+	struct blk_plug plug;
 	u64 flags;
 	int ret;
 	int slot;
@@ -847,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	 * the scrub. This might currently (crc32) end up to be about 1MB
 	 */
 	start_stripe = 0;
+	blk_start_plug(&plug);
 again:
 	logical = base + offset + start_stripe * increment;
 	for (i = start_stripe; i < nstripes; ++i) {
@@ -988,6 +988,7 @@ next:
 	scrub_submit(sdev);
 
 out:
+	blk_finish_plug(&plug);
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 4b9465cb9e3859186eefa1ca3b990a5849386320 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Jun 2011 09:36:29 -0400
Subject: Btrfs: add mount -o inode_cache

This makes the inode map cache default to off until we
fix the overflow problem when the free space crcs don't fit
inside a single page.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h            |  1 +
 fs/btrfs/free-space-cache.c |  6 ++++++
 fs/btrfs/inode-map.c        | 20 ++++++++++++++++++++
 fs/btrfs/super.c            |  8 +++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f98c200571..4958ef5417d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1cb72394498..bffa5c4a633 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2536,6 +2536,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	int ret = 0;
 	u64 root_gen = btrfs_root_generation(&root->root_item);
 
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
 	/*
 	 * If we're unmounting then just return, since this does a search on the
 	 * normal root and not the commit root and we could deadlock.
@@ -2575,6 +2578,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 	struct inode *inode;
 	int ret;
 
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode))
 		return 0;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2d0d50067a7..cb79b8975c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,6 +38,9 @@ static int caching_kthread(void *data)
 	int slot;
 	int ret;
 
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -141,6 +144,9 @@ static void start_caching(struct btrfs_root *root)
 	int ret;
 	u64 objectid;
 
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
 	spin_lock(&root->cache_lock);
 	if (root->cached != BTRFS_CACHE_NO) {
 		spin_unlock(&root->cache_lock);
@@ -178,6 +184,9 @@ static void start_caching(struct btrfs_root *root)
 
 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
 {
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return btrfs_find_free_objectid(root, objectid);
+
 again:
 	*objectid = btrfs_find_ino_for_alloc(root);
 
@@ -201,6 +210,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
 	struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
 again:
 	if (root->cached == BTRFS_CACHE_FINISHED) {
 		__btrfs_add_free_space(ctl, objectid, 1);
@@ -250,6 +263,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
 	struct rb_node *n;
 	u64 count;
 
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return;
+
 	while (1) {
 		n = rb_first(rbroot);
 		if (!n)
@@ -399,9 +415,13 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	    root != root->fs_info->tree_root)
 		return 0;
 
+	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+
 again:
 	inode = lookup_free_ino_inode(root, path);
 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 28e3cb2607f..3559d0b3518 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -160,7 +160,8 @@ enum {
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+	Opt_inode_cache, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -192,6 +193,7 @@ static match_table_t tokens = {
 	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
+	{Opt_inode_cache, "inode_cache"},
 	{Opt_err, NULL},
 };
 
@@ -360,6 +362,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling disk space caching\n");
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 			break;
+		case Opt_inode_cache:
+			printk(KERN_INFO "btrfs: enabling inode map caching\n");
+			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+			break;
 		case Opt_clear_cache:
 			printk(KERN_INFO "btrfs: force clearing of disk cache\n");
 			btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
-- 
cgit v1.2.3-70-g09d2


From 7841cb2898f66a73062c64d0ef5733dde7279e46 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 31 May 2011 18:07:27 +0200
Subject: btrfs: add helper for fs_info->closing

wrap checking of filesystem 'closing' flag and fix a few missing memory
barriers.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/ctree.h            |  9 +++++++++
 fs/btrfs/extent-tree.c      |  3 +--
 fs/btrfs/file.c             |  4 ++--
 fs/btrfs/free-space-cache.c | 10 ++++------
 fs/btrfs/inode-map.c        |  3 +--
 fs/btrfs/inode.c            |  3 +--
 fs/btrfs/scrub.c            |  2 +-
 fs/btrfs/transaction.c      |  2 +-
 8 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4958ef5417d..8490ee06370 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2354,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * Get synced with close_ctree()
+	 */
+	smp_mb();
+	return fs_info->closing;
+}
+
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 			struct btrfs_path *path,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c9173a7827b..5b9b6b6df24 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -366,8 +366,7 @@ again:
 	nritems = btrfs_header_nritems(leaf);
 
 	while (1) {
-		smp_mb();
-		if (fs_info->closing > 1) {
+		if (btrfs_fs_closing(fs_info) > 1) {
 			last = (u64)-1;
 			break;
 		}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 982b5ea9762..fa4ef18b66b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 	if (!btrfs_test_opt(root, AUTO_DEFRAG))
 		return 0;
 
-	if (root->fs_info->closing)
+	if (btrfs_fs_closing(root->fs_info))
 		return 0;
 
 	if (BTRFS_I(inode)->in_defrag)
@@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 		first_ino = defrag->ino + 1;
 		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
 
-		if (fs_info->closing)
+		if (btrfs_fs_closing(fs_info))
 			goto next_free;
 
 		spin_unlock(&fs_info->defrag_inodes_lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bffa5c4a633..ad144736a5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		return inode;
 
 	spin_lock(&block_group->lock);
-	if (!root->fs_info->closing) {
+	if (!btrfs_fs_closing(root->fs_info)) {
 		block_group->inode = igrab(inode);
 		block_group->iref = 1;
 	}
@@ -493,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 	 * If we're unmounting then just return, since this does a search on the
 	 * normal root and not the commit root and we could deadlock.
 	 */
-	smp_mb();
-	if (fs_info->closing)
+	if (btrfs_fs_closing(fs_info))
 		return 0;
 
 	/*
@@ -2513,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
 		return inode;
 
 	spin_lock(&root->cache_lock);
-	if (!root->fs_info->closing)
+	if (!btrfs_fs_closing(root->fs_info))
 		root->cache_inode = igrab(inode);
 	spin_unlock(&root->cache_lock);
 
@@ -2543,8 +2542,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	 * If we're unmounting then just return, since this does a search on the
 	 * normal root and not the commit root and we could deadlock.
 	 */
-	smp_mb();
-	if (fs_info->closing)
+	if (btrfs_fs_closing(fs_info))
 		return 0;
 
 	path = btrfs_alloc_path();
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cb79b8975c9..b4087e0fa87 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -62,8 +62,7 @@ again:
 		goto out;
 
 	while (1) {
-		smp_mb();
-		if (fs_info->closing)
+		if (btrfs_fs_closing(fs_info))
 			goto out;
 
 		leaf = path->nodes[0];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a83e44bf320..02ff4a1b968 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4266,8 +4266,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	if (BTRFS_I(inode)->dummy_inode)
 		return 0;
 
-	smp_mb();
-	if (root->fs_info->closing && is_free_space_inode(root, inode))
+	if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
 		nolock = true;
 
 	if (wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1204eab9402..df50fd1eca8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1183,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 	int ret;
 	struct btrfs_device *dev;
 
-	if (root->fs_info->closing)
+	if (btrfs_fs_closing(root->fs_info))
 		return -EINVAL;
 
 	/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d5c6d2aa4e..dd719662340 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -817,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		btrfs_btree_balance_dirty(info->tree_root, nr);
 		cond_resched();
 
-		if (root->fs_info->closing || ret != -EAGAIN)
+		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
 			break;
 	}
 	root->defrag_running = 0;
-- 
cgit v1.2.3-70-g09d2


From aa0467d8d2a00e75b2bb6a56a4ee6d70c5d1928f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 3 Jun 2011 16:29:08 +0200
Subject: btrfs: fix uninitialized variable warning

With Linus' tree, today's linux-next build (powercp ppc64_defconfig)
produced this warning:

fs/btrfs/delayed-inode.c: In function 'btrfs_delayed_update_inode':
fs/btrfs/delayed-inode.c:1598:6: warning: 'ret' may be used
uninitialized in this function

Introduced by commit 16cdcec736cd ("btrfs: implement delayed inode items
operation").

This fixes a bug in btrfs_update_inode(): if the returned value from
btrfs_delayed_update_inode is a nonzero garbage, inode stat data are not
updated and several call paths may hit a BUG_ON or fail with strange
code.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/delayed-inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c61c32cf0f7..6462c29d2d3 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_delayed_node *delayed_node;
-	int ret;
+	int ret = 0;
 
 	delayed_node = btrfs_get_or_create_delayed_node(inode);
 	if (IS_ERR(delayed_node))
-- 
cgit v1.2.3-70-g09d2


From 942c1a927bf296fd64fd49f04c5a8f66bb14446b Mon Sep 17 00:00:00 2001
From: Per Dalén <per.dalen@appeartv.com>
Date: Thu, 26 May 2011 09:08:53 -0400
Subject: hwmon: (max6642): Better chip detection schema

Improve detection of MAX6642 by reading non existing registers (0x04, 0x06
and 0xff). Reading those registers returns the previously read value.

Signed-off-by: Per Dalen <per.dalen@appeartv.com>
[guenter.roeck@ericsson.com: added second set of register reads]
Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
---
 drivers/hwmon/max6642.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/hwmon/max6642.c b/drivers/hwmon/max6642.c
index 0f30e1bb366..e855d3b0bd1 100644
--- a/drivers/hwmon/max6642.c
+++ b/drivers/hwmon/max6642.c
@@ -136,15 +136,29 @@ static int max6642_detect(struct i2c_client *client,
 	if (man_id != 0x4D)
 		return -ENODEV;
 
+	/* sanity check */
+	if (i2c_smbus_read_byte_data(client, 0x04) != 0x4D
+	    || i2c_smbus_read_byte_data(client, 0x06) != 0x4D
+	    || i2c_smbus_read_byte_data(client, 0xff) != 0x4D)
+		return -ENODEV;
+
 	/*
 	 * We read the config and status register, the 4 lower bits in the
 	 * config register should be zero and bit 5, 3, 1 and 0 should be
 	 * zero in the status register.
 	 */
 	reg_config = i2c_smbus_read_byte_data(client, MAX6642_REG_R_CONFIG);
+	if ((reg_config & 0x0f) != 0x00)
+		return -ENODEV;
+
+	/* in between, another round of sanity checks */
+	if (i2c_smbus_read_byte_data(client, 0x04) != reg_config
+	    || i2c_smbus_read_byte_data(client, 0x06) != reg_config
+	    || i2c_smbus_read_byte_data(client, 0xff) != reg_config)
+		return -ENODEV;
+
 	reg_status = i2c_smbus_read_byte_data(client, MAX6642_REG_R_STATUS);
-	if (((reg_config & 0x0f) != 0x00) ||
-	    ((reg_status & 0x2b) != 0x00))
+	if ((reg_status & 0x2b) != 0x00)
 		return -ENODEV;
 
 	strlcpy(info->type, "max6642", I2C_NAME_SIZE);
-- 
cgit v1.2.3-70-g09d2


From f2a4d8ae4d40f6f5482d207f47fd4d53b3ae0ed4 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Tue, 31 May 2011 15:14:07 -0600
Subject: ARM: Tegra: Harmony: Fix conflicting GPIO numbering

Currently, both the WM8903 and TPS6586x chips attempt to register with
gpiolib using the same GPIO numbers. This causes the audio driver to
fail to initialize.

To solve this, add a define to board-harmony.h for the TPS6586x, and make
board-harmony-power.c use this define, instead of directly referencing
TEGRA_NR_GPIOS.

This fixes a regression introduced by commit
6f168f2fa60f87e85e0df25e87e2372f22f5eb7c.
ARM: tegra: harmony: initialize the TPS65862 PMIC

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Colin Cross <ccross@android.com>
---
 arch/arm/mach-tegra/board-harmony-power.c | 4 +++-
 arch/arm/mach-tegra/board-harmony.h       | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-tegra/board-harmony-power.c b/arch/arm/mach-tegra/board-harmony-power.c
index c84442cabe0..5ad8b2f94f8 100644
--- a/arch/arm/mach-tegra/board-harmony-power.c
+++ b/arch/arm/mach-tegra/board-harmony-power.c
@@ -24,6 +24,8 @@
 
 #include <mach/irqs.h>
 
+#include "board-harmony.h"
+
 #define PMC_CTRL		0x0
 #define PMC_CTRL_INTR_LOW	(1 << 17)
 
@@ -98,7 +100,7 @@ static struct tps6586x_platform_data tps_platform = {
 	.irq_base	= TEGRA_NR_IRQS,
 	.num_subdevs	= ARRAY_SIZE(tps_devs),
 	.subdevs	= tps_devs,
-	.gpio_base	= TEGRA_NR_GPIOS,
+	.gpio_base	= HARMONY_GPIO_TPS6586X(0),
 };
 
 static struct i2c_board_info __initdata harmony_regulators[] = {
diff --git a/arch/arm/mach-tegra/board-harmony.h b/arch/arm/mach-tegra/board-harmony.h
index 1e57b071f52..d85142edaf6 100644
--- a/arch/arm/mach-tegra/board-harmony.h
+++ b/arch/arm/mach-tegra/board-harmony.h
@@ -17,7 +17,8 @@
 #ifndef _MACH_TEGRA_BOARD_HARMONY_H
 #define _MACH_TEGRA_BOARD_HARMONY_H
 
-#define HARMONY_GPIO_WM8903(_x_)	(TEGRA_NR_GPIOS + (_x_))
+#define HARMONY_GPIO_TPS6586X(_x_)	(TEGRA_NR_GPIOS + (_x_))
+#define HARMONY_GPIO_WM8903(_x_)	(HARMONY_GPIO_TPS6586X(4) + (_x_))
 
 #define TEGRA_GPIO_SD2_CD		TEGRA_GPIO_PI5
 #define TEGRA_GPIO_SD2_WP		TEGRA_GPIO_PH1
-- 
cgit v1.2.3-70-g09d2


From e0dcd8a05be438b3d2e49ef61441ea3a463663f8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 5 Jun 2011 22:03:13 -0700
Subject: mm: fix ENOSPC returned by handle_mm_fault()

Al Viro observes that in the hugetlb case, handle_mm_fault() may return
a value of the kind ENOSPC when its caller is expecting a value of the
kind VM_FAULT_SIGBUS: fix alloc_huge_page()'s failure returns.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f33bb319b73..6402458fee3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 */
 	chg = vma_needs_reservation(h, vma, addr);
 	if (chg < 0)
-		return ERR_PTR(chg);
+		return ERR_PTR(-VM_FAULT_OOM);
 	if (chg)
 		if (hugetlb_get_quota(inode->i_mapping, chg))
-			return ERR_PTR(-ENOSPC);
+			return ERR_PTR(-VM_FAULT_SIGBUS);
 
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-- 
cgit v1.2.3-70-g09d2


From 59c5f46fbe01a00eedf54a23789634438bb80603 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 6 Jun 2011 18:06:33 +0900
Subject: Linux 3.0-rc2

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index afb8e0d26f2..0f1db8d9074 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 3
 PATCHLEVEL = 0
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc2
 NAME = Sneaky Weasel
 
 # *DOCUMENTATION*
-- 
cgit v1.2.3-70-g09d2


From c8fb13d04ad0d08772f637bee873e620fcc064c2 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Fri, 27 May 2011 13:56:12 -0700
Subject: OMAP: PM debug: fix section mismatch warnings

WARNING: arch/arm/mach-omap2/built-in.o(.text+0x423c): Section mismatch in reference from the function pm_dbg_regset_init() to the function .init.text:pm_dbg_init()
The function pm_dbg_regset_init() references
the function __init pm_dbg_init().
This is often because pm_dbg_regset_init lacks a __init
annotation or the annotation of pm_dbg_init is wrong.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-omap2/pm-debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c
index a5a83b358dd..e01da45c053 100644
--- a/arch/arm/mach-omap2/pm-debug.c
+++ b/arch/arm/mach-omap2/pm-debug.c
@@ -189,7 +189,7 @@ static struct dentry *pm_dbg_dir;
 
 static int pm_dbg_init_done;
 
-static int __init pm_dbg_init(void);
+static int pm_dbg_init(void);
 
 enum {
 	DEBUG_FILE_COUNTERS = 0,
@@ -595,7 +595,7 @@ static int option_set(void *data, u64 val)
 
 DEFINE_SIMPLE_ATTRIBUTE(pm_dbg_option_fops, option_get, option_set, "%llu\n");
 
-static int __init pm_dbg_init(void)
+static int pm_dbg_init(void)
 {
 	int i;
 	struct dentry *d;
-- 
cgit v1.2.3-70-g09d2


From 345f79b3de7f6d651e4dba794af7c7303bdfd649 Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@ti.com>
Date: Tue, 31 May 2011 16:08:09 -0700
Subject: OMAP: PM: omap_device: fix device power domain callbacks

After commit 4d27e9dcff00a6425d779b065ec8892e4f391661 (PM: Make power
domain callbacks take precedence over subsystem ones), the power
domain callbacks need to call the driver callbacks instead of relying
on the default subsystem (in this case, platform_bus) to handle the
driver callbacks.

Validated on 3430/n900, 3530/Overo.

Signed-off-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/plat-omap/omap_device.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/arm/plat-omap/omap_device.c b/arch/arm/plat-omap/omap_device.c
index a37b8eb65b7..49fc0df0c21 100644
--- a/arch/arm/plat-omap/omap_device.c
+++ b/arch/arm/plat-omap/omap_device.c
@@ -84,6 +84,7 @@
 #include <linux/io.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/pm_runtime.h>
 
 #include <plat/omap_device.h>
 #include <plat/omap_hwmod.h>
@@ -539,20 +540,34 @@ int omap_early_device_register(struct omap_device *od)
 static int _od_runtime_suspend(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
+	int ret;
+
+	ret = pm_generic_runtime_suspend(dev);
+
+	if (!ret)
+		omap_device_idle(pdev);
+
+	return ret;
+}
 
-	return omap_device_idle(pdev);
+static int _od_runtime_idle(struct device *dev)
+{
+	return pm_generic_runtime_idle(dev);
 }
 
 static int _od_runtime_resume(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
 
-	return omap_device_enable(pdev);
+	omap_device_enable(pdev);
+
+	return pm_generic_runtime_resume(dev);
 }
 
 static struct dev_power_domain omap_device_power_domain = {
 	.ops = {
 		.runtime_suspend = _od_runtime_suspend,
+		.runtime_idle = _od_runtime_idle,
 		.runtime_resume = _od_runtime_resume,
 		USE_PLATFORM_PM_SLEEP_OPS
 	}
-- 
cgit v1.2.3-70-g09d2