From 6af2d180f82151cf3d58952e35a4f96e45bc453a Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 26 Jul 2012 16:24:50 +0200 Subject: drm/i915: fix forcewake related hangs on snb ... by adding seemingly redudant posting reads. This little dragon lair exploded the first time around when we've refactored the code a bit to use the common wait_for_atomic_us in "drm/i915: Group the GT routines together in both code and vtable", which caused QA to file fdo bug #51738. Chris Wilson entertained a few approaches to fixing #51738: Replacing the udelay(1) with the previously-used udelay(10) (or any other "sufficiently larger" delay), adding a posting read, or ditching the delay completely and using cpu_relax. We went with the cpu_relax and "915: Workaround hang with BSD and forcewake on SandyBridge". Which blew up in fdo bug #52424, but adding the posting read while still using cpu_relax seems to also fix that, it looks like the posting read is the important ingriedient to fix these rc6 related hangs on snb. Popular theories as to why this is like it is include: - A herd of pink elephants got royally angered somehow. - The gpu has internally different functional units and judging by the register offsets, the forcewake request register and the forcewake ack registers are _not_ in the same functional unit (or at least aren't reached through the same routes). Hence the posting read syncs up with the wrong block and gets the entire gpu confused. - ... As a minimal ducttape fix for 3.6, let's just put these posting reads into place again. We can try fancier approaches (like adding back the cpu_relax instead of the udelay) in -next. This (re-)fixes a regression introduced in commit 990bbdadabaa51828e475eda86ee5720a4910cc3 Author: Chris Wilson Date: Mon Jul 2 11:51:02 2012 -0300 drm/i915: Group the GT routines together in both code and vtable Cc: Chris Wilson Tested-by: Chris Wilson Tested-by: Du Yan Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=52424 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=51738u Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_pm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_pm.c') diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 94aabcaa3a6..58c07cdafb7 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -3963,6 +3963,7 @@ static void __gen6_gt_force_wake_get(struct drm_i915_private *dev_priv) DRM_ERROR("Force wake wait timed out\n"); I915_WRITE_NOTRACE(FORCEWAKE, 1); + POSTING_READ(FORCEWAKE); if (wait_for_atomic_us((I915_READ_NOTRACE(forcewake_ack) & 1), 500)) DRM_ERROR("Force wake wait timed out\n"); @@ -3983,6 +3984,7 @@ static void __gen6_gt_force_wake_mt_get(struct drm_i915_private *dev_priv) DRM_ERROR("Force wake wait timed out\n"); I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_ENABLE(1)); + POSTING_READ(FORCEWAKE_MT); if (wait_for_atomic_us((I915_READ_NOTRACE(forcewake_ack) & 1), 500)) DRM_ERROR("Force wake wait timed out\n"); @@ -4018,14 +4020,14 @@ void gen6_gt_check_fifodbg(struct drm_i915_private *dev_priv) static void __gen6_gt_force_wake_put(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE, 0); - /* The below doubles as a POSTING_READ */ + POSTING_READ(FORCEWAKE); gen6_gt_check_fifodbg(dev_priv); } static void __gen6_gt_force_wake_mt_put(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_DISABLE(1)); - /* The below doubles as a POSTING_READ */ + POSTING_READ(FORCEWAKE_MT); gen6_gt_check_fifodbg(dev_priv); } -- cgit v1.2.3-70-g09d2 From 1ee9ae3244c4789f3184c5123f3b2d7e405b3f4c Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 15 Aug 2012 10:41:45 +0200 Subject: drm/i915: use hsw rps tuning values everywhere on gen6+ James Bottomley reported [1] a massive power regression, due to the enabling of semaphores by default in 3.5. A workaround for him is to again disable semaphores. And indeed, his system has a very hard time to enter rc6 with semaphores enabled. Ben Widawsky run around with a kill-a-watt a lot and noticed: - There are indeed a few rare systems that seem to have a hard time entering rc6 when desktop-idle. - One machine, The Indestructible Toshiba regressed in this behaviour between 3.5 and 3.6 in a merge commit! So rc6 behaviour with the current setting seems to be highly timing dependent and not robust at all. - The behaviour James reported wrt semaphores seems to be a freak timing thing that only happens on his specific machine, confirming that enabling semaphores shouldn't reduce rc6 residency. Now furthermore the Google ChromeOS guys reported [2] a while ago that at least on some machines a simply a blinking cursor can keep the gpu turbo at the highest frequency. This is because the current rps limits used on snb/ivb are highly asymmetric. On the theory that gpu turbo and rc6 tuning values are related, we've tried whether the much saner looking (since much less asymmetric) rps tuning values used for hsw would also help entering rc6 more robustly. And it seems to mostly work, and we don't really have the resources to through-roughly tune things in any better way: The values from the ChromeOS ppl seem to fare a bit worse for James' machine, so I guess we better stick with something vpg (the gpu hw/windows group) provided, hoping that they've done their jobs. Reference[1]: http://lists.freedesktop.org/archives/dri-devel/2012-July/025675.html Reference[2]: http://lists.freedesktop.org/archives/intel-gfx/2012-July/018692.html Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=53393 Tested-by: Ben Widawsky Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_pm.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/i915/intel_pm.c') diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 58c07cdafb7..1881c8c83f0 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -2441,17 +2441,10 @@ static void gen6_enable_rps(struct drm_device *dev) dev_priv->max_delay << 24 | dev_priv->min_delay << 16); - if (IS_HASWELL(dev)) { - I915_WRITE(GEN6_RP_UP_THRESHOLD, 59400); - I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 245000); - I915_WRITE(GEN6_RP_UP_EI, 66000); - I915_WRITE(GEN6_RP_DOWN_EI, 350000); - } else { - I915_WRITE(GEN6_RP_UP_THRESHOLD, 10000); - I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 1000000); - I915_WRITE(GEN6_RP_UP_EI, 100000); - I915_WRITE(GEN6_RP_DOWN_EI, 5000000); - } + I915_WRITE(GEN6_RP_UP_THRESHOLD, 59400); + I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 245000); + I915_WRITE(GEN6_RP_UP_EI, 66000); + I915_WRITE(GEN6_RP_DOWN_EI, 350000); I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10); I915_WRITE(GEN6_RP_CONTROL, -- cgit v1.2.3-70-g09d2