From 2e7ba01494ea1ec33ae7d7f5f124975818ddb825 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 08:37:00 +0100
Subject: [PATCH 001/141] drm/i915: Remove unused i915_gem_active_peek_rcu()

This was originally introduced to be used by the busy-ioctl, but in the
end busy ioctl performed a different dance. Since there are no users,
and no likely users, remove an unwanted chunk of the API.

Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470728222-10243-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem_request.h | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 3496e28785e7..583e237b98d1 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -380,27 +380,6 @@ i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex)
 	return request;
 }
 
-/**
- * i915_gem_active_peek_rcu - report the active request being monitored
- * @active - the active tracker
- *
- * i915_gem_active_peek_rcu() returns the current request being tracked if
- * still active, or NULL. It does not obtain a reference on the request
- * for the caller, and inspection of the request is only valid under
- * the RCU lock.
- */
-static inline struct drm_i915_gem_request *
-i915_gem_active_peek_rcu(const struct i915_gem_active *active)
-{
-	struct drm_i915_gem_request *request;
-
-	request = rcu_dereference(active->request);
-	if (!request || i915_gem_request_completed(request))
-		return NULL;
-
-	return request;
-}
-
 /**
  * i915_gem_active_get - return a reference to the active request
  * @active - the active tracker

From 385384a82cb4d9d1725330fde293877c36c1dba2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 08:37:01 +0100
Subject: [PATCH 002/141] drm/i915: Wrap the protected active RCU dereference
 in a helper

As we do the lockdep protected RCU lookup in a couple of places,
refactor that code to a common helper i915_gem_active_raw().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470728222-10243-2-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 583e237b98d1..f6661f31a348 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -359,6 +359,21 @@ __i915_gem_active_peek(const struct i915_gem_active *active)
 	return rcu_dereference_protected(active->request, 1);
 }
 
+/**
+ * i915_gem_active_raw - return the active request
+ * @active - the active tracker
+ *
+ * i915_gem_active_raw() returns the current request being tracked, or NULL.
+ * It does not obtain a reference on the request for the caller, so the caller
+ * must hold struct_mutex.
+ */
+static inline struct drm_i915_gem_request *
+i915_gem_active_raw(const struct i915_gem_active *active, struct mutex *mutex)
+{
+	return rcu_dereference_protected(active->request,
+					 lockdep_is_held(mutex));
+}
+
 /**
  * i915_gem_active_peek - report the active request being monitored
  * @active - the active tracker
@@ -372,8 +387,7 @@ i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex)
 {
 	struct drm_i915_gem_request *request;
 
-	request = rcu_dereference_protected(active->request,
-					    lockdep_is_held(mutex));
+	request = i915_gem_active_raw(active, mutex);
 	if (!request || i915_gem_request_completed(request))
 		return NULL;
 
@@ -614,8 +628,7 @@ i915_gem_active_retire(struct i915_gem_active *active,
 	struct drm_i915_gem_request *request;
 	int ret;
 
-	request = rcu_dereference_protected(active->request,
-					    lockdep_is_held(mutex));
+	request = i915_gem_active_raw(active, mutex);
 	if (!request)
 		return 0;
 

From 87b723a16de9ff95e2b7d61dbd86bddd3c1716d9 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 08:37:02 +0100
Subject: [PATCH 003/141] drm/i915: Don't check for idleness before retiring
 after a GPU hang

When we force the cleanup after a GPU hang, we want to retire all
requests, or else we may leak them if truly wedged (and the GPU never
advances again). Converting to the active request helpers had the issue
of doing the check against busyness before reporting the request, so if
we claim the GPU had hung but this engine hadn't we could potential skip
the request cleanup - triggering the self-check BUG.

Fixes: dcff85c8443e ("drm/i915: Enable i915_gem_wait_for_idle() ...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470728222-10243-3-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7a00678ae729..bb83069ce016 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2423,15 +2423,11 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	struct drm_i915_gem_request *request;
 	struct intel_ring *ring;
 
-	request = i915_gem_active_peek(&engine->last_request,
-				       &engine->i915->drm.struct_mutex);
-
 	/* Mark all pending requests as complete so that any concurrent
 	 * (lockless) lookup doesn't try and wait upon the request as we
 	 * reset it.
 	 */
-	if (request)
-		intel_engine_init_seqno(engine, request->fence.seqno);
+	intel_engine_init_seqno(engine, engine->last_submitted_seqno);
 
 	/*
 	 * Clear the execlists queue up before freeing the requests, as those
@@ -2453,6 +2449,8 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	 * implicit references on things like e.g. ppgtt address spaces through
 	 * the request.
 	 */
+	request = i915_gem_active_raw(&engine->last_request,
+				      &engine->i915->drm.struct_mutex);
 	if (request)
 		i915_gem_request_retire_upto(request);
 	GEM_BUG_ON(intel_engine_is_active(engine));

From edf6b76f64a2f62b81ed796fe2ce6dd664351d64 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 09:23:33 +0100
Subject: [PATCH 004/141] drm/i915: Add smp_rmb() to busy ioctl's RCU dance

In the debate as to whether the second read of active->request is
ordered after the dependent reads of the first read of active->request,
just give in and throw a smp_rmb() in there so that ordering of loads is
assured.

v2: Explain the manual smp_rmb()

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c         | 36 +++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_request.h |  3 +++
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bb83069ce016..373136f57d7b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3733,7 +3733,7 @@ i915_gem_object_ggtt_unpin_view(struct drm_i915_gem_object *obj,
 	i915_vma_unpin(i915_gem_obj_to_ggtt_view(obj, view));
 }
 
-static __always_inline unsigned __busy_read_flag(unsigned int id)
+static __always_inline unsigned int __busy_read_flag(unsigned int id)
 {
 	/* Note that we could alias engines in the execbuf API, but
 	 * that would be very unwise as it prevents userspace from
@@ -3751,7 +3751,7 @@ static __always_inline unsigned int __busy_write_id(unsigned int id)
 	return id;
 }
 
-static __always_inline unsigned
+static __always_inline unsigned int
 __busy_set_if_active(const struct i915_gem_active *active,
 		     unsigned int (*flag)(unsigned int id))
 {
@@ -3768,19 +3768,45 @@ __busy_set_if_active(const struct i915_gem_active *active,
 
 		id = request->engine->exec_id;
 
-		/* Check that the pointer wasn't reassigned and overwritten. */
+		/* Check that the pointer wasn't reassigned and overwritten.
+		 *
+		 * In __i915_gem_active_get_rcu(), we enforce ordering between
+		 * the first rcu pointer dereference (imposing a
+		 * read-dependency only on access through the pointer) and
+		 * the second lockless access through the memory barrier
+		 * following a successful atomic_inc_not_zero(). Here there
+		 * is no such barrier, and so we must manually insert an
+		 * explicit read barrier to ensure that the following
+		 * access occurs after all the loads through the first
+		 * pointer.
+		 *
+		 * It is worth comparing this sequence with
+		 * raw_write_seqcount_latch() which operates very similarly.
+		 * The challenge here is the visibility of the other CPU
+		 * writes to the reallocated request vs the local CPU ordering.
+		 * Before the other CPU can overwrite the request, it will
+		 * have updated our active->request and gone through a wmb.
+		 * During the read here, we want to make sure that the values
+		 * we see have not been overwritten as we do so - and we do
+		 * that by serialising the second pointer check with the writes
+		 * on other other CPUs.
+		 *
+		 * The corresponding write barrier is part of
+		 * rcu_assign_pointer().
+		 */
+		smp_rmb();
 		if (request == rcu_access_pointer(active->request))
 			return flag(id);
 	} while (1);
 }
 
-static inline unsigned
+static __always_inline unsigned int
 busy_check_reader(const struct i915_gem_active *active)
 {
 	return __busy_set_if_active(active, __busy_read_flag);
 }
 
-static inline unsigned
+static __always_inline unsigned int
 busy_check_writer(const struct i915_gem_active *active)
 {
 	return __busy_set_if_active(active, __busy_write_id);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index f6661f31a348..6dd83b172c7a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -490,6 +490,9 @@ __i915_gem_active_get_rcu(const struct i915_gem_active *active)
 		 * incremented) then the following read for rcu_access_pointer()
 		 * must occur after the atomic operation and so confirm
 		 * that this request is the one currently being tracked.
+		 *
+		 * The corresponding write barrier is part of
+		 * rcu_assign_pointer().
 		 */
 		if (!request || request == rcu_access_pointer(active->request))
 			return rcu_pointer_handoff(request);

From 5a198b8c53e68b6e4737b6890995e676d856c9ed Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 09:23:34 +0100
Subject: [PATCH 005/141] drm/i915: Do not overwrite the request with zero on
 reallocation

When using RCU lookup for the request, commit 0eafec6d3244 ("drm/i915:
Enable lockless lookup of request tracking via RCU"), we acknowledge that
we may race with another thread that could have reallocated the request.
In order for the first thread not to blow up, the second thread must not
clear the request completed before overwriting it. In the RCU lookup, we
allow for the engine/seqno to be replaced but we do not allow for it to
be zeroed.

The choice we make is to either add extra checking to the RCU lookup, or
embrace the inherent races (as intended). It is more complicated as we
need to manually clear everything we depend upon being zero initialised,
but we benefit from not emiting the memset() to clear the entire
frequently allocated structure (that memset turns up in throughput
profiles). And at the same time, the lookup remains flexible for future
adjustments.

v2: Old style LRC requires another variable to be initialize. (The
danger inherent in not zeroing everything.)
v3: request->batch also needs to be cleared
v4: signaling.tsk is no long used unset, but pid still exists

Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Goel, Akash" <akash.goel@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470731014-6894-2-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 37 ++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h | 11 ++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 6a1661643d3d..06d1267e733d 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -355,7 +355,35 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (req && i915_gem_request_completed(req))
 		i915_gem_request_retire(req);
 
-	req = kmem_cache_zalloc(dev_priv->requests, GFP_KERNEL);
+	/* Beware: Dragons be flying overhead.
+	 *
+	 * We use RCU to look up requests in flight. The lookups may
+	 * race with the request being allocated from the slab freelist.
+	 * That is the request we are writing to here, may be in the process
+	 * of being read by __i915_gem_active_get_request_rcu(). As such,
+	 * we have to be very careful when overwriting the contents. During
+	 * the RCU lookup, we change chase the request->engine pointer,
+	 * read the request->fence.seqno and increment the reference count.
+	 *
+	 * The reference count is incremented atomically. If it is zero,
+	 * the lookup knows the request is unallocated and complete. Otherwise,
+	 * it is either still in use, or has been reallocated and reset
+	 * with fence_init(). This increment is safe for release as we check
+	 * that the request we have a reference to and matches the active
+	 * request.
+	 *
+	 * Before we increment the refcount, we chase the request->engine
+	 * pointer. We must not call kmem_cache_zalloc() or else we set
+	 * that pointer to NULL and cause a crash during the lookup. If
+	 * we see the request is completed (based on the value of the
+	 * old engine and seqno), the lookup is complete and reports NULL.
+	 * If we decide the request is not completed (new engine or seqno),
+	 * then we grab a reference and double check that it is still the
+	 * active request - which it won't be and restart the lookup.
+	 *
+	 * Do not use kmem_cache_zalloc() here!
+	 */
+	req = kmem_cache_alloc(dev_priv->requests, GFP_KERNEL);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
 
@@ -375,6 +403,13 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->engine = engine;
 	req->ctx = i915_gem_context_get(ctx);
 
+	/* No zalloc, must clear what we need by hand */
+	req->previous_context = NULL;
+	req->file_priv = NULL;
+	req->batch_obj = NULL;
+	req->pid = NULL;
+	req->elsp_submitted = 0;
+
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
 	 * eventually emit this request. This is to guarantee that the
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 6dd83b172c7a..4e91d4912443 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -51,6 +51,13 @@ struct intel_signal_node {
  * emission time to be associated with the request for tracking how far ahead
  * of the GPU the submission is.
  *
+ * When modifying this structure be very aware that we perform a lockless
+ * RCU lookup of it that may race against reallocation of the struct
+ * from the slab freelist. We intentionally do not zero the structure on
+ * allocation so that the lookup can use the dangling pointers (and is
+ * cogniscent that those pointers may be wrong). Instead, everything that
+ * needs to be initialised must be done so explicitly.
+ *
  * The requests are reference counted.
  */
 struct drm_i915_gem_request {
@@ -458,6 +465,10 @@ __i915_gem_active_get_rcu(const struct i915_gem_active *active)
 	 * just report the active tracker is idle. If the new request is
 	 * incomplete, then we acquire a reference on it and check that
 	 * it remained the active request.
+	 *
+	 * It is then imperative that we do not zero the request on
+	 * reallocation, so that we can chase the dangling pointers!
+	 * See i915_gem_request_alloc().
 	 */
 	do {
 		struct drm_i915_gem_request *request;

From 09fa8bb9094569941661ed2a1dc0ff4bd37a0d7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 5 Aug 2016 20:41:34 +0300
Subject: [PATCH 006/141] drm/i915: Add some curly braces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

intel_enable_pipe() looks rather confusing when one side doesn't have
the curly braces, and the other one does. And what's even worse,
there's another if-else inside the braceless side. Let's put braces
around it to make it clear which branch goes where.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470418894-1249-1-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_display.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 9cbf5431c1e3..ddae54ad4ac8 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -1959,12 +1959,12 @@ static void intel_enable_pipe(struct intel_crtc *crtc)
 	 * a plane.  On ILK+ the pipe PLLs are integrated, so we don't
 	 * need the check.
 	 */
-	if (HAS_GMCH_DISPLAY(dev_priv))
+	if (HAS_GMCH_DISPLAY(dev_priv)) {
 		if (intel_crtc_has_type(crtc->config, INTEL_OUTPUT_DSI))
 			assert_dsi_pll_enabled(dev_priv);
 		else
 			assert_pll_enabled(dev_priv, pipe);
-	else {
+	} else {
 		if (crtc->config->has_pch_encoder) {
 			/* if driving the PCH, we need FDI enabled */
 			assert_fdi_rx_pll_enabled(dev_priv, pch_transcoder);

From a168f5b3f1797d6704d2edb36a3674e663154f80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 5 Aug 2016 20:00:17 +0300
Subject: [PATCH 007/141] drm/i915: Don't mark PCH underrun reporting as
 disabled for transcoder B/C on LPT-H
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Marking PCH transcoder FIFO underrun reporting as disabled for
transcoder B/C on LPT-H will block us from enabling the south error
interrupt. So let's only mark transcoder A underrun reporting as
disabled initially.

This is a little tricky to hit since you need a machine with LPT-H, and
the BIOS must enable either pipe B or C at boot. Then i915 would mark
the "transcoder B/C" underrun reporting as disabled and never enable it
again, meaning south interrupts would never get enabled either. The only
other interrupt in there is actually the poison interrupt which, if we
could ever trigger it, would just result in a little error in dmesg.

Here's the resulting change in SDEIMR on my HSW when I boot it with
multiple displays attached:
- (0x000c4004): 0xf115ffff
+ (0x000c4004): 0xf114ffff

My previous attempt [1] tried to fix this a little differently, but
Daniel requested I do this instead.

[1] https://lists.freedesktop.org/archives/intel-gfx/2015-November/081420.html

Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470416417-15021-1-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index ddae54ad4ac8..47ad6b3368e3 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -15771,6 +15771,13 @@ static bool intel_encoder_has_connectors(struct intel_encoder *encoder)
 	return false;
 }
 
+static bool has_pch_trancoder(struct drm_i915_private *dev_priv,
+			      enum transcoder pch_transcoder)
+{
+	return HAS_PCH_IBX(dev_priv) || HAS_PCH_CPT(dev_priv) ||
+		(HAS_PCH_LPT_H(dev_priv) && pch_transcoder == TRANSCODER_A);
+}
+
 static void intel_sanitize_crtc(struct intel_crtc *crtc)
 {
 	struct drm_device *dev = crtc->base.dev;
@@ -15849,7 +15856,17 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc)
 		 * worst a fifo underrun happens which also sets this to false.
 		 */
 		crtc->cpu_fifo_underrun_disabled = true;
-		crtc->pch_fifo_underrun_disabled = true;
+		/*
+		 * We track the PCH trancoder underrun reporting state
+		 * within the crtc. With crtc for pipe A housing the underrun
+		 * reporting state for PCH transcoder A, crtc for pipe B housing
+		 * it for PCH transcoder B, etc. LPT-H has only PCH transcoder A,
+		 * and marking underrun reporting as disabled for the non-existing
+		 * PCH transcoders B and C would prevent enabling the south
+		 * error interrupt (see cpt_can_enable_serr_int()).
+		 */
+		if (has_pch_trancoder(dev_priv, (enum transcoder)crtc->pipe))
+			crtc->pch_fifo_underrun_disabled = true;
 	}
 }
 

From 739748939974791b84629a8790527a16f76873a4 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Fri, 5 Aug 2016 23:28:27 +0300
Subject: [PATCH 008/141] drm/i915: Fix modeset handling during gpu reset, v5.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function would call drm_modeset_lock_all, while the suspend/resume
functions already have their own locking. Fix this by factoring out
__intel_display_resume, and calling the atomic helpers for duplicating
atomic state and disabling all crtc's during suspend.

Changes since v1:
- Deal with -EDEADLK right after lock_all and clean up calls
  to hw readout.
- Always take all modeset locks so updates during gpu reset are blocked.
Changes since v2:
- Fix deadlock in intel_update_primary_planes.
- Move WARN_ON(EDEADLK) to __intel_display_resume.
- pctx -> ctx
- only call __intel_display_resume on success in intel_display_resume.
Changes since v3:
- Rebase on top of dev_priv -> dev change.
- Use drm_modeset_lock_all_ctx instead of drm_modeset_lock_all.
Changes since v4 [by vsyrjala]:
- Deal with skip_intermediate_wm
- Update comment w.r.t. mode_config.mutex vs. ->detect()
- Rebase due to INTEL_GEN() etc.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Fixes: e2c8b8701e2d ("drm/i915: Use atomic helpers for suspend, v2.")
Cc: stable@vger.kernel.org
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470428910-12125-2-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/i915_drv.h      |   1 +
 drivers/gpu/drm/i915/intel_display.c | 178 +++++++++++++++++----------
 2 files changed, 115 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c36d17659ebe..54f789c75aa1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1840,6 +1840,7 @@ struct drm_i915_private {
 	enum modeset_restore modeset_restore;
 	struct mutex modeset_restore_lock;
 	struct drm_atomic_state *modeset_restore_state;
+	struct drm_modeset_acquire_ctx reset_ctx;
 
 	struct list_head vm_list; /* Global list of all address spaces */
 	struct i915_ggtt ggtt; /* VM representing the global address space */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 47ad6b3368e3..a95139575324 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3093,40 +3093,110 @@ static void intel_update_primary_planes(struct drm_device *dev)
 
 	for_each_crtc(dev, crtc) {
 		struct intel_plane *plane = to_intel_plane(crtc->primary);
-		struct intel_plane_state *plane_state;
-
-		drm_modeset_lock_crtc(crtc, &plane->base);
-		plane_state = to_intel_plane_state(plane->base.state);
+		struct intel_plane_state *plane_state =
+			to_intel_plane_state(plane->base.state);
 
 		if (plane_state->visible)
 			plane->update_plane(&plane->base,
 					    to_intel_crtc_state(crtc->state),
 					    plane_state);
-
-		drm_modeset_unlock_crtc(crtc);
 	}
 }
 
+static int
+__intel_display_resume(struct drm_device *dev,
+		       struct drm_atomic_state *state)
+{
+	struct drm_crtc_state *crtc_state;
+	struct drm_crtc *crtc;
+	int i, ret;
+
+	intel_modeset_setup_hw_state(dev);
+	i915_redisable_vga(dev);
+
+	if (!state)
+		return 0;
+
+	for_each_crtc_in_state(state, crtc, crtc_state, i) {
+		/*
+		 * Force recalculation even if we restore
+		 * current state. With fast modeset this may not result
+		 * in a modeset when the state is compatible.
+		 */
+		crtc_state->mode_changed = true;
+	}
+
+	/* ignore any reset values/BIOS leftovers in the WM registers */
+	to_intel_atomic_state(state)->skip_intermediate_wm = true;
+
+	ret = drm_atomic_commit(state);
+
+	WARN_ON(ret == -EDEADLK);
+	return ret;
+}
+
 void intel_prepare_reset(struct drm_i915_private *dev_priv)
 {
+	struct drm_device *dev = &dev_priv->drm;
+	struct drm_modeset_acquire_ctx *ctx = &dev_priv->reset_ctx;
+	struct drm_atomic_state *state;
+	int ret;
+
 	/* no reset support for gen2 */
 	if (IS_GEN2(dev_priv))
 		return;
 
-	/* reset doesn't touch the display */
+	/*
+	 * Need mode_config.mutex so that we don't
+	 * trample ongoing ->detect() and whatnot.
+	 */
+	mutex_lock(&dev->mode_config.mutex);
+	drm_modeset_acquire_init(ctx, 0);
+	while (1) {
+		ret = drm_modeset_lock_all_ctx(dev, ctx);
+		if (ret != -EDEADLK)
+			break;
+
+		drm_modeset_backoff(ctx);
+	}
+
+	/* reset doesn't touch the display, but flips might get nuked anyway, */
 	if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv))
 		return;
 
-	drm_modeset_lock_all(&dev_priv->drm);
 	/*
 	 * Disabling the crtcs gracefully seems nicer. Also the
 	 * g33 docs say we should at least disable all the planes.
 	 */
-	intel_display_suspend(&dev_priv->drm);
+	state = drm_atomic_helper_duplicate_state(dev, ctx);
+	if (IS_ERR(state)) {
+		ret = PTR_ERR(state);
+		state = NULL;
+		DRM_ERROR("Duplicating state failed with %i\n", ret);
+		goto err;
+	}
+
+	ret = drm_atomic_helper_disable_all(dev, ctx);
+	if (ret) {
+		DRM_ERROR("Suspending crtc's failed with %i\n", ret);
+		goto err;
+	}
+
+	dev_priv->modeset_restore_state = state;
+	state->acquire_ctx = ctx;
+	return;
+
+err:
+	drm_atomic_state_free(state);
 }
 
 void intel_finish_reset(struct drm_i915_private *dev_priv)
 {
+	struct drm_device *dev = &dev_priv->drm;
+	struct drm_modeset_acquire_ctx *ctx = &dev_priv->reset_ctx;
+	struct drm_atomic_state *state = dev_priv->modeset_restore_state;
+	int ret;
+
 	/*
 	 * Flips in the rings will be nuked by the reset,
 	 * so complete all pending flips so that user space
@@ -3138,6 +3208,8 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 	if (IS_GEN2(dev_priv))
 		return;
 
+	dev_priv->modeset_restore_state = NULL;
+
 	/* reset doesn't touch the display */
 	if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)) {
 		/*
@@ -3149,29 +3221,32 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 		 * FIXME: Atomic will make this obsolete since we won't schedule
 		 * CS-based flips (which might get lost in gpu resets) any more.
 		 */
-		intel_update_primary_planes(&dev_priv->drm);
-		return;
+		intel_update_primary_planes(dev);
+	} else {
+		/*
+		 * The display has been reset as well,
+		 * so need a full re-initialization.
+		 */
+		intel_runtime_pm_disable_interrupts(dev_priv);
+		intel_runtime_pm_enable_interrupts(dev_priv);
+
+		intel_modeset_init_hw(dev);
+
+		spin_lock_irq(&dev_priv->irq_lock);
+		if (dev_priv->display.hpd_irq_setup)
+			dev_priv->display.hpd_irq_setup(dev_priv);
+		spin_unlock_irq(&dev_priv->irq_lock);
+
+		ret = __intel_display_resume(dev, state);
+		if (ret)
+			DRM_ERROR("Restoring old state failed with %i\n", ret);
+
+		intel_hpd_init(dev_priv);
 	}
 
-	/*
-	 * The display has been reset as well,
-	 * so need a full re-initialization.
-	 */
-	intel_runtime_pm_disable_interrupts(dev_priv);
-	intel_runtime_pm_enable_interrupts(dev_priv);
-
-	intel_modeset_init_hw(&dev_priv->drm);
-
-	spin_lock_irq(&dev_priv->irq_lock);
-	if (dev_priv->display.hpd_irq_setup)
-		dev_priv->display.hpd_irq_setup(dev_priv);
-	spin_unlock_irq(&dev_priv->irq_lock);
-
-	intel_display_resume(&dev_priv->drm);
-
-	intel_hpd_init(dev_priv);
-
-	drm_modeset_unlock_all(&dev_priv->drm);
+	drm_modeset_drop_locks(ctx);
+	drm_modeset_acquire_fini(ctx);
+	mutex_unlock(&dev->mode_config.mutex);
 }
 
 static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
@@ -16180,9 +16255,10 @@ void intel_display_resume(struct drm_device *dev)
 	struct drm_atomic_state *state = dev_priv->modeset_restore_state;
 	struct drm_modeset_acquire_ctx ctx;
 	int ret;
-	bool setup = false;
 
 	dev_priv->modeset_restore_state = NULL;
+	if (state)
+		state->acquire_ctx = &ctx;
 
 	/*
 	 * This is a cludge because with real atomic modeset mode_config.mutex
@@ -16193,43 +16269,17 @@ void intel_display_resume(struct drm_device *dev)
 	mutex_lock(&dev->mode_config.mutex);
 	drm_modeset_acquire_init(&ctx, 0);
 
-retry:
-	ret = drm_modeset_lock_all_ctx(dev, &ctx);
+	while (1) {
+		ret = drm_modeset_lock_all_ctx(dev, &ctx);
+		if (ret != -EDEADLK)
+			break;
 
-	if (ret == 0 && !setup) {
-		setup = true;
-
-		intel_modeset_setup_hw_state(dev);
-		i915_redisable_vga(dev);
-	}
-
-	if (ret == 0 && state) {
-		struct drm_crtc_state *crtc_state;
-		struct drm_crtc *crtc;
-		int i;
-
-		state->acquire_ctx = &ctx;
-
-		/* ignore any reset values/BIOS leftovers in the WM registers */
-		to_intel_atomic_state(state)->skip_intermediate_wm = true;
-
-		for_each_crtc_in_state(state, crtc, crtc_state, i) {
-			/*
-			 * Force recalculation even if we restore
-			 * current state. With fast modeset this may not result
-			 * in a modeset when the state is compatible.
-			 */
-			crtc_state->mode_changed = true;
-		}
-
-		ret = drm_atomic_commit(state);
-	}
-
-	if (ret == -EDEADLK) {
 		drm_modeset_backoff(&ctx);
-		goto retry;
 	}
 
+	if (!ret)
+		ret = __intel_display_resume(dev, state);
+
 	drm_modeset_drop_locks(&ctx);
 	drm_modeset_acquire_fini(&ctx);
 	mutex_unlock(&dev->mode_config.mutex);

From 522a63de18a12a3c74609323f984790e21d4a47e Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Fri, 5 Aug 2016 23:28:28 +0300
Subject: [PATCH 009/141] drm/i915: Add a way to test the modeset done during
 gpu reset, v3.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add force_reset_modeset_test as a parameter to force the modeset path during gpu reset.
This allows a IGT test to set the knob and trigger a hang to force the gpu reset,
even on platforms that wouldn't otherwise require it.

Changes since v1:
- Split out fix to separate commit.
Changes since v2:
- This commit is purely about force_reset_modeset_test now.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Testcase: drv_hangman.reset-with-forced-modeset
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470428910-12125-3-git-send-email-ville.syrjala@linux.intel.com
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_params.c   |  6 ++++++
 drivers/gpu/drm/i915/i915_params.h   |  1 +
 drivers/gpu/drm/i915/intel_display.c | 29 +++++++++++++++++-----------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
index b6e404c91eed..768ad89d9cd4 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -45,6 +45,7 @@ struct i915_params i915 __read_mostly = {
 	.fastboot = 0,
 	.prefault_disable = 0,
 	.load_detect_test = 0,
+	.force_reset_modeset_test = 0,
 	.reset = true,
 	.invert_brightness = 0,
 	.disable_display = 0,
@@ -161,6 +162,11 @@ MODULE_PARM_DESC(load_detect_test,
 	"Force-enable the VGA load detect code for testing (default:false). "
 	"For developers only.");
 
+module_param_named_unsafe(force_reset_modeset_test, i915.force_reset_modeset_test, bool, 0600);
+MODULE_PARM_DESC(force_reset_modeset_test,
+	"Force a modeset during gpu reset for testing (default:false). "
+	"For developers only.");
+
 module_param_named_unsafe(invert_brightness, i915.invert_brightness, int, 0600);
 MODULE_PARM_DESC(invert_brightness,
 	"Invert backlight brightness "
diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
index 0ad020b4a925..3a0dd78ddb38 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -57,6 +57,7 @@ struct i915_params {
 	bool fastboot;
 	bool prefault_disable;
 	bool load_detect_test;
+	bool force_reset_modeset_test;
 	bool reset;
 	bool disable_display;
 	bool verbose_state_checks;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index a95139575324..8fa200bfc2c1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3161,7 +3161,8 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv)
 	}
 
 	/* reset doesn't touch the display, but flips might get nuked anyway, */
-	if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv))
+	if (!i915.force_reset_modeset_test &&
+	    (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)))
 		return;
 
 	/*
@@ -3212,16 +3213,22 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 
 	/* reset doesn't touch the display */
 	if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)) {
-		/*
-		 * Flips in the rings have been nuked by the reset,
-		 * so update the base address of all primary
-		 * planes to the the last fb to make sure we're
-		 * showing the correct fb after a reset.
-		 *
-		 * FIXME: Atomic will make this obsolete since we won't schedule
-		 * CS-based flips (which might get lost in gpu resets) any more.
-		 */
-		intel_update_primary_planes(dev);
+		if (!state) {
+			/*
+			 * Flips in the rings have been nuked by the reset,
+			 * so update the base address of all primary
+			 * planes to the the last fb to make sure we're
+			 * showing the correct fb after a reset.
+			 *
+			 * FIXME: Atomic will make this obsolete since we won't schedule
+			 * CS-based flips (which might get lost in gpu resets) any more.
+			 */
+			intel_update_primary_planes(dev);
+		} else {
+			ret = __intel_display_resume(dev, state);
+			if (ret)
+				DRM_ERROR("Restoring old state failed with %i\n", ret);
+		}
 	} else {
 		/*
 		 * The display has been reset as well,

From 4ac2ba2f8c4b7bb33d32fc1be290daf25d39802a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 5 Aug 2016 23:28:29 +0300
Subject: [PATCH 010/141] drm/i915: Introduce gpu_reset_clobbers_display()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Factor out the "does the GPU reset clobber the display?" check into a
small helper.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470428910-12125-4-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 8fa200bfc2c1..5d3766105b33 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3135,6 +3135,11 @@ __intel_display_resume(struct drm_device *dev,
 	return ret;
 }
 
+static bool gpu_reset_clobbers_display(struct drm_i915_private *dev_priv)
+{
+	return INTEL_GEN(dev_priv) < 5 && !IS_G4X(dev_priv);
+}
+
 void intel_prepare_reset(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = &dev_priv->drm;
@@ -3162,7 +3167,7 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv)
 
 	/* reset doesn't touch the display, but flips might get nuked anyway, */
 	if (!i915.force_reset_modeset_test &&
-	    (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)))
+	    !gpu_reset_clobbers_display(dev_priv))
 		return;
 
 	/*
@@ -3212,7 +3217,7 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 	dev_priv->modeset_restore_state = NULL;
 
 	/* reset doesn't touch the display */
-	if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)) {
+	if (!gpu_reset_clobbers_display(dev_priv)) {
 		if (!state) {
 			/*
 			 * Flips in the rings have been nuked by the reset,

From ae98104bec5bbe9543764a78f37a421d45dc65af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 5 Aug 2016 23:28:30 +0300
Subject: [PATCH 011/141] drm/i915: Use the g4x+ approach on gen2 for handling
 display stuff around GPU reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We don't have GPU reset support for gen2, which means the display
hardware is unaffected when a GPU hang is handled. However as the ring
has in fact stopped, any flips still in the ring will never complete,
and thus the display base address updates will never happen. So we
really need to fix that up manually just like we do on g4x+.

In fact, let's just use intel_has_gpu_reset() instead of IS_GEN2()
since that'll also handle cases where someone would disable the GPU
reset support on gen3/4 for whatever reason.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470428910-12125-5-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 5d3766105b33..ecd154347cc1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3137,7 +3137,8 @@ __intel_display_resume(struct drm_device *dev,
 
 static bool gpu_reset_clobbers_display(struct drm_i915_private *dev_priv)
 {
-	return INTEL_GEN(dev_priv) < 5 && !IS_G4X(dev_priv);
+	return intel_has_gpu_reset(dev_priv) &&
+		INTEL_GEN(dev_priv) < 5 && !IS_G4X(dev_priv);
 }
 
 void intel_prepare_reset(struct drm_i915_private *dev_priv)
@@ -3147,10 +3148,6 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv)
 	struct drm_atomic_state *state;
 	int ret;
 
-	/* no reset support for gen2 */
-	if (IS_GEN2(dev_priv))
-		return;
-
 	/*
 	 * Need mode_config.mutex so that we don't
 	 * trample ongoing ->detect() and whatnot.
@@ -3210,10 +3207,6 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 	 */
 	intel_complete_page_flips(dev_priv);
 
-	/* no reset support for gen2 */
-	if (IS_GEN2(dev_priv))
-		return;
-
 	dev_priv->modeset_restore_state = NULL;
 
 	/* reset doesn't touch the display */

From 1426f7157efe12bb40455e84dd7083917d21860f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 17:03:22 +0100
Subject: [PATCH 012/141] drm/i915: Correct typo for __i915_gem_active_get_rcu
 in a comment

I mistyped and added an extra _request_ to __i915_gem_active_get_rcu()
Also, the same happened to another comment for i915_gem_active_get_rcu()

Reported-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470758602-1338-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 06d1267e733d..76314e527cfd 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -360,7 +360,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 * We use RCU to look up requests in flight. The lookups may
 	 * race with the request being allocated from the slab freelist.
 	 * That is the request we are writing to here, may be in the process
-	 * of being read by __i915_gem_active_get_request_rcu(). As such,
+	 * of being read by __i915_gem_active_get_rcu(). As such,
 	 * we have to be very careful when overwriting the contents. During
 	 * the RCU lookup, we change chase the request->engine pointer,
 	 * read the request->fence.seqno and increment the reference count.
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 43e545e44352..bf9a6e56e719 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -310,7 +310,7 @@ struct intel_engine_cs {
 
 	/* An RCU guarded pointer to the last request. No reference is
 	 * held to the request, users must carefully acquire a reference to
-	 * the request using i915_gem_active_get_request_rcu(), or hold the
+	 * the request using i915_gem_active_get_rcu(), or hold the
 	 * struct_mutex.
 	 */
 	struct i915_gem_active last_request;

From aac440ff22a27155ae08f4d0fd6ac2971bfc5f3d Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 9 Aug 2016 18:35:10 +0100
Subject: [PATCH 013/141] drm: avoid "possible bad bitmask?" warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent versions of gcc say this:

include/drm/i915_drm.h:96:34: warning: result of ‘65535 << 20’
requires 37 bits to represent, but ‘int’ only has 32 bits
[-Wshift-overflow=]

Reported-by: David Binderman <linuxdev.baldrick@gmail.com>
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Cc: Dave Airlie <airlied@gmail.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470764110-23855-1-git-send-email-david.s.gordon@intel.com
---
 include/drm/i915_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index b1755f8db36b..4e1b274e1164 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -93,6 +93,6 @@ extern bool i915_gpu_turbo_disable(void);
 #define    I845_TSEG_SIZE_1M	(3 << 1)
 
 #define INTEL_BSM 0x5c
-#define   INTEL_BSM_MASK (0xFFFF << 20)
+#define   INTEL_BSM_MASK	(-(1u << 20))
 
 #endif				/* _I915_DRM_H_ */

From 70cb472c6d7a3575a9b3fc7e0188384b7ca3d705 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 18:08:25 +0100
Subject: [PATCH 014/141] drm/i915: Always mark the writer as also a read for
 busy ioctl

One of the few guarantees we want the busy ioctl to provide is that the
reported busy writer is included in the set of busy read engines. This
should be provided by the ordering of setting and retiring the active
trackers, but we can do better by explicitly setting the busy read
engine flag for the last writer.

v2: More comments inside __busy_write_id() to explain why both fields
are set.

Fixes: 3fdc13c7a3cb ("drm/i915: Remove (struct_mutex) locking for busy-ioctl")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470762505-12799-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 373136f57d7b..3fb574974343 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3748,7 +3748,15 @@ static __always_inline unsigned int __busy_read_flag(unsigned int id)
 
 static __always_inline unsigned int __busy_write_id(unsigned int id)
 {
-	return id;
+	/* The uABI guarantees an active writer is also amongst the read
+	 * engines. This would be true if we accessed the activity tracking
+	 * under the lock, but as we perform the lookup of the object and
+	 * its activity locklessly we can not guarantee that the last_write
+	 * being active implies that we have set the same engine flag from
+	 * last_read - hence we always set both read and write busy for
+	 * last_write.
+	 */
+	return id | __busy_read_flag(id);
 }
 
 static __always_inline unsigned int
@@ -3857,9 +3865,11 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 			args->busy |= busy_check_reader(&obj->last_read[idx]);
 
 		/* For ABI sanity, we only care that the write engine is in
-		 * the set of read engines. This is ensured by the ordering
-		 * of setting last_read/last_write in i915_vma_move_to_active,
-		 * and then in reverse in retire.
+		 * the set of read engines. This should be ensured by the
+		 * ordering of setting last_read/last_write in
+		 * i915_vma_move_to_active(), and then in reverse in retire.
+		 * However, for good measure, we always report the last_write
+		 * request as a busy read as well as being a busy write.
 		 *
 		 * We don't care that the set of active read/write engines
 		 * may change during construction of the result, as it is

From 83348ba84ee0d5d4d982e5382bfbc8b2a2d05e75 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 17:47:51 +0100
Subject: [PATCH 015/141] drm/i915: Move missed interrupt detection from
 hangcheck to breadcrumbs

In commit 2529d57050af ("drm/i915: Drop racy markup of missed-irqs from
idle-worker") the racy detection of missed interrupts was removed when
we went idle. This however opened up the issue that the stuck waiters
were not being reported, causing a test case failure. If we move the
stuck waiter detection out of hangcheck and into the breadcrumb
mechanims (i.e. the waiter) itself, we can avoid this issue entirely.
This leaves hangcheck looking for a stuck GPU (inspecting for request
advancement and HEAD motion), and breadcrumbs looking for a stuck
waiter - hopefully make both easier to understand by their segregation.

v2: Reduce the error message as we now run independently of hangcheck,
and the hanging batch used by igt also counts as a stuck waiter causing
extra warnings in dmesg.
v3: Move the breadcrumb's hangcheck kickstart to the first missed wait.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=97104
Fixes: 2529d57050af (waiter"drm/i915: Drop racy markup of missed-irqs...")
Testcase: igt/drv_missed_irq
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470761272-1245-2-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c      | 11 ++--
 drivers/gpu/drm/i915/i915_gem.c          | 10 ----
 drivers/gpu/drm/i915/i915_irq.c          | 26 +--------
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 69 ++++++++++++++++--------
 drivers/gpu/drm/i915/intel_engine_cs.c   |  1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  6 +--
 6 files changed, 56 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 9bd41581b592..0627e170ea25 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -787,8 +787,6 @@ static void i915_ring_seqno_info(struct seq_file *m,
 
 	seq_printf(m, "Current sequence (%s): %x\n",
 		   engine->name, intel_engine_get_seqno(engine));
-	seq_printf(m, "Current user interrupts (%s): %lx\n",
-		   engine->name, READ_ONCE(engine->breadcrumbs.irq_wakeups));
 
 	spin_lock(&b->lock);
 	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1434,11 +1432,10 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   engine->hangcheck.seqno,
 			   seqno[id],
 			   engine->last_submitted_seqno);
-		seq_printf(m, "\twaiters? %d\n",
-			   intel_engine_has_waiter(engine));
-		seq_printf(m, "\tuser interrupts = %lx [current %lx]\n",
-			   engine->hangcheck.user_interrupts,
-			   READ_ONCE(engine->breadcrumbs.irq_wakeups));
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+			   yesno(intel_engine_has_waiter(engine)),
+			   yesno(test_bit(engine->id,
+					  &dev_priv->gpu_error.missed_irq_rings)));
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3fb574974343..e68b4c62be88 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2524,7 +2524,6 @@ i915_gem_idle_work_handler(struct work_struct *work)
 		container_of(work, typeof(*dev_priv), gt.idle_work.work);
 	struct drm_device *dev = &dev_priv->drm;
 	struct intel_engine_cs *engine;
-	unsigned int stuck_engines;
 	bool rearm_hangcheck;
 
 	if (!READ_ONCE(dev_priv->gt.awake))
@@ -2554,15 +2553,6 @@ i915_gem_idle_work_handler(struct work_struct *work)
 	dev_priv->gt.awake = false;
 	rearm_hangcheck = false;
 
-	/* As we have disabled hangcheck, we need to unstick any waiters still
-	 * hanging around. However, as we may be racing against the interrupt
-	 * handler or the waiters themselves, we skip enabling the fake-irq.
-	 */
-	stuck_engines = intel_kick_waiters(dev_priv);
-	if (unlikely(stuck_engines))
-		DRM_DEBUG_DRIVER("kicked stuck waiters (%x)...missed irq?\n",
-				 stuck_engines);
-
 	if (INTEL_GEN(dev_priv) >= 6)
 		gen6_rps_idle(dev_priv);
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 591f452ece68..ebb83d5a448b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -972,10 +972,8 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 static void notify_ring(struct intel_engine_cs *engine)
 {
 	smp_store_mb(engine->breadcrumbs.irq_posted, true);
-	if (intel_engine_wakeup(engine)) {
+	if (intel_engine_wakeup(engine))
 		trace_i915_gem_request_notify(engine);
-		engine->breadcrumbs.irq_wakeups++;
-	}
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -3044,22 +3042,6 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
-static unsigned long kick_waiters(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *i915 = engine->i915;
-	unsigned long irq_count = READ_ONCE(engine->breadcrumbs.irq_wakeups);
-
-	if (engine->hangcheck.user_interrupts == irq_count &&
-	    !test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
-		if (!test_bit(engine->id, &i915->gpu_error.test_irq_rings))
-			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-				  engine->name);
-
-		intel_engine_enable_fake_irq(engine);
-	}
-
-	return irq_count;
-}
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -3097,7 +3079,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		bool busy = intel_engine_has_waiter(engine);
 		u64 acthd;
 		u32 seqno;
-		unsigned user_interrupts;
 
 		semaphore_clear_deadlocks(dev_priv);
 
@@ -3114,15 +3095,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		acthd = intel_engine_get_active_head(engine);
 		seqno = intel_engine_get_seqno(engine);
 
-		/* Reset stuck interrupts between batch advances */
-		user_interrupts = 0;
-
 		if (engine->hangcheck.seqno == seqno) {
 			if (!intel_engine_is_active(engine)) {
 				engine->hangcheck.action = HANGCHECK_IDLE;
 				if (busy) {
 					/* Safeguard against driver failure */
-					user_interrupts = kick_waiters(engine);
 					engine->hangcheck.score += BUSY;
 				}
 			} else {
@@ -3185,7 +3162,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		engine->hangcheck.seqno = seqno;
 		engine->hangcheck.acthd = acthd;
-		engine->hangcheck.user_interrupts = user_interrupts;
 		busy_count += busy;
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 90867446f1a5..7be9af1d5424 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -26,6 +26,40 @@
 
 #include "i915_drv.h"
 
+static void intel_breadcrumbs_hangcheck(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	if (!b->irq_enabled)
+		return;
+
+	if (time_before(jiffies, b->timeout)) {
+		mod_timer(&b->hangcheck, b->timeout);
+		return;
+	}
+
+	DRM_DEBUG("Hangcheck timer elapsed... %s idle\n", engine->name);
+	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
+	mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
+
+	/* Ensure that even if the GPU hangs, we get woken up.
+	 *
+	 * However, note that if no one is waiting, we never notice
+	 * a gpu hang. Eventually, we will have to wait for a resource
+	 * held by the GPU and so trigger a hangcheck. In the most
+	 * pathological case, this will be upon memory starvation! To
+	 * prevent this, we also queue the hangcheck from the retire
+	 * worker.
+	 */
+	i915_queue_hangcheck(engine->i915);
+}
+
+static unsigned long wait_timeout(void)
+{
+	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
+}
+
 static void intel_breadcrumbs_fake_irq(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
@@ -51,13 +85,6 @@ static void irq_enable(struct intel_engine_cs *engine)
 	 */
 	engine->breadcrumbs.irq_posted = true;
 
-	/* Make sure the current hangcheck doesn't falsely accuse a just
-	 * started irq handler from missing an interrupt (because the
-	 * interrupt count still matches the stale value from when
-	 * the irq handler was disabled, many hangchecks ago).
-	 */
-	engine->breadcrumbs.irq_wakeups++;
-
 	spin_lock_irq(&engine->i915->irq_lock);
 	engine->irq_enable(engine);
 	spin_unlock_irq(&engine->i915->irq_lock);
@@ -98,17 +125,13 @@ static void __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
 	}
 
 	if (!b->irq_enabled ||
-	    test_bit(engine->id, &i915->gpu_error.missed_irq_rings))
+	    test_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
 		mod_timer(&b->fake_irq, jiffies + 1);
-
-	/* Ensure that even if the GPU hangs, we get woken up.
-	 *
-	 * However, note that if no one is waiting, we never notice
-	 * a gpu hang. Eventually, we will have to wait for a resource
-	 * held by the GPU and so trigger a hangcheck. In the most
-	 * pathological case, this will be upon memory starvation!
-	 */
-	i915_queue_hangcheck(i915);
+	} else {
+		/* Ensure we never sleep indefinitely */
+		GEM_BUG_ON(!time_after(b->timeout, jiffies));
+		mod_timer(&b->hangcheck, b->timeout);
+	}
 }
 
 static void __intel_breadcrumbs_disable_irq(struct intel_breadcrumbs *b)
@@ -219,6 +242,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		GEM_BUG_ON(!next && !first);
 		if (next && next != &wait->node) {
 			GEM_BUG_ON(first);
+			b->timeout = wait_timeout();
 			b->first_wait = to_wait(next);
 			smp_store_mb(b->irq_seqno_bh, b->first_wait->tsk);
 			/* As there is a delay between reading the current
@@ -245,6 +269,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 
 	if (first) {
 		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
+		b->timeout = wait_timeout();
 		b->first_wait = wait;
 		smp_store_mb(b->irq_seqno_bh, wait->tsk);
 		/* After assigning ourselves as the new bottom-half, we must
@@ -277,11 +302,6 @@ bool intel_engine_add_wait(struct intel_engine_cs *engine,
 	return first;
 }
 
-void intel_engine_enable_fake_irq(struct intel_engine_cs *engine)
-{
-	mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
-}
-
 static inline bool chain_wakeup(struct rb_node *rb, int priority)
 {
 	return rb && to_wait(rb)->tsk->prio <= priority;
@@ -359,6 +379,7 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 			 * the interrupt, or if we have to handle an
 			 * exception rather than a seqno completion.
 			 */
+			b->timeout = wait_timeout();
 			b->first_wait = to_wait(next);
 			smp_store_mb(b->irq_seqno_bh, b->first_wait->tsk);
 			if (b->first_wait->seqno != wait->seqno)
@@ -536,6 +557,9 @@ int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 	setup_timer(&b->fake_irq,
 		    intel_breadcrumbs_fake_irq,
 		    (unsigned long)engine);
+	setup_timer(&b->hangcheck,
+		    intel_breadcrumbs_hangcheck,
+		    (unsigned long)engine);
 
 	/* Spawn a thread to provide a common bottom-half for all signals.
 	 * As this is an asynchronous interface we cannot steal the current
@@ -560,6 +584,7 @@ void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 	if (!IS_ERR_OR_NULL(b->signaler))
 		kthread_stop(b->signaler);
 
+	del_timer_sync(&b->hangcheck);
 	del_timer_sync(&b->fake_irq);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index e9b301ae2d0c..0dd3d1de18aa 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -164,6 +164,7 @@ cleanup:
 void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
 {
 	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
+	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
 static void intel_engine_init_requests(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index bf9a6e56e719..c7a6db310f9f 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -75,7 +75,6 @@ enum intel_engine_hangcheck_action {
 
 struct intel_engine_hangcheck {
 	u64 acthd;
-	unsigned long user_interrupts;
 	u32 seqno;
 	int score;
 	enum intel_engine_hangcheck_action action;
@@ -173,7 +172,6 @@ struct intel_engine_cs {
 	 */
 	struct intel_breadcrumbs {
 		struct task_struct *irq_seqno_bh; /* bh for user interrupts */
-		unsigned long irq_wakeups;
 		bool irq_posted;
 
 		spinlock_t lock; /* protects the lists of requests */
@@ -183,6 +181,9 @@ struct intel_engine_cs {
 		struct task_struct *signaler; /* used for fence signalling */
 		struct drm_i915_gem_request *first_signal;
 		struct timer_list fake_irq; /* used after a missed interrupt */
+		struct timer_list hangcheck; /* detect missed interrupts */
+
+		unsigned long timeout;
 
 		bool irq_enabled : 1;
 		bool rpm_wakelock : 1;
@@ -560,7 +561,6 @@ static inline bool intel_engine_wakeup(struct intel_engine_cs *engine)
 	return wakeup;
 }
 
-void intel_engine_enable_fake_irq(struct intel_engine_cs *engine);
 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
 unsigned int intel_kick_waiters(struct drm_i915_private *i915);
 unsigned int intel_kick_signalers(struct drm_i915_private *i915);

From dbd6ef29a74e6db22e037fc3d481c1dbca596f80 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 9 Aug 2016 17:47:52 +0100
Subject: [PATCH 016/141] drm/i915: Use RCU to annotate and enforce protection
 for breadcrumb's bh

The bottom-half we use for processing the breadcrumb interrupt is a
task, which is an RCU protected struct. When accessing this struct, we
need to be holding the RCU read lock to prevent it disappearing beneath
us. We can use the RCU annotation to mark our irq_seqno_bh pointer as
being under RCU guard and then use the RCU accessors to both provide
correct ordering of access through the pointer.

Most notably, this fixes the access from hard irq context to use the RCU
read lock, which both Daniel and Tvrtko complained about.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1470761272-1245-3-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h          |  4 ++--
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 22 +++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.c  |  2 --
 drivers/gpu/drm/i915/intel_ringbuffer.h  | 25 +++++++++++++++---------
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 54f789c75aa1..2cb91ffbbfa2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3849,7 +3849,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	 * is woken.
 	 */
 	if (engine->irq_seqno_barrier &&
-	    READ_ONCE(engine->breadcrumbs.irq_seqno_bh) == current &&
+	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
 	    cmpxchg_relaxed(&engine->breadcrumbs.irq_posted, 1, 0)) {
 		struct task_struct *tsk;
 
@@ -3874,7 +3874,7 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 		 * irq_posted == false but we are still running).
 		 */
 		rcu_read_lock();
-		tsk = READ_ONCE(engine->breadcrumbs.irq_seqno_bh);
+		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
 		if (tsk && tsk != current)
 			/* Note that if the bottom-half is changed as we
 			 * are sending the wake-up, the new bottom-half will
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 7be9af1d5424..2491e4c1eaf0 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -71,10 +71,8 @@ static void intel_breadcrumbs_fake_irq(unsigned long data)
 	 * every jiffie in order to kick the oldest waiter to do the
 	 * coherent seqno check.
 	 */
-	rcu_read_lock();
 	if (intel_engine_wakeup(engine))
 		mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
-	rcu_read_unlock();
 }
 
 static void irq_enable(struct intel_engine_cs *engine)
@@ -234,7 +232,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 	}
 	rb_link_node(&wait->node, parent, p);
 	rb_insert_color(&wait->node, &b->waiters);
-	GEM_BUG_ON(!first && !b->irq_seqno_bh);
+	GEM_BUG_ON(!first && !rcu_access_pointer(b->irq_seqno_bh));
 
 	if (completed) {
 		struct rb_node *next = rb_next(completed);
@@ -244,7 +242,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 			GEM_BUG_ON(first);
 			b->timeout = wait_timeout();
 			b->first_wait = to_wait(next);
-			smp_store_mb(b->irq_seqno_bh, b->first_wait->tsk);
+			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
 			/* As there is a delay between reading the current
 			 * seqno, processing the completed tasks and selecting
 			 * the next waiter, we may have missed the interrupt
@@ -271,7 +269,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
 		b->timeout = wait_timeout();
 		b->first_wait = wait;
-		smp_store_mb(b->irq_seqno_bh, wait->tsk);
+		rcu_assign_pointer(b->irq_seqno_bh, wait->tsk);
 		/* After assigning ourselves as the new bottom-half, we must
 		 * perform a cursory check to prevent a missed interrupt.
 		 * Either we miss the interrupt whilst programming the hardware,
@@ -282,7 +280,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		 */
 		__intel_breadcrumbs_enable_irq(b);
 	}
-	GEM_BUG_ON(!b->irq_seqno_bh);
+	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh));
 	GEM_BUG_ON(!b->first_wait);
 	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
 
@@ -337,7 +335,7 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 		const int priority = wakeup_priority(b, wait->tsk);
 		struct rb_node *next;
 
-		GEM_BUG_ON(b->irq_seqno_bh != wait->tsk);
+		GEM_BUG_ON(rcu_access_pointer(b->irq_seqno_bh) != wait->tsk);
 
 		/* We are the current bottom-half. Find the next candidate,
 		 * the first waiter in the queue on the remaining oldest
@@ -381,13 +379,13 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 			 */
 			b->timeout = wait_timeout();
 			b->first_wait = to_wait(next);
-			smp_store_mb(b->irq_seqno_bh, b->first_wait->tsk);
+			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
 			if (b->first_wait->seqno != wait->seqno)
 				__intel_breadcrumbs_enable_irq(b);
-			wake_up_process(b->irq_seqno_bh);
+			wake_up_process(b->first_wait->tsk);
 		} else {
 			b->first_wait = NULL;
-			WRITE_ONCE(b->irq_seqno_bh, NULL);
+			rcu_assign_pointer(b->irq_seqno_bh, NULL);
 			__intel_breadcrumbs_disable_irq(b);
 		}
 	} else {
@@ -401,7 +399,7 @@ out_unlock:
 	GEM_BUG_ON(b->first_wait == wait);
 	GEM_BUG_ON(rb_first(&b->waiters) !=
 		   (b->first_wait ? &b->first_wait->node : NULL));
-	GEM_BUG_ON(!b->irq_seqno_bh ^ RB_EMPTY_ROOT(&b->waiters));
+	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh) ^ RB_EMPTY_ROOT(&b->waiters));
 	spin_unlock(&b->lock);
 }
 
@@ -598,11 +596,9 @@ unsigned int intel_kick_waiters(struct drm_i915_private *i915)
 	 * RCU lock, i.e. as we call wake_up_process() we must be holding the
 	 * rcu_read_lock().
 	 */
-	rcu_read_lock();
 	for_each_engine(engine, i915)
 		if (unlikely(intel_engine_wakeup(engine)))
 			mask |= intel_engine_flag(engine);
-	rcu_read_unlock();
 
 	return mask;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index e08a1e1b04e4..16b726fe33eb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2410,9 +2410,7 @@ void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno)
 	/* After manually advancing the seqno, fake the interrupt in case
 	 * there are any waiters for that seqno.
 	 */
-	rcu_read_lock();
 	intel_engine_wakeup(engine);
-	rcu_read_unlock();
 }
 
 static void gen6_bsd_submit_request(struct drm_i915_gem_request *request)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c7a6db310f9f..ea2735144b2a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -171,7 +171,7 @@ struct intel_engine_cs {
 	 * the overhead of waking that client is much preferred.
 	 */
 	struct intel_breadcrumbs {
-		struct task_struct *irq_seqno_bh; /* bh for user interrupts */
+		struct task_struct __rcu *irq_seqno_bh; /* bh for interrupts */
 		bool irq_posted;
 
 		spinlock_t lock; /* protects the lists of requests */
@@ -539,25 +539,32 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 			      struct intel_wait *wait);
 void intel_engine_enable_signaling(struct drm_i915_gem_request *request);
 
-static inline bool intel_engine_has_waiter(struct intel_engine_cs *engine)
+static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
 {
-	return READ_ONCE(engine->breadcrumbs.irq_seqno_bh);
+	return rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh);
 }
 
-static inline bool intel_engine_wakeup(struct intel_engine_cs *engine)
+static inline bool intel_engine_wakeup(const struct intel_engine_cs *engine)
 {
 	bool wakeup = false;
-	struct task_struct *tsk = READ_ONCE(engine->breadcrumbs.irq_seqno_bh);
+
 	/* Note that for this not to dangerously chase a dangling pointer,
-	 * the caller is responsible for ensure that the task remain valid for
-	 * wake_up_process() i.e. that the RCU grace period cannot expire.
+	 * we must hold the rcu_read_lock here.
 	 *
 	 * Also note that tsk is likely to be in !TASK_RUNNING state so an
 	 * early test for tsk->state != TASK_RUNNING before wake_up_process()
 	 * is unlikely to be beneficial.
 	 */
-	if (tsk)
-		wakeup = wake_up_process(tsk);
+	if (intel_engine_has_waiter(engine)) {
+		struct task_struct *tsk;
+
+		rcu_read_lock();
+		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
+		if (tsk)
+			wakeup = wake_up_process(tsk);
+		rcu_read_unlock();
+	}
+
 	return wakeup;
 }
 

From 8888cd015458811d568a152238b0614c586df751 Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 9 Aug 2016 15:19:19 +0100
Subject: [PATCH 017/141] drm/i915/guc: doorbell reset should avoid used
 doorbells

guc_init_doorbell_hw() borrows the (currently single) GuC client to use
in reinitialising ALL the doorbell registers (as the hardware doesn't
reset them when the GuC is reset). As a prerequisite for accommodating
multiple clients, it should only reset doorbells that are supposed to be
disabled, avoiding those that are marked as in use by any client.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 03a5cef353eb..d5db0d1df298 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -697,7 +697,7 @@ guc_client_free(struct drm_i915_private *dev_priv,
 }
 
 /*
- * Borrow the first client to set up & tear down every doorbell
+ * Borrow the first client to set up & tear down each unused doorbell
  * in turn, to ensure that all doorbell h/w is (re)initialised.
  */
 static void guc_init_doorbell_hw(struct intel_guc *guc)
@@ -713,6 +713,9 @@ static void guc_init_doorbell_hw(struct intel_guc *guc)
 		i915_reg_t drbreg = GEN8_DRBREGL(i);
 		u32 value = I915_READ(drbreg);
 
+		if (test_bit(i, guc->doorbell_bitmap))
+			continue;
+
 		err = guc_update_doorbell_id(guc, client, i);
 
 		/* Report update failure or unexpectedly active doorbell */
@@ -731,6 +734,9 @@ static void guc_init_doorbell_hw(struct intel_guc *guc)
 		i915_reg_t drbreg = GEN8_DRBREGL(i);
 		u32 value = I915_READ(drbreg);
 
+		if (test_bit(i, guc->doorbell_bitmap))
+			continue;
+
 		if (i != db_id && (value & GUC_DOORBELL_ENABLED))
 			DRM_DEBUG_DRIVER("Doorbell %d (reg 0x%x) finally 0x%x\n",
 					  i, drbreg.reg, value);

From 84b7f88235dfe56048c39229481a010e9754eb6f Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 9 Aug 2016 15:19:20 +0100
Subject: [PATCH 018/141] drm/i915/guc: refactor guc_init_doorbell_hw()

We have essentially the same code in each of two different
loops, so we can refactor it into a little helper function.

This also reduces the amount of work done during startup,
as we now only reprogram h/w found to be in a state other
than that expected, and so avoid the overhead of setting
doorbell registers to the state they're already in.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 54 ++++++++++++----------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index d5db0d1df298..af5c4cf8c113 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -696,32 +696,47 @@ guc_client_free(struct drm_i915_private *dev_priv,
 	kfree(client);
 }
 
+/* Check that a doorbell register is in the expected state */
+static bool guc_doorbell_check(struct intel_guc *guc, uint16_t db_id)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	i915_reg_t drbreg = GEN8_DRBREGL(db_id);
+	uint32_t value = I915_READ(drbreg);
+	bool enabled = (value & GUC_DOORBELL_ENABLED) != 0;
+	bool expected = test_bit(db_id, guc->doorbell_bitmap);
+
+	if (enabled == expected)
+		return true;
+
+	DRM_DEBUG_DRIVER("Doorbell %d (reg 0x%x) 0x%x, should be %s\n",
+			 db_id, drbreg.reg, value,
+			 expected ? "active" : "inactive");
+
+	return false;
+}
+
 /*
  * Borrow the first client to set up & tear down each unused doorbell
  * in turn, to ensure that all doorbell h/w is (re)initialised.
  */
 static void guc_init_doorbell_hw(struct intel_guc *guc)
 {
-	struct drm_i915_private *dev_priv = guc_to_i915(guc);
 	struct i915_guc_client *client = guc->execbuf_client;
-	uint16_t db_id, i;
-	int err;
+	uint16_t db_id;
+	int i, err;
 
+	/* Save client's original doorbell selection */
 	db_id = client->doorbell_id;
 
 	for (i = 0; i < GUC_MAX_DOORBELLS; ++i) {
-		i915_reg_t drbreg = GEN8_DRBREGL(i);
-		u32 value = I915_READ(drbreg);
-
-		if (test_bit(i, guc->doorbell_bitmap))
+		/* Skip if doorbell is OK */
+		if (guc_doorbell_check(guc, i))
 			continue;
 
 		err = guc_update_doorbell_id(guc, client, i);
-
-		/* Report update failure or unexpectedly active doorbell */
-		if (err || (i != db_id && (value & GUC_DOORBELL_ENABLED)))
-			DRM_DEBUG_DRIVER("Doorbell %d (reg 0x%x) was 0x%x, err %d\n",
-					  i, drbreg.reg, value, err);
+		if (err)
+			DRM_DEBUG_DRIVER("Doorbell %d update failed, err %d\n",
+					i, err);
 	}
 
 	/* Restore to original value */
@@ -730,18 +745,9 @@ static void guc_init_doorbell_hw(struct intel_guc *guc)
 		DRM_ERROR("Failed to restore doorbell to %d, err %d\n",
 			db_id, err);
 
-	for (i = 0; i < GUC_MAX_DOORBELLS; ++i) {
-		i915_reg_t drbreg = GEN8_DRBREGL(i);
-		u32 value = I915_READ(drbreg);
-
-		if (test_bit(i, guc->doorbell_bitmap))
-			continue;
-
-		if (i != db_id && (value & GUC_DOORBELL_ENABLED))
-			DRM_DEBUG_DRIVER("Doorbell %d (reg 0x%x) finally 0x%x\n",
-					  i, drbreg.reg, value);
-
-	}
+	/* Read back & verify all doorbell registers */
+	for (i = 0; i < GUC_MAX_DOORBELLS; ++i)
+		(void)guc_doorbell_check(guc, i);
 }
 
 /**

From e02757d91f7481093742bb32385567499a59354c Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 9 Aug 2016 15:19:21 +0100
Subject: [PATCH 019/141] drm/i915/guc: add engine mask to GuC client & pass to
 GuC

The Context Descriptor passed by the kernel to the GuC contains a field
specifying which engine(s) the context will use. Historically, this was
always set to "all of them", but if we had a separate client for each
engine, we could be more precise, and set only the bit for the engine
that the client was associated with. So this patch enables this usage,
in preparation for having multiple clients, though at this point there
is still only a single client used for all supported engines.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 15 ++++++++++-----
 drivers/gpu/drm/i915/intel_guc.h           |  3 ++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index af5c4cf8c113..405e445fba72 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -340,7 +340,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 	desc.priority = client->priority;
 	desc.db_id = client->doorbell_id;
 
-	for_each_engine(engine, dev_priv) {
+	for_each_engine_masked(engine, dev_priv, client->engines) {
 		struct intel_context *ce = &ctx->engine[engine->id];
 		struct guc_execlist_context *lrc = &desc.lrc[engine->guc_id];
 		struct drm_i915_gem_object *obj;
@@ -374,6 +374,8 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		desc.engines_used |= (1 << engine->guc_id);
 	}
 
+	DRM_DEBUG_DRIVER("Host engines 0x%x => GuC engines used 0x%x\n",
+			client->engines, desc.engines_used);
 	WARN_ON(desc.engines_used == 0);
 
 	/*
@@ -764,6 +766,7 @@ static void guc_init_doorbell_hw(struct intel_guc *guc)
  */
 static struct i915_guc_client *
 guc_client_alloc(struct drm_i915_private *dev_priv,
+		 uint32_t engines,
 		 uint32_t priority,
 		 struct i915_gem_context *ctx)
 {
@@ -776,10 +779,11 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	if (!client)
 		return NULL;
 
-	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
-	client->priority = priority;
 	client->owner = ctx;
 	client->guc = guc;
+	client->engines = engines;
+	client->priority = priority;
+	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
 
 	client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
 			GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
@@ -821,8 +825,8 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	if (guc_init_doorbell(guc, client, db_id))
 		goto err;
 
-	DRM_DEBUG_DRIVER("new priority %u client %p: ctx_index %u\n",
-		priority, client, client->ctx_index);
+	DRM_DEBUG_DRIVER("new priority %u client %p for engine(s) 0x%x: ctx_index %u\n",
+		priority, client, client->engines, client->ctx_index);
 	DRM_DEBUG_DRIVER("doorbell id %u, cacheline offset 0x%x\n",
 		client->doorbell_id, client->doorbell_offset);
 
@@ -1006,6 +1010,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 	/* client for execbuf submission */
 	client = guc_client_alloc(dev_priv,
+				  INTEL_INFO(dev_priv)->ring_mask,
 				  GUC_CTX_PRIORITY_KMD_NORMAL,
 				  dev_priv->kernel_context);
 	if (!client) {
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 623cf26cd784..e57661ff0079 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -67,6 +67,8 @@ struct i915_guc_client {
 	void *client_base;		/* first page (only) of above	*/
 	struct i915_gem_context *owner;
 	struct intel_guc *guc;
+
+	uint32_t engines;		/* bitmap of (host) engine ids	*/
 	uint32_t priority;
 	uint32_t ctx_index;
 
@@ -79,7 +81,6 @@ struct i915_guc_client {
 	uint32_t wq_offset;
 	uint32_t wq_size;
 	uint32_t wq_tail;
-	uint32_t unused;		/* Was 'wq_head'		*/
 
 	uint32_t no_wq_space;
 	uint32_t q_fail;		/* No longer used		*/

From c18468c4b2dbf08854dba4f041c8c1845bf0cc9c Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 9 Aug 2016 15:19:22 +0100
Subject: [PATCH 020/141] drm/i915/guc: use for_each_engine_id() where
 appropriate

Now that host structures are indexed by host engine-id rather than
guc_id, we can usefully convert some for_each_engine() loops to use
for_each_engine_id() and avoid multiple dereferences of engine->id.

Also a few related tweaks to cache structure members locally wherever
they're used more than once or twice, hopefully eliminating memory
references.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        | 18 ++++++++++--------
 drivers/gpu/drm/i915/i915_guc_submission.c | 13 +++++++------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 0627e170ea25..b84fa93124ce 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2544,6 +2544,7 @@ static void i915_guc_client_info(struct seq_file *m,
 				 struct i915_guc_client *client)
 {
 	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	uint64_t tot = 0;
 
 	seq_printf(m, "\tPriority %d, GuC ctx index: %u, PD offset 0x%x\n",
@@ -2558,11 +2559,11 @@ static void i915_guc_client_info(struct seq_file *m,
 	seq_printf(m, "\tFailed doorbell: %u\n", client->b_fail);
 	seq_printf(m, "\tLast submission result: %d\n", client->retcode);
 
-	for_each_engine(engine, dev_priv) {
+	for_each_engine_id(engine, dev_priv, id) {
+		u64 submissions = client->submissions[id];
+		tot += submissions;
 		seq_printf(m, "\tSubmissions: %llu %s\n",
-				client->submissions[engine->id],
-				engine->name);
-		tot += client->submissions[engine->id];
+				submissions, engine->name);
 	}
 	seq_printf(m, "\tTotal: %llu\n", tot);
 }
@@ -2575,6 +2576,7 @@ static int i915_guc_info(struct seq_file *m, void *data)
 	struct intel_guc guc;
 	struct i915_guc_client client = {};
 	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	u64 total = 0;
 
 	if (!HAS_GUC_SCHED(dev_priv))
@@ -2601,11 +2603,11 @@ static int i915_guc_info(struct seq_file *m, void *data)
 	seq_printf(m, "GuC last action error code: %d\n", guc.action_err);
 
 	seq_printf(m, "\nGuC submissions:\n");
-	for_each_engine(engine, dev_priv) {
+	for_each_engine_id(engine, dev_priv, id) {
+		u64 submissions = guc.submissions[id];
+		total += submissions;
 		seq_printf(m, "\t%-24s: %10llu, last seqno 0x%08x\n",
-			engine->name, guc.submissions[engine->id],
-			guc.last_seqno[engine->id]);
-		total += guc.submissions[engine->id];
+			engine->name, submissions, guc.last_seqno[id]);
 	}
 	seq_printf(m, "\t%s: %llu\n", "Total", total);
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 405e445fba72..6831321a9c8c 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -342,7 +342,8 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 
 	for_each_engine_masked(engine, dev_priv, client->engines) {
 		struct intel_context *ce = &ctx->engine[engine->id];
-		struct guc_execlist_context *lrc = &desc.lrc[engine->guc_id];
+		uint32_t guc_engine_id = engine->guc_id;
+		struct guc_execlist_context *lrc = &desc.lrc[guc_engine_id];
 		struct drm_i915_gem_object *obj;
 
 		/* TODO: We have a design issue to be solved here. Only when we
@@ -361,7 +362,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		gfx_addr = i915_gem_obj_ggtt_offset(ce->state);
 		lrc->ring_lcra = gfx_addr + LRC_STATE_PN * PAGE_SIZE;
 		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
-				(engine->guc_id << GUC_ELC_ENGINE_OFFSET);
+				(guc_engine_id << GUC_ELC_ENGINE_OFFSET);
 
 		obj = ce->ring->obj;
 		gfx_addr = i915_gem_obj_ggtt_offset(obj);
@@ -371,7 +372,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		lrc->ring_next_free_location = gfx_addr;
 		lrc->ring_current_tail_pointer_value = 0;
 
-		desc.engines_used |= (1 << engine->guc_id);
+		desc.engines_used |= (1 << guc_engine_id);
 	}
 
 	DRM_DEBUG_DRIVER("Host engines 0x%x => GuC engines used 0x%x\n",
@@ -459,6 +460,7 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
 	/* wqi_len is in DWords, and does not include the one-word header */
 	const size_t wqi_size = sizeof(struct guc_wq_item);
 	const u32 wqi_len = wqi_size/sizeof(u32) - 1;
+	struct intel_engine_cs *engine = rq->engine;
 	struct guc_process_desc *desc;
 	struct guc_wq_item *wqi;
 	void *base;
@@ -500,12 +502,11 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
 	/* Now fill in the 4-word work queue item */
 	wqi->header = WQ_TYPE_INORDER |
 			(wqi_len << WQ_LEN_SHIFT) |
-			(rq->engine->guc_id << WQ_TARGET_SHIFT) |
+			(engine->guc_id << WQ_TARGET_SHIFT) |
 			WQ_NO_WCFLUSH_WAIT;
 
 	/* The GuC wants only the low-order word of the context descriptor */
-	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx,
-							     rq->engine);
+	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
 
 	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
 	wqi->fence_id = rq->fence.seqno;

From 774439e12b53daff3c97d71669a233d6e5290e79 Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 9 Aug 2016 15:19:23 +0100
Subject: [PATCH 021/141] drm/i915/guc: re-optimise i915_guc_client layout

As we're tweaking the GuC-related code in debugfs, we can
drop the no-longer-used 'q_fail' and repack the structure.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 1 -
 drivers/gpu/drm/i915/intel_guc.h    | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index b84fa93124ce..ade2446d924a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2555,7 +2555,6 @@ static void i915_guc_client_info(struct seq_file *m,
 		client->wq_size, client->wq_offset, client->wq_tail);
 
 	seq_printf(m, "\tWork queue full: %u\n", client->no_wq_space);
-	seq_printf(m, "\tFailed to queue: %u\n", client->q_fail);
 	seq_printf(m, "\tFailed doorbell: %u\n", client->b_fail);
 	seq_printf(m, "\tLast submission result: %d\n", client->retcode);
 
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index e57661ff0079..26f3d9c08120 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -71,19 +71,17 @@ struct i915_guc_client {
 	uint32_t engines;		/* bitmap of (host) engine ids	*/
 	uint32_t priority;
 	uint32_t ctx_index;
-
 	uint32_t proc_desc_offset;
+
 	uint32_t doorbell_offset;
 	uint32_t cookie;
 	uint16_t doorbell_id;
-	uint16_t padding;		/* Maintain alignment		*/
+	uint16_t padding[3];		/* Maintain alignment		*/
 
 	uint32_t wq_offset;
 	uint32_t wq_size;
 	uint32_t wq_tail;
-
 	uint32_t no_wq_space;
-	uint32_t q_fail;		/* No longer used		*/
 	uint32_t b_fail;
 	int retcode;
 

From 44cb734cd2ea09e4fb8166fdd1f0ef65af3a95de Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 10 Aug 2016 14:07:29 +0300
Subject: [PATCH 022/141] drm/i915: Merge the PPS register definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PPS registers are pretty much the same everywhere, the differences
being:
- Register fields appearing, disappearing from one platform to the
  next: panel-reset-on-powerdown, backlight-on, panel-port,
  register-unlock
- Different register base addresses
- Different number of PPS instances: 2 on VLV/CHV/BXT, 1 everywhere
  else.

We can merge the separate set of PPS definitions by extending the PPS
instance argument to all platforms and using instance 0 on platforms
with a single instance. This means we'll need to calculate the register
addresses dynamically based on the given platform and PPS instance.

v2:
- Simplify if ladder in intel_pps_get_registers(). (Ville)

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470827254-21954-1-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/i915_drv.h      |   2 +
 drivers/gpu/drm/i915/i915_reg.h      | 145 +++++++++++----------------
 drivers/gpu/drm/i915/i915_suspend.c  |  30 ++----
 drivers/gpu/drm/i915/intel_display.c |  22 +++-
 drivers/gpu/drm/i915/intel_dp.c      |  45 ++++-----
 drivers/gpu/drm/i915/intel_lvds.c    |  43 ++------
 6 files changed, 117 insertions(+), 170 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2cb91ffbbfa2..0ffab295a702 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1749,6 +1749,8 @@ struct drm_i915_private {
 
 	uint32_t psr_mmio_base;
 
+	uint32_t pps_mmio_base;
+
 	wait_queue_head_t gmbus_wait_queue;
 
 	struct pci_dev *bridge_dev;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index f38a5e20bbee..b65fe502bb1a 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3660,8 +3660,17 @@ enum {
 #define   VIDEO_DIP_ENABLE_SPD_HSW	(1 << 0)
 
 /* Panel power sequencing */
-#define PP_STATUS	_MMIO(0x61200)
-#define   PP_ON		(1 << 31)
+#define PPS_BASE			0x61200
+#define VLV_PPS_BASE			(VLV_DISPLAY_BASE + PPS_BASE)
+#define PCH_PPS_BASE			0xC7200
+
+#define _MMIO_PPS(pps_idx, reg)		_MMIO(dev_priv->pps_mmio_base -	\
+					      PPS_BASE + (reg) +	\
+					      (pps_idx) * 0x100)
+
+#define _PP_STATUS			0x61200
+#define PP_STATUS(pps_idx)		_MMIO_PPS(pps_idx, _PP_STATUS)
+#define   PP_ON				(1 << 31)
 /*
  * Indicates that all dependencies of the panel are on:
  *
@@ -3669,14 +3678,14 @@ enum {
  * - pipe enabled
  * - LVDS/DVOB/DVOC on
  */
-#define   PP_READY		(1 << 30)
-#define   PP_SEQUENCE_NONE	(0 << 28)
-#define   PP_SEQUENCE_POWER_UP	(1 << 28)
-#define   PP_SEQUENCE_POWER_DOWN (2 << 28)
-#define   PP_SEQUENCE_MASK	(3 << 28)
-#define   PP_SEQUENCE_SHIFT	28
-#define   PP_CYCLE_DELAY_ACTIVE	(1 << 27)
-#define   PP_SEQUENCE_STATE_MASK 0x0000000f
+#define   PP_READY			(1 << 30)
+#define   PP_SEQUENCE_NONE		(0 << 28)
+#define   PP_SEQUENCE_POWER_UP		(1 << 28)
+#define   PP_SEQUENCE_POWER_DOWN	(2 << 28)
+#define   PP_SEQUENCE_MASK		(3 << 28)
+#define   PP_SEQUENCE_SHIFT		28
+#define   PP_CYCLE_DELAY_ACTIVE		(1 << 27)
+#define   PP_SEQUENCE_STATE_MASK	0x0000000f
 #define   PP_SEQUENCE_STATE_OFF_IDLE	(0x0 << 0)
 #define   PP_SEQUENCE_STATE_OFF_S0_1	(0x1 << 0)
 #define   PP_SEQUENCE_STATE_OFF_S0_2	(0x2 << 0)
@@ -3686,11 +3695,46 @@ enum {
 #define   PP_SEQUENCE_STATE_ON_S1_2	(0xa << 0)
 #define   PP_SEQUENCE_STATE_ON_S1_3	(0xb << 0)
 #define   PP_SEQUENCE_STATE_RESET	(0xf << 0)
-#define PP_CONTROL	_MMIO(0x61204)
-#define   POWER_TARGET_ON	(1 << 0)
-#define PP_ON_DELAYS	_MMIO(0x61208)
-#define PP_OFF_DELAYS	_MMIO(0x6120c)
-#define PP_DIVISOR	_MMIO(0x61210)
+
+#define _PP_CONTROL			0x61204
+#define PP_CONTROL(pps_idx)		_MMIO_PPS(pps_idx, _PP_CONTROL)
+#define  PANEL_UNLOCK_REGS		(0xabcd << 16)
+#define  PANEL_UNLOCK_MASK		(0xffff << 16)
+#define  BXT_POWER_CYCLE_DELAY_MASK	0x1f0
+#define  BXT_POWER_CYCLE_DELAY_SHIFT	4
+#define  EDP_FORCE_VDD			(1 << 3)
+#define  EDP_BLC_ENABLE			(1 << 2)
+#define  PANEL_POWER_RESET		(1 << 1)
+#define  PANEL_POWER_OFF		(0 << 0)
+#define  PANEL_POWER_ON			(1 << 0)
+#define  POWER_TARGET_ON		(1 << 0)
+
+#define _PP_ON_DELAYS			0x61208
+#define PP_ON_DELAYS(pps_idx)		_MMIO_PPS(pps_idx, _PP_ON_DELAYS)
+#define  PANEL_PORT_SELECT_MASK		(3 << 30)
+#define  PANEL_PORT_SELECT_LVDS		(0 << 30)
+#define  PANEL_PORT_SELECT_DPA		(1 << 30)
+#define  PANEL_PORT_SELECT_DPC		(2 << 30)
+#define  PANEL_PORT_SELECT_DPD		(3 << 30)
+#define  PANEL_PORT_SELECT_VLV(port)	((port) << 30)
+#define  PANEL_POWER_UP_DELAY_MASK	0x1fff0000
+#define  PANEL_POWER_UP_DELAY_SHIFT	16
+#define  PANEL_LIGHT_ON_DELAY_MASK	0x1fff
+#define  PANEL_LIGHT_ON_DELAY_SHIFT	0
+
+#define _PP_OFF_DELAYS			0x6120C
+#define PP_OFF_DELAYS(pps_idx)		_MMIO_PPS(pps_idx, _PP_OFF_DELAYS)
+#define  PANEL_POWER_DOWN_DELAY_MASK	0x1fff0000
+#define  PANEL_POWER_DOWN_DELAY_SHIFT	16
+#define  PANEL_LIGHT_OFF_DELAY_MASK	0x1fff
+#define  PANEL_LIGHT_OFF_DELAY_SHIFT	0
+
+#define _PP_DIVISOR			0x61210
+#define PP_DIVISOR(pps_idx)		_MMIO_PPS(pps_idx, _PP_DIVISOR)
+#define  PP_REFERENCE_DIVIDER_MASK	0xffffff00
+#define  PP_REFERENCE_DIVIDER_SHIFT	8
+#define  PANEL_POWER_CYCLE_DELAY_MASK	0x1f
+#define  PANEL_POWER_CYCLE_DELAY_SHIFT	0
 
 /* Panel fitting */
 #define PFIT_CONTROL	_MMIO(dev_priv->info.display_mmio_offset + 0x61230)
@@ -6750,77 +6794,6 @@ enum {
 #define PCH_LVDS	_MMIO(0xe1180)
 #define  LVDS_DETECTED	(1 << 1)
 
-/* vlv has 2 sets of panel control regs. */
-#define _PIPEA_PP_STATUS         (VLV_DISPLAY_BASE + 0x61200)
-#define _PIPEA_PP_CONTROL        (VLV_DISPLAY_BASE + 0x61204)
-#define _PIPEA_PP_ON_DELAYS      (VLV_DISPLAY_BASE + 0x61208)
-#define  PANEL_PORT_SELECT_VLV(port)	((port) << 30)
-#define _PIPEA_PP_OFF_DELAYS     (VLV_DISPLAY_BASE + 0x6120c)
-#define _PIPEA_PP_DIVISOR        (VLV_DISPLAY_BASE + 0x61210)
-
-#define _PIPEB_PP_STATUS         (VLV_DISPLAY_BASE + 0x61300)
-#define _PIPEB_PP_CONTROL        (VLV_DISPLAY_BASE + 0x61304)
-#define _PIPEB_PP_ON_DELAYS      (VLV_DISPLAY_BASE + 0x61308)
-#define _PIPEB_PP_OFF_DELAYS     (VLV_DISPLAY_BASE + 0x6130c)
-#define _PIPEB_PP_DIVISOR        (VLV_DISPLAY_BASE + 0x61310)
-
-#define VLV_PIPE_PP_STATUS(pipe)	_MMIO_PIPE(pipe, _PIPEA_PP_STATUS, _PIPEB_PP_STATUS)
-#define VLV_PIPE_PP_CONTROL(pipe)	_MMIO_PIPE(pipe, _PIPEA_PP_CONTROL, _PIPEB_PP_CONTROL)
-#define VLV_PIPE_PP_ON_DELAYS(pipe)	_MMIO_PIPE(pipe, _PIPEA_PP_ON_DELAYS, _PIPEB_PP_ON_DELAYS)
-#define VLV_PIPE_PP_OFF_DELAYS(pipe)	_MMIO_PIPE(pipe, _PIPEA_PP_OFF_DELAYS, _PIPEB_PP_OFF_DELAYS)
-#define VLV_PIPE_PP_DIVISOR(pipe)	_MMIO_PIPE(pipe, _PIPEA_PP_DIVISOR, _PIPEB_PP_DIVISOR)
-
-#define _PCH_PP_STATUS		0xc7200
-#define _PCH_PP_CONTROL		0xc7204
-#define  PANEL_UNLOCK_REGS	(0xabcd << 16)
-#define  PANEL_UNLOCK_MASK	(0xffff << 16)
-#define  BXT_POWER_CYCLE_DELAY_MASK	(0x1f0)
-#define  BXT_POWER_CYCLE_DELAY_SHIFT	4
-#define  EDP_FORCE_VDD		(1 << 3)
-#define  EDP_BLC_ENABLE		(1 << 2)
-#define  PANEL_POWER_RESET	(1 << 1)
-#define  PANEL_POWER_OFF	(0 << 0)
-#define  PANEL_POWER_ON		(1 << 0)
-#define _PCH_PP_ON_DELAYS	0xc7208
-#define  PANEL_PORT_SELECT_MASK	(3 << 30)
-#define  PANEL_PORT_SELECT_LVDS	(0 << 30)
-#define  PANEL_PORT_SELECT_DPA	(1 << 30)
-#define  PANEL_PORT_SELECT_DPC	(2 << 30)
-#define  PANEL_PORT_SELECT_DPD	(3 << 30)
-#define  PANEL_POWER_UP_DELAY_MASK	(0x1fff0000)
-#define  PANEL_POWER_UP_DELAY_SHIFT	16
-#define  PANEL_LIGHT_ON_DELAY_MASK	(0x1fff)
-#define  PANEL_LIGHT_ON_DELAY_SHIFT	0
-
-#define _PCH_PP_OFF_DELAYS		0xc720c
-#define  PANEL_POWER_DOWN_DELAY_MASK	(0x1fff0000)
-#define  PANEL_POWER_DOWN_DELAY_SHIFT	16
-#define  PANEL_LIGHT_OFF_DELAY_MASK	(0x1fff)
-#define  PANEL_LIGHT_OFF_DELAY_SHIFT	0
-
-#define _PCH_PP_DIVISOR			0xc7210
-#define  PP_REFERENCE_DIVIDER_MASK	(0xffffff00)
-#define  PP_REFERENCE_DIVIDER_SHIFT	8
-#define  PANEL_POWER_CYCLE_DELAY_MASK	(0x1f)
-#define  PANEL_POWER_CYCLE_DELAY_SHIFT	0
-
-#define PCH_PP_STATUS			_MMIO(_PCH_PP_STATUS)
-#define PCH_PP_CONTROL			_MMIO(_PCH_PP_CONTROL)
-#define PCH_PP_ON_DELAYS		_MMIO(_PCH_PP_ON_DELAYS)
-#define PCH_PP_OFF_DELAYS		_MMIO(_PCH_PP_OFF_DELAYS)
-#define PCH_PP_DIVISOR			_MMIO(_PCH_PP_DIVISOR)
-
-/* BXT PPS changes - 2nd set of PPS registers */
-#define _BXT_PP_STATUS2 	0xc7300
-#define _BXT_PP_CONTROL2 	0xc7304
-#define _BXT_PP_ON_DELAYS2	0xc7308
-#define _BXT_PP_OFF_DELAYS2	0xc730c
-
-#define BXT_PP_STATUS(n)	_MMIO_PIPE(n, _PCH_PP_STATUS, _BXT_PP_STATUS2)
-#define BXT_PP_CONTROL(n)	_MMIO_PIPE(n, _PCH_PP_CONTROL, _BXT_PP_CONTROL2)
-#define BXT_PP_ON_DELAYS(n)	_MMIO_PIPE(n, _PCH_PP_ON_DELAYS, _BXT_PP_ON_DELAYS2)
-#define BXT_PP_OFF_DELAYS(n)	_MMIO_PIPE(n, _PCH_PP_OFF_DELAYS, _BXT_PP_OFF_DELAYS2)
-
 #define _PCH_DP_B		0xe4100
 #define PCH_DP_B		_MMIO(_PCH_DP_B)
 #define _PCH_DPB_AUX_CH_CTL	0xe4110
diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
index 5cfe4c7716b4..c826b69ba0aa 100644
--- a/drivers/gpu/drm/i915/i915_suspend.c
+++ b/drivers/gpu/drm/i915/i915_suspend.c
@@ -44,16 +44,11 @@ static void i915_save_display(struct drm_device *dev)
 		dev_priv->regfile.saveLVDS = I915_READ(LVDS);
 
 	/* Panel power sequencer */
-	if (HAS_PCH_SPLIT(dev)) {
-		dev_priv->regfile.savePP_CONTROL = I915_READ(PCH_PP_CONTROL);
-		dev_priv->regfile.savePP_ON_DELAYS = I915_READ(PCH_PP_ON_DELAYS);
-		dev_priv->regfile.savePP_OFF_DELAYS = I915_READ(PCH_PP_OFF_DELAYS);
-		dev_priv->regfile.savePP_DIVISOR = I915_READ(PCH_PP_DIVISOR);
-	} else if (INTEL_INFO(dev)->gen <= 4) {
-		dev_priv->regfile.savePP_CONTROL = I915_READ(PP_CONTROL);
-		dev_priv->regfile.savePP_ON_DELAYS = I915_READ(PP_ON_DELAYS);
-		dev_priv->regfile.savePP_OFF_DELAYS = I915_READ(PP_OFF_DELAYS);
-		dev_priv->regfile.savePP_DIVISOR = I915_READ(PP_DIVISOR);
+	if (HAS_PCH_SPLIT(dev_priv) || INTEL_GEN(dev_priv) <= 4) {
+		dev_priv->regfile.savePP_CONTROL = I915_READ(PP_CONTROL(0));
+		dev_priv->regfile.savePP_ON_DELAYS = I915_READ(PP_ON_DELAYS(0));
+		dev_priv->regfile.savePP_OFF_DELAYS = I915_READ(PP_OFF_DELAYS(0));
+		dev_priv->regfile.savePP_DIVISOR = I915_READ(PP_DIVISOR(0));
 	}
 
 	/* save FBC interval */
@@ -79,16 +74,11 @@ static void i915_restore_display(struct drm_device *dev)
 		I915_WRITE(LVDS, dev_priv->regfile.saveLVDS & mask);
 
 	/* Panel power sequencer */
-	if (HAS_PCH_SPLIT(dev)) {
-		I915_WRITE(PCH_PP_ON_DELAYS, dev_priv->regfile.savePP_ON_DELAYS);
-		I915_WRITE(PCH_PP_OFF_DELAYS, dev_priv->regfile.savePP_OFF_DELAYS);
-		I915_WRITE(PCH_PP_DIVISOR, dev_priv->regfile.savePP_DIVISOR);
-		I915_WRITE(PCH_PP_CONTROL, dev_priv->regfile.savePP_CONTROL);
-	} else if (INTEL_INFO(dev)->gen <= 4) {
-		I915_WRITE(PP_ON_DELAYS, dev_priv->regfile.savePP_ON_DELAYS);
-		I915_WRITE(PP_OFF_DELAYS, dev_priv->regfile.savePP_OFF_DELAYS);
-		I915_WRITE(PP_DIVISOR, dev_priv->regfile.savePP_DIVISOR);
-		I915_WRITE(PP_CONTROL, dev_priv->regfile.savePP_CONTROL);
+	if (HAS_PCH_SPLIT(dev_priv) || INTEL_GEN(dev_priv) <= 4) {
+		I915_WRITE(PP_ON_DELAYS(0), dev_priv->regfile.savePP_ON_DELAYS);
+		I915_WRITE(PP_OFF_DELAYS(0), dev_priv->regfile.savePP_OFF_DELAYS);
+		I915_WRITE(PP_DIVISOR(0), dev_priv->regfile.savePP_DIVISOR);
+		I915_WRITE(PP_CONTROL(0), dev_priv->regfile.savePP_CONTROL);
 	}
 
 	/* only restore FBC info on the platform that supports FBC*/
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index ecd154347cc1..dc0d1b61dae4 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -1202,8 +1202,8 @@ void assert_panel_unlocked(struct drm_i915_private *dev_priv,
 	if (HAS_PCH_SPLIT(dev)) {
 		u32 port_sel;
 
-		pp_reg = PCH_PP_CONTROL;
-		port_sel = I915_READ(PCH_PP_ON_DELAYS) & PANEL_PORT_SELECT_MASK;
+		pp_reg = PP_CONTROL(0);
+		port_sel = I915_READ(PP_ON_DELAYS(0)) & PANEL_PORT_SELECT_MASK;
 
 		if (port_sel == PANEL_PORT_SELECT_LVDS &&
 		    I915_READ(PCH_LVDS) & LVDS_PIPEB_SELECT)
@@ -1211,10 +1211,10 @@ void assert_panel_unlocked(struct drm_i915_private *dev_priv,
 		/* XXX: else fix for eDP */
 	} else if (IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev)) {
 		/* presumably write lock depends on pipe, not port select */
-		pp_reg = VLV_PIPE_PP_CONTROL(pipe);
+		pp_reg = PP_CONTROL(pipe);
 		panel_pipe = pipe;
 	} else {
-		pp_reg = PP_CONTROL;
+		pp_reg = PP_CONTROL(0);
 		if (I915_READ(LVDS) & LVDS_PIPEB_SELECT)
 			panel_pipe = PIPE_B;
 	}
@@ -9491,7 +9491,7 @@ static void assert_can_disable_lcpll(struct drm_i915_private *dev_priv)
 	I915_STATE_WARN(I915_READ(SPLL_CTL) & SPLL_PLL_ENABLE, "SPLL enabled\n");
 	I915_STATE_WARN(I915_READ(WRPLL_CTL(0)) & WRPLL_PLL_ENABLE, "WRPLL1 enabled\n");
 	I915_STATE_WARN(I915_READ(WRPLL_CTL(1)) & WRPLL_PLL_ENABLE, "WRPLL2 enabled\n");
-	I915_STATE_WARN(I915_READ(PCH_PP_STATUS) & PP_ON, "Panel power on\n");
+	I915_STATE_WARN(I915_READ(PP_STATUS(0)) & PP_ON, "Panel power on\n");
 	I915_STATE_WARN(I915_READ(BLC_PWM_CPU_CTL2) & BLM_PWM_ENABLE,
 	     "CPU PWM1 enabled\n");
 	if (IS_HASWELL(dev))
@@ -14729,12 +14729,24 @@ static bool intel_crt_present(struct drm_device *dev)
 	return true;
 }
 
+static void intel_pps_init(struct drm_i915_private *dev_priv)
+{
+	if (HAS_PCH_SPLIT(dev_priv) || IS_BROXTON(dev_priv))
+		dev_priv->pps_mmio_base = PCH_PPS_BASE;
+	else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
+		dev_priv->pps_mmio_base = VLV_PPS_BASE;
+	else
+		dev_priv->pps_mmio_base = PPS_BASE;
+}
+
 static void intel_setup_outputs(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_encoder *encoder;
 	bool dpd_is_edp = false;
 
+	intel_pps_init(dev_priv);
+
 	/*
 	 * intel_edp_init_connector() depends on this completing first, to
 	 * prevent the registeration of both eDP and LVDS and the incorrect
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 8fe2afa5439e..15aff93d3bc5 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -463,13 +463,13 @@ typedef bool (*vlv_pipe_check)(struct drm_i915_private *dev_priv,
 static bool vlv_pipe_has_pp_on(struct drm_i915_private *dev_priv,
 			       enum pipe pipe)
 {
-	return I915_READ(VLV_PIPE_PP_STATUS(pipe)) & PP_ON;
+	return I915_READ(PP_STATUS(pipe)) & PP_ON;
 }
 
 static bool vlv_pipe_has_vdd_on(struct drm_i915_private *dev_priv,
 				enum pipe pipe)
 {
-	return I915_READ(VLV_PIPE_PP_CONTROL(pipe)) & EDP_FORCE_VDD;
+	return I915_READ(PP_CONTROL(pipe)) & EDP_FORCE_VDD;
 }
 
 static bool vlv_pipe_any(struct drm_i915_private *dev_priv,
@@ -486,7 +486,7 @@ vlv_initial_pps_pipe(struct drm_i915_private *dev_priv,
 	enum pipe pipe;
 
 	for (pipe = PIPE_A; pipe <= PIPE_B; pipe++) {
-		u32 port_sel = I915_READ(VLV_PIPE_PP_ON_DELAYS(pipe)) &
+		u32 port_sel = I915_READ(PP_ON_DELAYS(pipe)) &
 			PANEL_PORT_SELECT_MASK;
 
 		if (port_sel != PANEL_PORT_SELECT_VLV(port))
@@ -583,30 +583,21 @@ static void intel_pps_get_registers(struct drm_i915_private *dev_priv,
 				    struct intel_dp *intel_dp,
 				    struct pps_registers *regs)
 {
+	int pps_idx = 0;
+
 	memset(regs, 0, sizeof(*regs));
 
-	if (IS_BROXTON(dev_priv)) {
-		int idx = bxt_power_sequencer_idx(intel_dp);
+	if (IS_BROXTON(dev_priv))
+		pps_idx = bxt_power_sequencer_idx(intel_dp);
+	else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
+		pps_idx = vlv_power_sequencer_pipe(intel_dp);
 
-		regs->pp_ctrl = BXT_PP_CONTROL(idx);
-		regs->pp_stat = BXT_PP_STATUS(idx);
-		regs->pp_on = BXT_PP_ON_DELAYS(idx);
-		regs->pp_off = BXT_PP_OFF_DELAYS(idx);
-	} else if (HAS_PCH_SPLIT(dev_priv)) {
-		regs->pp_ctrl = PCH_PP_CONTROL;
-		regs->pp_stat = PCH_PP_STATUS;
-		regs->pp_on = PCH_PP_ON_DELAYS;
-		regs->pp_off = PCH_PP_OFF_DELAYS;
-		regs->pp_div = PCH_PP_DIVISOR;
-	} else {
-		enum pipe pipe = vlv_power_sequencer_pipe(intel_dp);
-
-		regs->pp_ctrl = VLV_PIPE_PP_CONTROL(pipe);
-		regs->pp_stat = VLV_PIPE_PP_STATUS(pipe);
-		regs->pp_on = VLV_PIPE_PP_ON_DELAYS(pipe);
-		regs->pp_off = VLV_PIPE_PP_OFF_DELAYS(pipe);
-		regs->pp_div = VLV_PIPE_PP_DIVISOR(pipe);
-	}
+	regs->pp_ctrl = PP_CONTROL(pps_idx);
+	regs->pp_stat = PP_STATUS(pps_idx);
+	regs->pp_on = PP_ON_DELAYS(pps_idx);
+	regs->pp_off = PP_OFF_DELAYS(pps_idx);
+	if (!IS_BROXTON(dev_priv))
+		regs->pp_div = PP_DIVISOR(pps_idx);
 }
 
 static i915_reg_t
@@ -651,8 +642,8 @@ static int edp_notify_handler(struct notifier_block *this, unsigned long code,
 		i915_reg_t pp_ctrl_reg, pp_div_reg;
 		u32 pp_div;
 
-		pp_ctrl_reg = VLV_PIPE_PP_CONTROL(pipe);
-		pp_div_reg  = VLV_PIPE_PP_DIVISOR(pipe);
+		pp_ctrl_reg = PP_CONTROL(pipe);
+		pp_div_reg  = PP_DIVISOR(pipe);
 		pp_div = I915_READ(pp_div_reg);
 		pp_div &= PP_REFERENCE_DIVIDER_MASK;
 
@@ -2729,7 +2720,7 @@ static void vlv_detach_power_sequencer(struct intel_dp *intel_dp)
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
 	struct drm_i915_private *dev_priv = to_i915(intel_dig_port->base.base.dev);
 	enum pipe pipe = intel_dp->pps_pipe;
-	i915_reg_t pp_on_reg = VLV_PIPE_PP_ON_DELAYS(pipe);
+	i915_reg_t pp_on_reg = PP_ON_DELAYS(pipe);
 
 	edp_panel_vdd_off_sync(intel_dp);
 
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 49550470483e..413e729d2499 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -217,21 +217,12 @@ static void intel_enable_lvds(struct intel_encoder *encoder)
 	struct intel_connector *intel_connector =
 		&lvds_encoder->attached_connector->base;
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	i915_reg_t ctl_reg, stat_reg;
-
-	if (HAS_PCH_SPLIT(dev)) {
-		ctl_reg = PCH_PP_CONTROL;
-		stat_reg = PCH_PP_STATUS;
-	} else {
-		ctl_reg = PP_CONTROL;
-		stat_reg = PP_STATUS;
-	}
 
 	I915_WRITE(lvds_encoder->reg, I915_READ(lvds_encoder->reg) | LVDS_PORT_EN);
 
-	I915_WRITE(ctl_reg, I915_READ(ctl_reg) | POWER_TARGET_ON);
+	I915_WRITE(PP_CONTROL(0), I915_READ(PP_CONTROL(0)) | POWER_TARGET_ON);
 	POSTING_READ(lvds_encoder->reg);
-	if (intel_wait_for_register(dev_priv, stat_reg, PP_ON, PP_ON, 1000))
+	if (intel_wait_for_register(dev_priv, PP_STATUS(0), PP_ON, PP_ON, 1000))
 		DRM_ERROR("timed out waiting for panel to power on\n");
 
 	intel_panel_enable_backlight(intel_connector);
@@ -242,18 +233,9 @@ static void intel_disable_lvds(struct intel_encoder *encoder)
 	struct drm_device *dev = encoder->base.dev;
 	struct intel_lvds_encoder *lvds_encoder = to_lvds_encoder(&encoder->base);
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	i915_reg_t ctl_reg, stat_reg;
 
-	if (HAS_PCH_SPLIT(dev)) {
-		ctl_reg = PCH_PP_CONTROL;
-		stat_reg = PCH_PP_STATUS;
-	} else {
-		ctl_reg = PP_CONTROL;
-		stat_reg = PP_STATUS;
-	}
-
-	I915_WRITE(ctl_reg, I915_READ(ctl_reg) & ~POWER_TARGET_ON);
-	if (intel_wait_for_register(dev_priv, stat_reg, PP_ON, 0, 1000))
+	I915_WRITE(PP_CONTROL(0), I915_READ(PP_CONTROL(0)) & ~POWER_TARGET_ON);
+	if (intel_wait_for_register(dev_priv, PP_STATUS(0), PP_ON, 0, 1000))
 		DRM_ERROR("timed out waiting for panel to power off\n");
 
 	I915_WRITE(lvds_encoder->reg, I915_READ(lvds_encoder->reg) & ~LVDS_PORT_EN);
@@ -904,13 +886,10 @@ void intel_lvds_init(struct drm_device *dev)
 	 * Unlock registers and just leave them unlocked. Do this before
 	 * checking quirk lists to avoid bogus WARNINGs.
 	 */
-	if (HAS_PCH_SPLIT(dev)) {
-		I915_WRITE(PCH_PP_CONTROL,
-			   I915_READ(PCH_PP_CONTROL) | PANEL_UNLOCK_REGS);
-	} else if (INTEL_INFO(dev_priv)->gen < 5) {
-		I915_WRITE(PP_CONTROL,
-			   I915_READ(PP_CONTROL) | PANEL_UNLOCK_REGS);
-	}
+	if (HAS_PCH_SPLIT(dev_priv) || INTEL_GEN(dev_priv) <= 4)
+		I915_WRITE(PP_CONTROL(0),
+			   I915_READ(PP_CONTROL(0)) | PANEL_UNLOCK_REGS);
+
 	if (!intel_lvds_supported(dev))
 		return;
 
@@ -945,12 +924,12 @@ void intel_lvds_init(struct drm_device *dev)
 
 	 /* Set the Panel Power On/Off timings if uninitialized. */
 	if (INTEL_INFO(dev_priv)->gen < 5 &&
-	    I915_READ(PP_ON_DELAYS) == 0 && I915_READ(PP_OFF_DELAYS) == 0) {
+	    I915_READ(PP_ON_DELAYS(0)) == 0 && I915_READ(PP_OFF_DELAYS(0)) == 0) {
 		/* Set T2 to 40ms and T5 to 200ms */
-		I915_WRITE(PP_ON_DELAYS, 0x019007d0);
+		I915_WRITE(PP_ON_DELAYS(0), 0x019007d0);
 
 		/* Set T3 to 35ms and Tx to 200ms */
-		I915_WRITE(PP_OFF_DELAYS, 0x015e07d0);
+		I915_WRITE(PP_OFF_DELAYS(0), 0x015e07d0);
 
 		DRM_DEBUG_KMS("Panel power timings uninitialized, setting defaults\n");
 	}

From 5a162e229a0472bb3e953aa6fc1bb6fcf52843c8 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 10 Aug 2016 14:07:30 +0300
Subject: [PATCH 023/141] drm/i915: Merge TARGET_POWER_ON and PANEL_POWER_ON
 flag definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These two flags mean the same thing, so remove the duplication.

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470827254-21954-2-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/i915_reg.h   | 1 -
 drivers/gpu/drm/i915/intel_dp.c   | 6 +++---
 drivers/gpu/drm/i915/intel_lvds.c | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index b65fe502bb1a..889508fdbe98 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3707,7 +3707,6 @@ enum {
 #define  PANEL_POWER_RESET		(1 << 1)
 #define  PANEL_POWER_OFF		(0 << 0)
 #define  PANEL_POWER_ON			(1 << 0)
-#define  POWER_TARGET_ON		(1 << 0)
 
 #define _PP_ON_DELAYS			0x61208
 #define PP_ON_DELAYS(pps_idx)		_MMIO_PPS(pps_idx, _PP_ON_DELAYS)
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 15aff93d3bc5..a20faa05ef3d 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -1948,7 +1948,7 @@ static void edp_panel_vdd_off_sync(struct intel_dp *intel_dp)
 	DRM_DEBUG_KMS("PP_STATUS: 0x%08x PP_CONTROL: 0x%08x\n",
 	I915_READ(pp_stat_reg), I915_READ(pp_ctrl_reg));
 
-	if ((pp & POWER_TARGET_ON) == 0)
+	if ((pp & PANEL_POWER_ON) == 0)
 		intel_dp->panel_power_off_time = ktime_get_boottime();
 
 	power_domain = intel_display_port_aux_power_domain(intel_encoder);
@@ -2035,7 +2035,7 @@ static void edp_panel_on(struct intel_dp *intel_dp)
 		POSTING_READ(pp_ctrl_reg);
 	}
 
-	pp |= POWER_TARGET_ON;
+	pp |= PANEL_POWER_ON;
 	if (!IS_GEN5(dev))
 		pp |= PANEL_POWER_RESET;
 
@@ -2087,7 +2087,7 @@ static void edp_panel_off(struct intel_dp *intel_dp)
 	pp = ironlake_get_pp_control(intel_dp);
 	/* We need to switch off panel power _and_ force vdd, for otherwise some
 	 * panels get very unhappy and cease to work. */
-	pp &= ~(POWER_TARGET_ON | PANEL_POWER_RESET | EDP_FORCE_VDD |
+	pp &= ~(PANEL_POWER_ON | PANEL_POWER_RESET | EDP_FORCE_VDD |
 		EDP_BLC_ENABLE);
 
 	pp_ctrl_reg = _pp_ctrl_reg(intel_dp);
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 413e729d2499..c5739fc9e5fe 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -220,7 +220,7 @@ static void intel_enable_lvds(struct intel_encoder *encoder)
 
 	I915_WRITE(lvds_encoder->reg, I915_READ(lvds_encoder->reg) | LVDS_PORT_EN);
 
-	I915_WRITE(PP_CONTROL(0), I915_READ(PP_CONTROL(0)) | POWER_TARGET_ON);
+	I915_WRITE(PP_CONTROL(0), I915_READ(PP_CONTROL(0)) | PANEL_POWER_ON);
 	POSTING_READ(lvds_encoder->reg);
 	if (intel_wait_for_register(dev_priv, PP_STATUS(0), PP_ON, PP_ON, 1000))
 		DRM_ERROR("timed out waiting for panel to power on\n");
@@ -234,7 +234,7 @@ static void intel_disable_lvds(struct intel_encoder *encoder)
 	struct intel_lvds_encoder *lvds_encoder = to_lvds_encoder(&encoder->base);
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
-	I915_WRITE(PP_CONTROL(0), I915_READ(PP_CONTROL(0)) & ~POWER_TARGET_ON);
+	I915_WRITE(PP_CONTROL(0), I915_READ(PP_CONTROL(0)) & ~PANEL_POWER_ON);
 	if (intel_wait_for_register(dev_priv, PP_STATUS(0), PP_ON, 0, 1000))
 		DRM_ERROR("timed out waiting for panel to power off\n");
 

From ed6143b8f7537db98e94feeb50e7b7f698c7cb05 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 10 Aug 2016 14:07:31 +0300
Subject: [PATCH 024/141] drm/i915/lvds: Restore initial HW state during
 encoder enabling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Atm the LVDS encoder depends on the PPS HW context being saved/restored
from generic suspend/resume code. Since the PPS is specific to the LVDS
and eDP encoders a cleaner way is to reinitialize it during encoder
enabling, so do this here for LVDS. Follow-up patches will init the PPS
for the eDP encoder similarly and remove the suspend/resume time save /
restore.

v2:
- Apply BSpec +1 offset and use DIV_ROUND_UP() when programming the
power cycle delay. (Ville)
v3: (Ville)
- Fix +1 vs. round-up order.
- s/reset_on_powerdown/powerdown_on_reset/

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470827254-21954-3-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/i915_reg.h   |   1 +
 drivers/gpu/drm/i915/intel_lvds.c | 114 ++++++++++++++++++++++++++----
 2 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 889508fdbe98..da82744f3a3b 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3710,6 +3710,7 @@ enum {
 
 #define _PP_ON_DELAYS			0x61208
 #define PP_ON_DELAYS(pps_idx)		_MMIO_PPS(pps_idx, _PP_ON_DELAYS)
+#define  PANEL_PORT_SELECT_SHIFT	30
 #define  PANEL_PORT_SELECT_MASK		(3 << 30)
 #define  PANEL_PORT_SELECT_LVDS		(0 << 30)
 #define  PANEL_PORT_SELECT_DPA		(1 << 30)
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index c5739fc9e5fe..e79fae4bbc1f 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -48,6 +48,20 @@ struct intel_lvds_connector {
 	struct notifier_block lid_notifier;
 };
 
+struct intel_lvds_pps {
+	/* 100us units */
+	int t1_t2;
+	int t3;
+	int t4;
+	int t5;
+	int tx;
+
+	int divider;
+
+	int port;
+	bool powerdown_on_reset;
+};
+
 struct intel_lvds_encoder {
 	struct intel_encoder base;
 
@@ -55,6 +69,9 @@ struct intel_lvds_encoder {
 	i915_reg_t reg;
 	u32 a3_power;
 
+	struct intel_lvds_pps init_pps;
+	u32 init_lvds_val;
+
 	struct intel_lvds_connector *attached_connector;
 };
 
@@ -136,6 +153,83 @@ static void intel_lvds_get_config(struct intel_encoder *encoder,
 	pipe_config->base.adjusted_mode.crtc_clock = pipe_config->port_clock;
 }
 
+static void intel_lvds_pps_get_hw_state(struct drm_i915_private *dev_priv,
+					struct intel_lvds_pps *pps)
+{
+	u32 val;
+
+	pps->powerdown_on_reset = I915_READ(PP_CONTROL(0)) & PANEL_POWER_RESET;
+
+	val = I915_READ(PP_ON_DELAYS(0));
+	pps->port = (val & PANEL_PORT_SELECT_MASK) >>
+		    PANEL_PORT_SELECT_SHIFT;
+	pps->t1_t2 = (val & PANEL_POWER_UP_DELAY_MASK) >>
+		     PANEL_POWER_UP_DELAY_SHIFT;
+	pps->t5 = (val & PANEL_LIGHT_ON_DELAY_MASK) >>
+		  PANEL_LIGHT_ON_DELAY_SHIFT;
+
+	val = I915_READ(PP_OFF_DELAYS(0));
+	pps->t3 = (val & PANEL_POWER_DOWN_DELAY_MASK) >>
+		  PANEL_POWER_DOWN_DELAY_SHIFT;
+	pps->tx = (val & PANEL_LIGHT_OFF_DELAY_MASK) >>
+		  PANEL_LIGHT_OFF_DELAY_SHIFT;
+
+	val = I915_READ(PP_DIVISOR(0));
+	pps->divider = (val & PP_REFERENCE_DIVIDER_MASK) >>
+		       PP_REFERENCE_DIVIDER_SHIFT;
+	val = (val & PANEL_POWER_CYCLE_DELAY_MASK) >>
+	      PANEL_POWER_CYCLE_DELAY_SHIFT;
+	/*
+	 * Remove the BSpec specified +1 (100ms) offset that accounts for a
+	 * too short power-cycle delay due to the asynchronous programming of
+	 * the register.
+	 */
+	if (val)
+		val--;
+	/* Convert from 100ms to 100us units */
+	pps->t4 = val * 1000;
+
+	if (INTEL_INFO(dev_priv)->gen <= 4 &&
+	    pps->t1_t2 == 0 && pps->t5 == 0 && pps->t3 == 0 && pps->tx == 0) {
+		DRM_DEBUG_KMS("Panel power timings uninitialized, "
+			      "setting defaults\n");
+		/* Set T2 to 40ms and T5 to 200ms in 100 usec units */
+		pps->t1_t2 = 40 * 10;
+		pps->t5 = 200 * 10;
+		/* Set T3 to 35ms and Tx to 200ms in 100 usec units */
+		pps->t3 = 35 * 10;
+		pps->tx = 200 * 10;
+	}
+
+	DRM_DEBUG_DRIVER("LVDS PPS:t1+t2 %d t3 %d t4 %d t5 %d tx %d "
+			 "divider %d port %d powerdown_on_reset %d\n",
+			 pps->t1_t2, pps->t3, pps->t4, pps->t5, pps->tx,
+			 pps->divider, pps->port, pps->powerdown_on_reset);
+}
+
+static void intel_lvds_pps_init_hw(struct drm_i915_private *dev_priv,
+				   struct intel_lvds_pps *pps)
+{
+	u32 val;
+
+	val = I915_READ(PP_CONTROL(0));
+	WARN_ON((val & PANEL_UNLOCK_MASK) != PANEL_UNLOCK_REGS);
+	if (pps->powerdown_on_reset)
+		val |= PANEL_POWER_RESET;
+	I915_WRITE(PP_CONTROL(0), val);
+
+	I915_WRITE(PP_ON_DELAYS(0), (pps->port << PANEL_PORT_SELECT_SHIFT) |
+				    (pps->t1_t2 << PANEL_POWER_UP_DELAY_SHIFT) |
+				    (pps->t5 << PANEL_LIGHT_ON_DELAY_SHIFT));
+	I915_WRITE(PP_OFF_DELAYS(0), (pps->t3 << PANEL_POWER_DOWN_DELAY_SHIFT) |
+				     (pps->tx << PANEL_LIGHT_OFF_DELAY_SHIFT));
+
+	val = pps->divider << PP_REFERENCE_DIVIDER_SHIFT;
+	val |= (DIV_ROUND_UP(pps->t4, 1000) + 1) <<
+	       PANEL_POWER_CYCLE_DELAY_SHIFT;
+	I915_WRITE(PP_DIVISOR(0), val);
+}
+
 static void intel_pre_enable_lvds(struct intel_encoder *encoder)
 {
 	struct intel_lvds_encoder *lvds_encoder = to_lvds_encoder(&encoder->base);
@@ -154,7 +248,9 @@ static void intel_pre_enable_lvds(struct intel_encoder *encoder)
 		assert_pll_disabled(dev_priv, pipe);
 	}
 
-	temp = I915_READ(lvds_encoder->reg);
+	intel_lvds_pps_init_hw(dev_priv, &lvds_encoder->init_pps);
+
+	temp = lvds_encoder->init_lvds_val;
 	temp |= LVDS_PORT_EN | LVDS_A0A2_CLKA_POWER_UP;
 
 	if (HAS_PCH_CPT(dev)) {
@@ -922,18 +1018,6 @@ void intel_lvds_init(struct drm_device *dev)
 		DRM_DEBUG_KMS("LVDS is not present in VBT, but enabled anyway\n");
 	}
 
-	 /* Set the Panel Power On/Off timings if uninitialized. */
-	if (INTEL_INFO(dev_priv)->gen < 5 &&
-	    I915_READ(PP_ON_DELAYS(0)) == 0 && I915_READ(PP_OFF_DELAYS(0)) == 0) {
-		/* Set T2 to 40ms and T5 to 200ms */
-		I915_WRITE(PP_ON_DELAYS(0), 0x019007d0);
-
-		/* Set T3 to 35ms and Tx to 200ms */
-		I915_WRITE(PP_OFF_DELAYS(0), 0x015e07d0);
-
-		DRM_DEBUG_KMS("Panel power timings uninitialized, setting defaults\n");
-	}
-
 	lvds_encoder = kzalloc(sizeof(*lvds_encoder), GFP_KERNEL);
 	if (!lvds_encoder)
 		return;
@@ -999,6 +1083,10 @@ void intel_lvds_init(struct drm_device *dev)
 				      dev->mode_config.scaling_mode_property,
 				      DRM_MODE_SCALE_ASPECT);
 	intel_connector->panel.fitting_mode = DRM_MODE_SCALE_ASPECT;
+
+	intel_lvds_pps_get_hw_state(dev_priv, &lvds_encoder->init_pps);
+	lvds_encoder->init_lvds_val = lvds;
+
 	/*
 	 * LVDS discovery:
 	 * 1) check for EDID on DDC

From 335f752ba948f30e8ebbf3c4217e1714a885be44 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 10 Aug 2016 14:07:32 +0300
Subject: [PATCH 025/141] drm/i915/dp: Restore PPS HW state from the encoder
 resume hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similarly to the previous patch, initialize the PPS from the DP
encoder's resume hook. Note that as opposed to LVDS we can't do this
during encoder enabling, since we need the PPS for DP detection as well.
The PPS init code is now the same for init and resume, so factor out a
new intel_dp_pps_init() helper for this.

v2:
- Factor out intel_dp_pps_init() (Ville).

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470827254-21954-4-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/intel_dp.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index a20faa05ef3d..2ef7b14cc36e 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -256,6 +256,8 @@ intel_dp_init_panel_power_sequencer(struct drm_device *dev,
 static void
 intel_dp_init_panel_power_sequencer_registers(struct drm_device *dev,
 					      struct intel_dp *intel_dp);
+static void
+intel_dp_pps_init(struct drm_device *dev, struct intel_dp *intel_dp);
 
 static void pps_lock(struct intel_dp *intel_dp)
 {
@@ -4657,13 +4659,8 @@ void intel_dp_encoder_reset(struct drm_encoder *encoder)
 
 	pps_lock(intel_dp);
 
-	/*
-	 * Read out the current power sequencer assignment,
-	 * in case the BIOS did something with it.
-	 */
-	if (IS_VALLEYVIEW(encoder->dev) || IS_CHERRYVIEW(encoder->dev))
-		vlv_initial_power_sequencer_setup(intel_dp);
-
+	/* Reinit the power sequencer, in case BIOS did something with it. */
+	intel_dp_pps_init(encoder->dev, intel_dp);
 	intel_edp_panel_vdd_sanitize(intel_dp);
 
 	pps_unlock(intel_dp);
@@ -5011,6 +5008,17 @@ intel_dp_init_panel_power_sequencer_registers(struct drm_device *dev,
 		      I915_READ(regs.pp_div));
 }
 
+static void intel_dp_pps_init(struct drm_device *dev,
+			      struct intel_dp *intel_dp)
+{
+	if (IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev)) {
+		vlv_initial_power_sequencer_setup(intel_dp);
+	} else {
+		intel_dp_init_panel_power_sequencer(dev, intel_dp);
+		intel_dp_init_panel_power_sequencer_registers(dev, intel_dp);
+	}
+}
+
 /**
  * intel_dp_set_drrs_state - program registers for RR switch to take effect
  * @dev: DRM device
@@ -5425,14 +5433,7 @@ static bool intel_edp_init_connector(struct intel_dp *intel_dp,
 	pps_lock(intel_dp);
 
 	intel_dp_init_panel_power_timestamps(intel_dp);
-
-	if (IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev)) {
-		vlv_initial_power_sequencer_setup(intel_dp);
-	} else {
-		intel_dp_init_panel_power_sequencer(dev, intel_dp);
-		intel_dp_init_panel_power_sequencer_registers(dev, intel_dp);
-	}
-
+	intel_dp_pps_init(dev, intel_dp);
 	intel_edp_panel_vdd_sanitize(intel_dp);
 
 	pps_unlock(intel_dp);

From 8090ba8c216ff75c32ecb85c41adf3c5126d8a92 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 10 Aug 2016 14:07:33 +0300
Subject: [PATCH 026/141] drm/i915: Apply the PPS register unlock workaround
 more consistently
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Atm, we apply this workaround somewhat inconsistently at the following
points: driver loading, LVDS init, eDP PPS init, system resume. As this
workaround also affects registers other than PPS (timing, PLL) a more
consistent way is to apply it early after the PPS HW context is known to
be lost: driver loading, system resume and on VLV/CHV/BXT when turning
on power domains.

This is needed by the next patch that removes saving/restoring of the
PP_CONTROL register.

This also removes the incorrect programming of the workaround on HSW+
PCH platforms which don't have the register locking mechanism.

v2: (Ville)
- Don't apply the workaround on BXT.
- Simplify platform checks using HAS_DDI().
v3:
- Move the call of intel_pps_unlock_regs_wa() to the more
  logical vlv_display_power_well_init() (also fixing CHV) (Ville).

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470827254-21954-5-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/i915_drv.c         |  1 +
 drivers/gpu/drm/i915/intel_display.c    | 26 +++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_dp.c         |  3 ++-
 drivers/gpu/drm/i915/intel_drv.h        |  1 +
 drivers/gpu/drm/i915/intel_lvds.c       |  8 --------
 drivers/gpu/drm/i915/intel_runtime_pm.c |  4 ++++
 6 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 8cfc264ec9f6..0fcd1c0f67bb 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1560,6 +1560,7 @@ static int i915_drm_resume(struct drm_device *dev)
 	i915_gem_resume(dev);
 
 	i915_restore_state(dev);
+	intel_pps_unlock_regs_wa(dev_priv);
 	intel_opregion_setup(dev_priv);
 
 	intel_init_pch_refclk(dev);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index dc0d1b61dae4..c0509061f321 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -14729,6 +14729,30 @@ static bool intel_crt_present(struct drm_device *dev)
 	return true;
 }
 
+void intel_pps_unlock_regs_wa(struct drm_i915_private *dev_priv)
+{
+	int pps_num;
+	int pps_idx;
+
+	if (HAS_DDI(dev_priv))
+		return;
+	/*
+	 * This w/a is needed at least on CPT/PPT, but to be sure apply it
+	 * everywhere where registers can be write protected.
+	 */
+	if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
+		pps_num = 2;
+	else
+		pps_num = 1;
+
+	for (pps_idx = 0; pps_idx < pps_num; pps_idx++) {
+		u32 val = I915_READ(PP_CONTROL(pps_idx));
+
+		val = (val & ~PANEL_UNLOCK_MASK) | PANEL_UNLOCK_REGS;
+		I915_WRITE(PP_CONTROL(pps_idx), val);
+	}
+}
+
 static void intel_pps_init(struct drm_i915_private *dev_priv)
 {
 	if (HAS_PCH_SPLIT(dev_priv) || IS_BROXTON(dev_priv))
@@ -14737,6 +14761,8 @@ static void intel_pps_init(struct drm_i915_private *dev_priv)
 		dev_priv->pps_mmio_base = VLV_PPS_BASE;
 	else
 		dev_priv->pps_mmio_base = PPS_BASE;
+
+	intel_pps_unlock_regs_wa(dev_priv);
 }
 
 static void intel_setup_outputs(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 2ef7b14cc36e..364db908c191 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -1829,7 +1829,8 @@ static  u32 ironlake_get_pp_control(struct intel_dp *intel_dp)
 	lockdep_assert_held(&dev_priv->pps_mutex);
 
 	control = I915_READ(_pp_ctrl_reg(intel_dp));
-	if (!IS_BROXTON(dev)) {
+	if (WARN_ON(!HAS_DDI(dev_priv) &&
+		    (control & PANEL_UNLOCK_MASK) != PANEL_UNLOCK_REGS)) {
 		control &= ~PANEL_UNLOCK_MASK;
 		control |= PANEL_UNLOCK_REGS;
 	}
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index b1fc67ed8bc2..65c97a3eef1d 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1162,6 +1162,7 @@ void intel_mark_busy(struct drm_i915_private *dev_priv);
 void intel_mark_idle(struct drm_i915_private *dev_priv);
 void intel_crtc_restore_mode(struct drm_crtc *crtc);
 int intel_display_suspend(struct drm_device *dev);
+void intel_pps_unlock_regs_wa(struct drm_i915_private *dev_priv);
 void intel_encoder_destroy(struct drm_encoder *encoder);
 int intel_connector_init(struct intel_connector *);
 struct intel_connector *intel_connector_alloc(void);
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index e79fae4bbc1f..668eabb0ba1b 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -978,14 +978,6 @@ void intel_lvds_init(struct drm_device *dev)
 	int pipe;
 	u8 pin;
 
-	/*
-	 * Unlock registers and just leave them unlocked. Do this before
-	 * checking quirk lists to avoid bogus WARNINGs.
-	 */
-	if (HAS_PCH_SPLIT(dev_priv) || INTEL_GEN(dev_priv) <= 4)
-		I915_WRITE(PP_CONTROL(0),
-			   I915_READ(PP_CONTROL(0)) | PANEL_UNLOCK_REGS);
-
 	if (!intel_lvds_supported(dev))
 		return;
 
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 1c603bbe5784..d659d6f5b8d3 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -592,6 +592,8 @@ void bxt_disable_dc9(struct drm_i915_private *dev_priv)
 	DRM_DEBUG_KMS("Disabling DC9\n");
 
 	gen9_set_dc_state(dev_priv, DC_STATE_DISABLE);
+
+	intel_pps_unlock_regs_wa(dev_priv);
 }
 
 static void assert_csr_loaded(struct drm_i915_private *dev_priv)
@@ -1121,6 +1123,8 @@ static void vlv_display_power_well_init(struct drm_i915_private *dev_priv)
 	}
 
 	i915_redisable_vga_power_on(&dev_priv->drm);
+
+	intel_pps_unlock_regs_wa(dev_priv);
 }
 
 static void vlv_display_power_well_deinit(struct drm_i915_private *dev_priv)

From eebb40e0814fe20dba2ccb899557521cbf423711 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 10 Aug 2016 14:07:34 +0300
Subject: [PATCH 027/141] drm/i915: Remove LVDS and PPS suspend time
 save/restore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the preceding patches we made sure that:
- the LVDS encoder takes care of reiniting both the LVDS register
and its PPS
- the eDP encoder takes care of reiniting its PPS
- the PPS register unlocking workaround is applied explicitly whenever
the PPS context is lost

Based on the above we can safely remove the opaque LVDS and PPS save /
restore from generic code.

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470827254-21954-6-git-send-email-imre.deak@intel.com
---
 drivers/gpu/drm/i915/i915_drv.h     |  7 -------
 drivers/gpu/drm/i915/i915_suspend.c | 31 -----------------------------
 2 files changed, 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0ffab295a702..7f2754a070a5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1061,13 +1061,6 @@ struct intel_gmbus {
 
 struct i915_suspend_saved_registers {
 	u32 saveDSPARB;
-	u32 saveLVDS;
-	u32 savePP_ON_DELAYS;
-	u32 savePP_OFF_DELAYS;
-	u32 savePP_ON;
-	u32 savePP_OFF;
-	u32 savePP_CONTROL;
-	u32 savePP_DIVISOR;
 	u32 saveFBC_CONTROL;
 	u32 saveCACHE_MODE_0;
 	u32 saveMI_ARB_STATE;
diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
index c826b69ba0aa..4f272777b4f4 100644
--- a/drivers/gpu/drm/i915/i915_suspend.c
+++ b/drivers/gpu/drm/i915/i915_suspend.c
@@ -37,20 +37,6 @@ static void i915_save_display(struct drm_device *dev)
 	if (INTEL_INFO(dev)->gen <= 4)
 		dev_priv->regfile.saveDSPARB = I915_READ(DSPARB);
 
-	/* LVDS state */
-	if (HAS_PCH_IBX(dev) || HAS_PCH_CPT(dev))
-		dev_priv->regfile.saveLVDS = I915_READ(PCH_LVDS);
-	else if (INTEL_INFO(dev)->gen <= 4 && IS_MOBILE(dev) && !IS_I830(dev))
-		dev_priv->regfile.saveLVDS = I915_READ(LVDS);
-
-	/* Panel power sequencer */
-	if (HAS_PCH_SPLIT(dev_priv) || INTEL_GEN(dev_priv) <= 4) {
-		dev_priv->regfile.savePP_CONTROL = I915_READ(PP_CONTROL(0));
-		dev_priv->regfile.savePP_ON_DELAYS = I915_READ(PP_ON_DELAYS(0));
-		dev_priv->regfile.savePP_OFF_DELAYS = I915_READ(PP_OFF_DELAYS(0));
-		dev_priv->regfile.savePP_DIVISOR = I915_READ(PP_DIVISOR(0));
-	}
-
 	/* save FBC interval */
 	if (HAS_FBC(dev) && INTEL_INFO(dev)->gen <= 4 && !IS_G4X(dev))
 		dev_priv->regfile.saveFBC_CONTROL = I915_READ(FBC_CONTROL);
@@ -59,28 +45,11 @@ static void i915_save_display(struct drm_device *dev)
 static void i915_restore_display(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	u32 mask = 0xffffffff;
 
 	/* Display arbitration */
 	if (INTEL_INFO(dev)->gen <= 4)
 		I915_WRITE(DSPARB, dev_priv->regfile.saveDSPARB);
 
-	mask = ~LVDS_PORT_EN;
-
-	/* LVDS state */
-	if (HAS_PCH_IBX(dev) || HAS_PCH_CPT(dev))
-		I915_WRITE(PCH_LVDS, dev_priv->regfile.saveLVDS & mask);
-	else if (INTEL_INFO(dev)->gen <= 4 && IS_MOBILE(dev) && !IS_I830(dev))
-		I915_WRITE(LVDS, dev_priv->regfile.saveLVDS & mask);
-
-	/* Panel power sequencer */
-	if (HAS_PCH_SPLIT(dev_priv) || INTEL_GEN(dev_priv) <= 4) {
-		I915_WRITE(PP_ON_DELAYS(0), dev_priv->regfile.savePP_ON_DELAYS);
-		I915_WRITE(PP_OFF_DELAYS(0), dev_priv->regfile.savePP_OFF_DELAYS);
-		I915_WRITE(PP_DIVISOR(0), dev_priv->regfile.savePP_DIVISOR);
-		I915_WRITE(PP_CONTROL(0), dev_priv->regfile.savePP_CONTROL);
-	}
-
 	/* only restore FBC info on the platform that supports FBC*/
 	intel_fbc_global_disable(dev_priv);
 

From b5163dbb17fd0648600d2c840e53b76f5ca49a5b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 10 Aug 2016 13:58:24 +0100
Subject: [PATCH 028/141] drm/i915: Fix nesting of rps.mutex and struct_mutex
 during powersave init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During intel_gt_powersave_init() we take the RPS mutex to ensure that
all locking requirements are met as we talk to the punit, but we also
require the struct_mutex for allocating a slice of the global GTT for a
power context on Valleyview. struct_mutex must be the outer lock here,
as we nest rps.mutex inside later on.

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Fixes: 773ea9a80132 ("drm/i915: Perform static RPS frequency setup before...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470833904-29886-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_pm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index aef0b105eb58..f9591b99d8ca 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -5675,8 +5675,6 @@ static void valleyview_setup_pctx(struct drm_i915_private *dev_priv)
 	u32 pcbr;
 	int pctx_size = 24*1024;
 
-	mutex_lock(&dev_priv->drm.struct_mutex);
-
 	pcbr = I915_READ(VLV_PCBR);
 	if (pcbr) {
 		/* BIOS set it up already, grab the pre-alloc'd space */
@@ -5712,7 +5710,6 @@ static void valleyview_setup_pctx(struct drm_i915_private *dev_priv)
 out:
 	DRM_DEBUG_DRIVER("PCBR: 0x%08x\n", I915_READ(VLV_PCBR));
 	dev_priv->vlv_pctx = pctx;
-	mutex_unlock(&dev_priv->drm.struct_mutex);
 }
 
 static void valleyview_cleanup_pctx(struct drm_i915_private *dev_priv)
@@ -6488,6 +6485,7 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
 		intel_runtime_pm_get(dev_priv);
 	}
 
+	mutex_lock(&dev_priv->drm.struct_mutex);
 	mutex_lock(&dev_priv->rps.hw_lock);
 
 	/* Initialize RPS limits (for userspace) */
@@ -6529,6 +6527,7 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
 	dev_priv->rps.boost_freq = dev_priv->rps.max_freq;
 
 	mutex_unlock(&dev_priv->rps.hw_lock);
+	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	intel_autoenable_gt_powersave(dev_priv);
 }

From 737aac2465e9f44f8ab4f3b75f3220b524ae2a1b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 10 Aug 2016 13:41:45 +0100
Subject: [PATCH 029/141] drm/i915: Mark unmappable GGTT entries as PIN_HIGH

We allocate a few objects into the GGTT that we never need to access via
the mappable aperture (such as contexts, status pages). We can request
that these are bound high in the VM to increase the amount of mappable
aperture available. However, anything that may be frequently pinned
(such as logical contexts) we want to use the fast search & insert.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470832906-13972-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_lrc.c        | 2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 309c5d9b1c57..c7f4b64b16f6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1182,7 +1182,7 @@ static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *engine, u32 size)
 	}
 
 	ret = i915_gem_object_ggtt_pin(engine->wa_ctx.obj, NULL,
-				       0, PAGE_SIZE, 0);
+				       0, PAGE_SIZE, PIN_HIGH);
 	if (ret) {
 		DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
 				 ret);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 16b726fe33eb..09f01c641c14 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2093,7 +2093,7 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx,
 
 	if (ce->state) {
 		ret = i915_gem_object_ggtt_pin(ce->state, NULL, 0,
-					       ctx->ggtt_alignment, 0);
+					       ctx->ggtt_alignment, PIN_HIGH);
 		if (ret)
 			goto error;
 	}
@@ -2629,7 +2629,8 @@ static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
 			i915.semaphores = 0;
 		} else {
 			i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
-			ret = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
+			ret = i915_gem_object_ggtt_pin(obj, NULL,
+						       0, 0, PIN_HIGH);
 			if (ret != 0) {
 				i915_gem_object_put(obj);
 				DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");

From 17f298cf5458045562157a8bb7a592dbdace3d95 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 10 Aug 2016 13:41:46 +0100
Subject: [PATCH 030/141] drm/i915: Move setting of request->batch into its
 single callsite

request->batch_obj is only set by execbuffer for the convenience of
debugging hangs. By moving that operation to the callsite, we can
simplify all other callers and future patches. We also move the
complications of reference handling of the request->batch_obj next to
where the active tracking is set up for the request.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470832906-13972-2-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 10 +++++++++-
 drivers/gpu/drm/i915/i915_gem_request.c    | 12 +-----------
 drivers/gpu/drm/i915/i915_gem_request.h    |  8 +++-----
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c494b79ded20..c8d13fea4b25 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1702,6 +1702,14 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	/* Whilst this request exists, batch_obj will be on the
+	 * active_list, and so will hold the active reference. Only when this
+	 * request is retired will the the batch_obj be moved onto the
+	 * inactive_list and lose its active reference. Hence we do not need
+	 * to explicitly hold another reference here.
+	 */
+	params->request->batch_obj = params->batch->obj;
+
 	ret = i915_gem_request_add_to_client(params->request, file);
 	if (ret)
 		goto err_request;
@@ -1720,7 +1728,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
-	__i915_add_request(params->request, params->batch->obj, ret == 0);
+	__i915_add_request(params->request, ret == 0);
 
 err_batch_unpin:
 	/*
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 76314e527cfd..b764c1d440c8 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -461,9 +461,7 @@ static void i915_gem_mark_busy(const struct intel_engine_cs *engine)
  * request is not being tracked for completion but the work itself is
  * going to happen on the hardware. This would be a Bad Thing(tm).
  */
-void __i915_add_request(struct drm_i915_gem_request *request,
-			struct drm_i915_gem_object *obj,
-			bool flush_caches)
+void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 {
 	struct intel_engine_cs *engine;
 	struct intel_ring *ring;
@@ -504,14 +502,6 @@ void __i915_add_request(struct drm_i915_gem_request *request,
 
 	request->head = request_start;
 
-	/* Whilst this request exists, batch_obj will be on the
-	 * active_list, and so will hold the active reference. Only when this
-	 * request is retired will the the batch_obj be moved onto the
-	 * inactive_list and lose its active reference. Hence we do not need
-	 * to explicitly hold another reference here.
-	 */
-	request->batch_obj = obj;
-
 	/* Seal the request and mark it as pending execution. Note that
 	 * we may inspect this state, without holding any locks, during
 	 * hangcheck. Hence we apply the barrier to ensure that we do not
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 4e91d4912443..85393be09c27 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -225,13 +225,11 @@ static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
 	*pdst = src;
 }
 
-void __i915_add_request(struct drm_i915_gem_request *req,
-			struct drm_i915_gem_object *batch_obj,
-			bool flush_caches);
+void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request(req) \
-	__i915_add_request(req, NULL, true)
+	__i915_add_request(req, true)
 #define i915_add_request_no_flush(req) \
-	__i915_add_request(req, NULL, false)
+	__i915_add_request(req, false)
 
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)

From c1bb11451ed93aae53efcf461f936a54675dcbea Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Wed, 10 Aug 2016 16:22:10 +0100
Subject: [PATCH 031/141] drm/i915: Store number of active engines in device
 info

Until now code was calling hweight32 to figure out the
number from device_info->ring_mask at runtime. Instead
we can cache it at engine init time and use directly.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1470842530-35854-1-git-send-email-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  2 +-
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem_context.c |  2 +-
 drivers/gpu/drm/i915/intel_engine_cs.c  | 10 +++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.c |  6 +++---
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index ade2446d924a..2a3d6d2817d8 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3226,7 +3226,7 @@ static int i915_semaphore_status(struct seq_file *m, void *unused)
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_engine_cs *engine;
-	int num_rings = hweight32(INTEL_INFO(dev)->ring_mask);
+	int num_rings = INTEL_INFO(dev)->num_rings;
 	enum intel_engine_id id;
 	int j, ret;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7f2754a070a5..7971c76852df 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -793,6 +793,7 @@ struct intel_device_info {
 	u8 gen;
 	u16 gen_mask;
 	u8 ring_mask; /* Rings supported by the HW */
+	u8 num_rings;
 	DEV_INFO_FOR_EACH_FLAG(DEFINE_FLAG, SEP_SEMICOLON);
 	/* Register offsets for the various display pipes and transcoders */
 	int pipe_offsets[I915_MAX_TRANSCODERS];
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index bb72af5320b0..547caf26a6b9 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -568,7 +568,7 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 	const int num_rings =
 		/* Use an extended w/a on ivb+ if signalling from other rings */
 		i915.semaphores ?
-		hweight32(INTEL_INFO(dev_priv)->ring_mask) - 1 :
+		INTEL_INFO(dev_priv)->num_rings - 1 :
 		0;
 	int len, ret;
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 0dd3d1de18aa..186c12d07f99 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -109,6 +109,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 int intel_engines_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
+	struct intel_device_info *device_info = mkwrite_device_info(dev_priv);
 	unsigned int mask = 0;
 	int (*init)(struct intel_engine_cs *engine);
 	unsigned int i;
@@ -142,11 +143,10 @@ int intel_engines_init(struct drm_device *dev)
 	 * are added to the driver by a warning and disabling the forgotten
 	 * engines.
 	 */
-	if (WARN_ON(mask != INTEL_INFO(dev_priv)->ring_mask)) {
-		struct intel_device_info *info =
-			(struct intel_device_info *)&dev_priv->info;
-		info->ring_mask = mask;
-	}
+	if (WARN_ON(mask != INTEL_INFO(dev_priv)->ring_mask))
+		device_info->ring_mask = mask;
+
+	device_info->num_rings = hweight32(mask);
 
 	return 0;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 09f01c641c14..ed19868df9c6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1317,7 +1317,7 @@ static int gen8_rcs_signal(struct drm_i915_gem_request *req)
 	enum intel_engine_id id;
 	int ret, num_rings;
 
-	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
+	num_rings = INTEL_INFO(dev_priv)->num_rings;
 	ret = intel_ring_begin(req, (num_rings-1) * 8);
 	if (ret)
 		return ret;
@@ -1354,7 +1354,7 @@ static int gen8_xcs_signal(struct drm_i915_gem_request *req)
 	enum intel_engine_id id;
 	int ret, num_rings;
 
-	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
+	num_rings = INTEL_INFO(dev_priv)->num_rings;
 	ret = intel_ring_begin(req, (num_rings-1) * 6);
 	if (ret)
 		return ret;
@@ -1389,7 +1389,7 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 	enum intel_engine_id id;
 	int ret, num_rings;
 
-	num_rings = hweight32(INTEL_INFO(dev_priv)->ring_mask);
+	num_rings = INTEL_INFO(dev_priv)->num_rings;
 	ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
 	if (ret)
 		return ret;

From 5e334c199a258b8309f1073b3a6432cd0256f9b1 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Wed, 10 Aug 2016 16:16:46 +0100
Subject: [PATCH 032/141] drm/i915/guc: Consolidate firmware major-minor to one
 place

Currently to change the firmware one has to update the exported
module firmware string and the major-minor versions used for
verification after load. Consolidate that to a single place
defining correct major and minor versions per platform.

v2: Rebased for KBL.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Peter Antoine <peter.antoine@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Reviewed-by: Dave Gordon <david.s.gordon@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470842206-35685-1-git-send-email-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/intel_guc_loader.c | 30 +++++++++++++++++--------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index 3763e30cc165..bfcf6b5d29ad 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -59,13 +59,25 @@
  *
  */
 
-#define I915_SKL_GUC_UCODE "i915/skl_guc_ver6_1.bin"
+#define SKL_FW_MAJOR 6
+#define SKL_FW_MINOR 1
+
+#define BXT_FW_MAJOR 8
+#define BXT_FW_MINOR 7
+
+#define KBL_FW_MAJOR 9
+#define KBL_FW_MINOR 14
+
+#define GUC_FW_PATH(platform, major, minor) \
+       "i915/" __stringify(platform) "_guc_ver" __stringify(major) "_" __stringify(minor) ".bin"
+
+#define I915_SKL_GUC_UCODE GUC_FW_PATH(skl, SKL_FW_MAJOR, SKL_FW_MINOR)
 MODULE_FIRMWARE(I915_SKL_GUC_UCODE);
 
-#define I915_BXT_GUC_UCODE "i915/bxt_guc_ver8_7.bin"
+#define I915_BXT_GUC_UCODE GUC_FW_PATH(bxt, BXT_FW_MAJOR, BXT_FW_MINOR)
 MODULE_FIRMWARE(I915_BXT_GUC_UCODE);
 
-#define I915_KBL_GUC_UCODE "i915/kbl_guc_ver9_14.bin"
+#define I915_KBL_GUC_UCODE GUC_FW_PATH(kbl, KBL_FW_MAJOR, KBL_FW_MINOR)
 MODULE_FIRMWARE(I915_KBL_GUC_UCODE);
 
 /* User-friendly representation of an enum */
@@ -697,16 +709,16 @@ void intel_guc_init(struct drm_device *dev)
 		fw_path = NULL;
 	} else if (IS_SKYLAKE(dev)) {
 		fw_path = I915_SKL_GUC_UCODE;
-		guc_fw->guc_fw_major_wanted = 6;
-		guc_fw->guc_fw_minor_wanted = 1;
+		guc_fw->guc_fw_major_wanted = SKL_FW_MAJOR;
+		guc_fw->guc_fw_minor_wanted = SKL_FW_MINOR;
 	} else if (IS_BROXTON(dev)) {
 		fw_path = I915_BXT_GUC_UCODE;
-		guc_fw->guc_fw_major_wanted = 8;
-		guc_fw->guc_fw_minor_wanted = 7;
+		guc_fw->guc_fw_major_wanted = BXT_FW_MAJOR;
+		guc_fw->guc_fw_minor_wanted = BXT_FW_MINOR;
 	} else if (IS_KABYLAKE(dev)) {
 		fw_path = I915_KBL_GUC_UCODE;
-		guc_fw->guc_fw_major_wanted = 9;
-		guc_fw->guc_fw_minor_wanted = 14;
+		guc_fw->guc_fw_major_wanted = KBL_FW_MAJOR;
+		guc_fw->guc_fw_minor_wanted = KBL_FW_MINOR;
 	} else {
 		fw_path = "";	/* unknown device */
 	}

From d721b02fd00bf133580f431b82ef37f3b746dfb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Mon, 8 Aug 2016 13:58:39 +0300
Subject: [PATCH 033/141] drm/i915: Account for TSEG size when determining 865G
 stolen base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Looks like the TSEG lives just above TOUD, stolen comes after TSEG.

The spec seems somewhat self-contradictory in places, in the ESMRAMC
register desctription it says:
 TSEG Size:
  10=(TOUD + 512 KB) to TOUD
  11 =(TOUD + 1 MB) to TOUD

so that agrees with TSEG being at TOUD. But the example given
elsehwere in the spec says:

 TOUD equals 62.5 MB = 03E7FFFFh
 TSEG selected as 512 KB in size,
 Graphics local memory selected as 1 MB in size
 General System RAM available in system = 62.5 MB
 General system RAM range00000000h to 03E7FFFFh
 TSEG address range03F80000h to 03FFFFFFh
 TSEG pre-allocated from03F80000h to 03FFFFFFh
 Graphics local memory pre-allocated from03E80000h to 03F7FFFFh

so here we have TSEG above stolen.

Real world evidence agrees with the TOUD->TSEG->stolen order however, so
let's fix up the code to account for the TSEG size.

Cc: Taketo Kabe <fdporg@vega.pgw.jp>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: stable@vger.kernel.org
Fixes: 0ad98c74e093 ("drm/i915: Determine the stolen memory base address on gen2")
Fixes: a4dff76924fe ("x86/gpu: Add Intel graphics stolen memory quirk for gen2 platforms")
Reported-by: Taketo Kabe <fdporg@vega.pgw.jp>
Tested-by: Taketo Kabe <fdporg@vega.pgw.jp>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96473
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470653919-27251-1-git-send-email-ville.syrjala@linux.intel.com
Link: http://download.intel.com/design/chipsets/datashts/25251405.pdf
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 arch/x86/kernel/early-quirks.c         |  9 ++-------
 drivers/gpu/drm/i915/i915_gem_stolen.c | 23 +++++++++++++++++------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index de7501edb21c..8b8852bc2f4a 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -317,16 +317,11 @@ static phys_addr_t __init i85x_stolen_base(int num, int slot, int func,
 static phys_addr_t __init i865_stolen_base(int num, int slot, int func,
 					   size_t stolen_size)
 {
-	u16 toud;
+	u16 toud = 0;
 
-	/*
-	 * FIXME is the graphics stolen memory region
-	 * always at TOUD? Ie. is it always the last
-	 * one to be allocated by the BIOS?
-	 */
 	toud = read_pci_config_16(0, 0, 0, I865_TOUD);
 
-	return (phys_addr_t)toud << 16;
+	return (phys_addr_t)(toud << 16) + i845_tseg_size();
 }
 
 static phys_addr_t __init gen3_stolen_base(int num, int slot, int func,
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index 13279610eeec..f7633585ea38 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -115,17 +115,28 @@ static unsigned long i915_stolen_to_physical(struct drm_device *dev)
 
 		base = bsm & INTEL_BSM_MASK;
 	} else if (IS_I865G(dev)) {
+		u32 tseg_size = 0;
 		u16 toud = 0;
+		u8 tmp;
+
+		pci_bus_read_config_byte(dev->pdev->bus, PCI_DEVFN(0, 0),
+					 I845_ESMRAMC, &tmp);
+
+		if (tmp & TSEG_ENABLE) {
+			switch (tmp & I845_TSEG_SIZE_MASK) {
+			case I845_TSEG_SIZE_512K:
+				tseg_size = KB(512);
+				break;
+			case I845_TSEG_SIZE_1M:
+				tseg_size = MB(1);
+				break;
+			}
+		}
 
-		/*
-		 * FIXME is the graphics stolen memory region
-		 * always at TOUD? Ie. is it always the last
-		 * one to be allocated by the BIOS?
-		 */
 		pci_bus_read_config_word(dev->pdev->bus, PCI_DEVFN(0, 0),
 					 I865_TOUD, &toud);
 
-		base = toud << 16;
+		base = (toud << 16) + tseg_size;
 	} else if (IS_I85X(dev)) {
 		u32 tseg_size = 0;
 		u32 tom;

From 6687c9062c46c83e5a07df65015eb4fc9dc76524 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Tue, 15 Sep 2015 13:16:41 +0300
Subject: [PATCH 034/141] drm/i915: Rewrite fb rotation GTT handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Redo the fb rotation handling in order to:
- eliminate the NV12 special casing
- handle fb->offsets[] properly
- make the rotation handling easier for the plane code

To achieve these goals we reduce intel_rotation_info to only contain
(for each plane) the rotated view width,height,stride in tile units,
and the page offset into the object where the plane starts. Each plane
is handled exactly the same way, no special casing for NV12 or other
formats. We then store the computed rotation_info under
intel_framebuffer so that we don't have to recompute it again.

To handle fb->offsets[] we treat them as a linear offsets and convert
them to x/y offsets from the start of the relevant GTT mapping (either
normal or rotated). We store the x/y offsets under intel_framebuffer,
and for some extra convenience we also store the rotated pitch (ie.
tile aligned plane height). So for each plane we have the normal
x/y offsets, rotated x/y offsets, and the rotated pitch. The normal
pitch is available already in fb->pitches[].

While we're gathering up all that extra information, we can also easily
compute the storage requirements for the framebuffer, so that we can
check that the object is big enough to hold it.

When it comes time to deal with the plane source coordinates, we first
rotate the clipped src coordinates to match the relevant GTT view
orientation, then add to them the fb x/y offsets. Next we compute
the aligned surface page offset, and as a result we're left with some
residual x/y offsets. Finally, if required by the hardware, we convert
the remaining x/y offsets into a linear offset.

For gen2/3 we simply skip computing the final page offset, and just
convert the src+fb x/y offsets directly into a linear offset since
that's what the hardware wants.

After this all platforms, incluing SKL+, compute these things in exactly
the same way (excluding alignemnt differences).

v2: Use BIT(DRM_ROTATE_270) instead of ROTATE_270 when rotating
    plane src coordinates
    Drop some spurious changes that got left behind during
    development
v3: Split out more changes to prep patches (Daniel)
    s/intel_fb->plane[].foo.bar/intel_fb->foo[].bar/ for brevity
    Rename intel_surf_gtt_offset to intel_fb_gtt_offset
    Kill the pointless 'plane' parameter from intel_fb_gtt_offset()
v4: Fix alignment vs. alignment-1 when calling
    _intel_compute_tile_offset() from intel_fill_fb_info()
    Pass the pitch in tiles in
    stad of pixels to intel_adjust_tile_offset() from intel_fill_fb_info()
    Pass the full width/height of the rotated area to
    drm_rect_rotate() for clarity
    Use u32 for more offsets
v5: Preserve the upper_32_bits()/lower_32_bits() handling for the
    fb ggtt offset (Sivakumar)
v6: Rebase due to drm_plane_state src/dst rects

Cc: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-2-git-send-email-ville.syrjala@linux.intel.com
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c  |  51 +---
 drivers/gpu/drm/i915/i915_gem_gtt.h  |   5 +-
 drivers/gpu/drm/i915/intel_display.c | 368 ++++++++++++++++++---------
 drivers/gpu/drm/i915/intel_drv.h     |  19 +-
 drivers/gpu/drm/i915/intel_sprite.c  |  95 +++----
 5 files changed, 330 insertions(+), 208 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 18c7c9644761..d876501694c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3449,18 +3449,16 @@ rotate_pages(const dma_addr_t *in, unsigned int offset,
 }
 
 static struct sg_table *
-intel_rotate_fb_obj_pages(struct intel_rotation_info *rot_info,
+intel_rotate_fb_obj_pages(const struct intel_rotation_info *rot_info,
 			  struct drm_i915_gem_object *obj)
 {
 	const size_t n_pages = obj->base.size / PAGE_SIZE;
-	unsigned int size_pages = rot_info->plane[0].width * rot_info->plane[0].height;
-	unsigned int size_pages_uv;
+	unsigned int size = intel_rotation_info_size(rot_info);
 	struct sgt_iter sgt_iter;
 	dma_addr_t dma_addr;
 	unsigned long i;
 	dma_addr_t *page_addr_list;
 	struct sg_table *st;
-	unsigned int uv_start_page;
 	struct scatterlist *sg;
 	int ret = -ENOMEM;
 
@@ -3471,18 +3469,12 @@ intel_rotate_fb_obj_pages(struct intel_rotation_info *rot_info,
 	if (!page_addr_list)
 		return ERR_PTR(ret);
 
-	/* Account for UV plane with NV12. */
-	if (rot_info->pixel_format == DRM_FORMAT_NV12)
-		size_pages_uv = rot_info->plane[1].width * rot_info->plane[1].height;
-	else
-		size_pages_uv = 0;
-
 	/* Allocate target SG list. */
 	st = kmalloc(sizeof(*st), GFP_KERNEL);
 	if (!st)
 		goto err_st_alloc;
 
-	ret = sg_alloc_table(st, size_pages + size_pages_uv, GFP_KERNEL);
+	ret = sg_alloc_table(st, size, GFP_KERNEL);
 	if (ret)
 		goto err_sg_alloc;
 
@@ -3495,32 +3487,14 @@ intel_rotate_fb_obj_pages(struct intel_rotation_info *rot_info,
 	st->nents = 0;
 	sg = st->sgl;
 
-	/* Rotate the pages. */
-	sg = rotate_pages(page_addr_list, 0,
-			  rot_info->plane[0].width, rot_info->plane[0].height,
-			  rot_info->plane[0].width,
-			  st, sg);
-
-	/* Append the UV plane if NV12. */
-	if (rot_info->pixel_format == DRM_FORMAT_NV12) {
-		uv_start_page = size_pages;
-
-		/* Check for tile-row un-alignment. */
-		if (offset_in_page(rot_info->uv_offset))
-			uv_start_page--;
-
-		rot_info->uv_start_page = uv_start_page;
-
-		sg = rotate_pages(page_addr_list, rot_info->uv_start_page,
-				  rot_info->plane[1].width, rot_info->plane[1].height,
-				  rot_info->plane[1].width,
-				  st, sg);
+	for (i = 0 ; i < ARRAY_SIZE(rot_info->plane); i++) {
+		sg = rotate_pages(page_addr_list, rot_info->plane[i].offset,
+				  rot_info->plane[i].width, rot_info->plane[i].height,
+				  rot_info->plane[i].stride, st, sg);
 	}
 
-	DRM_DEBUG_KMS("Created rotated page mapping for object size %zu (%ux%u tiles, %u pages (%u plane 0)).\n",
-		      obj->base.size, rot_info->plane[0].width,
-		      rot_info->plane[0].height, size_pages + size_pages_uv,
-		      size_pages);
+	DRM_DEBUG_KMS("Created rotated page mapping for object size %zu (%ux%u tiles, %u pages)\n",
+		      obj->base.size, rot_info->plane[0].width, rot_info->plane[0].height, size);
 
 	drm_free_large(page_addr_list);
 
@@ -3531,10 +3505,9 @@ err_sg_alloc:
 err_st_alloc:
 	drm_free_large(page_addr_list);
 
-	DRM_DEBUG_KMS("Failed to create rotated mapping for object size %zu! (%d) (%ux%u tiles, %u pages (%u plane 0))\n",
-		      obj->base.size, ret, rot_info->plane[0].width,
-		      rot_info->plane[0].height, size_pages + size_pages_uv,
-		      size_pages);
+	DRM_DEBUG_KMS("Failed to create rotated mapping for object size %zu! (%ux%u tiles, %u pages)\n",
+		      obj->base.size, rot_info->plane[0].width, rot_info->plane[0].height, size);
+
 	return ERR_PTR(ret);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index cc56206a1600..56e64a5355e8 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -139,12 +139,9 @@ enum i915_ggtt_view_type {
 };
 
 struct intel_rotation_info {
-	unsigned int uv_offset;
-	uint32_t pixel_format;
-	unsigned int uv_start_page;
 	struct {
 		/* tiles */
-		unsigned int width, height;
+		unsigned int width, height, stride, offset;
 	} plane[2];
 };
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index c0509061f321..7c61ab4e5c64 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2147,33 +2147,6 @@ intel_fill_fb_ggtt_view(struct i915_ggtt_view *view,
 	}
 }
 
-static void
-intel_fill_fb_info(struct drm_i915_private *dev_priv,
-		   struct drm_framebuffer *fb)
-{
-	struct intel_rotation_info *info = &to_intel_framebuffer(fb)->rot_info;
-	unsigned int tile_size, tile_width, tile_height, cpp;
-
-	tile_size = intel_tile_size(dev_priv);
-
-	cpp = drm_format_plane_cpp(fb->pixel_format, 0);
-	intel_tile_dims(dev_priv, &tile_width, &tile_height,
-			fb->modifier[0], cpp);
-
-	info->plane[0].width = DIV_ROUND_UP(fb->pitches[0], tile_width * cpp);
-	info->plane[0].height = DIV_ROUND_UP(fb->height, tile_height);
-
-	if (info->pixel_format == DRM_FORMAT_NV12) {
-		cpp = drm_format_plane_cpp(fb->pixel_format, 1);
-		intel_tile_dims(dev_priv, &tile_width, &tile_height,
-				fb->modifier[1], cpp);
-
-		info->uv_offset = fb->offsets[1];
-		info->plane[1].width = DIV_ROUND_UP(fb->pitches[1], tile_width * cpp);
-		info->plane[1].height = DIV_ROUND_UP(fb->height / 2, tile_height);
-	}
-}
-
 static unsigned int intel_linear_alignment(const struct drm_i915_private *dev_priv)
 {
 	if (INTEL_INFO(dev_priv)->gen >= 9)
@@ -2294,6 +2267,42 @@ void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	i915_gem_object_unpin_from_display_plane(obj, &view);
 }
 
+/*
+ * Convert the x/y offsets into a linear offset.
+ * Only valid with 0/180 degree rotation, which is fine since linear
+ * offset is only used with linear buffers on pre-hsw and tiled buffers
+ * with gen2/3, and 90/270 degree rotations isn't supported on any of them.
+ */
+u32 intel_fb_xy_to_linear(int x, int y,
+			  const struct drm_framebuffer *fb, int plane)
+{
+	unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
+	unsigned int pitch = fb->pitches[plane];
+
+	return y * pitch + x * cpp;
+}
+
+/*
+ * Add the x/y offsets derived from fb->offsets[] to the user
+ * specified plane src x/y offsets. The resulting x/y offsets
+ * specify the start of scanout from the beginning of the gtt mapping.
+ */
+void intel_add_fb_offsets(int *x, int *y,
+			  const struct drm_framebuffer *fb, int plane,
+			  unsigned int rotation)
+
+{
+	const struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
+
+	if (intel_rotation_90_or_270(rotation)) {
+		*x += intel_fb->rotated[plane].x;
+		*y += intel_fb->rotated[plane].y;
+	} else {
+		*x += intel_fb->normal[plane].x;
+		*y += intel_fb->normal[plane].y;
+	}
+}
+
 /*
  * Adjust the tile offset by moving the difference into
  * the x/y offsets.
@@ -2330,18 +2339,24 @@ static u32 intel_adjust_tile_offset(int *x, int *y,
  * In the 90/270 rotated case, x and y are assumed
  * to be already rotated to match the rotated GTT view, and
  * pitch is the tile_height aligned framebuffer height.
+ *
+ * This function is used when computing the derived information
+ * under intel_framebuffer, so using any of that information
+ * here is not allowed. Anything under drm_framebuffer can be
+ * used. This is why the user has to pass in the pitch since it
+ * is specified in the rotated orientation.
  */
-u32 intel_compute_tile_offset(int *x, int *y,
-			      const struct drm_framebuffer *fb, int plane,
-			      unsigned int pitch,
-			      unsigned int rotation)
+static u32 _intel_compute_tile_offset(const struct drm_i915_private *dev_priv,
+				      int *x, int *y,
+				      const struct drm_framebuffer *fb, int plane,
+				      unsigned int pitch,
+				      unsigned int rotation,
+				      u32 alignment)
 {
-	const struct drm_i915_private *dev_priv = to_i915(fb->dev);
 	uint64_t fb_modifier = fb->modifier[plane];
 	unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
-	u32 offset, offset_aligned, alignment;
+	u32 offset, offset_aligned;
 
-	alignment = intel_surf_alignment(dev_priv, fb_modifier);
 	if (alignment)
 		alignment--;
 
@@ -2383,6 +2398,141 @@ u32 intel_compute_tile_offset(int *x, int *y,
 	return offset_aligned;
 }
 
+u32 intel_compute_tile_offset(int *x, int *y,
+			      const struct drm_framebuffer *fb, int plane,
+			      unsigned int pitch,
+			      unsigned int rotation)
+{
+	const struct drm_i915_private *dev_priv = to_i915(fb->dev);
+	u32 alignment = intel_surf_alignment(dev_priv, fb->modifier[plane]);
+
+	return _intel_compute_tile_offset(dev_priv, x, y, fb, plane, pitch,
+					  rotation, alignment);
+}
+
+/* Convert the fb->offset[] linear offset into x/y offsets */
+static void intel_fb_offset_to_xy(int *x, int *y,
+				  const struct drm_framebuffer *fb, int plane)
+{
+	unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
+	unsigned int pitch = fb->pitches[plane];
+	u32 linear_offset = fb->offsets[plane];
+
+	*y = linear_offset / pitch;
+	*x = linear_offset % pitch / cpp;
+}
+
+static int
+intel_fill_fb_info(struct drm_i915_private *dev_priv,
+		   struct drm_framebuffer *fb)
+{
+	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
+	struct intel_rotation_info *rot_info = &intel_fb->rot_info;
+	u32 gtt_offset_rotated = 0;
+	unsigned int max_size = 0;
+	uint32_t format = fb->pixel_format;
+	int i, num_planes = drm_format_num_planes(format);
+	unsigned int tile_size = intel_tile_size(dev_priv);
+
+	for (i = 0; i < num_planes; i++) {
+		unsigned int width, height;
+		unsigned int cpp, size;
+		u32 offset;
+		int x, y;
+
+		cpp = drm_format_plane_cpp(format, i);
+		width = drm_format_plane_width(fb->width, format, i);
+		height = drm_format_plane_height(fb->height, format, i);
+
+		intel_fb_offset_to_xy(&x, &y, fb, i);
+
+		/*
+		 * First pixel of the framebuffer from
+		 * the start of the normal gtt mapping.
+		 */
+		intel_fb->normal[i].x = x;
+		intel_fb->normal[i].y = y;
+
+		offset = _intel_compute_tile_offset(dev_priv, &x, &y,
+						    fb, 0, fb->pitches[i],
+						    BIT(DRM_ROTATE_0), tile_size);
+		offset /= tile_size;
+
+		if (fb->modifier[i] != DRM_FORMAT_MOD_NONE) {
+			unsigned int tile_width, tile_height;
+			unsigned int pitch_tiles;
+			struct drm_rect r;
+
+			intel_tile_dims(dev_priv, &tile_width, &tile_height,
+					fb->modifier[i], cpp);
+
+			rot_info->plane[i].offset = offset;
+			rot_info->plane[i].stride = DIV_ROUND_UP(fb->pitches[i], tile_width * cpp);
+			rot_info->plane[i].width = DIV_ROUND_UP(x + width, tile_width);
+			rot_info->plane[i].height = DIV_ROUND_UP(y + height, tile_height);
+
+			intel_fb->rotated[i].pitch =
+				rot_info->plane[i].height * tile_height;
+
+			/* how many tiles does this plane need */
+			size = rot_info->plane[i].stride * rot_info->plane[i].height;
+			/*
+			 * If the plane isn't horizontally tile aligned,
+			 * we need one more tile.
+			 */
+			if (x != 0)
+				size++;
+
+			/* rotate the x/y offsets to match the GTT view */
+			r.x1 = x;
+			r.y1 = y;
+			r.x2 = x + width;
+			r.y2 = y + height;
+			drm_rect_rotate(&r,
+					rot_info->plane[i].width * tile_width,
+					rot_info->plane[i].height * tile_height,
+					BIT(DRM_ROTATE_270));
+			x = r.x1;
+			y = r.y1;
+
+			/* rotate the tile dimensions to match the GTT view */
+			pitch_tiles = intel_fb->rotated[i].pitch / tile_height;
+			swap(tile_width, tile_height);
+
+			/*
+			 * We only keep the x/y offsets, so push all of the
+			 * gtt offset into the x/y offsets.
+			 */
+			intel_adjust_tile_offset(&x, &y, tile_size,
+						 tile_width, tile_height, pitch_tiles,
+						 gtt_offset_rotated * tile_size, 0);
+
+			gtt_offset_rotated += rot_info->plane[i].width * rot_info->plane[i].height;
+
+			/*
+			 * First pixel of the framebuffer from
+			 * the start of the rotated gtt mapping.
+			 */
+			intel_fb->rotated[i].x = x;
+			intel_fb->rotated[i].y = y;
+		} else {
+			size = DIV_ROUND_UP((y + height) * fb->pitches[i] +
+					    x * cpp, tile_size);
+		}
+
+		/* how many tiles in total needed in the bo */
+		max_size = max(max_size, offset + size);
+	}
+
+	if (max_size * tile_size > to_intel_framebuffer(fb)->obj->base.size) {
+		DRM_DEBUG("fb too big for bo (need %u bytes, have %zu bytes)\n",
+			  max_size * tile_size, to_intel_framebuffer(fb)->obj->base.size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int i9xx_format_to_fourcc(int format)
 {
 	switch (format) {
@@ -2618,7 +2768,6 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 	u32 dspcntr;
 	i915_reg_t reg = DSPCNTR(plane);
 	unsigned int rotation = plane_state->base.rotation;
-	int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 	int x = plane_state->src.x1 >> 16;
 	int y = plane_state->src.y1 >> 16;
 
@@ -2677,30 +2826,25 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 	if (IS_G4X(dev))
 		dspcntr |= DISPPLANE_TRICKLE_FEED_DISABLE;
 
-	linear_offset = y * fb->pitches[0] + x * cpp;
+	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
 
-	if (INTEL_INFO(dev)->gen >= 4) {
+	if (INTEL_INFO(dev)->gen >= 4)
 		intel_crtc->dspaddr_offset =
 			intel_compute_tile_offset(&x, &y, fb, 0,
 						  fb->pitches[0], rotation);
-		linear_offset -= intel_crtc->dspaddr_offset;
-	} else {
-		intel_crtc->dspaddr_offset = linear_offset;
-	}
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dspcntr |= DISPPLANE_ROTATE_180;
 
 		x += (crtc_state->pipe_src_w - 1);
 		y += (crtc_state->pipe_src_h - 1);
-
-		/* Finding the last pixel of the last line of the display
-		data and adding to linear_offset*/
-		linear_offset +=
-			(crtc_state->pipe_src_h - 1) * fb->pitches[0] +
-			(crtc_state->pipe_src_w - 1) * cpp;
 	}
 
+	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+
+	if (INTEL_INFO(dev)->gen < 4)
+		intel_crtc->dspaddr_offset = linear_offset;
+
 	intel_crtc->adjusted_x = x;
 	intel_crtc->adjusted_y = y;
 
@@ -2709,7 +2853,8 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 	I915_WRITE(DSPSTRIDE(plane), fb->pitches[0]);
 	if (INTEL_INFO(dev)->gen >= 4) {
 		I915_WRITE(DSPSURF(plane),
-			   i915_gem_obj_ggtt_offset(obj) + intel_crtc->dspaddr_offset);
+			   intel_fb_gtt_offset(fb, rotation) +
+			   intel_crtc->dspaddr_offset);
 		I915_WRITE(DSPTILEOFF(plane), (y << 16) | x);
 		I915_WRITE(DSPLINOFF(plane), linear_offset);
 	} else
@@ -2747,7 +2892,6 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 	u32 dspcntr;
 	i915_reg_t reg = DSPCNTR(plane);
 	unsigned int rotation = plane_state->base.rotation;
-	int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 	int x = plane_state->src.x1 >> 16;
 	int y = plane_state->src.y1 >> 16;
 
@@ -2786,26 +2930,23 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 	if (!IS_HASWELL(dev) && !IS_BROADWELL(dev))
 		dspcntr |= DISPPLANE_TRICKLE_FEED_DISABLE;
 
-	linear_offset = y * fb->pitches[0] + x * cpp;
+	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
+
 	intel_crtc->dspaddr_offset =
 		intel_compute_tile_offset(&x, &y, fb, 0,
 					  fb->pitches[0], rotation);
-	linear_offset -= intel_crtc->dspaddr_offset;
+
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dspcntr |= DISPPLANE_ROTATE_180;
 
 		if (!IS_HASWELL(dev) && !IS_BROADWELL(dev)) {
 			x += (crtc_state->pipe_src_w - 1);
 			y += (crtc_state->pipe_src_h - 1);
-
-			/* Finding the last pixel of the last line of the display
-			data and adding to linear_offset*/
-			linear_offset +=
-				(crtc_state->pipe_src_h - 1) * fb->pitches[0] +
-				(crtc_state->pipe_src_w - 1) * cpp;
 		}
 	}
 
+	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+
 	intel_crtc->adjusted_x = x;
 	intel_crtc->adjusted_y = y;
 
@@ -2813,7 +2954,8 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 
 	I915_WRITE(DSPSTRIDE(plane), fb->pitches[0]);
 	I915_WRITE(DSPSURF(plane),
-		   i915_gem_obj_ggtt_offset(obj) + intel_crtc->dspaddr_offset);
+		   intel_fb_gtt_offset(fb, rotation) +
+		   intel_crtc->dspaddr_offset);
 	if (IS_HASWELL(dev) || IS_BROADWELL(dev)) {
 		I915_WRITE(DSPOFFSET(plane), (y << 16) | x);
 	} else {
@@ -2835,28 +2977,16 @@ u32 intel_fb_stride_alignment(const struct drm_i915_private *dev_priv,
 	}
 }
 
-u32 intel_plane_obj_offset(struct intel_plane *intel_plane,
-			   struct drm_i915_gem_object *obj,
-			   unsigned int plane)
+u32 intel_fb_gtt_offset(struct drm_framebuffer *fb,
+			unsigned int rotation)
 {
+	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	struct i915_ggtt_view view;
-	struct i915_vma *vma;
 	u64 offset;
 
-	intel_fill_fb_ggtt_view(&view, intel_plane->base.state->fb,
-				intel_plane->base.state->rotation);
+	intel_fill_fb_ggtt_view(&view, fb, rotation);
 
-	vma = i915_gem_obj_to_ggtt_view(obj, &view);
-	if (WARN(!vma, "ggtt vma for display object not found! (view=%u)\n",
-		view.type))
-		return -1;
-
-	offset = vma->node.start;
-
-	if (plane == 1) {
-		offset += vma->ggtt_view.params.rotated.uv_start_page *
-			  PAGE_SIZE;
-	}
+	offset = i915_gem_obj_ggtt_offset_view(obj, &view);
 
 	WARN_ON(upper_32_bits(offset));
 
@@ -2979,12 +3109,9 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->base.crtc);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	int pipe = intel_crtc->pipe;
 	u32 plane_ctl, stride_div, stride;
-	u32 tile_height, plane_offset, plane_size;
 	unsigned int rotation = plane_state->base.rotation;
-	int x_offset, y_offset;
 	u32 surf_addr;
 	int scaler_id = plane_state->scaler_id;
 	int src_x = plane_state->src.x1 >> 16;
@@ -3005,36 +3132,49 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 	plane_ctl |= PLANE_CTL_PLANE_GAMMA_DISABLE;
 	plane_ctl |= skl_plane_ctl_rotation(rotation);
 
-	stride_div = intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-					       fb->pixel_format);
-	surf_addr = intel_plane_obj_offset(to_intel_plane(plane), obj, 0);
-
-	WARN_ON(drm_rect_width(&plane_state->src) == 0);
-
 	if (intel_rotation_90_or_270(rotation)) {
+		struct drm_rect r = {
+			.x1 = src_x,
+			.x2 = src_x + src_w,
+			.y1 = src_y,
+			.y2 = src_y + src_h,
+		};
 		int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
+		struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 
-		/* stride = Surface height in tiles */
-		tile_height = intel_tile_height(dev_priv, fb->modifier[0], cpp);
-		stride = DIV_ROUND_UP(fb->height, tile_height);
-		x_offset = stride * tile_height - src_y - src_h;
-		y_offset = src_x;
-		plane_size = (src_w - 1) << 16 | (src_h - 1);
+		/* Rotate src coordinates to match rotated GTT view */
+		drm_rect_rotate(&r, fb->width, fb->height, BIT(DRM_ROTATE_270));
+
+		src_x = r.x1;
+		src_y = r.y1;
+		src_w = drm_rect_width(&r);
+		src_h = drm_rect_height(&r);
+
+		stride_div = intel_tile_height(dev_priv, fb->modifier[0], cpp);
+		stride = intel_fb->rotated[0].pitch;
 	} else {
-		stride = fb->pitches[0] / stride_div;
-		x_offset = src_x;
-		y_offset = src_y;
-		plane_size = (src_h - 1) << 16 | (src_w - 1);
+		stride_div = intel_fb_stride_alignment(dev_priv, fb->modifier[0],
+						       fb->pixel_format);
+		stride = fb->pitches[0];
 	}
-	plane_offset = y_offset << 16 | x_offset;
 
-	intel_crtc->adjusted_x = x_offset;
-	intel_crtc->adjusted_y = y_offset;
+	intel_add_fb_offsets(&src_x, &src_y, fb, 0, rotation);
+	surf_addr = intel_compute_tile_offset(&src_x, &src_y, fb, 0,
+					      stride, rotation);
+
+	/* Sizes are 0 based */
+	src_w--;
+	src_h--;
+	dst_w--;
+	dst_h--;
+
+	intel_crtc->adjusted_x = src_x;
+	intel_crtc->adjusted_y = src_y;
 
 	I915_WRITE(PLANE_CTL(pipe, 0), plane_ctl);
-	I915_WRITE(PLANE_OFFSET(pipe, 0), plane_offset);
-	I915_WRITE(PLANE_SIZE(pipe, 0), plane_size);
-	I915_WRITE(PLANE_STRIDE(pipe, 0), stride);
+	I915_WRITE(PLANE_OFFSET(pipe, 0), (src_y << 16) | src_x);
+	I915_WRITE(PLANE_STRIDE(pipe, 0), stride / stride_div);
+	I915_WRITE(PLANE_SIZE(pipe, 0), (src_h << 16) | src_w);
 
 	if (scaler_id >= 0) {
 		uint32_t ps_ctrl = 0;
@@ -3051,7 +3191,8 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 		I915_WRITE(PLANE_POS(pipe, 0), (dst_y << 16) | dst_x);
 	}
 
-	I915_WRITE(PLANE_SURF(pipe, 0), surf_addr);
+	I915_WRITE(PLANE_SURF(pipe, 0),
+		   intel_fb_gtt_offset(fb, rotation) + surf_addr);
 
 	POSTING_READ(PLANE_SURF(pipe, 0));
 }
@@ -11462,7 +11603,7 @@ static void skl_do_mmio_flip(struct intel_crtc *intel_crtc,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_framebuffer *fb = intel_crtc->base.primary->fb;
 	const enum pipe pipe = intel_crtc->pipe;
-	u32 ctl, stride, tile_height;
+	u32 ctl, stride;
 
 	ctl = I915_READ(PLANE_CTL(pipe, 0));
 	ctl &= ~PLANE_CTL_TILED_MASK;
@@ -11487,9 +11628,11 @@ static void skl_do_mmio_flip(struct intel_crtc *intel_crtc,
 	 * linear buffers or in number of tiles for tiled buffers.
 	 */
 	if (intel_rotation_90_or_270(rotation)) {
-		/* stride = Surface height in tiles */
-		tile_height = intel_tile_height(dev_priv, fb->modifier[0], 0);
-		stride = DIV_ROUND_UP(fb->height, tile_height);
+		int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
+		struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
+
+		stride = intel_fb->rotated[0].pitch /
+			intel_tile_height(dev_priv, fb->modifier[0], cpp);
 	} else {
 		stride = fb->pitches[0] /
 			intel_fb_stride_alignment(dev_priv, fb->modifier[0],
@@ -11769,8 +11912,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	if (ret)
 		goto cleanup_pending;
 
-	work->gtt_offset = intel_plane_obj_offset(to_intel_plane(primary),
-						  obj, 0);
+	work->gtt_offset = intel_fb_gtt_offset(fb, primary->state->rotation);
 	work->gtt_offset += intel_crtc->dspaddr_offset;
 	work->rotation = crtc->primary->state->rotation;
 
@@ -15040,7 +15182,6 @@ static int intel_framebuffer_init(struct drm_device *dev,
 				  struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	unsigned int aligned_height;
 	int ret;
 	u32 pitch_limit, stride_alignment;
 
@@ -15166,17 +15307,12 @@ static int intel_framebuffer_init(struct drm_device *dev,
 	if (mode_cmd->offsets[0] != 0)
 		return -EINVAL;
 
-	aligned_height = intel_fb_align_height(dev, mode_cmd->height,
-					       mode_cmd->pixel_format,
-					       mode_cmd->modifier[0]);
-	/* FIXME drm helper for size checks (especially planar formats)? */
-	if (obj->base.size < aligned_height * mode_cmd->pitches[0])
-		return -EINVAL;
-
 	drm_helper_mode_fill_fb_struct(&intel_fb->base, mode_cmd);
 	intel_fb->obj = obj;
 
-	intel_fill_fb_info(dev_priv, &intel_fb->base);
+	ret = intel_fill_fb_info(dev_priv, &intel_fb->base);
+	if (ret)
+		return ret;
 
 	ret = drm_framebuffer_init(dev, &intel_fb->base, &intel_fb_funcs);
 	if (ret) {
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 65c97a3eef1d..a21ebc5232f6 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -178,6 +178,16 @@ struct intel_framebuffer {
 	struct drm_framebuffer base;
 	struct drm_i915_gem_object *obj;
 	struct intel_rotation_info rot_info;
+
+	/* for each plane in the normal GTT view */
+	struct {
+		unsigned int x, y;
+	} normal[2];
+	/* for each plane in the rotated GTT view */
+	struct {
+		unsigned int x, y;
+		unsigned int pitch; /* pixels */
+	} rotated[2];
 };
 
 struct intel_fbdev {
@@ -1156,6 +1166,11 @@ int vlv_get_cck_clock(struct drm_i915_private *dev_priv,
 		      const char *name, u32 reg, int ref_freq);
 extern const struct drm_plane_funcs intel_plane_funcs;
 void intel_init_display_hooks(struct drm_i915_private *dev_priv);
+unsigned int intel_fb_xy_to_linear(int x, int y,
+				   const struct drm_framebuffer *fb, int plane);
+void intel_add_fb_offsets(int *x, int *y,
+			  const struct drm_framebuffer *fb, int plane,
+			  unsigned int rotation);
 unsigned int intel_rotation_info_size(const struct intel_rotation_info *rot_info);
 bool intel_has_pending_fb_unpin(struct drm_device *dev);
 void intel_mark_busy(struct drm_i915_private *dev_priv);
@@ -1326,9 +1341,7 @@ void intel_mode_from_pipe_config(struct drm_display_mode *mode,
 int skl_update_scaler_crtc(struct intel_crtc_state *crtc_state);
 int skl_max_scale(struct intel_crtc *crtc, struct intel_crtc_state *crtc_state);
 
-u32 intel_plane_obj_offset(struct intel_plane *intel_plane,
-			   struct drm_i915_gem_object *obj,
-			   unsigned int plane);
+u32 intel_fb_gtt_offset(struct drm_framebuffer *fb, unsigned int rotation);
 
 u32 skl_plane_ctl_format(uint32_t pixel_format);
 u32 skl_plane_ctl_tiling(uint64_t fb_modifier);
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 9ed7ad32cffd..1fd99d39e89d 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -203,15 +203,13 @@ skl_update_plane(struct drm_plane *drm_plane,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_plane *intel_plane = to_intel_plane(drm_plane);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
+	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	const int pipe = intel_plane->pipe;
 	const int plane = intel_plane->plane + 1;
 	u32 plane_ctl, stride_div, stride;
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
 	u32 surf_addr;
-	u32 tile_height, plane_offset, plane_size;
 	unsigned int rotation = plane_state->base.rotation;
-	int x_offset, y_offset;
 	int crtc_x = plane_state->dst.x1;
 	int crtc_y = plane_state->dst.y1;
 	uint32_t crtc_w = drm_rect_width(&plane_state->dst);
@@ -230,15 +228,6 @@ skl_update_plane(struct drm_plane *drm_plane,
 
 	plane_ctl |= skl_plane_ctl_rotation(rotation);
 
-	stride_div = intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-					       fb->pixel_format);
-
-	/* Sizes are 0 based */
-	src_w--;
-	src_h--;
-	crtc_w--;
-	crtc_h--;
-
 	if (key->flags) {
 		I915_WRITE(PLANE_KEYVAL(pipe, plane), key->min_value);
 		I915_WRITE(PLANE_KEYMAX(pipe, plane), key->max_value);
@@ -250,28 +239,44 @@ skl_update_plane(struct drm_plane *drm_plane,
 	else if (key->flags & I915_SET_COLORKEY_SOURCE)
 		plane_ctl |= PLANE_CTL_KEY_ENABLE_SOURCE;
 
-	surf_addr = intel_plane_obj_offset(intel_plane, obj, 0);
-
 	if (intel_rotation_90_or_270(rotation)) {
-		int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
+		struct drm_rect r = {
+			.x1 = x,
+			.x2 = x + src_w,
+			.y1 = y,
+			.y2 = y + src_h,
+		};
+		unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 
-		/* stride: Surface height in tiles */
-		tile_height = intel_tile_height(dev_priv, fb->modifier[0], cpp);
-		stride = DIV_ROUND_UP(fb->height, tile_height);
-		plane_size = (src_w << 16) | src_h;
-		x_offset = stride * tile_height - y - (src_h + 1);
-		y_offset = x;
+		/* Rotate src coordinates to match rotated GTT view */
+		drm_rect_rotate(&r, fb->width, fb->height, BIT(DRM_ROTATE_270));
+
+		x = r.x1;
+		y = r.y1;
+		src_w = drm_rect_width(&r);
+		src_h = drm_rect_height(&r);
+
+		stride_div = intel_tile_height(dev_priv, fb->modifier[0], cpp);
+		stride = intel_fb->rotated[0].pitch;
 	} else {
-		stride = fb->pitches[0] / stride_div;
-		plane_size = (src_h << 16) | src_w;
-		x_offset = x;
-		y_offset = y;
+		stride_div = intel_fb_stride_alignment(dev_priv, fb->modifier[0],
+						       fb->pixel_format);
+		stride = fb->pitches[0];
 	}
-	plane_offset = y_offset << 16 | x_offset;
 
-	I915_WRITE(PLANE_OFFSET(pipe, plane), plane_offset);
-	I915_WRITE(PLANE_STRIDE(pipe, plane), stride);
-	I915_WRITE(PLANE_SIZE(pipe, plane), plane_size);
+	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
+	surf_addr = intel_compute_tile_offset(&x, &y, fb, 0,
+					      stride, rotation);
+
+	/* Sizes are 0 based */
+	src_w--;
+	src_h--;
+	crtc_w--;
+	crtc_h--;
+
+	I915_WRITE(PLANE_OFFSET(pipe, plane), (y << 16) | x);
+	I915_WRITE(PLANE_STRIDE(pipe, plane), stride / stride_div);
+	I915_WRITE(PLANE_SIZE(pipe, plane), (src_h << 16) | src_w);
 
 	/* program plane scaler */
 	if (plane_state->scaler_id >= 0) {
@@ -296,7 +301,8 @@ skl_update_plane(struct drm_plane *drm_plane,
 	}
 
 	I915_WRITE(PLANE_CTL(pipe, plane), plane_ctl);
-	I915_WRITE(PLANE_SURF(pipe, plane), surf_addr);
+	I915_WRITE(PLANE_SURF(pipe, plane),
+		   intel_fb_gtt_offset(fb, rotation) + surf_addr);
 	POSTING_READ(PLANE_SURF(pipe, plane));
 }
 
@@ -369,7 +375,6 @@ vlv_update_plane(struct drm_plane *dplane,
 	u32 sprctl;
 	u32 sprsurf_offset, linear_offset;
 	unsigned int rotation = dplane->state->rotation;
-	int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
 	int crtc_x = plane_state->dst.x1;
 	int crtc_y = plane_state->dst.y1;
@@ -440,19 +445,19 @@ vlv_update_plane(struct drm_plane *dplane,
 	crtc_w--;
 	crtc_h--;
 
-	linear_offset = y * fb->pitches[0] + x * cpp;
+	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
 	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0,
 						   fb->pitches[0], rotation);
-	linear_offset -= sprsurf_offset;
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		sprctl |= SP_ROTATE_180;
 
 		x += src_w;
 		y += src_h;
-		linear_offset += src_h * fb->pitches[0] + src_w * cpp;
 	}
 
+	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+
 	if (key->flags) {
 		I915_WRITE(SPKEYMINVAL(pipe, plane), key->min_value);
 		I915_WRITE(SPKEYMAXVAL(pipe, plane), key->max_value);
@@ -477,8 +482,8 @@ vlv_update_plane(struct drm_plane *dplane,
 
 	I915_WRITE(SPSIZE(pipe, plane), (crtc_h << 16) | crtc_w);
 	I915_WRITE(SPCNTR(pipe, plane), sprctl);
-	I915_WRITE(SPSURF(pipe, plane), i915_gem_obj_ggtt_offset(obj) +
-		   sprsurf_offset);
+	I915_WRITE(SPSURF(pipe, plane),
+		   intel_fb_gtt_offset(fb, rotation) + sprsurf_offset);
 	POSTING_READ(SPSURF(pipe, plane));
 }
 
@@ -511,7 +516,6 @@ ivb_update_plane(struct drm_plane *plane,
 	u32 sprctl, sprscale = 0;
 	u32 sprsurf_offset, linear_offset;
 	unsigned int rotation = plane_state->base.rotation;
-	int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
 	int crtc_x = plane_state->dst.x1;
 	int crtc_y = plane_state->dst.y1;
@@ -573,10 +577,9 @@ ivb_update_plane(struct drm_plane *plane,
 	if (crtc_w != src_w || crtc_h != src_h)
 		sprscale = SPRITE_SCALE_ENABLE | (src_w << 16) | src_h;
 
-	linear_offset = y * fb->pitches[0] + x * cpp;
+	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
 	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0,
 						   fb->pitches[0], rotation);
-	linear_offset -= sprsurf_offset;
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		sprctl |= SPRITE_ROTATE_180;
@@ -585,10 +588,11 @@ ivb_update_plane(struct drm_plane *plane,
 		if (!IS_HASWELL(dev) && !IS_BROADWELL(dev)) {
 			x += src_w;
 			y += src_h;
-			linear_offset += src_h * fb->pitches[0] + src_w * cpp;
 		}
 	}
 
+	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+
 	if (key->flags) {
 		I915_WRITE(SPRKEYVAL(pipe), key->min_value);
 		I915_WRITE(SPRKEYMAX(pipe), key->max_value);
@@ -617,7 +621,7 @@ ivb_update_plane(struct drm_plane *plane,
 		I915_WRITE(SPRSCALE(pipe), sprscale);
 	I915_WRITE(SPRCTL(pipe), sprctl);
 	I915_WRITE(SPRSURF(pipe),
-		   i915_gem_obj_ggtt_offset(obj) + sprsurf_offset);
+		   intel_fb_gtt_offset(fb, rotation) + sprsurf_offset);
 	POSTING_READ(SPRSURF(pipe));
 }
 
@@ -652,7 +656,6 @@ ilk_update_plane(struct drm_plane *plane,
 	u32 dvscntr, dvsscale;
 	u32 dvssurf_offset, linear_offset;
 	unsigned int rotation = plane_state->base.rotation;
-	int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
 	int crtc_x = plane_state->dst.x1;
 	int crtc_y = plane_state->dst.y1;
@@ -710,19 +713,19 @@ ilk_update_plane(struct drm_plane *plane,
 	if (crtc_w != src_w || crtc_h != src_h)
 		dvsscale = DVS_SCALE_ENABLE | (src_w << 16) | src_h;
 
-	linear_offset = y * fb->pitches[0] + x * cpp;
+	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
 	dvssurf_offset = intel_compute_tile_offset(&x, &y, fb, 0,
 						   fb->pitches[0], rotation);
-	linear_offset -= dvssurf_offset;
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dvscntr |= DVS_ROTATE_180;
 
 		x += src_w;
 		y += src_h;
-		linear_offset += src_h * fb->pitches[0] + src_w * cpp;
 	}
 
+	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+
 	if (key->flags) {
 		I915_WRITE(DVSKEYVAL(pipe), key->min_value);
 		I915_WRITE(DVSKEYMAX(pipe), key->max_value);
@@ -746,7 +749,7 @@ ilk_update_plane(struct drm_plane *plane,
 	I915_WRITE(DVSSCALE(pipe), dvsscale);
 	I915_WRITE(DVSCNTR(pipe), dvscntr);
 	I915_WRITE(DVSSURF(pipe),
-		   i915_gem_obj_ggtt_offset(obj) + dvssurf_offset);
+		   intel_fb_gtt_offset(fb, rotation) + dvssurf_offset);
 	POSTING_READ(DVSSURF(pipe));
 }
 

From ef78ec9423b9ee9f8f3dfe92dffb900cb83365ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Tue, 13 Oct 2015 22:48:39 +0300
Subject: [PATCH 035/141] drm/i915: Don't pass pitch to
 intel_compute_page_offset()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

intel_compute_page_offset() can dig up the correct pitch from the fb
itself, no need for the caller to pass it in.

A bit of extra care is needed for the lower level
_intel_compute_page_offset() since that one gets called before the
rotated pitch under intel_fb is populated. Note that we don't actually
call it with anything but DRM_ROTATE_0 there so we wouldn't actually
look up the rotated pitch there, but still, leave the pitch as something
the caller has to pass to _intel_compute_page_offset() as an
indicator that something is a bit special.

This leaves 'stride_div' in the skl plane update hooks as a mostly useless
variable so just get rid of it.

v2: Add a note why stride_div got nuked
v3: Extract intel_fb_pitch() since it can be useful later

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v2)
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-3-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 34 ++++++++++++++++------------
 drivers/gpu/drm/i915/intel_drv.h     |  1 -
 drivers/gpu/drm/i915/intel_sprite.c  | 26 +++++++++------------
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 7c61ab4e5c64..05e6ec3e0029 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2267,6 +2267,15 @@ void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	i915_gem_object_unpin_from_display_plane(obj, &view);
 }
 
+static int intel_fb_pitch(const struct drm_framebuffer *fb, int plane,
+			  unsigned int rotation)
+{
+	if (intel_rotation_90_or_270(rotation))
+		return to_intel_framebuffer(fb)->rotated[plane].pitch;
+	else
+		return fb->pitches[plane];
+}
+
 /*
  * Convert the x/y offsets into a linear offset.
  * Only valid with 0/180 degree rotation, which is fine since linear
@@ -2400,11 +2409,11 @@ static u32 _intel_compute_tile_offset(const struct drm_i915_private *dev_priv,
 
 u32 intel_compute_tile_offset(int *x, int *y,
 			      const struct drm_framebuffer *fb, int plane,
-			      unsigned int pitch,
 			      unsigned int rotation)
 {
 	const struct drm_i915_private *dev_priv = to_i915(fb->dev);
 	u32 alignment = intel_surf_alignment(dev_priv, fb->modifier[plane]);
+	int pitch = intel_fb_pitch(fb, plane, rotation);
 
 	return _intel_compute_tile_offset(dev_priv, x, y, fb, plane, pitch,
 					  rotation, alignment);
@@ -2830,8 +2839,7 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 
 	if (INTEL_INFO(dev)->gen >= 4)
 		intel_crtc->dspaddr_offset =
-			intel_compute_tile_offset(&x, &y, fb, 0,
-						  fb->pitches[0], rotation);
+			intel_compute_tile_offset(&x, &y, fb, 0, rotation);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dspcntr |= DISPPLANE_ROTATE_180;
@@ -2933,8 +2941,7 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
 
 	intel_crtc->dspaddr_offset =
-		intel_compute_tile_offset(&x, &y, fb, 0,
-					  fb->pitches[0], rotation);
+		intel_compute_tile_offset(&x, &y, fb, 0, rotation);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dspcntr |= DISPPLANE_ROTATE_180;
@@ -3110,7 +3117,7 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->base.crtc);
 	struct drm_framebuffer *fb = plane_state->base.fb;
 	int pipe = intel_crtc->pipe;
-	u32 plane_ctl, stride_div, stride;
+	u32 plane_ctl, stride;
 	unsigned int rotation = plane_state->base.rotation;
 	u32 surf_addr;
 	int scaler_id = plane_state->scaler_id;
@@ -3150,17 +3157,16 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 		src_w = drm_rect_width(&r);
 		src_h = drm_rect_height(&r);
 
-		stride_div = intel_tile_height(dev_priv, fb->modifier[0], cpp);
-		stride = intel_fb->rotated[0].pitch;
+		stride = intel_fb->rotated[0].pitch /
+			intel_tile_height(dev_priv, fb->modifier[0], cpp);
 	} else {
-		stride_div = intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-						       fb->pixel_format);
-		stride = fb->pitches[0];
+		stride = fb->pitches[0] /
+			intel_fb_stride_alignment(dev_priv, fb->modifier[0],
+						  fb->pixel_format);
 	}
 
 	intel_add_fb_offsets(&src_x, &src_y, fb, 0, rotation);
-	surf_addr = intel_compute_tile_offset(&src_x, &src_y, fb, 0,
-					      stride, rotation);
+	surf_addr = intel_compute_tile_offset(&src_x, &src_y, fb, 0, rotation);
 
 	/* Sizes are 0 based */
 	src_w--;
@@ -3173,7 +3179,7 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 
 	I915_WRITE(PLANE_CTL(pipe, 0), plane_ctl);
 	I915_WRITE(PLANE_OFFSET(pipe, 0), (src_y << 16) | src_x);
-	I915_WRITE(PLANE_STRIDE(pipe, 0), stride / stride_div);
+	I915_WRITE(PLANE_STRIDE(pipe, 0), stride);
 	I915_WRITE(PLANE_SIZE(pipe, 0), (src_h << 16) | src_w);
 
 	if (scaler_id >= 0) {
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index a21ebc5232f6..127e343368cc 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1297,7 +1297,6 @@ void assert_pipe(struct drm_i915_private *dev_priv, enum pipe pipe, bool state);
 #define assert_pipe_disabled(d, p) assert_pipe(d, p, false)
 u32 intel_compute_tile_offset(int *x, int *y,
 			      const struct drm_framebuffer *fb, int plane,
-			      unsigned int pitch,
 			      unsigned int rotation);
 void intel_prepare_reset(struct drm_i915_private *dev_priv);
 void intel_finish_reset(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 1fd99d39e89d..75bce3b48c2a 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -206,7 +206,7 @@ skl_update_plane(struct drm_plane *drm_plane,
 	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	const int pipe = intel_plane->pipe;
 	const int plane = intel_plane->plane + 1;
-	u32 plane_ctl, stride_div, stride;
+	u32 plane_ctl, stride;
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
 	u32 surf_addr;
 	unsigned int rotation = plane_state->base.rotation;
@@ -256,17 +256,16 @@ skl_update_plane(struct drm_plane *drm_plane,
 		src_w = drm_rect_width(&r);
 		src_h = drm_rect_height(&r);
 
-		stride_div = intel_tile_height(dev_priv, fb->modifier[0], cpp);
-		stride = intel_fb->rotated[0].pitch;
+		stride = intel_fb->rotated[0].pitch /
+			intel_tile_height(dev_priv, fb->modifier[0], cpp);
 	} else {
-		stride_div = intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-						       fb->pixel_format);
-		stride = fb->pitches[0];
+		stride = fb->pitches[0] /
+			intel_fb_stride_alignment(dev_priv, fb->modifier[0],
+						  fb->pixel_format);
 	}
 
 	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	surf_addr = intel_compute_tile_offset(&x, &y, fb, 0,
-					      stride, rotation);
+	surf_addr = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
 
 	/* Sizes are 0 based */
 	src_w--;
@@ -275,7 +274,7 @@ skl_update_plane(struct drm_plane *drm_plane,
 	crtc_h--;
 
 	I915_WRITE(PLANE_OFFSET(pipe, plane), (y << 16) | x);
-	I915_WRITE(PLANE_STRIDE(pipe, plane), stride / stride_div);
+	I915_WRITE(PLANE_STRIDE(pipe, plane), stride);
 	I915_WRITE(PLANE_SIZE(pipe, plane), (src_h << 16) | src_w);
 
 	/* program plane scaler */
@@ -446,8 +445,7 @@ vlv_update_plane(struct drm_plane *dplane,
 	crtc_h--;
 
 	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0,
-						   fb->pitches[0], rotation);
+	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		sprctl |= SP_ROTATE_180;
@@ -578,8 +576,7 @@ ivb_update_plane(struct drm_plane *plane,
 		sprscale = SPRITE_SCALE_ENABLE | (src_w << 16) | src_h;
 
 	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0,
-						   fb->pitches[0], rotation);
+	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		sprctl |= SPRITE_ROTATE_180;
@@ -714,8 +711,7 @@ ilk_update_plane(struct drm_plane *plane,
 		dvsscale = DVS_SCALE_ENABLE | (src_w << 16) | src_h;
 
 	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	dvssurf_offset = intel_compute_tile_offset(&x, &y, fb, 0,
-						   fb->pitches[0], rotation);
+	dvssurf_offset = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dvscntr |= DVS_ROTATE_180;

From d21967740f4b7d7af794badac91b2c718b8f17c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 28 Jan 2016 18:33:11 +0200
Subject: [PATCH 036/141] drm/i915: Move SKL hw stride calculation into a
 helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We repeat the SKL stride register value calculations a several places.
Move it into a small helper function.

v2: Rebase due to drm_plane_state src/dst rects

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-4-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 52 +++++++++++++---------------
 drivers/gpu/drm/i915/intel_drv.h     |  2 ++
 drivers/gpu/drm/i915/intel_sprite.c  | 12 ++-----
 3 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 05e6ec3e0029..cb82fa83aef1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3027,6 +3027,28 @@ static void skl_detach_scalers(struct intel_crtc *intel_crtc)
 	}
 }
 
+u32 skl_plane_stride(const struct drm_framebuffer *fb, int plane,
+		     unsigned int rotation)
+{
+	const struct drm_i915_private *dev_priv = to_i915(fb->dev);
+	u32 stride = intel_fb_pitch(fb, plane, rotation);
+
+	/*
+	 * The stride is either expressed as a multiple of 64 bytes chunks for
+	 * linear buffers or in number of tiles for tiled buffers.
+	 */
+	if (intel_rotation_90_or_270(rotation)) {
+		int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
+
+		stride /= intel_tile_height(dev_priv, fb->modifier[0], cpp);
+	} else {
+		stride /= intel_fb_stride_alignment(dev_priv, fb->modifier[0],
+						    fb->pixel_format);
+	}
+
+	return stride;
+}
+
 u32 skl_plane_ctl_format(uint32_t pixel_format)
 {
 	switch (pixel_format) {
@@ -3117,8 +3139,9 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->base.crtc);
 	struct drm_framebuffer *fb = plane_state->base.fb;
 	int pipe = intel_crtc->pipe;
-	u32 plane_ctl, stride;
+	u32 plane_ctl;
 	unsigned int rotation = plane_state->base.rotation;
+	u32 stride = skl_plane_stride(fb, 0, rotation);
 	u32 surf_addr;
 	int scaler_id = plane_state->scaler_id;
 	int src_x = plane_state->src.x1 >> 16;
@@ -3146,8 +3169,6 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 			.y1 = src_y,
 			.y2 = src_y + src_h,
 		};
-		int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
-		struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 
 		/* Rotate src coordinates to match rotated GTT view */
 		drm_rect_rotate(&r, fb->width, fb->height, BIT(DRM_ROTATE_270));
@@ -3156,13 +3177,6 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 		src_y = r.y1;
 		src_w = drm_rect_width(&r);
 		src_h = drm_rect_height(&r);
-
-		stride = intel_fb->rotated[0].pitch /
-			intel_tile_height(dev_priv, fb->modifier[0], cpp);
-	} else {
-		stride = fb->pitches[0] /
-			intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-						  fb->pixel_format);
 	}
 
 	intel_add_fb_offsets(&src_x, &src_y, fb, 0, rotation);
@@ -11609,7 +11623,7 @@ static void skl_do_mmio_flip(struct intel_crtc *intel_crtc,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_framebuffer *fb = intel_crtc->base.primary->fb;
 	const enum pipe pipe = intel_crtc->pipe;
-	u32 ctl, stride;
+	u32 ctl, stride = skl_plane_stride(fb, 0, rotation);
 
 	ctl = I915_READ(PLANE_CTL(pipe, 0));
 	ctl &= ~PLANE_CTL_TILED_MASK;
@@ -11629,22 +11643,6 @@ static void skl_do_mmio_flip(struct intel_crtc *intel_crtc,
 		MISSING_CASE(fb->modifier[0]);
 	}
 
-	/*
-	 * The stride is either expressed as a multiple of 64 bytes chunks for
-	 * linear buffers or in number of tiles for tiled buffers.
-	 */
-	if (intel_rotation_90_or_270(rotation)) {
-		int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
-		struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
-
-		stride = intel_fb->rotated[0].pitch /
-			intel_tile_height(dev_priv, fb->modifier[0], cpp);
-	} else {
-		stride = fb->pitches[0] /
-			intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-						  fb->pixel_format);
-	}
-
 	/*
 	 * Both PLANE_CTL and PLANE_STRIDE are not updated on vblank but on
 	 * PLANE_SURF updates, the update is then guaranteed to be atomic.
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 127e343368cc..34992c2ac584 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1345,6 +1345,8 @@ u32 intel_fb_gtt_offset(struct drm_framebuffer *fb, unsigned int rotation);
 u32 skl_plane_ctl_format(uint32_t pixel_format);
 u32 skl_plane_ctl_tiling(uint64_t fb_modifier);
 u32 skl_plane_ctl_rotation(unsigned int rotation);
+u32 skl_plane_stride(const struct drm_framebuffer *fb, int plane,
+		     unsigned int rotation);
 
 /* intel_csr.c */
 void intel_csr_ucode_init(struct drm_i915_private *);
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 75bce3b48c2a..2d24c74ea06d 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -203,13 +203,13 @@ skl_update_plane(struct drm_plane *drm_plane,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_plane *intel_plane = to_intel_plane(drm_plane);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
 	const int pipe = intel_plane->pipe;
 	const int plane = intel_plane->plane + 1;
-	u32 plane_ctl, stride;
+	u32 plane_ctl;
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
 	u32 surf_addr;
 	unsigned int rotation = plane_state->base.rotation;
+	u32 stride = skl_plane_stride(fb, 0, rotation);
 	int crtc_x = plane_state->dst.x1;
 	int crtc_y = plane_state->dst.y1;
 	uint32_t crtc_w = drm_rect_width(&plane_state->dst);
@@ -246,7 +246,6 @@ skl_update_plane(struct drm_plane *drm_plane,
 			.y1 = y,
 			.y2 = y + src_h,
 		};
-		unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
 
 		/* Rotate src coordinates to match rotated GTT view */
 		drm_rect_rotate(&r, fb->width, fb->height, BIT(DRM_ROTATE_270));
@@ -255,13 +254,6 @@ skl_update_plane(struct drm_plane *drm_plane,
 		y = r.y1;
 		src_w = drm_rect_width(&r);
 		src_h = drm_rect_height(&r);
-
-		stride = intel_fb->rotated[0].pitch /
-			intel_tile_height(dev_priv, fb->modifier[0], cpp);
-	} else {
-		stride = fb->pitches[0] /
-			intel_fb_stride_alignment(dev_priv, fb->modifier[0],
-						  fb->pixel_format);
 	}
 
 	intel_add_fb_offsets(&x, &y, fb, 0, rotation);

From 2949056c862275064ac4fc0bbb5fa68a4d3026bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Wed, 20 Jan 2016 18:02:50 +0200
Subject: [PATCH 037/141] drm/i915: Pass around plane_state instead of
 fb+rotation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

intel_compute_tile_offset() and intel_add_fb_offsets() get passed the fb
and the rotation. As both of those come from the plane state we can just
pass that in instead.

For extra consitency pass the plane state to intel_fb_xy_to_linear() as
well even though it only really needs the fb.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-5-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 35 ++++++++++++++++------------
 drivers/gpu/drm/i915/intel_drv.h     |  9 ++++---
 drivers/gpu/drm/i915/intel_sprite.c  | 22 ++++++++---------
 3 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index cb82fa83aef1..0863f2a2d31c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2283,8 +2283,10 @@ static int intel_fb_pitch(const struct drm_framebuffer *fb, int plane,
  * with gen2/3, and 90/270 degree rotations isn't supported on any of them.
  */
 u32 intel_fb_xy_to_linear(int x, int y,
-			  const struct drm_framebuffer *fb, int plane)
+			  const struct intel_plane_state *state,
+			  int plane)
 {
+	const struct drm_framebuffer *fb = state->base.fb;
 	unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
 	unsigned int pitch = fb->pitches[plane];
 
@@ -2297,11 +2299,12 @@ u32 intel_fb_xy_to_linear(int x, int y,
  * specify the start of scanout from the beginning of the gtt mapping.
  */
 void intel_add_fb_offsets(int *x, int *y,
-			  const struct drm_framebuffer *fb, int plane,
-			  unsigned int rotation)
+			  const struct intel_plane_state *state,
+			  int plane)
 
 {
-	const struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
+	const struct intel_framebuffer *intel_fb = to_intel_framebuffer(state->base.fb);
+	unsigned int rotation = state->base.rotation;
 
 	if (intel_rotation_90_or_270(rotation)) {
 		*x += intel_fb->rotated[plane].x;
@@ -2408,10 +2411,12 @@ static u32 _intel_compute_tile_offset(const struct drm_i915_private *dev_priv,
 }
 
 u32 intel_compute_tile_offset(int *x, int *y,
-			      const struct drm_framebuffer *fb, int plane,
-			      unsigned int rotation)
+			      const struct intel_plane_state *state,
+			      int plane)
 {
-	const struct drm_i915_private *dev_priv = to_i915(fb->dev);
+	const struct drm_i915_private *dev_priv = to_i915(state->base.plane->dev);
+	const struct drm_framebuffer *fb = state->base.fb;
+	unsigned int rotation = state->base.rotation;
 	u32 alignment = intel_surf_alignment(dev_priv, fb->modifier[plane]);
 	int pitch = intel_fb_pitch(fb, plane, rotation);
 
@@ -2835,11 +2840,11 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 	if (IS_G4X(dev))
 		dspcntr |= DISPPLANE_TRICKLE_FEED_DISABLE;
 
-	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
 
 	if (INTEL_INFO(dev)->gen >= 4)
 		intel_crtc->dspaddr_offset =
-			intel_compute_tile_offset(&x, &y, fb, 0, rotation);
+			intel_compute_tile_offset(&x, &y, plane_state, 0);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dspcntr |= DISPPLANE_ROTATE_180;
@@ -2848,7 +2853,7 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 		y += (crtc_state->pipe_src_h - 1);
 	}
 
-	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+	linear_offset = intel_fb_xy_to_linear(x, y, plane_state, 0);
 
 	if (INTEL_INFO(dev)->gen < 4)
 		intel_crtc->dspaddr_offset = linear_offset;
@@ -2938,10 +2943,10 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 	if (!IS_HASWELL(dev) && !IS_BROADWELL(dev))
 		dspcntr |= DISPPLANE_TRICKLE_FEED_DISABLE;
 
-	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
 
 	intel_crtc->dspaddr_offset =
-		intel_compute_tile_offset(&x, &y, fb, 0, rotation);
+		intel_compute_tile_offset(&x, &y, plane_state, 0);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dspcntr |= DISPPLANE_ROTATE_180;
@@ -2952,7 +2957,7 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 		}
 	}
 
-	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+	linear_offset = intel_fb_xy_to_linear(x, y, plane_state, 0);
 
 	intel_crtc->adjusted_x = x;
 	intel_crtc->adjusted_y = y;
@@ -3179,8 +3184,8 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 		src_h = drm_rect_height(&r);
 	}
 
-	intel_add_fb_offsets(&src_x, &src_y, fb, 0, rotation);
-	surf_addr = intel_compute_tile_offset(&src_x, &src_y, fb, 0, rotation);
+	intel_add_fb_offsets(&src_x, &src_y, plane_state, 0);
+	surf_addr = intel_compute_tile_offset(&src_x, &src_y, plane_state, 0);
 
 	/* Sizes are 0 based */
 	src_w--;
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 34992c2ac584..9b27bd7f7283 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1167,10 +1167,10 @@ int vlv_get_cck_clock(struct drm_i915_private *dev_priv,
 extern const struct drm_plane_funcs intel_plane_funcs;
 void intel_init_display_hooks(struct drm_i915_private *dev_priv);
 unsigned int intel_fb_xy_to_linear(int x, int y,
-				   const struct drm_framebuffer *fb, int plane);
+				   const struct intel_plane_state *state,
+				   int plane);
 void intel_add_fb_offsets(int *x, int *y,
-			  const struct drm_framebuffer *fb, int plane,
-			  unsigned int rotation);
+			  const struct intel_plane_state *state, int plane);
 unsigned int intel_rotation_info_size(const struct intel_rotation_info *rot_info);
 bool intel_has_pending_fb_unpin(struct drm_device *dev);
 void intel_mark_busy(struct drm_i915_private *dev_priv);
@@ -1296,8 +1296,7 @@ void assert_pipe(struct drm_i915_private *dev_priv, enum pipe pipe, bool state);
 #define assert_pipe_enabled(d, p) assert_pipe(d, p, true)
 #define assert_pipe_disabled(d, p) assert_pipe(d, p, false)
 u32 intel_compute_tile_offset(int *x, int *y,
-			      const struct drm_framebuffer *fb, int plane,
-			      unsigned int rotation);
+			      const struct intel_plane_state *state, int plane);
 void intel_prepare_reset(struct drm_i915_private *dev_priv);
 void intel_finish_reset(struct drm_i915_private *dev_priv);
 void hsw_enable_pc8(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 2d24c74ea06d..67770c2ee183 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -256,8 +256,8 @@ skl_update_plane(struct drm_plane *drm_plane,
 		src_h = drm_rect_height(&r);
 	}
 
-	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	surf_addr = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
+	surf_addr = intel_compute_tile_offset(&x, &y, plane_state, 0);
 
 	/* Sizes are 0 based */
 	src_w--;
@@ -436,8 +436,8 @@ vlv_update_plane(struct drm_plane *dplane,
 	crtc_w--;
 	crtc_h--;
 
-	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
+	sprsurf_offset = intel_compute_tile_offset(&x, &y, plane_state, 0);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		sprctl |= SP_ROTATE_180;
@@ -446,7 +446,7 @@ vlv_update_plane(struct drm_plane *dplane,
 		y += src_h;
 	}
 
-	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+	linear_offset = intel_fb_xy_to_linear(x, y, plane_state, 0);
 
 	if (key->flags) {
 		I915_WRITE(SPKEYMINVAL(pipe, plane), key->min_value);
@@ -567,8 +567,8 @@ ivb_update_plane(struct drm_plane *plane,
 	if (crtc_w != src_w || crtc_h != src_h)
 		sprscale = SPRITE_SCALE_ENABLE | (src_w << 16) | src_h;
 
-	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	sprsurf_offset = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
+	sprsurf_offset = intel_compute_tile_offset(&x, &y, plane_state, 0);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		sprctl |= SPRITE_ROTATE_180;
@@ -580,7 +580,7 @@ ivb_update_plane(struct drm_plane *plane,
 		}
 	}
 
-	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+	linear_offset = intel_fb_xy_to_linear(x, y, plane_state, 0);
 
 	if (key->flags) {
 		I915_WRITE(SPRKEYVAL(pipe), key->min_value);
@@ -702,8 +702,8 @@ ilk_update_plane(struct drm_plane *plane,
 	if (crtc_w != src_w || crtc_h != src_h)
 		dvsscale = DVS_SCALE_ENABLE | (src_w << 16) | src_h;
 
-	intel_add_fb_offsets(&x, &y, fb, 0, rotation);
-	dvssurf_offset = intel_compute_tile_offset(&x, &y, fb, 0, rotation);
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
+	dvssurf_offset = intel_compute_tile_offset(&x, &y, plane_state, 0);
 
 	if (rotation == BIT(DRM_ROTATE_180)) {
 		dvscntr |= DVS_ROTATE_180;
@@ -712,7 +712,7 @@ ilk_update_plane(struct drm_plane *plane,
 		y += src_h;
 	}
 
-	linear_offset = intel_fb_xy_to_linear(x, y, fb, 0);
+	linear_offset = intel_fb_xy_to_linear(x, y, plane_state, 0);
 
 	if (key->flags) {
 		I915_WRITE(DVSKEYVAL(pipe), key->min_value);

From 72618ebfde9e3ff74e03eb6c0bc1fdb33d1d1fa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 4 Feb 2016 20:38:20 +0200
Subject: [PATCH 038/141] drm/i915: Use fb modifiers for display tiling
 decisions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Soon the fence tiling mode may not always match the fb modifier
even for X tiled buffers. So let's use the fb modifier
consistently for all display tiling decisions.

v2: Rebased due to s/ring/engine/
v3: Rebased due to s/engine/ring/ O_o
v4: Rebase due to i915_gem_object_get_tiling() & co.

Reviewed-by: Matthew Auld <matthew.auld@intel.com> (v1)
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-6-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 35 ++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_sprite.c  | 15 +++++-------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 0863f2a2d31c..b1d5fca5e14d 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2436,6 +2436,18 @@ static void intel_fb_offset_to_xy(int *x, int *y,
 	*x = linear_offset % pitch / cpp;
 }
 
+static unsigned int intel_fb_modifier_to_tiling(uint64_t fb_modifier)
+{
+	switch (fb_modifier) {
+	case I915_FORMAT_MOD_X_TILED:
+		return I915_TILING_X;
+	case I915_FORMAT_MOD_Y_TILED:
+		return I915_TILING_Y;
+	default:
+		return I915_TILING_NONE;
+	}
+}
+
 static int
 intel_fill_fb_info(struct drm_i915_private *dev_priv,
 		   struct drm_framebuffer *fb)
@@ -2834,7 +2846,8 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 		BUG();
 	}
 
-	if (INTEL_INFO(dev)->gen >= 4 && i915_gem_object_is_tiled(obj))
+	if (INTEL_GEN(dev_priv) >= 4 &&
+	    fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		dspcntr |= DISPPLANE_TILED;
 
 	if (IS_G4X(dev))
@@ -2899,7 +2912,6 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->base.crtc);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	int plane = intel_crtc->plane;
 	u32 linear_offset;
 	u32 dspcntr;
@@ -2937,7 +2949,7 @@ static void ironlake_update_primary_plane(struct drm_plane *primary,
 		BUG();
 	}
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		dspcntr |= DISPPLANE_TILED;
 
 	if (!IS_HASWELL(dev) && !IS_BROADWELL(dev))
@@ -11444,7 +11456,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
 	intel_ring_emit(ring, fb->pitches[0]);
 	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset |
-			i915_gem_object_get_tiling(obj));
+			intel_fb_modifier_to_tiling(fb->modifier[0]));
 
 	/* XXX Enabling the panel-fitter across page-flip is so far
 	 * untested on non-native modes, so ignore it for now.
@@ -11476,7 +11488,8 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 
 	intel_ring_emit(ring, MI_DISPLAY_FLIP |
 			MI_DISPLAY_FLIP_PLANE(intel_crtc->plane));
-	intel_ring_emit(ring, fb->pitches[0] | i915_gem_object_get_tiling(obj));
+	intel_ring_emit(ring, fb->pitches[0] |
+			intel_fb_modifier_to_tiling(fb->modifier[0]));
 	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
 
 	/* Contrary to the suggestions in the documentation,
@@ -11579,7 +11592,8 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	}
 
 	intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
-	intel_ring_emit(ring, fb->pitches[0] | i915_gem_object_get_tiling(obj));
+	intel_ring_emit(ring, fb->pitches[0] |
+			intel_fb_modifier_to_tiling(fb->modifier[0]));
 	intel_ring_emit(ring, intel_crtc->flip_work->gtt_offset);
 	intel_ring_emit(ring, (MI_NOOP));
 
@@ -11664,15 +11678,13 @@ static void ilk_do_mmio_flip(struct intel_crtc *intel_crtc,
 {
 	struct drm_device *dev = intel_crtc->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct intel_framebuffer *intel_fb =
-		to_intel_framebuffer(intel_crtc->base.primary->fb);
-	struct drm_i915_gem_object *obj = intel_fb->obj;
+	struct drm_framebuffer *fb = intel_crtc->base.primary->fb;
 	i915_reg_t reg = DSPCNTR(intel_crtc->plane);
 	u32 dspcntr;
 
 	dspcntr = I915_READ(reg);
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		dspcntr |= DISPPLANE_TILED;
 	else
 		dspcntr &= ~DISPPLANE_TILED;
@@ -11900,8 +11912,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 
 	if (IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev)) {
 		engine = &dev_priv->engine[BCS];
-		if (i915_gem_object_get_tiling(obj) !=
-		    i915_gem_object_get_tiling(intel_fb_obj(work->old_fb)))
+		if (fb->modifier[0] != old_fb->modifier[0])
 			/* vlv: DISPLAY_FLIP fails to change tiling */
 			engine = NULL;
 	} else if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev)) {
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 67770c2ee183..ccab808ffc6b 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -360,7 +360,6 @@ vlv_update_plane(struct drm_plane *dplane,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_plane *intel_plane = to_intel_plane(dplane);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	int pipe = intel_plane->pipe;
 	int plane = intel_plane->plane;
 	u32 sprctl;
@@ -427,7 +426,7 @@ vlv_update_plane(struct drm_plane *dplane,
 	 */
 	sprctl |= SP_GAMMA_ENABLE;
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		sprctl |= SP_TILED;
 
 	/* Sizes are 0 based */
@@ -463,7 +462,7 @@ vlv_update_plane(struct drm_plane *dplane,
 	I915_WRITE(SPSTRIDE(pipe, plane), fb->pitches[0]);
 	I915_WRITE(SPPOS(pipe, plane), (crtc_y << 16) | crtc_x);
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		I915_WRITE(SPTILEOFF(pipe, plane), (y << 16) | x);
 	else
 		I915_WRITE(SPLINOFF(pipe, plane), linear_offset);
@@ -501,7 +500,6 @@ ivb_update_plane(struct drm_plane *plane,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_plane *intel_plane = to_intel_plane(plane);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	enum pipe pipe = intel_plane->pipe;
 	u32 sprctl, sprscale = 0;
 	u32 sprsurf_offset, linear_offset;
@@ -547,7 +545,7 @@ ivb_update_plane(struct drm_plane *plane,
 	 */
 	sprctl |= SPRITE_GAMMA_ENABLE;
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		sprctl |= SPRITE_TILED;
 
 	if (IS_HASWELL(dev) || IS_BROADWELL(dev))
@@ -600,7 +598,7 @@ ivb_update_plane(struct drm_plane *plane,
 	 * register */
 	if (IS_HASWELL(dev) || IS_BROADWELL(dev))
 		I915_WRITE(SPROFFSET(pipe), (y << 16) | x);
-	else if (i915_gem_object_is_tiled(obj))
+	else if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		I915_WRITE(SPRTILEOFF(pipe), (y << 16) | x);
 	else
 		I915_WRITE(SPRLINOFF(pipe), linear_offset);
@@ -640,7 +638,6 @@ ilk_update_plane(struct drm_plane *plane,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_plane *intel_plane = to_intel_plane(plane);
 	struct drm_framebuffer *fb = plane_state->base.fb;
-	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	int pipe = intel_plane->pipe;
 	u32 dvscntr, dvsscale;
 	u32 dvssurf_offset, linear_offset;
@@ -686,7 +683,7 @@ ilk_update_plane(struct drm_plane *plane,
 	 */
 	dvscntr |= DVS_GAMMA_ENABLE;
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		dvscntr |= DVS_TILED;
 
 	if (IS_GEN6(dev))
@@ -728,7 +725,7 @@ ilk_update_plane(struct drm_plane *plane,
 	I915_WRITE(DVSSTRIDE(pipe), fb->pitches[0]);
 	I915_WRITE(DVSPOS(pipe), (crtc_y << 16) | crtc_x);
 
-	if (i915_gem_object_is_tiled(obj))
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED)
 		I915_WRITE(DVSTILEOFF(pipe), (y << 16) | x);
 	else
 		I915_WRITE(DVSLINOFF(pipe), linear_offset);

From c2ff7370aee6ba750859d7008d05c68e873d7c7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 11 Feb 2016 19:16:37 +0200
Subject: [PATCH 039/141] drm/i915: Adjust obj tiling vs. fb modifier rules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently we require the object to be X tiled if the fb is X
tiled. The argument is supposedly FBC GTT tracking. But
actually that no longer holds water since FBC supports
Y tiling as well on SKL+.

A better rule IMO is to require that if there is a fence, the
fb modifier match the object tiling mode. But if the object is linear,
we can allow the fb modifier to be anything. The idea being that
if the user set the tiling mode on the object, presumably the intention
is to actually use the fence for CPU access. But if the tiling mode is
not set, the user has no intention of using a fence (and can't actually
since we disallow tiling mode changes when there are framebuffers
associated with the object).

On gen2/3 we must keep to the rule that the object and fb
must be either both linear or both X tiled. No mixing allowed
since the display engine itself will use the fence if it's present.

v2: Fix typos
v3: Rebase due to i915_gem_object_get_tiling() & co.

Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-7-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 31 +++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index b1d5fca5e14d..d7e30eea7f0a 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -15202,23 +15202,26 @@ static int intel_framebuffer_init(struct drm_device *dev,
 				  struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
+	unsigned int tiling = i915_gem_object_get_tiling(obj);
 	int ret;
 	u32 pitch_limit, stride_alignment;
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
 	if (mode_cmd->flags & DRM_MODE_FB_MODIFIERS) {
-		/* Enforce that fb modifier and tiling mode match, but only for
-		 * X-tiled. This is needed for FBC. */
-		if (!!(i915_gem_object_get_tiling(obj) == I915_TILING_X) !=
-		    !!(mode_cmd->modifier[0] == I915_FORMAT_MOD_X_TILED)) {
+		/*
+		 * If there's a fence, enforce that
+		 * the fb modifier and tiling mode match.
+		 */
+		if (tiling != I915_TILING_NONE &&
+		    tiling != intel_fb_modifier_to_tiling(mode_cmd->modifier[0])) {
 			DRM_DEBUG("tiling_mode doesn't match fb modifier\n");
 			return -EINVAL;
 		}
 	} else {
-		if (i915_gem_object_get_tiling(obj) == I915_TILING_X)
+		if (tiling == I915_TILING_X) {
 			mode_cmd->modifier[0] = I915_FORMAT_MOD_X_TILED;
-		else if (i915_gem_object_get_tiling(obj) == I915_TILING_Y) {
+		} else if (tiling == I915_TILING_Y) {
 			DRM_DEBUG("No Y tiling for legacy addfb\n");
 			return -EINVAL;
 		}
@@ -15242,6 +15245,16 @@ static int intel_framebuffer_init(struct drm_device *dev,
 		return -EINVAL;
 	}
 
+	/*
+	 * gen2/3 display engine uses the fence if present,
+	 * so the tiling mode must match the fb modifier exactly.
+	 */
+	if (INTEL_INFO(dev_priv)->gen < 4 &&
+	    tiling != intel_fb_modifier_to_tiling(mode_cmd->modifier[0])) {
+		DRM_DEBUG("tiling_mode must match fb modifier exactly on gen2/3\n");
+		return -EINVAL;
+	}
+
 	stride_alignment = intel_fb_stride_alignment(dev_priv,
 						     mode_cmd->modifier[0],
 						     mode_cmd->pixel_format);
@@ -15261,7 +15274,11 @@ static int intel_framebuffer_init(struct drm_device *dev,
 		return -EINVAL;
 	}
 
-	if (mode_cmd->modifier[0] == I915_FORMAT_MOD_X_TILED &&
+	/*
+	 * If there's a fence, enforce that
+	 * the fb pitch and fence stride match.
+	 */
+	if (tiling != I915_TILING_NONE &&
 	    mode_cmd->pitches[0] != i915_gem_object_get_stride(obj)) {
 		DRM_DEBUG("pitch (%d) must match tiling stride (%d)\n",
 			  mode_cmd->pitches[0],

From 60d5f2a45b5f714ad9a421d2f22c15e7beaef979 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 22 Jan 2016 18:41:24 +0200
Subject: [PATCH 040/141] drm/i915: Limit fb x offset due to fences
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If there's a fence on the object it will be aligned to the start
of the object, and hence CPU rendering to any fb that straddles
the fence edge will come out wrong due to lines wrapping at the
wrong place.

We have no API to manage fences on a sub-object level, so we can't
really fix this in any way. Additonally gen2/3 fences are rather
coarse grained so adjusting the offset migth not even be possible.

Avoid these problems by requiring the fb layout to agree with the
fence layout (if present).

v2: Rebase due to i915_gem_object_get_tiling() & co.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-8-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d7e30eea7f0a..e9f065ff618f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2472,6 +2472,22 @@ intel_fill_fb_info(struct drm_i915_private *dev_priv,
 
 		intel_fb_offset_to_xy(&x, &y, fb, i);
 
+		/*
+		 * The fence (if used) is aligned to the start of the object
+		 * so having the framebuffer wrap around across the edge of the
+		 * fenced region doesn't really work. We have no API to configure
+		 * the fence start offset within the object (nor could we probably
+		 * on gen2/3). So it's just easier if we just require that the
+		 * fb layout agrees with the fence layout. We already check that the
+		 * fb stride matches the fence stride elsewhere.
+		 */
+		if (i915_gem_object_is_tiled(intel_fb->obj) &&
+		    (x + width) * cpp > fb->pitches[i]) {
+			DRM_DEBUG("bad fb plane %d offset: 0x%x\n",
+				  i, fb->offsets[i]);
+			return -EINVAL;
+		}
+
 		/*
 		 * First pixel of the framebuffer from
 		 * the start of the normal gtt mapping.

From b9b2403845f11cae20af5f6b5fd5cf7c6c9346ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Mon, 8 Feb 2016 18:28:00 +0200
Subject: [PATCH 041/141] drm/i915: Allow calling intel_adjust_tile_offset()
 multiple times
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Minimize the resulting X coordinate after intel_adjust_tile_offset() is
done with it's offset adjustment. This allows calling
intel_adjust_tile_offset() multiple times in case we need to adjust
the offset several times.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-9-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index e9f065ff618f..976159ab818c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2330,6 +2330,7 @@ static u32 intel_adjust_tile_offset(int *x, int *y,
 				    u32 old_offset,
 				    u32 new_offset)
 {
+	unsigned int pitch_pixels = pitch_tiles * tile_width;
 	unsigned int tiles;
 
 	WARN_ON(old_offset & (tile_size - 1));
@@ -2341,6 +2342,10 @@ static u32 intel_adjust_tile_offset(int *x, int *y,
 	*y += tiles / pitch_tiles * tile_height;
 	*x += tiles % pitch_tiles * tile_width;
 
+	/* minimize x in case it got needlessly big */
+	*y += *x / pitch_pixels * tile_height;
+	*x %= pitch_pixels;
+
 	return new_offset;
 }
 

From 66a2d927cb0eb6767529454316ac36511cdffdac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Fri, 5 Feb 2016 18:44:05 +0200
Subject: [PATCH 042/141] drm/i915: Make intel_adjust_tile_offset() work for
 linear buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To make life less surprising we can make intel_adjust_tile_offset()
deal with linear buffers as well. Currently it doesn't seem like there's
a real need for this since only X tiling and NV12 (which would always
be tiled currently) should need it. But I've used it for some debug
hacks already so seems like a reasonable thing to have.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-10-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 73 ++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 976159ab818c..06ce3911b8df 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2316,19 +2316,16 @@ void intel_add_fb_offsets(int *x, int *y,
 }
 
 /*
- * Adjust the tile offset by moving the difference into
- * the x/y offsets.
- *
  * Input tile dimensions and pitch must already be
  * rotated to match x and y, and in pixel units.
  */
-static u32 intel_adjust_tile_offset(int *x, int *y,
-				    unsigned int tile_width,
-				    unsigned int tile_height,
-				    unsigned int tile_size,
-				    unsigned int pitch_tiles,
-				    u32 old_offset,
-				    u32 new_offset)
+static u32 _intel_adjust_tile_offset(int *x, int *y,
+				     unsigned int tile_width,
+				     unsigned int tile_height,
+				     unsigned int tile_size,
+				     unsigned int pitch_tiles,
+				     u32 old_offset,
+				     u32 new_offset)
 {
 	unsigned int pitch_pixels = pitch_tiles * tile_width;
 	unsigned int tiles;
@@ -2349,6 +2346,50 @@ static u32 intel_adjust_tile_offset(int *x, int *y,
 	return new_offset;
 }
 
+/*
+ * Adjust the tile offset by moving the difference into
+ * the x/y offsets.
+ */
+static u32 intel_adjust_tile_offset(int *x, int *y,
+				    const struct intel_plane_state *state, int plane,
+				    u32 old_offset, u32 new_offset)
+{
+	const struct drm_i915_private *dev_priv = to_i915(state->base.plane->dev);
+	const struct drm_framebuffer *fb = state->base.fb;
+	unsigned int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
+	unsigned int rotation = state->base.rotation;
+	unsigned int pitch = intel_fb_pitch(fb, plane, rotation);
+
+	WARN_ON(new_offset > old_offset);
+
+	if (fb->modifier[plane] != DRM_FORMAT_MOD_NONE) {
+		unsigned int tile_size, tile_width, tile_height;
+		unsigned int pitch_tiles;
+
+		tile_size = intel_tile_size(dev_priv);
+		intel_tile_dims(dev_priv, &tile_width, &tile_height,
+				fb->modifier[plane], cpp);
+
+		if (intel_rotation_90_or_270(rotation)) {
+			pitch_tiles = pitch / tile_height;
+			swap(tile_width, tile_height);
+		} else {
+			pitch_tiles = pitch / (tile_width * cpp);
+		}
+
+		_intel_adjust_tile_offset(x, y, tile_width, tile_height,
+					  tile_size, pitch_tiles,
+					  old_offset, new_offset);
+	} else {
+		old_offset += *y * pitch + *x * cpp;
+
+		*y = (old_offset - new_offset) / pitch;
+		*x = ((old_offset - new_offset) - *y * pitch) / cpp;
+	}
+
+	return new_offset;
+}
+
 /*
  * Computes the linear offset to the base tile and adjusts
  * x, y. bytes per pixel is assumed to be a power-of-two.
@@ -2401,9 +2442,9 @@ static u32 _intel_compute_tile_offset(const struct drm_i915_private *dev_priv,
 		offset = (tile_rows * pitch_tiles + tiles) * tile_size;
 		offset_aligned = offset & ~alignment;
 
-		intel_adjust_tile_offset(x, y, tile_width, tile_height,
-					 tile_size, pitch_tiles,
-					 offset, offset_aligned);
+		_intel_adjust_tile_offset(x, y, tile_width, tile_height,
+					  tile_size, pitch_tiles,
+					  offset, offset_aligned);
 	} else {
 		offset = *y * pitch + *x * cpp;
 		offset_aligned = offset & ~alignment;
@@ -2550,9 +2591,9 @@ intel_fill_fb_info(struct drm_i915_private *dev_priv,
 			 * We only keep the x/y offsets, so push all of the
 			 * gtt offset into the x/y offsets.
 			 */
-			intel_adjust_tile_offset(&x, &y, tile_size,
-						 tile_width, tile_height, pitch_tiles,
-						 gtt_offset_rotated * tile_size, 0);
+			_intel_adjust_tile_offset(&x, &y, tile_size,
+						  tile_width, tile_height, pitch_tiles,
+						  gtt_offset_rotated * tile_size, 0);
 
 			gtt_offset_rotated += rot_info->plane[i].width * rot_info->plane[i].height;
 

From b63a16f6cd8911e74816a307f8078e29564eadc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 28 Jan 2016 16:53:54 +0200
Subject: [PATCH 043/141] drm/i915: Compute display surface offset in the plane
 check hook for SKL+
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SKL has nasty limitations with the display surface offsets:
* source x offset + width must be less than the stride for X tiled
  surfaces or the display engine falls over
* the surface offset requires lots of alignment (256K or 1M)

These facts mean that we can't just pick any suitably aligned tile
boundary as the offset and expect the resulting x offset to be useable.
The solution is to start with the closest boundary as before, but then
keep searching backwards until we find one that works, or don't. This
means we must be prepared to fail, hence the whole surface offset
calculation needs to be moved to the .check_plane() hook from the
.update_plane() hook.

While at it we can check that the source width/height don't exceed
maximum plane size limits.

We'll store the results of the computation in the plane state to make
it easy for the .update_plane() hook to do its thing.

v2: Replace for+break loop with while loop
    Rebase due to drm_plane_state src/dst rects
    Rebase due to plane_check_state()

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Sivakumar Thulasimani <sivakumar.thulasimani@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-11-git-send-email-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/i915/intel_display.c | 166 ++++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_drv.h     |   6 +
 drivers/gpu/drm/i915/intel_sprite.c  |  33 ++----
 3 files changed, 152 insertions(+), 53 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 06ce3911b8df..ffb74196322a 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2842,6 +2842,117 @@ valid_fb:
 		  &obj->frontbuffer_bits);
 }
 
+static int skl_max_plane_width(const struct drm_framebuffer *fb, int plane,
+			       unsigned int rotation)
+{
+	int cpp = drm_format_plane_cpp(fb->pixel_format, plane);
+
+	switch (fb->modifier[plane]) {
+	case DRM_FORMAT_MOD_NONE:
+	case I915_FORMAT_MOD_X_TILED:
+		switch (cpp) {
+		case 8:
+			return 4096;
+		case 4:
+		case 2:
+		case 1:
+			return 8192;
+		default:
+			MISSING_CASE(cpp);
+			break;
+		}
+		break;
+	case I915_FORMAT_MOD_Y_TILED:
+	case I915_FORMAT_MOD_Yf_TILED:
+		switch (cpp) {
+		case 8:
+			return 2048;
+		case 4:
+			return 4096;
+		case 2:
+		case 1:
+			return 8192;
+		default:
+			MISSING_CASE(cpp);
+			break;
+		}
+		break;
+	default:
+		MISSING_CASE(fb->modifier[plane]);
+	}
+
+	return 2048;
+}
+
+static int skl_check_main_surface(struct intel_plane_state *plane_state)
+{
+	const struct drm_i915_private *dev_priv = to_i915(plane_state->base.plane->dev);
+	const struct drm_framebuffer *fb = plane_state->base.fb;
+	unsigned int rotation = plane_state->base.rotation;
+	int x = plane_state->src.x1 >> 16;
+	int y = plane_state->src.y1 >> 16;
+	int w = drm_rect_width(&plane_state->src) >> 16;
+	int h = drm_rect_height(&plane_state->src) >> 16;
+	int max_width = skl_max_plane_width(fb, 0, rotation);
+	int max_height = 4096;
+	u32 alignment, offset;
+
+	if (w > max_width || h > max_height) {
+		DRM_DEBUG_KMS("requested Y/RGB source size %dx%d too big (limit %dx%d)\n",
+			      w, h, max_width, max_height);
+		return -EINVAL;
+	}
+
+	intel_add_fb_offsets(&x, &y, plane_state, 0);
+	offset = intel_compute_tile_offset(&x, &y, plane_state, 0);
+
+	alignment = intel_surf_alignment(dev_priv, fb->modifier[0]);
+
+	/*
+	 * When using an X-tiled surface, the plane blows up
+	 * if the x offset + width exceed the stride.
+	 *
+	 * TODO: linear and Y-tiled seem fine, Yf untested,
+	 */
+	if (fb->modifier[0] == I915_FORMAT_MOD_X_TILED) {
+		int cpp = drm_format_plane_cpp(fb->pixel_format, 0);
+
+		while ((x + w) * cpp > fb->pitches[0]) {
+			if (offset == 0) {
+				DRM_DEBUG_KMS("Unable to find suitable display surface offset\n");
+				return -EINVAL;
+			}
+
+			offset = intel_adjust_tile_offset(&x, &y, plane_state, 0,
+							  offset, offset - alignment);
+		}
+	}
+
+	plane_state->main.offset = offset;
+	plane_state->main.x = x;
+	plane_state->main.y = y;
+
+	return 0;
+}
+
+int skl_check_plane_surface(struct intel_plane_state *plane_state)
+{
+	const struct drm_framebuffer *fb = plane_state->base.fb;
+	unsigned int rotation = plane_state->base.rotation;
+	int ret;
+
+	/* Rotate src coordinates to match rotated GTT view */
+	if (intel_rotation_90_or_270(rotation))
+		drm_rect_rotate(&plane_state->src,
+				fb->width, fb->height, BIT(DRM_ROTATE_270));
+
+	ret = skl_check_main_surface(plane_state);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 static void i9xx_update_primary_plane(struct drm_plane *primary,
 				      const struct intel_crtc_state *crtc_state,
 				      const struct intel_plane_state *plane_state)
@@ -3221,10 +3332,10 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 	u32 plane_ctl;
 	unsigned int rotation = plane_state->base.rotation;
 	u32 stride = skl_plane_stride(fb, 0, rotation);
-	u32 surf_addr;
+	u32 surf_addr = plane_state->main.offset;
 	int scaler_id = plane_state->scaler_id;
-	int src_x = plane_state->src.x1 >> 16;
-	int src_y = plane_state->src.y1 >> 16;
+	int src_x = plane_state->main.x;
+	int src_y = plane_state->main.y;
 	int src_w = drm_rect_width(&plane_state->src) >> 16;
 	int src_h = drm_rect_height(&plane_state->src) >> 16;
 	int dst_x = plane_state->dst.x1;
@@ -3241,26 +3352,6 @@ static void skylake_update_primary_plane(struct drm_plane *plane,
 	plane_ctl |= PLANE_CTL_PLANE_GAMMA_DISABLE;
 	plane_ctl |= skl_plane_ctl_rotation(rotation);
 
-	if (intel_rotation_90_or_270(rotation)) {
-		struct drm_rect r = {
-			.x1 = src_x,
-			.x2 = src_x + src_w,
-			.y1 = src_y,
-			.y2 = src_y + src_h,
-		};
-
-		/* Rotate src coordinates to match rotated GTT view */
-		drm_rect_rotate(&r, fb->width, fb->height, BIT(DRM_ROTATE_270));
-
-		src_x = r.x1;
-		src_y = r.y1;
-		src_w = drm_rect_width(&r);
-		src_h = drm_rect_height(&r);
-	}
-
-	intel_add_fb_offsets(&src_x, &src_y, plane_state, 0);
-	surf_addr = intel_compute_tile_offset(&src_x, &src_y, plane_state, 0);
-
 	/* Sizes are 0 based */
 	src_w--;
 	src_h--;
@@ -14419,13 +14510,15 @@ intel_check_primary_plane(struct drm_plane *plane,
 			  struct intel_crtc_state *crtc_state,
 			  struct intel_plane_state *state)
 {
+	struct drm_i915_private *dev_priv = to_i915(plane->dev);
 	struct drm_crtc *crtc = state->base.crtc;
 	struct drm_framebuffer *fb = state->base.fb;
 	int min_scale = DRM_PLANE_HELPER_NO_SCALING;
 	int max_scale = DRM_PLANE_HELPER_NO_SCALING;
 	bool can_position = false;
+	int ret;
 
-	if (INTEL_INFO(plane->dev)->gen >= 9) {
+	if (INTEL_GEN(dev_priv) >= 9) {
 		/* use scaler when colorkey is not required */
 		if (state->ckey.flags == I915_SET_COLORKEY_NONE) {
 			min_scale = 1;
@@ -14434,12 +14527,25 @@ intel_check_primary_plane(struct drm_plane *plane,
 		can_position = true;
 	}
 
-	return drm_plane_helper_check_update(plane, crtc, fb, &state->src,
-					     &state->dst, &state->clip,
-					     state->base.rotation,
-					     min_scale, max_scale,
-					     can_position, true,
-					     &state->visible);
+	ret = drm_plane_helper_check_update(plane, crtc, fb, &state->src,
+					    &state->dst, &state->clip,
+					    state->base.rotation,
+					    min_scale, max_scale,
+					    can_position, true,
+					    &state->visible);
+	if (ret)
+		return ret;
+
+	if (!fb)
+		return 0;
+
+	if (INTEL_GEN(dev_priv) >= 9) {
+		ret = skl_check_plane_surface(state);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 static void intel_begin_crtc_commit(struct drm_crtc *crtc,
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 9b27bd7f7283..2b980f7855d4 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -353,6 +353,11 @@ struct intel_plane_state {
 	struct drm_rect clip;
 	bool visible;
 
+	struct {
+		u32 offset;
+		int x, y;
+	} main;
+
 	/*
 	 * scaler_id
 	 *    = -1 : not using a scaler
@@ -1346,6 +1351,7 @@ u32 skl_plane_ctl_tiling(uint64_t fb_modifier);
 u32 skl_plane_ctl_rotation(unsigned int rotation);
 u32 skl_plane_stride(const struct drm_framebuffer *fb, int plane,
 		     unsigned int rotation);
+int skl_check_plane_surface(struct intel_plane_state *plane_state);
 
 /* intel_csr.c */
 void intel_csr_ucode_init(struct drm_i915_private *);
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index ccab808ffc6b..c2e2c31dc972 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -207,15 +207,15 @@ skl_update_plane(struct drm_plane *drm_plane,
 	const int plane = intel_plane->plane + 1;
 	u32 plane_ctl;
 	const struct drm_intel_sprite_colorkey *key = &plane_state->ckey;
-	u32 surf_addr;
+	u32 surf_addr = plane_state->main.offset;
 	unsigned int rotation = plane_state->base.rotation;
 	u32 stride = skl_plane_stride(fb, 0, rotation);
 	int crtc_x = plane_state->dst.x1;
 	int crtc_y = plane_state->dst.y1;
 	uint32_t crtc_w = drm_rect_width(&plane_state->dst);
 	uint32_t crtc_h = drm_rect_height(&plane_state->dst);
-	uint32_t x = plane_state->src.x1 >> 16;
-	uint32_t y = plane_state->src.y1 >> 16;
+	uint32_t x = plane_state->main.x;
+	uint32_t y = plane_state->main.y;
 	uint32_t src_w = drm_rect_width(&plane_state->src) >> 16;
 	uint32_t src_h = drm_rect_height(&plane_state->src) >> 16;
 
@@ -239,26 +239,6 @@ skl_update_plane(struct drm_plane *drm_plane,
 	else if (key->flags & I915_SET_COLORKEY_SOURCE)
 		plane_ctl |= PLANE_CTL_KEY_ENABLE_SOURCE;
 
-	if (intel_rotation_90_or_270(rotation)) {
-		struct drm_rect r = {
-			.x1 = x,
-			.x2 = x + src_w,
-			.y1 = y,
-			.y2 = y + src_h,
-		};
-
-		/* Rotate src coordinates to match rotated GTT view */
-		drm_rect_rotate(&r, fb->width, fb->height, BIT(DRM_ROTATE_270));
-
-		x = r.x1;
-		y = r.y1;
-		src_w = drm_rect_width(&r);
-		src_h = drm_rect_height(&r);
-	}
-
-	intel_add_fb_offsets(&x, &y, plane_state, 0);
-	surf_addr = intel_compute_tile_offset(&x, &y, plane_state, 0);
-
 	/* Sizes are 0 based */
 	src_w--;
 	src_h--;
@@ -773,6 +753,7 @@ intel_check_sprite_plane(struct drm_plane *plane,
 	int hscale, vscale;
 	int max_scale, min_scale;
 	bool can_scale;
+	int ret;
 
 	if (!fb) {
 		state->visible = false;
@@ -927,6 +908,12 @@ intel_check_sprite_plane(struct drm_plane *plane,
 	dst->y1 = crtc_y;
 	dst->y2 = crtc_y + crtc_h;
 
+	if (INTEL_GEN(dev) >= 9) {
+		ret = skl_check_plane_surface(state);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 

From 8d970654b767ebe8aeb524d30e27b37c0cb8eaed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Thu, 28 Jan 2016 16:30:28 +0200
Subject: [PATCH 044/141] drm/i915: Deal with NV12 CbCr plane AUX surface on
 SKL+
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With NV12 we have two color planes to deal with so we must compute the
surface and x/y offsets for the second plane as well.

What makes this a bit nasty is that the hardware expects the surface
offset to be specified as a distance from the main surface offset.
What's worse, the distance must be non-negative (no neat wraparound or
anything). So we must make sure that the main surface offset is always
less or equal to the AUX surface offset. We do that by computing the AUX
offset first and the main surface offset second. If the main surface
offset ends up being above the AUX offset, we just push it down as far
as is required while still maintaining the required alignment etc.

Fortunately the AUX offset only reuqires 4K alignment, so we don't need
to do any of the backwards searching for an acceptable offset that we
must do for the main surface. And X tiled + NV12 isn't a supported
combination anyway.

Note that this just computes aux surface offsets, we do not yet program
them into the actual hardware registers, and hence we can't yet expose
NV12.

v2: Rebase due to drm_plane_state src/dst rects
    s/TODO.../something else/ in the commit message/ (Daniel)

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1470821001-25272-12-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 62 +++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_drv.h     |  4 ++
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index ffb74196322a..21ec74ea77d2 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2463,8 +2463,14 @@ u32 intel_compute_tile_offset(int *x, int *y,
 	const struct drm_i915_private *dev_priv = to_i915(state->base.plane->dev);
 	const struct drm_framebuffer *fb = state->base.fb;
 	unsigned int rotation = state->base.rotation;
-	u32 alignment = intel_surf_alignment(dev_priv, fb->modifier[plane]);
 	int pitch = intel_fb_pitch(fb, plane, rotation);
+	u32 alignment;
+
+	/* AUX_DIST needs only 4K alignment */
+	if (fb->pixel_format == DRM_FORMAT_NV12 && plane == 1)
+		alignment = 4096;
+	else
+		alignment = intel_surf_alignment(dev_priv, fb->modifier[plane]);
 
 	return _intel_compute_tile_offset(dev_priv, x, y, fb, plane, pitch,
 					  rotation, alignment);
@@ -2895,7 +2901,7 @@ static int skl_check_main_surface(struct intel_plane_state *plane_state)
 	int h = drm_rect_height(&plane_state->src) >> 16;
 	int max_width = skl_max_plane_width(fb, 0, rotation);
 	int max_height = 4096;
-	u32 alignment, offset;
+	u32 alignment, offset, aux_offset = plane_state->aux.offset;
 
 	if (w > max_width || h > max_height) {
 		DRM_DEBUG_KMS("requested Y/RGB source size %dx%d too big (limit %dx%d)\n",
@@ -2908,6 +2914,15 @@ static int skl_check_main_surface(struct intel_plane_state *plane_state)
 
 	alignment = intel_surf_alignment(dev_priv, fb->modifier[0]);
 
+	/*
+	 * AUX surface offset is specified as the distance from the
+	 * main surface offset, and it must be non-negative. Make
+	 * sure that is what we will get.
+	 */
+	if (offset > aux_offset)
+		offset = intel_adjust_tile_offset(&x, &y, plane_state, 0,
+						  offset, aux_offset & ~(alignment - 1));
+
 	/*
 	 * When using an X-tiled surface, the plane blows up
 	 * if the x offset + width exceed the stride.
@@ -2935,6 +2950,35 @@ static int skl_check_main_surface(struct intel_plane_state *plane_state)
 	return 0;
 }
 
+static int skl_check_nv12_aux_surface(struct intel_plane_state *plane_state)
+{
+	const struct drm_framebuffer *fb = plane_state->base.fb;
+	unsigned int rotation = plane_state->base.rotation;
+	int max_width = skl_max_plane_width(fb, 1, rotation);
+	int max_height = 4096;
+	int x = plane_state->src.x1 >> 17;
+	int y = plane_state->src.y1 >> 17;
+	int w = drm_rect_width(&plane_state->src) >> 17;
+	int h = drm_rect_height(&plane_state->src) >> 17;
+	u32 offset;
+
+	intel_add_fb_offsets(&x, &y, plane_state, 1);
+	offset = intel_compute_tile_offset(&x, &y, plane_state, 1);
+
+	/* FIXME not quite sure how/if these apply to the chroma plane */
+	if (w > max_width || h > max_height) {
+		DRM_DEBUG_KMS("CbCr source size %dx%d too big (limit %dx%d)\n",
+			      w, h, max_width, max_height);
+		return -EINVAL;
+	}
+
+	plane_state->aux.offset = offset;
+	plane_state->aux.x = x;
+	plane_state->aux.y = y;
+
+	return 0;
+}
+
 int skl_check_plane_surface(struct intel_plane_state *plane_state)
 {
 	const struct drm_framebuffer *fb = plane_state->base.fb;
@@ -2946,6 +2990,20 @@ int skl_check_plane_surface(struct intel_plane_state *plane_state)
 		drm_rect_rotate(&plane_state->src,
 				fb->width, fb->height, BIT(DRM_ROTATE_270));
 
+	/*
+	 * Handle the AUX surface first since
+	 * the main surface setup depends on it.
+	 */
+	if (fb->pixel_format == DRM_FORMAT_NV12) {
+		ret = skl_check_nv12_aux_surface(plane_state);
+		if (ret)
+			return ret;
+	} else {
+		plane_state->aux.offset = ~0xfff;
+		plane_state->aux.x = 0;
+		plane_state->aux.y = 0;
+	}
+
 	ret = skl_check_main_surface(plane_state);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 2b980f7855d4..bb34d5470cba 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -357,6 +357,10 @@ struct intel_plane_state {
 		u32 offset;
 		int x, y;
 	} main;
+	struct {
+		u32 offset;
+		int x, y;
+	} aux;
 
 	/*
 	 * scaler_id

From d31d7cb1460c2942e23bfd0c0f4c8066b60353dc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 12 Aug 2016 12:39:58 +0100
Subject: [PATCH 045/141] drm/i915: Support for creating write combined type
 vmaps

vmaps has a provision for controlling the page protection bits, with which
we can use to control the mapping type, e.g. WB, WC, UC or even WT.
To allow the caller to choose their mapping type, we add a parameter to
i915_gem_object_pin_map - but we still only allow one vmap to be cached
per object. If the object is currently not pinned, then we recreate the
previous vmap with the new access type, but if it was pinned we report an
error. This effectively limits the access via i915_gem_object_pin_map to a
single mapping type for the lifetime of the object. Not usually a problem,
but something to be aware of when setting up the object's vmap.

We will want to vary the access type to enable WC mappings of ringbuffer
and context objects on !llc platforms, as well as other objects where we
need coherent access to the GPU's pages without going through the GTT

v2: Remove the redundant braces around pin count check and fix the marker
     in documentation (Chris)

v3:
- Add a new enum for the vmalloc mapping type & pass that as an argument to
   i915_object_pin_map. (Tvrtko)
- Use PAGE_MASK to extract or filter the mapping type info and remove a
   superfluous BUG_ON.(Tvrtko)

v4:
- Rename the enums and clean up the pin_map function. (Chris)

v5: Drop the VM_NO_GUARD, minor cosmetics.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471001999-17787-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h         | 21 ++++++++-
 drivers/gpu/drm/i915/i915_gem.c         | 58 ++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_gem_dmabuf.c  |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c        |  8 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.c |  2 +-
 5 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7971c76852df..654aabe76efc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3144,13 +3144,20 @@ static inline void i915_gem_object_unpin_pages(struct drm_i915_gem_object *obj)
 	obj->pages_pin_count--;
 }
 
+enum i915_map_type {
+	I915_MAP_WB = 0,
+	I915_MAP_WC,
+};
+
 /**
  * i915_gem_object_pin_map - return a contiguous mapping of the entire object
  * @obj - the object to map into kernel address space
+ * @type - the type of mapping, used to select pgprot_t
  *
  * Calls i915_gem_object_pin_pages() to prevent reaping of the object's
  * pages and then returns a contiguous mapping of the backing storage into
- * the kernel address space.
+ * the kernel address space. Based on the @type of mapping, the PTE will be
+ * set to either WriteBack or WriteCombine (via pgprot_t).
  *
  * The caller must hold the struct_mutex, and is responsible for calling
  * i915_gem_object_unpin_map() when the mapping is no longer required.
@@ -3158,7 +3165,8 @@ static inline void i915_gem_object_unpin_pages(struct drm_i915_gem_object *obj)
  * Returns the pointer through which to access the mapped object, or an
  * ERR_PTR() on error.
  */
-void *__must_check i915_gem_object_pin_map(struct drm_i915_gem_object *obj);
+void *__must_check i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
+					   enum i915_map_type type);
 
 /**
  * i915_gem_object_unpin_map - releases an earlier mapping
@@ -3899,4 +3907,13 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+#define ptr_unpack_bits(ptr, bits) ({					\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(bits) = __v & ~PAGE_MASK;					\
+	(typeof(ptr))(__v & PAGE_MASK);					\
+})
+
+#define ptr_pack_bits(ptr, bits)					\
+	((typeof(ptr))((unsigned long)(ptr) | (bits)))
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e68b4c62be88..75a57e2da86e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2077,6 +2077,7 @@ i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 	list_del(&obj->global_list);
 
 	if (obj->mapping) {
+		/* low bits are ignored by is_vmalloc_addr and kmap_to_page */
 		if (is_vmalloc_addr(obj->mapping))
 			vunmap(obj->mapping);
 		else
@@ -2253,7 +2254,8 @@ i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 }
 
 /* The 'mapping' part of i915_gem_object_pin_map() below */
-static void *i915_gem_object_map(const struct drm_i915_gem_object *obj)
+static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
+				 enum i915_map_type type)
 {
 	unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
 	struct sg_table *sgt = obj->pages;
@@ -2262,10 +2264,11 @@ static void *i915_gem_object_map(const struct drm_i915_gem_object *obj)
 	struct page *stack_pages[32];
 	struct page **pages = stack_pages;
 	unsigned long i = 0;
+	pgprot_t pgprot;
 	void *addr;
 
 	/* A single page can always be kmapped */
-	if (n_pages == 1)
+	if (n_pages == 1 && type == I915_MAP_WB)
 		return kmap(sg_page(sgt->sgl));
 
 	if (n_pages > ARRAY_SIZE(stack_pages)) {
@@ -2281,7 +2284,15 @@ static void *i915_gem_object_map(const struct drm_i915_gem_object *obj)
 	/* Check that we have the expected number of pages */
 	GEM_BUG_ON(i != n_pages);
 
-	addr = vmap(pages, n_pages, 0, PAGE_KERNEL);
+	switch (type) {
+	case I915_MAP_WB:
+		pgprot = PAGE_KERNEL;
+		break;
+	case I915_MAP_WC:
+		pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
+		break;
+	}
+	addr = vmap(pages, n_pages, 0, pgprot);
 
 	if (pages != stack_pages)
 		drm_free_large(pages);
@@ -2290,27 +2301,54 @@ static void *i915_gem_object_map(const struct drm_i915_gem_object *obj)
 }
 
 /* get, pin, and map the pages of the object into kernel space */
-void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj)
+void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
+			      enum i915_map_type type)
 {
+	enum i915_map_type has_type;
+	bool pinned;
+	void *ptr;
 	int ret;
 
 	lockdep_assert_held(&obj->base.dev->struct_mutex);
+	GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
 
 	ret = i915_gem_object_get_pages(obj);
 	if (ret)
 		return ERR_PTR(ret);
 
 	i915_gem_object_pin_pages(obj);
+	pinned = obj->pages_pin_count > 1;
 
-	if (!obj->mapping) {
-		obj->mapping = i915_gem_object_map(obj);
-		if (!obj->mapping) {
-			i915_gem_object_unpin_pages(obj);
-			return ERR_PTR(-ENOMEM);
+	ptr = ptr_unpack_bits(obj->mapping, has_type);
+	if (ptr && has_type != type) {
+		if (pinned) {
+			ret = -EBUSY;
+			goto err;
 		}
+
+		if (is_vmalloc_addr(ptr))
+			vunmap(ptr);
+		else
+			kunmap(kmap_to_page(ptr));
+
+		ptr = obj->mapping = NULL;
 	}
 
-	return obj->mapping;
+	if (!ptr) {
+		ptr = i915_gem_object_map(obj, type);
+		if (!ptr) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		obj->mapping = ptr_pack_bits(ptr, type);
+	}
+
+	return ptr;
+
+err:
+	i915_gem_object_unpin_pages(obj);
+	return ERR_PTR(ret);
 }
 
 static void
diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index c60a8d5bbad0..10265bb35604 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -119,7 +119,7 @@ static void *i915_gem_dmabuf_vmap(struct dma_buf *dma_buf)
 	if (ret)
 		return ERR_PTR(ret);
 
-	addr = i915_gem_object_pin_map(obj);
+	addr = i915_gem_object_pin_map(obj, I915_MAP_WB);
 	mutex_unlock(&dev->struct_mutex);
 
 	return addr;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index c7f4b64b16f6..c24ac39d51f6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -780,7 +780,7 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 	if (ret)
 		goto err;
 
-	vaddr = i915_gem_object_pin_map(ce->state);
+	vaddr = i915_gem_object_pin_map(ce->state, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
 		ret = PTR_ERR(vaddr);
 		goto unpin_ctx_obj;
@@ -1755,7 +1755,7 @@ lrc_setup_hws(struct intel_engine_cs *engine,
 	/* The HWSP is part of the default context object in LRC mode. */
 	engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(dctx_obj) +
 				       LRC_PPHWSP_PN * PAGE_SIZE;
-	hws = i915_gem_object_pin_map(dctx_obj);
+	hws = i915_gem_object_pin_map(dctx_obj, I915_MAP_WB);
 	if (IS_ERR(hws))
 		return PTR_ERR(hws);
 	engine->status_page.page_addr = hws + LRC_PPHWSP_PN * PAGE_SIZE;
@@ -1968,7 +1968,7 @@ populate_lr_context(struct i915_gem_context *ctx,
 		return ret;
 	}
 
-	vaddr = i915_gem_object_pin_map(ctx_obj);
+	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
 		ret = PTR_ERR(vaddr);
 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
@@ -2189,7 +2189,7 @@ void intel_lr_context_reset(struct drm_i915_private *dev_priv,
 		if (!ctx_obj)
 			continue;
 
-		vaddr = i915_gem_object_pin_map(ctx_obj);
+		vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
 		if (WARN_ON(IS_ERR(vaddr)))
 			continue;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index ed19868df9c6..75a9f635c3dc 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1951,7 +1951,7 @@ int intel_ring_pin(struct intel_ring *ring)
 		if (ret)
 			goto err_unpin;
 
-		addr = i915_gem_object_pin_map(obj);
+		addr = i915_gem_object_pin_map(obj, I915_MAP_WB);
 		if (IS_ERR(addr)) {
 			ret = PTR_ERR(addr);
 			goto err_unpin;

From 0b1de5d58e1972a88cfa5d1ea709a7b16934783a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 12 Aug 2016 12:39:59 +0100
Subject: [PATCH 046/141] drm/i915: Use SSE4.1 movntdqa to accelerate reads
 from WC memory

This patch provides the infrastructure for performing a 16-byte aligned
read from WC memory using non-temporal instructions introduced with sse4.1.
Using movntdqa we can bypass the CPU caches and read directly from memory
and ignoring the page attributes set on the CPU PTE i.e. negating the
impact of an otherwise UC access. Copying using movntdqa from WC is almost
as fast as reading from WB memory, modulo the possibility of both hitting
the CPU cache or leaving the data in the CPU cache for the next consumer.
(The CPU cache itself my be flushed for the region of the movntdqa and on
later access the movntdqa reads from a separate internal buffer for the
cacheline.) The write back to the memory is however cached.

This will be used in later patches to accelerate accessing WC memory.

v2: Report whether the accelerated copy is successful/possible.
v3: Function alignment override was only necessary when using the
function target("sse4.1") - which is not necessary for emitting movntdqa
from __asm__.
v4: Improve notes on CPU cache behaviour vs non-temporal stores.
v5: Fix byte offsets for unrolled moves.
v6: Find all remaining typos of "movntqda", use kernel_fpu_begin.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471001999-17787-2-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/Makefile      |   3 +
 drivers/gpu/drm/i915/i915_drv.c    |   2 +
 drivers/gpu/drm/i915/i915_drv.h    |   3 +
 drivers/gpu/drm/i915/i915_memcpy.c | 101 +++++++++++++++++++++++++++++
 4 files changed, 109 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_memcpy.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index dda724f04445..3412413408c0 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -3,12 +3,15 @@
 # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
 
 subdir-ccflags-$(CONFIG_DRM_I915_WERROR) := -Werror
+subdir-ccflags-y += \
+	$(call as-instr,movntdqa (%eax)$(comma)%xmm0,-DCONFIG_AS_MOVNTDQA)
 
 # Please keep these build lists sorted!
 
 # core driver code
 i915-y := i915_drv.o \
 	  i915_irq.o \
+	  i915_memcpy.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 0fcd1c0f67bb..9b9ea8d3ecdc 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -827,6 +827,8 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	mutex_init(&dev_priv->wm.wm_mutex);
 	mutex_init(&dev_priv->pps_mutex);
 
+	i915_memcpy_init_early(dev_priv);
+
 	ret = i915_workqueues_init(dev_priv);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 654aabe76efc..bf193ba1574e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3907,6 +3907,9 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	return false;
 }
 
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
+
 #define ptr_unpack_bits(ptr, bits) ({					\
 	unsigned long __v = (unsigned long)(ptr);			\
 	(bits) = __v & ~PAGE_MASK;					\
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
new file mode 100644
index 000000000000..6f1df0ec8a81
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <asm/fpu/api.h>
+
+#include "i915_drv.h"
+
+DEFINE_STATIC_KEY_FALSE(has_movntdqa);
+
+#ifdef CONFIG_AS_MOVNTDQA
+static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
+{
+	kernel_fpu_begin();
+
+	len >>= 4;
+	while (len >= 4) {
+		asm("movntdqa   (%0), %%xmm0\n"
+		    "movntdqa 16(%0), %%xmm1\n"
+		    "movntdqa 32(%0), %%xmm2\n"
+		    "movntdqa 48(%0), %%xmm3\n"
+		    "movaps %%xmm0,   (%1)\n"
+		    "movaps %%xmm1, 16(%1)\n"
+		    "movaps %%xmm2, 32(%1)\n"
+		    "movaps %%xmm3, 48(%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+		src += 64;
+		dst += 64;
+		len -= 4;
+	}
+	while (len--) {
+		asm("movntdqa (%0), %%xmm0\n"
+		    "movaps %%xmm0, (%1)\n"
+		    :: "r" (src), "r" (dst) : "memory");
+		src += 16;
+		dst += 16;
+	}
+
+	kernel_fpu_end();
+}
+#endif
+
+/**
+ * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
+ * @dst: destination pointer
+ * @src: source pointer
+ * @len: how many bytes to copy
+ *
+ * i915_memcpy_from_wc copies @len bytes from @src to @dst using
+ * non-temporal instructions where available. Note that all arguments
+ * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
+ * of 16.
+ *
+ * To test whether accelerated reads from WC are supported, use
+ * i915_memcpy_from_wc(NULL, NULL, 0);
+ *
+ * Returns true if the copy was successful, false if the preconditions
+ * are not met.
+ */
+bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
+		return false;
+
+#ifdef CONFIG_AS_MOVNTDQA
+	if (static_branch_likely(&has_movntdqa)) {
+		if (likely(len))
+			__memcpy_ntdqa(dst, src, len);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
+void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
+{
+	if (static_cpu_has(X86_FEATURE_XMM4_1))
+		static_branch_enable(&has_movntdqa);
+}

From 35a9611ca0e719557d9a198cb01021b4003198f2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sun, 14 Aug 2016 18:44:40 +0100
Subject: [PATCH 047/141] drm/i915: Initialize return value for empty
 i915_gem_object_unbind()

If the obj->vma_list is empty, we immediately return ret. However, we
are doing so having never set it to any value, it should be zero!

Reported-by: Matthew Auld <matthew.auld@intel.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=97343
Fixes: aa653a685d81 ("drm/i915: Be more careful when unbinding vma")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471196681-30043-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 75a57e2da86e..ac706f31af7d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -279,12 +279,11 @@ static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 	.release = i915_gem_object_release_phys,
 };
 
-int
-i915_gem_object_unbind(struct drm_i915_gem_object *obj)
+int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 {
 	struct i915_vma *vma;
 	LIST_HEAD(still_in_list);
-	int ret;
+	int ret = 0;
 
 	/* The vma will only be freed if it is marked as closed, and if we wait
 	 * upon rendering to the vma, we may unbind anything in the list.

From 02bef8f98d26e9774b49c78a42b29b168e8876fc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sun, 14 Aug 2016 18:44:41 +0100
Subject: [PATCH 048/141] drm/i915: Unbind closed vma for
 i915_gem_object_unbind()

Closed vma are removed from the obj->vma_list so that they cannot be
found by userspace. However, this means that when forcibly unbinding an
object, we have to wait upon all rendering to that object first in order
for the closed, but active, vma to be reaped and their bindings removed.

Reported-by: Matthew Auld <matthew.auld@intel.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=97343
Fixes: aa653a685d81 ("drm/i915: Be more careful when unbinding vma")
Fixes: 8a3b3d576c93 (" drm/i915: Convert non-blocking userptr waits...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471196681-30043-2-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Tested-by: Matthew Auld <matthew.auld@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ac706f31af7d..3b16e92e9b51 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -283,11 +283,21 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 {
 	struct i915_vma *vma;
 	LIST_HEAD(still_in_list);
-	int ret = 0;
+	int ret;
 
-	/* The vma will only be freed if it is marked as closed, and if we wait
-	 * upon rendering to the vma, we may unbind anything in the list.
+	lockdep_assert_held(&obj->base.dev->struct_mutex);
+
+	/* Closed vma are removed from the obj->vma_list - but they may
+	 * still have an active binding on the object. To remove those we
+	 * must wait for all rendering to complete to the object (as unbinding
+	 * must anyway), and retire the requests.
 	 */
+	ret = i915_gem_object_wait_rendering(obj, false);
+	if (ret)
+		return ret;
+
+	i915_gem_retire_requests(to_i915(obj->base.dev));
+
 	while ((vma = list_first_entry_or_null(&obj->vma_list,
 					       struct i915_vma,
 					       obj_link))) {

From 7466c291b1d2fa38e3140717c857c86e0eac1b6b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 09:49:33 +0100
Subject: [PATCH 049/141] drm/i915: Show RPS autotuning thresholds along with
 waitboost

For convenience when debugging user issues show the autotuning
RPS parameters in debugfs/i915_rps_boost_info.

v2: Refine the presentation
v3: Style

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: fritsch@kodi.tv
Link: http://patchwork.freedesktop.org/patch/msgid/1471181336-27523-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: David Weinehall <david.weinehall@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471250973-31277-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c | 48 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_reg.h     |  7 +++--
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index c461072da142..a7ee7d6c8a66 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2441,6 +2441,20 @@ static int count_irq_waiters(struct drm_i915_private *i915)
 	return count;
 }
 
+static const char *rps_power_to_str(unsigned int power)
+{
+	static const char * const strings[] = {
+		[LOW_POWER] = "low power",
+		[BETWEEN] = "mixed",
+		[HIGH_POWER] = "high power",
+	};
+
+	if (power >= ARRAY_SIZE(strings) || !strings[power])
+		return "unknown";
+
+	return strings[power];
+}
+
 static int i915_rps_boost_info(struct seq_file *m, void *data)
 {
 	struct drm_info_node *node = m->private;
@@ -2452,12 +2466,17 @@ static int i915_rps_boost_info(struct seq_file *m, void *data)
 	seq_printf(m, "GPU busy? %s [%x]\n",
 		   yesno(dev_priv->gt.awake), dev_priv->gt.active_engines);
 	seq_printf(m, "CPU waiting? %d\n", count_irq_waiters(dev_priv));
-	seq_printf(m, "Frequency requested %d; min hard:%d, soft:%d; max soft:%d, hard:%d\n",
-		   intel_gpu_freq(dev_priv, dev_priv->rps.cur_freq),
+	seq_printf(m, "Frequency requested %d\n",
+		   intel_gpu_freq(dev_priv, dev_priv->rps.cur_freq));
+	seq_printf(m, "  min hard:%d, soft:%d; max soft:%d, hard:%d\n",
 		   intel_gpu_freq(dev_priv, dev_priv->rps.min_freq),
 		   intel_gpu_freq(dev_priv, dev_priv->rps.min_freq_softlimit),
 		   intel_gpu_freq(dev_priv, dev_priv->rps.max_freq_softlimit),
 		   intel_gpu_freq(dev_priv, dev_priv->rps.max_freq));
+	seq_printf(m, "  idle:%d, efficient:%d, boost:%d\n",
+		   intel_gpu_freq(dev_priv, dev_priv->rps.idle_freq),
+		   intel_gpu_freq(dev_priv, dev_priv->rps.efficient_freq),
+		   intel_gpu_freq(dev_priv, dev_priv->rps.boost_freq));
 
 	mutex_lock(&dev->filelist_mutex);
 	spin_lock(&dev_priv->rps.client_lock);
@@ -2478,6 +2497,31 @@ static int i915_rps_boost_info(struct seq_file *m, void *data)
 	spin_unlock(&dev_priv->rps.client_lock);
 	mutex_unlock(&dev->filelist_mutex);
 
+	if (INTEL_GEN(dev_priv) >= 6 &&
+	    dev_priv->rps.enabled &&
+	    dev_priv->gt.active_engines) {
+		u32 rpup, rpupei;
+		u32 rpdown, rpdownei;
+
+		intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+		rpup = I915_READ_FW(GEN6_RP_CUR_UP) & GEN6_RP_EI_MASK;
+		rpupei = I915_READ_FW(GEN6_RP_CUR_UP_EI) & GEN6_RP_EI_MASK;
+		rpdown = I915_READ_FW(GEN6_RP_CUR_DOWN) & GEN6_RP_EI_MASK;
+		rpdownei = I915_READ_FW(GEN6_RP_CUR_DOWN_EI) & GEN6_RP_EI_MASK;
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+
+		seq_printf(m, "\nRPS Autotuning (current \"%s\" window):\n",
+			   rps_power_to_str(dev_priv->rps.power));
+		seq_printf(m, "  Avg. up: %d%% [above threshold? %d%%]\n",
+			   100 * rpup / rpupei,
+			   dev_priv->rps.up_threshold);
+		seq_printf(m, "  Avg. down: %d%% [below threshold? %d%%]\n",
+			   100 * rpdown / rpdownei,
+			   dev_priv->rps.down_threshold);
+	} else {
+		seq_puts(m, "\nRPS Autotuning inactive\n");
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index da82744f3a3b..d4adf2806c50 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7036,12 +7036,13 @@ enum {
 #define GEN6_RP_UP_THRESHOLD			_MMIO(0xA02C)
 #define GEN6_RP_DOWN_THRESHOLD			_MMIO(0xA030)
 #define GEN6_RP_CUR_UP_EI			_MMIO(0xA050)
-#define   GEN6_CURICONT_MASK			0xffffff
+#define   GEN6_RP_EI_MASK			0xffffff
+#define   GEN6_CURICONT_MASK			GEN6_RP_EI_MASK
 #define GEN6_RP_CUR_UP				_MMIO(0xA054)
-#define   GEN6_CURBSYTAVG_MASK			0xffffff
+#define   GEN6_CURBSYTAVG_MASK			GEN6_RP_EI_MASK
 #define GEN6_RP_PREV_UP				_MMIO(0xA058)
 #define GEN6_RP_CUR_DOWN_EI			_MMIO(0xA05C)
-#define   GEN6_CURIAVG_MASK			0xffffff
+#define   GEN6_CURIAVG_MASK			GEN6_RP_EI_MASK
 #define GEN6_RP_CUR_DOWN			_MMIO(0xA060)
 #define GEN6_RP_PREV_DOWN			_MMIO(0xA064)
 #define GEN6_RP_UP_EI				_MMIO(0xA068)

From d045446df196e3af341aa7623de5d21049ec4f67 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:40 +0100
Subject: [PATCH 050/141] drm/i915: Record the position of the start of the
 request

Not only does it make for good documentation and debugging aide, but it is
also vital for when we want to unwind requests - such as when throwing away
an incomplete request.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-2-git-send-email-arun.siluvery@linux.intel.com
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem_request.c | 13 +++++++++----
 drivers/gpu/drm/i915/i915_gpu_error.c   |  6 ++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bf193ba1574e..b1017950087b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -557,6 +557,7 @@ struct drm_i915_error_state {
 		struct drm_i915_error_request {
 			long jiffies;
 			u32 seqno;
+			u32 head;
 			u32 tail;
 		} *requests;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index b764c1d440c8..8a9e9bfeea09 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -426,6 +426,13 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (ret)
 		goto err_ctx;
 
+	/* Record the position of the start of the request so that
+	 * should we detect the updated seqno part-way through the
+	 * GPU processing the request, we never over-estimate the
+	 * position of the head.
+	 */
+	req->head = req->ring->tail;
+
 	return req;
 
 err_ctx:
@@ -500,8 +507,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	trace_i915_gem_request_add(request);
 
-	request->head = request_start;
-
 	/* Seal the request and mark it as pending execution. Note that
 	 * we may inspect this state, without holding any locks, during
 	 * hangcheck. Hence we apply the barrier to ensure that we do not
@@ -514,10 +519,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	list_add_tail(&request->link, &engine->request_list);
 	list_add_tail(&request->ring_link, &ring->request_list);
 
-	/* Record the position of the start of the request so that
+	/* Record the position of the start of the breadcrumb so that
 	 * should we detect the updated seqno part-way through the
 	 * GPU processing the request, we never over-estimate the
-	 * position of the head.
+	 * position of the ring's HEAD.
 	 */
 	request->postfix = ring->tail;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index eecb87063c88..d54848f5f246 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -455,9 +455,10 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 				   dev_priv->engine[i].name,
 				   ee->num_requests);
 			for (j = 0; j < ee->num_requests; j++) {
-				err_printf(m, "  seqno 0x%08x, emitted %ld, tail 0x%08x\n",
+				err_printf(m, "  seqno 0x%08x, emitted %ld, head 0x%08x, tail 0x%08x\n",
 					   ee->requests[j].seqno,
 					   ee->requests[j].jiffies,
+					   ee->requests[j].head,
 					   ee->requests[j].tail);
 			}
 		}
@@ -1205,7 +1206,8 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			erq = &ee->requests[count++];
 			erq->seqno = request->fence.seqno;
 			erq->jiffies = request->emitted_jiffies;
-			erq->tail = request->postfix;
+			erq->head = request->head;
+			erq->tail = request->tail;
 		}
 	}
 }

From c0ce4663611e09182fdb81b980f6eaa5560b5b1d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:41 +0100
Subject: [PATCH 051/141] drm/i915: Reduce amount of duplicate buffer
 information captured on error

When capturing the error state, we do not need to know about every
address space - just those that are related to the error. We know which
context is active at the time, therefore we know which VM are implicated
in the error. We can then restrict the VM which we report to the
relevant subset.

v2: s/i/count_active/ (and similar)
    Rewrite label generation for "Buffers"

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-2-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h       |   9 +-
 drivers/gpu/drm/i915/i915_gpu_error.c | 216 ++++++++++++--------------
 2 files changed, 101 insertions(+), 124 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b1017950087b..7eb911e47904 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -517,6 +517,7 @@ struct drm_i915_error_state {
 		int num_waiters;
 		int hangcheck_score;
 		enum intel_engine_hangcheck_action hangcheck_action;
+		struct i915_address_space *vm;
 		int num_requests;
 
 		/* our own tracking of ring head and tail */
@@ -587,17 +588,15 @@ struct drm_i915_error_state {
 		u32 read_domains;
 		u32 write_domain;
 		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
-		s32 pinned:2;
 		u32 tiling:2;
 		u32 dirty:1;
 		u32 purgeable:1;
 		u32 userptr:1;
 		s32 engine:4;
 		u32 cache_level:3;
-	} **active_bo, **pinned_bo;
-
-	u32 *active_bo_count, *pinned_bo_count;
-	u32 vm_count;
+	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
+	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
+	struct i915_address_space *active_vm[I915_NUM_ENGINES];
 };
 
 struct intel_connector;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index d54848f5f246..1c098fa65fbe 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -42,16 +42,6 @@ static const char *engine_str(int engine)
 	}
 }
 
-static const char *pin_flag(int pinned)
-{
-	if (pinned > 0)
-		return " P";
-	else if (pinned < 0)
-		return " p";
-	else
-		return "";
-}
-
 static const char *tiling_flag(int tiling)
 {
 	switch (tiling) {
@@ -189,7 +179,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 {
 	int i;
 
-	err_printf(m, "  %s [%d]:\n", name, count);
+	err_printf(m, "%s [%d]:\n", name, count);
 
 	while (count--) {
 		err_printf(m, "    %08x_%08x %8u %02x %02x [ ",
@@ -202,7 +192,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 			err_printf(m, "%02x ", err->rseqno[i]);
 
 		err_printf(m, "] %02x", err->wseqno);
-		err_puts(m, pin_flag(err->pinned));
 		err_puts(m, tiling_flag(err->tiling));
 		err_puts(m, dirty_flag(err->dirty));
 		err_puts(m, purgeable_flag(err->purgeable));
@@ -414,18 +403,33 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			error_print_engine(m, &error->engine[i]);
 	}
 
-	for (i = 0; i < error->vm_count; i++) {
-		err_printf(m, "vm[%d]\n", i);
+	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
+		char buf[128];
+		int len, first = 1;
 
-		print_error_buffers(m, "Active",
+		if (!error->active_vm[i])
+			break;
+
+		len = scnprintf(buf, sizeof(buf), "Active (");
+		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
+			if (error->engine[j].vm != error->active_vm[i])
+				continue;
+
+			len += scnprintf(buf + len, sizeof(buf), "%s%s",
+					 first ? "" : ", ",
+					 dev_priv->engine[j].name);
+			first = 0;
+		}
+		scnprintf(buf + len, sizeof(buf), ")");
+		print_error_buffers(m, buf,
 				    error->active_bo[i],
 				    error->active_bo_count[i]);
-
-		print_error_buffers(m, "Pinned",
-				    error->pinned_bo[i],
-				    error->pinned_bo_count[i]);
 	}
 
+	print_error_buffers(m, "Pinned (global)",
+			    error->pinned_bo,
+			    error->pinned_bo_count);
+
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		struct drm_i915_error_engine *ee = &error->engine[i];
 
@@ -627,13 +631,10 @@ static void i915_error_state_free(struct kref *error_ref)
 
 	i915_error_object_free(error->semaphore_obj);
 
-	for (i = 0; i < error->vm_count; i++)
+	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
 		kfree(error->active_bo[i]);
-
-	kfree(error->active_bo);
-	kfree(error->active_bo_count);
 	kfree(error->pinned_bo);
-	kfree(error->pinned_bo_count);
+
 	kfree(error->overlay);
 	kfree(error->display);
 	kfree(error);
@@ -779,9 +780,6 @@ static void capture_bo(struct drm_i915_error_buffer *err,
 	err->read_domains = obj->base.read_domains;
 	err->write_domain = obj->base.write_domain;
 	err->fence_reg = obj->fence_reg;
-	err->pinned = 0;
-	if (i915_gem_obj_is_pinned(obj))
-		err->pinned = 1;
 	err->tiling = i915_gem_object_get_tiling(obj);
 	err->dirty = obj->dirty;
 	err->purgeable = obj->madv != I915_MADV_WILLNEED;
@@ -789,13 +787,17 @@ static void capture_bo(struct drm_i915_error_buffer *err,
 	err->cache_level = obj->cache_level;
 }
 
-static u32 capture_active_bo(struct drm_i915_error_buffer *err,
-			     int count, struct list_head *head)
+static u32 capture_error_bo(struct drm_i915_error_buffer *err,
+			    int count, struct list_head *head,
+			    bool pinned_only)
 {
 	struct i915_vma *vma;
 	int i = 0;
 
 	list_for_each_entry(vma, head, vm_link) {
+		if (pinned_only && !i915_vma_is_pinned(vma))
+			continue;
+
 		capture_bo(err++, vma);
 		if (++i == count)
 			break;
@@ -804,28 +806,6 @@ static u32 capture_active_bo(struct drm_i915_error_buffer *err,
 	return i;
 }
 
-static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
-			     int count, struct list_head *head,
-			     struct i915_address_space *vm)
-{
-	struct drm_i915_gem_object *obj;
-	struct drm_i915_error_buffer * const first = err;
-	struct drm_i915_error_buffer * const last = err + count;
-
-	list_for_each_entry(obj, head, global_list) {
-		struct i915_vma *vma;
-
-		if (err == last)
-			break;
-
-		list_for_each_entry(vma, &obj->vma_list, obj_link)
-			if (vma->vm == vm && i915_vma_is_pinned(vma))
-				capture_bo(err++, vma);
-	}
-
-	return err - first;
-}
-
 /* Generate a semi-unique error code. The code is not meant to have meaning, The
  * code's only purpose is to try to prevent false duplicated bug reports by
  * grossly estimating a GPU error state.
@@ -1063,7 +1043,6 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 	}
 }
 
-
 static void i915_gem_record_active_context(struct intel_engine_cs *engine,
 					   struct drm_i915_error_state *error,
 					   struct drm_i915_error_engine *ee)
@@ -1116,10 +1095,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 
 		request = i915_gem_find_active_request(engine);
 		if (request) {
-			struct i915_address_space *vm;
 			struct intel_ring *ring;
 
-			vm = request->ctx->ppgtt ?
+			ee->vm = request->ctx->ppgtt ?
 				&request->ctx->ppgtt->base : &ggtt->base;
 
 			/* We need to copy these to an anonymous buffer
@@ -1129,7 +1107,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			ee->batchbuffer =
 				i915_error_object_create(dev_priv,
 							 request->batch_obj,
-							 vm);
+							 ee->vm);
 
 			if (HAS_BROKEN_CS_TLB(dev_priv))
 				ee->wa_batchbuffer =
@@ -1212,89 +1190,88 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 	}
 }
 
-/* FIXME: Since pin count/bound list is global, we duplicate what we capture per
- * VM.
- */
 static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
 				struct drm_i915_error_state *error,
 				struct i915_address_space *vm,
-				const int ndx)
+				int idx)
 {
-	struct drm_i915_error_buffer *active_bo = NULL, *pinned_bo = NULL;
-	struct drm_i915_gem_object *obj;
+	struct drm_i915_error_buffer *active_bo;
 	struct i915_vma *vma;
-	int i;
+	int count;
 
-	i = 0;
+	count = 0;
 	list_for_each_entry(vma, &vm->active_list, vm_link)
-		i++;
-	error->active_bo_count[ndx] = i;
-
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
-		list_for_each_entry(vma, &obj->vma_list, obj_link)
-			if (vma->vm == vm && i915_vma_is_pinned(vma))
-				i++;
-	}
-	error->pinned_bo_count[ndx] = i - error->active_bo_count[ndx];
-
-	if (i) {
-		active_bo = kcalloc(i, sizeof(*active_bo), GFP_ATOMIC);
-		if (active_bo)
-			pinned_bo = active_bo + error->active_bo_count[ndx];
-	}
+		count++;
 
+	active_bo = NULL;
+	if (count)
+		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
 	if (active_bo)
-		error->active_bo_count[ndx] =
-			capture_active_bo(active_bo,
-					  error->active_bo_count[ndx],
-					  &vm->active_list);
+		count = capture_error_bo(active_bo, count, &vm->active_list, false);
+	else
+		count = 0;
 
-	if (pinned_bo)
-		error->pinned_bo_count[ndx] =
-			capture_pinned_bo(pinned_bo,
-					  error->pinned_bo_count[ndx],
-					  &dev_priv->mm.bound_list, vm);
-	error->active_bo[ndx] = active_bo;
-	error->pinned_bo[ndx] = pinned_bo;
+	error->active_vm[idx] = vm;
+	error->active_bo[idx] = active_bo;
+	error->active_bo_count[idx] = count;
 }
 
-static void i915_gem_capture_buffers(struct drm_i915_private *dev_priv,
-				     struct drm_i915_error_state *error)
+static void i915_capture_active_buffers(struct drm_i915_private *dev_priv,
+					struct drm_i915_error_state *error)
 {
-	struct i915_address_space *vm;
-	int cnt = 0, i = 0;
+	int cnt = 0, i, j;
 
-	list_for_each_entry(vm, &dev_priv->vm_list, global_link)
-		cnt++;
+	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
+	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
+	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
 
-	error->active_bo = kcalloc(cnt, sizeof(*error->active_bo), GFP_ATOMIC);
-	error->pinned_bo = kcalloc(cnt, sizeof(*error->pinned_bo), GFP_ATOMIC);
-	error->active_bo_count = kcalloc(cnt, sizeof(*error->active_bo_count),
-					 GFP_ATOMIC);
-	error->pinned_bo_count = kcalloc(cnt, sizeof(*error->pinned_bo_count),
-					 GFP_ATOMIC);
+	/* Scan each engine looking for unique active contexts/vm */
+	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
+		struct drm_i915_error_engine *ee = &error->engine[i];
+		bool found;
 
-	if (error->active_bo == NULL ||
-	    error->pinned_bo == NULL ||
-	    error->active_bo_count == NULL ||
-	    error->pinned_bo_count == NULL) {
-		kfree(error->active_bo);
-		kfree(error->active_bo_count);
-		kfree(error->pinned_bo);
-		kfree(error->pinned_bo_count);
+		if (!ee->vm)
+			continue;
 
-		error->active_bo = NULL;
-		error->active_bo_count = NULL;
-		error->pinned_bo = NULL;
-		error->pinned_bo_count = NULL;
-	} else {
-		list_for_each_entry(vm, &dev_priv->vm_list, global_link)
-			i915_gem_capture_vm(dev_priv, error, vm, i++);
-
-		error->vm_count = cnt;
+		found = false;
+		for (j = 0; j < i && !found; j++)
+			found = error->engine[j].vm == ee->vm;
+		if (!found)
+			i915_gem_capture_vm(dev_priv, error, ee->vm, cnt++);
 	}
 }
 
+static void i915_capture_pinned_buffers(struct drm_i915_private *dev_priv,
+					struct drm_i915_error_state *error)
+{
+	struct i915_address_space *vm = &dev_priv->ggtt.base;
+	struct drm_i915_error_buffer *bo;
+	struct i915_vma *vma;
+	int count_inactive, count_active;
+
+	count_inactive = 0;
+	list_for_each_entry(vma, &vm->active_list, vm_link)
+		count_inactive++;
+
+	count_active = 0;
+	list_for_each_entry(vma, &vm->inactive_list, vm_link)
+		count_active++;
+
+	bo = NULL;
+	if (count_inactive + count_active)
+		bo = kcalloc(count_inactive + count_active,
+			     sizeof(*bo), GFP_ATOMIC);
+	if (!bo)
+		return;
+
+	count_inactive = capture_error_bo(bo, count_inactive,
+					  &vm->active_list, true);
+	count_active = capture_error_bo(bo + count_inactive, count_active,
+					&vm->inactive_list, true);
+	error->pinned_bo_count = count_inactive + count_active;
+	error->pinned_bo = bo;
+}
+
 /* Capture all registers which don't fit into another category. */
 static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
 				   struct drm_i915_error_state *error)
@@ -1438,9 +1415,10 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 
 	i915_capture_gen_state(dev_priv, error);
 	i915_capture_reg_state(dev_priv, error);
-	i915_gem_capture_buffers(dev_priv, error);
 	i915_gem_record_fences(dev_priv, error);
 	i915_gem_record_rings(dev_priv, error);
+	i915_capture_active_buffers(dev_priv, error);
+	i915_capture_pinned_buffers(dev_priv, error);
 
 	do_gettimeofday(&error->time);
 

From 546b1b6a40999b6fb23cb45aaffa72d02076306c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:42 +0100
Subject: [PATCH 052/141] drm/i915: Store the active context object on all
 engines upon error

With execlists, we have context objects everywhere, not just RCS. So
store them for post-mortem debugging. This also has a secondary effect
of removing one more unsafe list iteration with using preserved state
from the hanging request. And now we can cross-reference the request's
context state with that loaded by the GPU.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-3-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 28 ++++-----------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1c098fa65fbe..d11630bac188 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1043,28 +1043,6 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 	}
 }
 
-static void i915_gem_record_active_context(struct intel_engine_cs *engine,
-					   struct drm_i915_error_state *error,
-					   struct drm_i915_error_engine *ee)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	struct drm_i915_gem_object *obj;
-
-	/* Currently render ring is the only HW context user */
-	if (engine->id != RCS || !error->ccid)
-		return;
-
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
-		if (!i915_gem_obj_ggtt_bound(obj))
-			continue;
-
-		if ((error->ccid & PAGE_MASK) == i915_gem_obj_ggtt_offset(obj)) {
-			ee->ctx = i915_error_ggtt_object_create(dev_priv, obj);
-			break;
-		}
-	}
-}
-
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct drm_i915_error_state *error)
 {
@@ -1114,6 +1092,10 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 					i915_error_ggtt_object_create(dev_priv,
 								      engine->scratch.obj);
 
+			ee->ctx =
+				i915_error_ggtt_object_create(dev_priv,
+							      request->ctx->engine[i].state);
+
 			if (request->pid) {
 				struct task_struct *task;
 
@@ -1144,8 +1126,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 		ee->wa_ctx = i915_error_ggtt_object_create(dev_priv,
 							   engine->wa_ctx.obj);
 
-		i915_gem_record_active_context(engine, error, ee);
-
 		count = 0;
 		list_for_each_entry(request, &engine->request_list, link)
 			count++;

From 61fb00d6ea1289215331e6547773eab885f5d9fa Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:43 +0100
Subject: [PATCH 053/141] drm/i915: Remove inactive/active list from debugfs

These two files (i915_gem_active, i915_gem_inactive) no longer give
pertinent information since active/inactive tracking is per-vm and so we
need the information per-vm. They are obsolete so remove them.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-4-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c | 49 -----------------------------
 1 file changed, 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a7ee7d6c8a66..470512d5e5d7 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -210,53 +210,6 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		seq_printf(m, " (frontbuffer: 0x%03x)", frontbuffer_bits);
 }
 
-static int i915_gem_object_list_info(struct seq_file *m, void *data)
-{
-	struct drm_info_node *node = m->private;
-	uintptr_t list = (uintptr_t) node->info_ent->data;
-	struct list_head *head;
-	struct drm_device *dev = node->minor->dev;
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct i915_vma *vma;
-	u64 total_obj_size, total_gtt_size;
-	int count, ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	/* FIXME: the user of this interface might want more than just GGTT */
-	switch (list) {
-	case ACTIVE_LIST:
-		seq_puts(m, "Active:\n");
-		head = &ggtt->base.active_list;
-		break;
-	case INACTIVE_LIST:
-		seq_puts(m, "Inactive:\n");
-		head = &ggtt->base.inactive_list;
-		break;
-	default:
-		mutex_unlock(&dev->struct_mutex);
-		return -EINVAL;
-	}
-
-	total_obj_size = total_gtt_size = count = 0;
-	list_for_each_entry(vma, head, vm_link) {
-		seq_printf(m, "   ");
-		describe_obj(m, vma->obj);
-		seq_printf(m, "\n");
-		total_obj_size += vma->obj->base.size;
-		total_gtt_size += vma->node.size;
-		count++;
-	}
-	mutex_unlock(&dev->struct_mutex);
-
-	seq_printf(m, "Total %d objects, %llu bytes, %llu GTT size\n",
-		   count, total_obj_size, total_gtt_size);
-	return 0;
-}
-
 static int obj_rank_by_stolen(void *priv,
 			      struct list_head *A, struct list_head *B)
 {
@@ -5420,8 +5373,6 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_gem_objects", i915_gem_object_info, 0},
 	{"i915_gem_gtt", i915_gem_gtt_info, 0},
 	{"i915_gem_pinned", i915_gem_gtt_info, 0, (void *) PINNED_LIST},
-	{"i915_gem_active", i915_gem_object_list_info, 0, (void *) ACTIVE_LIST},
-	{"i915_gem_inactive", i915_gem_object_list_info, 0, (void *) INACTIVE_LIST},
 	{"i915_gem_stolen", i915_gem_stolen_list_info },
 	{"i915_gem_pageflip", i915_gem_pageflip_info, 0},
 	{"i915_gem_request", i915_gem_request_info, 0},

From 6da8482936c75a3d31cb730642171284042f3199 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:44 +0100
Subject: [PATCH 054/141] drm/i915: Focus debugfs/i915_gem_pinned to show only
 display pins

Only those objects pinned to the display have semi-permanent pins of a
global nature (other pins are transient within their local vm). Simplify
i915_gem_pinned to only show the pertinent information about the pinned
objects within the GGTT.

v2: i915_gem_gtt_info is still shared with debugfs/i915_gem_gtt,
rename i915_gem_pinned to i915_gem_pin_display to better reflect its
contents

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-5-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 470512d5e5d7..859dc0e032f9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -40,12 +40,6 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 
-enum {
-	ACTIVE_LIST,
-	INACTIVE_LIST,
-	PINNED_LIST,
-};
-
 /* As the drm_debugfs_init() routines are called before dev->dev_private is
  * allocated we need to hook into the minor for release. */
 static int
@@ -537,8 +531,8 @@ static int i915_gem_gtt_info(struct seq_file *m, void *data)
 {
 	struct drm_info_node *node = m->private;
 	struct drm_device *dev = node->minor->dev;
-	uintptr_t list = (uintptr_t) node->info_ent->data;
 	struct drm_i915_private *dev_priv = to_i915(dev);
+	bool show_pin_display_only = !!data;
 	struct drm_i915_gem_object *obj;
 	u64 total_obj_size, total_gtt_size;
 	int count, ret;
@@ -549,7 +543,7 @@ static int i915_gem_gtt_info(struct seq_file *m, void *data)
 
 	total_obj_size = total_gtt_size = count = 0;
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
-		if (list == PINNED_LIST && !i915_gem_obj_is_pinned(obj))
+		if (show_pin_display_only && !obj->pin_display)
 			continue;
 
 		seq_puts(m, "   ");
@@ -5372,7 +5366,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_capabilities", i915_capabilities, 0},
 	{"i915_gem_objects", i915_gem_object_info, 0},
 	{"i915_gem_gtt", i915_gem_gtt_info, 0},
-	{"i915_gem_pinned", i915_gem_gtt_info, 0, (void *) PINNED_LIST},
+	{"i915_gem_pin_display", i915_gem_gtt_info, 0, (void *)1},
 	{"i915_gem_stolen", i915_gem_stolen_list_info },
 	{"i915_gem_pageflip", i915_gem_pageflip_info, 0},
 	{"i915_gem_request", i915_gem_request_info, 0},

From 2bd160a131ac617fc2441bfb4a02964c964a5da6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:45 +0100
Subject: [PATCH 055/141] drm/i915: Reduce i915_gem_objects to only show object
 information

No longer is knowing how much of the GTT (both mappable aperture and
beyond) relevant, and the output clutters the real information - that is
how many objects are allocated and bound (and by who) so that we can
quickly grasp if there is a leak.

v2: Relent, and rename pinned to indicate display only. Since the
display objects are semi-static and are of variable size, they are the
interesting objects to watch over time for aperture leaking. The other
pins are either static (such as the scratch page) or very short lived
(such as execbuf) and not part of the precious GGTT.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-6-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 114 ++++--------
 drivers/gpu/drm/i915/i915_drv.h       | 249 +++++++++++++-------------
 drivers/gpu/drm/i915/i915_gpu_error.c |  15 ++
 3 files changed, 175 insertions(+), 203 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 859dc0e032f9..4cb4c7aa8163 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -269,17 +269,6 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 	return 0;
 }
 
-#define count_objects(list, member) do { \
-	list_for_each_entry(obj, list, member) { \
-		size += i915_gem_obj_total_ggtt_size(obj); \
-		++count; \
-		if (obj->map_and_fenceable) { \
-			mappable_size += i915_gem_obj_ggtt_size(obj); \
-			++mappable_count; \
-		} \
-	} \
-} while (0)
-
 struct file_stats {
 	struct drm_i915_file_private *file_priv;
 	unsigned long count;
@@ -394,30 +383,16 @@ static void print_context_stats(struct seq_file *m,
 	print_file_stats(m, "[k]contexts", stats);
 }
 
-#define count_vmas(list, member) do { \
-	list_for_each_entry(vma, list, member) { \
-		size += i915_gem_obj_total_ggtt_size(vma->obj); \
-		++count; \
-		if (vma->obj->map_and_fenceable) { \
-			mappable_size += i915_gem_obj_ggtt_size(vma->obj); \
-			++mappable_count; \
-		} \
-	} \
-} while (0)
-
 static int i915_gem_object_info(struct seq_file *m, void* data)
 {
 	struct drm_info_node *node = m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	u32 count, mappable_count, purgeable_count;
-	u64 size, mappable_size, purgeable_size;
-	unsigned long pin_mapped_count = 0, pin_mapped_purgeable_count = 0;
-	u64 pin_mapped_size = 0, pin_mapped_purgeable_size = 0;
+	u32 count, mapped_count, purgeable_count, dpy_count;
+	u64 size, mapped_size, purgeable_size, dpy_size;
 	struct drm_i915_gem_object *obj;
 	struct drm_file *file;
-	struct i915_vma *vma;
 	int ret;
 
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
@@ -428,70 +403,51 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 		   dev_priv->mm.object_count,
 		   dev_priv->mm.object_memory);
 
-	size = count = mappable_size = mappable_count = 0;
-	count_objects(&dev_priv->mm.bound_list, global_list);
-	seq_printf(m, "%u [%u] objects, %llu [%llu] bytes in gtt\n",
-		   count, mappable_count, size, mappable_size);
-
-	size = count = mappable_size = mappable_count = 0;
-	count_vmas(&ggtt->base.active_list, vm_link);
-	seq_printf(m, "  %u [%u] active objects, %llu [%llu] bytes\n",
-		   count, mappable_count, size, mappable_size);
-
-	size = count = mappable_size = mappable_count = 0;
-	count_vmas(&ggtt->base.inactive_list, vm_link);
-	seq_printf(m, "  %u [%u] inactive objects, %llu [%llu] bytes\n",
-		   count, mappable_count, size, mappable_size);
-
 	size = count = purgeable_size = purgeable_count = 0;
 	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_list) {
-		size += obj->base.size, ++count;
-		if (obj->madv == I915_MADV_DONTNEED)
-			purgeable_size += obj->base.size, ++purgeable_count;
-		if (obj->mapping) {
-			pin_mapped_count++;
-			pin_mapped_size += obj->base.size;
-			if (obj->pages_pin_count == 0) {
-				pin_mapped_purgeable_count++;
-				pin_mapped_purgeable_size += obj->base.size;
-			}
-		}
-	}
-	seq_printf(m, "%u unbound objects, %llu bytes\n", count, size);
+		size += obj->base.size;
+		++count;
 
-	size = count = mappable_size = mappable_count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
-		if (obj->fault_mappable) {
-			size += i915_gem_obj_ggtt_size(obj);
-			++count;
-		}
-		if (obj->pin_display) {
-			mappable_size += i915_gem_obj_ggtt_size(obj);
-			++mappable_count;
-		}
 		if (obj->madv == I915_MADV_DONTNEED) {
 			purgeable_size += obj->base.size;
 			++purgeable_count;
 		}
+
 		if (obj->mapping) {
-			pin_mapped_count++;
-			pin_mapped_size += obj->base.size;
-			if (obj->pages_pin_count == 0) {
-				pin_mapped_purgeable_count++;
-				pin_mapped_purgeable_size += obj->base.size;
-			}
+			mapped_count++;
+			mapped_size += obj->base.size;
 		}
 	}
+	seq_printf(m, "%u unbound objects, %llu bytes\n", count, size);
+
+	size = count = dpy_size = dpy_count = 0;
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
+		size += obj->base.size;
+		++count;
+
+		if (obj->pin_display) {
+			dpy_size += obj->base.size;
+			++dpy_count;
+		}
+
+		if (obj->madv == I915_MADV_DONTNEED) {
+			purgeable_size += obj->base.size;
+			++purgeable_count;
+		}
+
+		if (obj->mapping) {
+			mapped_count++;
+			mapped_size += obj->base.size;
+		}
+	}
+	seq_printf(m, "%u bound objects, %llu bytes\n",
+		   count, size);
 	seq_printf(m, "%u purgeable objects, %llu bytes\n",
 		   purgeable_count, purgeable_size);
-	seq_printf(m, "%u pinned mappable objects, %llu bytes\n",
-		   mappable_count, mappable_size);
-	seq_printf(m, "%u fault mappable objects, %llu bytes\n",
-		   count, size);
-	seq_printf(m,
-		   "%lu [%lu] pin mapped objects, %llu [%llu] bytes [purgeable]\n",
-		   pin_mapped_count, pin_mapped_purgeable_count,
-		   pin_mapped_size, pin_mapped_purgeable_size);
+	seq_printf(m, "%u mapped objects, %llu bytes\n",
+		   mapped_count, mapped_size);
+	seq_printf(m, "%u display objects (pinned), %llu bytes\n",
+		   dpy_count, dpy_size);
 
 	seq_printf(m, "%llu [%llu] gtt total\n",
 		   ggtt->base.total, ggtt->mappable_end - ggtt->base.start);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7eb911e47904..25b1e6c010d5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -475,130 +475,6 @@ struct sdvo_device_mapping {
 	u8 ddc_pin;
 };
 
-struct intel_display_error_state;
-
-struct drm_i915_error_state {
-	struct kref ref;
-	struct timeval time;
-
-	char error_msg[128];
-	bool simulated;
-	int iommu;
-	u32 reset_count;
-	u32 suspend_count;
-
-	/* Generic register state */
-	u32 eir;
-	u32 pgtbl_er;
-	u32 ier;
-	u32 gtier[4];
-	u32 ccid;
-	u32 derrmr;
-	u32 forcewake;
-	u32 error; /* gen6+ */
-	u32 err_int; /* gen7 */
-	u32 fault_data0; /* gen8, gen9 */
-	u32 fault_data1; /* gen8, gen9 */
-	u32 done_reg;
-	u32 gac_eco;
-	u32 gam_ecochk;
-	u32 gab_ctl;
-	u32 gfx_mode;
-	u32 extra_instdone[I915_NUM_INSTDONE_REG];
-	u64 fence[I915_MAX_NUM_FENCES];
-	struct intel_overlay_error_state *overlay;
-	struct intel_display_error_state *display;
-	struct drm_i915_error_object *semaphore_obj;
-
-	struct drm_i915_error_engine {
-		int engine_id;
-		/* Software tracked state */
-		bool waiting;
-		int num_waiters;
-		int hangcheck_score;
-		enum intel_engine_hangcheck_action hangcheck_action;
-		struct i915_address_space *vm;
-		int num_requests;
-
-		/* our own tracking of ring head and tail */
-		u32 cpu_ring_head;
-		u32 cpu_ring_tail;
-
-		u32 last_seqno;
-		u32 semaphore_seqno[I915_NUM_ENGINES - 1];
-
-		/* Register state */
-		u32 start;
-		u32 tail;
-		u32 head;
-		u32 ctl;
-		u32 hws;
-		u32 ipeir;
-		u32 ipehr;
-		u32 instdone;
-		u32 bbstate;
-		u32 instpm;
-		u32 instps;
-		u32 seqno;
-		u64 bbaddr;
-		u64 acthd;
-		u32 fault_reg;
-		u64 faddr;
-		u32 rc_psmi; /* sleep state */
-		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
-
-		struct drm_i915_error_object {
-			int page_count;
-			u64 gtt_offset;
-			u32 *pages[0];
-		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
-
-		struct drm_i915_error_object *wa_ctx;
-
-		struct drm_i915_error_request {
-			long jiffies;
-			u32 seqno;
-			u32 head;
-			u32 tail;
-		} *requests;
-
-		struct drm_i915_error_waiter {
-			char comm[TASK_COMM_LEN];
-			pid_t pid;
-			u32 seqno;
-		} *waiters;
-
-		struct {
-			u32 gfx_mode;
-			union {
-				u64 pdp[4];
-				u32 pp_dir_base;
-			};
-		} vm_info;
-
-		pid_t pid;
-		char comm[TASK_COMM_LEN];
-	} engine[I915_NUM_ENGINES];
-
-	struct drm_i915_error_buffer {
-		u32 size;
-		u32 name;
-		u32 rseqno[I915_NUM_ENGINES], wseqno;
-		u64 gtt_offset;
-		u32 read_domains;
-		u32 write_domain;
-		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
-		u32 tiling:2;
-		u32 dirty:1;
-		u32 purgeable:1;
-		u32 userptr:1;
-		s32 engine:4;
-		u32 cache_level:3;
-	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
-	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
-	struct i915_address_space *active_vm[I915_NUM_ENGINES];
-};
-
 struct intel_connector;
 struct intel_encoder;
 struct intel_crtc_state;
@@ -823,6 +699,131 @@ struct intel_device_info {
 #undef DEFINE_FLAG
 #undef SEP_SEMICOLON
 
+struct intel_display_error_state;
+
+struct drm_i915_error_state {
+	struct kref ref;
+	struct timeval time;
+
+	char error_msg[128];
+	bool simulated;
+	int iommu;
+	u32 reset_count;
+	u32 suspend_count;
+	struct intel_device_info device_info;
+
+	/* Generic register state */
+	u32 eir;
+	u32 pgtbl_er;
+	u32 ier;
+	u32 gtier[4];
+	u32 ccid;
+	u32 derrmr;
+	u32 forcewake;
+	u32 error; /* gen6+ */
+	u32 err_int; /* gen7 */
+	u32 fault_data0; /* gen8, gen9 */
+	u32 fault_data1; /* gen8, gen9 */
+	u32 done_reg;
+	u32 gac_eco;
+	u32 gam_ecochk;
+	u32 gab_ctl;
+	u32 gfx_mode;
+	u32 extra_instdone[I915_NUM_INSTDONE_REG];
+	u64 fence[I915_MAX_NUM_FENCES];
+	struct intel_overlay_error_state *overlay;
+	struct intel_display_error_state *display;
+	struct drm_i915_error_object *semaphore_obj;
+
+	struct drm_i915_error_engine {
+		int engine_id;
+		/* Software tracked state */
+		bool waiting;
+		int num_waiters;
+		int hangcheck_score;
+		enum intel_engine_hangcheck_action hangcheck_action;
+		struct i915_address_space *vm;
+		int num_requests;
+
+		/* our own tracking of ring head and tail */
+		u32 cpu_ring_head;
+		u32 cpu_ring_tail;
+
+		u32 last_seqno;
+		u32 semaphore_seqno[I915_NUM_ENGINES - 1];
+
+		/* Register state */
+		u32 start;
+		u32 tail;
+		u32 head;
+		u32 ctl;
+		u32 hws;
+		u32 ipeir;
+		u32 ipehr;
+		u32 instdone;
+		u32 bbstate;
+		u32 instpm;
+		u32 instps;
+		u32 seqno;
+		u64 bbaddr;
+		u64 acthd;
+		u32 fault_reg;
+		u64 faddr;
+		u32 rc_psmi; /* sleep state */
+		u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
+
+		struct drm_i915_error_object {
+			int page_count;
+			u64 gtt_offset;
+			u32 *pages[0];
+		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
+
+		struct drm_i915_error_object *wa_ctx;
+
+		struct drm_i915_error_request {
+			long jiffies;
+			u32 seqno;
+			u32 head;
+			u32 tail;
+		} *requests;
+
+		struct drm_i915_error_waiter {
+			char comm[TASK_COMM_LEN];
+			pid_t pid;
+			u32 seqno;
+		} *waiters;
+
+		struct {
+			u32 gfx_mode;
+			union {
+				u64 pdp[4];
+				u32 pp_dir_base;
+			};
+		} vm_info;
+
+		pid_t pid;
+		char comm[TASK_COMM_LEN];
+	} engine[I915_NUM_ENGINES];
+
+	struct drm_i915_error_buffer {
+		u32 size;
+		u32 name;
+		u32 rseqno[I915_NUM_ENGINES], wseqno;
+		u64 gtt_offset;
+		u32 read_domains;
+		u32 write_domain;
+		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
+		u32 tiling:2;
+		u32 dirty:1;
+		u32 purgeable:1;
+		u32 userptr:1;
+		s32 engine:4;
+		u32 cache_level:3;
+	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
+	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
+	struct i915_address_space *active_vm[I915_NUM_ENGINES];
+};
+
 enum i915_cache_level {
 	I915_CACHE_NONE = 0,
 	I915_CACHE_LLC, /* also used for snoopable memory on non-LLC */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index d11630bac188..9ba71dd4a312 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -312,6 +312,16 @@ static void print_error_obj(struct drm_i915_error_state_buf *m,
 	}
 }
 
+static void err_print_capabilities(struct drm_i915_error_state_buf *m,
+				   const struct intel_device_info *info)
+{
+#define PRINT_FLAG(x)  err_printf(m, #x ": %s\n", yesno(info->x))
+#define SEP_SEMICOLON ;
+	DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG, SEP_SEMICOLON);
+#undef PRINT_FLAG
+#undef SEP_SEMICOLON
+}
+
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			    const struct i915_error_state_file_priv *error_priv)
 {
@@ -331,6 +341,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
+	err_print_capabilities(m, &error->device_info);
 	max_hangcheck_score = 0;
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		if (error->engine[i].hangcheck_score > max_hangcheck_score)
@@ -1362,6 +1373,10 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
 #endif
 	error->reset_count = i915_reset_count(&dev_priv->gpu_error);
 	error->suspend_count = dev_priv->suspend_count;
+
+	memcpy(&error->device_info,
+	       INTEL_INFO(dev_priv),
+	       sizeof(error->device_info));
 }
 
 /**

From 95b2ab56a53dc926eb6949ebd40a4708e32eb4f0 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:46 +0100
Subject: [PATCH 056/141] drm/i915: Remove redundant WARN_ON from
 __i915_add_request()

It's an outright programming error, so explode if it is ever hit.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-7-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_request.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 8a9e9bfeea09..4c5b7e104f2f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -470,18 +470,12 @@ static void i915_gem_mark_busy(const struct intel_engine_cs *engine)
  */
 void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 {
-	struct intel_engine_cs *engine;
-	struct intel_ring *ring;
+	struct intel_engine_cs *engine = request->engine;
+	struct intel_ring *ring = request->ring;
 	u32 request_start;
 	u32 reserved_tail;
 	int ret;
 
-	if (WARN_ON(!request))
-		return;
-
-	engine = request->engine;
-	ring = request->ring;
-
 	/*
 	 * To ensure that this call will not fail, space for its emissions
 	 * should already have been reserved in the ring buffer. Let the ring

From 247177ddd5170d81a0ffb6f4ec2f32c504a4a33d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:47 +0100
Subject: [PATCH 057/141] drm/i915: Always set the vma->pages

Previously, we would only set the vma->pages pointer for GGTT entries.
However, if we always set it, we can use it to prettify some code that
may want to access the backing store associated with the VMA (as
assigned to the VMA).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-8-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c     |  8 ++++----
 drivers/gpu/drm/i915/i915_gem_gtt.c | 30 ++++++++++++++---------------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  3 +--
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f48c45080a65..45c45d3a6e31 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2868,12 +2868,12 @@ int i915_vma_unbind(struct i915_vma *vma)
 	if (i915_vma_is_ggtt(vma)) {
 		if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
 			obj->map_and_fenceable = false;
-		} else if (vma->ggtt_view.pages) {
-			sg_free_table(vma->ggtt_view.pages);
-			kfree(vma->ggtt_view.pages);
+		} else if (vma->pages) {
+			sg_free_table(vma->pages);
+			kfree(vma->pages);
 		}
-		vma->ggtt_view.pages = NULL;
 	}
+	vma->pages = NULL;
 
 	/* Since the unbound list is global, only move to that list if
 	 * no more VMAs exist. */
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d876501694c6..9c178b0c40b5 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -170,11 +170,13 @@ static int ppgtt_bind_vma(struct i915_vma *vma,
 {
 	u32 pte_flags = 0;
 
+	vma->pages = vma->obj->pages;
+
 	/* Currently applicable only to VLV */
 	if (vma->obj->gt_ro)
 		pte_flags |= PTE_READ_ONLY;
 
-	vma->vm->insert_entries(vma->vm, vma->obj->pages, vma->node.start,
+	vma->vm->insert_entries(vma->vm, vma->pages, vma->node.start,
 				cache_level, pte_flags);
 
 	return 0;
@@ -2618,8 +2620,7 @@ static int ggtt_bind_vma(struct i915_vma *vma,
 	if (obj->gt_ro)
 		pte_flags |= PTE_READ_ONLY;
 
-	vma->vm->insert_entries(vma->vm, vma->ggtt_view.pages,
-				vma->node.start,
+	vma->vm->insert_entries(vma->vm, vma->pages, vma->node.start,
 				cache_level, pte_flags);
 
 	/*
@@ -2651,8 +2652,7 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 
 	if (flags & I915_VMA_GLOBAL_BIND) {
 		vma->vm->insert_entries(vma->vm,
-					vma->ggtt_view.pages,
-					vma->node.start,
+					vma->pages, vma->node.start,
 					cache_level, pte_flags);
 	}
 
@@ -2660,8 +2660,7 @@ static int aliasing_gtt_bind_vma(struct i915_vma *vma,
 		struct i915_hw_ppgtt *appgtt =
 			to_i915(vma->vm->dev)->mm.aliasing_ppgtt;
 		appgtt->base.insert_entries(&appgtt->base,
-					    vma->ggtt_view.pages,
-					    vma->node.start,
+					    vma->pages, vma->node.start,
 					    cache_level, pte_flags);
 	}
 
@@ -3557,28 +3556,27 @@ i915_get_ggtt_vma_pages(struct i915_vma *vma)
 {
 	int ret = 0;
 
-	if (vma->ggtt_view.pages)
+	if (vma->pages)
 		return 0;
 
 	if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL)
-		vma->ggtt_view.pages = vma->obj->pages;
+		vma->pages = vma->obj->pages;
 	else if (vma->ggtt_view.type == I915_GGTT_VIEW_ROTATED)
-		vma->ggtt_view.pages =
+		vma->pages =
 			intel_rotate_fb_obj_pages(&vma->ggtt_view.params.rotated, vma->obj);
 	else if (vma->ggtt_view.type == I915_GGTT_VIEW_PARTIAL)
-		vma->ggtt_view.pages =
-			intel_partial_pages(&vma->ggtt_view, vma->obj);
+		vma->pages = intel_partial_pages(&vma->ggtt_view, vma->obj);
 	else
 		WARN_ONCE(1, "GGTT view %u not implemented!\n",
 			  vma->ggtt_view.type);
 
-	if (!vma->ggtt_view.pages) {
+	if (!vma->pages) {
 		DRM_ERROR("Failed to get pages for GGTT view type %u!\n",
 			  vma->ggtt_view.type);
 		ret = -EINVAL;
-	} else if (IS_ERR(vma->ggtt_view.pages)) {
-		ret = PTR_ERR(vma->ggtt_view.pages);
-		vma->ggtt_view.pages = NULL;
+	} else if (IS_ERR(vma->pages)) {
+		ret = PTR_ERR(vma->pages);
+		vma->pages = NULL;
 		DRM_ERROR("Failed to get pages for VMA view type %u (%d)!\n",
 			  vma->ggtt_view.type, ret);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 56e64a5355e8..b580e8a013ce 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -155,8 +155,6 @@ struct i915_ggtt_view {
 		} partial;
 		struct intel_rotation_info rotated;
 	} params;
-
-	struct sg_table *pages;
 };
 
 extern const struct i915_ggtt_view i915_ggtt_view_normal;
@@ -176,6 +174,7 @@ struct i915_vma {
 	struct drm_mm_node node;
 	struct drm_i915_gem_object *obj;
 	struct i915_address_space *vm;
+	struct sg_table *pages;
 	void __iomem *iomap;
 	u64 size;
 

From 81a8aa4a6c8535955c8b795c762489d0edf9a648 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:48 +0100
Subject: [PATCH 058/141] drm/i915: Create a VMA for an object

In many places, we wish to store the VMA in preference to the object
itself and so being able to create the persistent VMA is useful.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-9-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_gtt.c | 11 +++++++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 9c178b0c40b5..1bec50bd651b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3386,6 +3386,17 @@ __i915_gem_vma_create(struct drm_i915_gem_object *obj,
 	return vma;
 }
 
+struct i915_vma *
+i915_vma_create(struct drm_i915_gem_object *obj,
+		struct i915_address_space *vm,
+		const struct i915_ggtt_view *view)
+{
+	GEM_BUG_ON(view && !i915_is_ggtt(vm));
+	GEM_BUG_ON(view ? i915_gem_obj_to_ggtt_view(obj, view) : i915_gem_obj_to_vma(obj, vm));
+
+	return __i915_gem_vma_create(obj, vm, view ?: &i915_ggtt_view_normal);
+}
+
 struct i915_vma *
 i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
 				  struct i915_address_space *vm)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index b580e8a013ce..f2769e01cc8c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -228,6 +228,11 @@ struct i915_vma {
 	struct drm_i915_gem_exec_object2 *exec_entry;
 };
 
+struct i915_vma *
+i915_vma_create(struct drm_i915_gem_object *obj,
+		struct i915_address_space *vm,
+		const struct i915_ggtt_view *view);
+
 static inline bool i915_vma_is_ggtt(const struct i915_vma *vma)
 {
 	return vma->flags & I915_VMA_GGTT;

From 78ef2d9abad6da3f94489148b4e08359dd2fb4fe Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:49 +0100
Subject: [PATCH 059/141] drm/i915: Add fetch_and_zero() macro

A simple little macro to clear a pointer and return the old value. This
is useful for writing

	value = *ptr;
	if (!value)
		return;

	*ptr = 0;
	...
	free(value);

in a slightly more concise form:

	value = fetch_and_zero(ptr);
	if (!value)
		return;

	...
	free(value);

with the idea that this establishes a pattern that may be extended for
atomic use (using xchg or cmpxchg) i.e. atomic_fetch_and_zero() and
similar to llist.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-10-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 25b1e6c010d5..855833a6306a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3920,4 +3920,10 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
 #define ptr_pack_bits(ptr, bits)					\
 	((typeof(ptr))((unsigned long)(ptr) | (bits)))
 
+#define fetch_and_zero(ptr) ({						\
+	typeof(*ptr) __T = *(ptr);					\
+	*(ptr) = (typeof(*ptr))0;					\
+	__T;								\
+})
+
 #endif

From 624192cfd37138f0e7a4e077ca9b19b080741294 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:50 +0100
Subject: [PATCH 060/141] drm/i915: Add convenience wrappers for vma's object
 get/put

The VMA are unreferenced, they belong to the object and live until they
are closed. However, if we want to use the VMA as a cookie and use it to
keep the object alive, we want to hold onto a reference to the object
for the lifetime of the VMA cookie. To facilitate this, add a couple of
simple wrappers for managing the reference count on the object owning the
VMA.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-11-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h            | 12 ++++++++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 855833a6306a..3285c8e2c87a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2373,6 +2373,18 @@ i915_gem_object_get_stride(struct drm_i915_gem_object *obj)
 	return obj->tiling_and_stride & STRIDE_MASK;
 }
 
+static inline struct i915_vma *i915_vma_get(struct i915_vma *vma)
+{
+	i915_gem_object_get(vma->obj);
+	return vma;
+}
+
+static inline void i915_vma_put(struct i915_vma *vma)
+{
+	lockdep_assert_held(&vma->vm->dev->struct_mutex);
+	i915_gem_object_put(vma->obj);
+}
+
 /*
  * Optimised SGL iterator for GEM objects
  */
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c8d13fea4b25..ced05878b405 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -271,7 +271,7 @@ static void eb_destroy(struct eb_vmas *eb)
 				       exec_list);
 		list_del_init(&vma->exec_list);
 		i915_gem_execbuffer_unreserve_vma(vma);
-		i915_gem_object_put(vma->obj);
+		i915_vma_put(vma);
 	}
 	kfree(eb);
 }
@@ -900,7 +900,7 @@ i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		vma = list_first_entry(&eb->vmas, struct i915_vma, exec_list);
 		list_del_init(&vma->exec_list);
 		i915_gem_execbuffer_unreserve_vma(vma);
-		i915_gem_object_put(vma->obj);
+		i915_vma_put(vma);
 	}
 
 	mutex_unlock(&dev->struct_mutex);

From 8b797af189c917a821221ddc258f50a14552177a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:51 +0100
Subject: [PATCH 061/141] drm/i915: Track pinned vma inside guc

Since the guc allocates and pins and object into the GGTT for its usage,
it is more natural to use that pinned VMA as our resource cookie.

v2: Embrace naming tautology
v3: Rewrite comments for guc_allocate_vma()

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-12-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  10 +-
 drivers/gpu/drm/i915/i915_gem_gtt.h        |   6 +
 drivers/gpu/drm/i915/i915_guc_submission.c | 144 ++++++++++-----------
 drivers/gpu/drm/i915/intel_guc.h           |   9 +-
 drivers/gpu/drm/i915/intel_guc_loader.c    |   7 +-
 5 files changed, 90 insertions(+), 86 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 4cb4c7aa8163..311225249563 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2570,15 +2570,15 @@ static int i915_guc_log_dump(struct seq_file *m, void *data)
 	struct drm_info_node *node = m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct drm_i915_gem_object *log_obj = dev_priv->guc.log_obj;
-	u32 *log;
+	struct drm_i915_gem_object *obj;
 	int i = 0, pg;
 
-	if (!log_obj)
+	if (!dev_priv->guc.log_vma)
 		return 0;
 
-	for (pg = 0; pg < log_obj->base.size / PAGE_SIZE; pg++) {
-		log = kmap_atomic(i915_gem_object_get_page(log_obj, pg));
+	obj = dev_priv->guc.log_vma->obj;
+	for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) {
+		u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg));
 
 		for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
 			seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index f2769e01cc8c..a2691943a404 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -716,4 +716,10 @@ static inline void i915_vma_unpin_iomap(struct i915_vma *vma)
 	i915_vma_unpin(vma);
 }
 
+static inline struct page *i915_vma_first_page(struct i915_vma *vma)
+{
+	GEM_BUG_ON(!vma->pages);
+	return sg_page(vma->pages->sgl);
+}
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 6831321a9c8c..29de8cec1b58 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -183,7 +183,7 @@ static int guc_update_doorbell_id(struct intel_guc *guc,
 				  struct i915_guc_client *client,
 				  u16 new_id)
 {
-	struct sg_table *sg = guc->ctx_pool_obj->pages;
+	struct sg_table *sg = guc->ctx_pool_vma->pages;
 	void *doorbell_bitmap = guc->doorbell_bitmap;
 	struct guc_doorbell_info *doorbell;
 	struct guc_context_desc desc;
@@ -325,7 +325,6 @@ static void guc_init_proc_desc(struct intel_guc *guc,
 static void guc_init_ctx_desc(struct intel_guc *guc,
 			      struct i915_guc_client *client)
 {
-	struct drm_i915_gem_object *client_obj = client->client_obj;
 	struct drm_i915_private *dev_priv = guc_to_i915(guc);
 	struct intel_engine_cs *engine;
 	struct i915_gem_context *ctx = client->owner;
@@ -383,8 +382,8 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 	 * The doorbell, process descriptor, and workqueue are all parts
 	 * of the client object, which the GuC will reference via the GGTT
 	 */
-	gfx_addr = i915_gem_obj_ggtt_offset(client_obj);
-	desc.db_trigger_phy = sg_dma_address(client_obj->pages->sgl) +
+	gfx_addr = client->vma->node.start;
+	desc.db_trigger_phy = sg_dma_address(client->vma->pages->sgl) +
 				client->doorbell_offset;
 	desc.db_trigger_cpu = (uintptr_t)client->client_base +
 				client->doorbell_offset;
@@ -400,7 +399,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 	desc.desc_private = (uintptr_t)client;
 
 	/* Pool context is pinned already */
-	sg = guc->ctx_pool_obj->pages;
+	sg = guc->ctx_pool_vma->pages;
 	sg_pcopy_from_buffer(sg->sgl, sg->nents, &desc, sizeof(desc),
 			     sizeof(desc) * client->ctx_index);
 }
@@ -413,7 +412,7 @@ static void guc_fini_ctx_desc(struct intel_guc *guc,
 
 	memset(&desc, 0, sizeof(desc));
 
-	sg = guc->ctx_pool_obj->pages;
+	sg = guc->ctx_pool_vma->pages;
 	sg_pcopy_from_buffer(sg->sgl, sg->nents, &desc, sizeof(desc),
 			     sizeof(desc) * client->ctx_index);
 }
@@ -496,7 +495,7 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
 	/* WQ starts from the page after doorbell / process_desc */
 	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
 	wq_off &= PAGE_SIZE - 1;
-	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, wq_page));
+	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
 	wqi = (struct guc_wq_item *)((char *)base + wq_off);
 
 	/* Now fill in the 4-word work queue item */
@@ -614,55 +613,61 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
  */
 
 /**
- * gem_allocate_guc_obj() - Allocate gem object for GuC usage
- * @dev_priv:	driver private data structure
- * @size:	size of object
+ * guc_allocate_vma() - Allocate a GGTT VMA for GuC usage
+ * @guc:	the guc
+ * @size:	size of area to allocate (both virtual space and memory)
  *
- * This is a wrapper to create a gem obj. In order to use it inside GuC, the
- * object needs to be pinned lifetime. Also we must pin it to gtt space other
- * than [0, GUC_WOPCM_TOP) because this range is reserved inside GuC.
+ * This is a wrapper to create an object for use with the GuC. In order to
+ * use it inside the GuC, an object needs to be pinned lifetime, so we allocate
+ * both some backing storage and a range inside the Global GTT. We must pin
+ * it in the GGTT somewhere other than than [0, GUC_WOPCM_TOP) because that
+ * range is reserved inside GuC.
  *
- * Return:	A drm_i915_gem_object if successful, otherwise NULL.
+ * Return:	A i915_vma if successful, otherwise an ERR_PTR.
  */
-static struct drm_i915_gem_object *
-gem_allocate_guc_obj(struct drm_i915_private *dev_priv, u32 size)
+static struct i915_vma *guc_allocate_vma(struct intel_guc *guc, u32 size)
 {
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
 	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	int ret;
 
 	obj = i915_gem_object_create(&dev_priv->drm, size);
 	if (IS_ERR(obj))
-		return NULL;
+		return ERR_CAST(obj);
 
-	if (i915_gem_object_get_pages(obj)) {
-		i915_gem_object_put(obj);
-		return NULL;
-	}
+	vma = i915_vma_create(obj, &dev_priv->ggtt.base, NULL);
+	if (IS_ERR(vma))
+		goto err;
 
-	if (i915_gem_object_ggtt_pin(obj, NULL, 0, PAGE_SIZE,
-				     PIN_OFFSET_BIAS | GUC_WOPCM_TOP)) {
-		i915_gem_object_put(obj);
-		return NULL;
+	ret = i915_vma_pin(vma, 0, PAGE_SIZE,
+			   PIN_GLOBAL | PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
+	if (ret) {
+		vma = ERR_PTR(ret);
+		goto err;
 	}
 
 	/* Invalidate GuC TLB to let GuC take the latest updates to GTT. */
 	I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
 
-	return obj;
+	return vma;
+
+err:
+	i915_gem_object_put(obj);
+	return vma;
 }
 
 /**
- * gem_release_guc_obj() - Release gem object allocated for GuC usage
- * @obj:	gem obj to be released
+ * guc_release_vma() - Release gem object allocated for GuC usage
+ * @vma:	gem obj to be released
  */
-static void gem_release_guc_obj(struct drm_i915_gem_object *obj)
+static void guc_release_vma(struct i915_vma *vma)
 {
-	if (!obj)
+	if (!vma)
 		return;
 
-	if (i915_gem_obj_is_pinned(obj))
-		i915_gem_object_ggtt_unpin(obj);
-
-	i915_gem_object_put(obj);
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
 }
 
 static void
@@ -689,7 +694,7 @@ guc_client_free(struct drm_i915_private *dev_priv,
 		kunmap(kmap_to_page(client->client_base));
 	}
 
-	gem_release_guc_obj(client->client_obj);
+	guc_release_vma(client->vma);
 
 	if (client->ctx_index != GUC_INVALID_CTX_ID) {
 		guc_fini_ctx_desc(guc, client);
@@ -773,7 +778,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 {
 	struct i915_guc_client *client;
 	struct intel_guc *guc = &dev_priv->guc;
-	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 	uint16_t db_id;
 
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
@@ -794,13 +799,13 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	}
 
 	/* The first page is doorbell/proc_desc. Two followed pages are wq. */
-	obj = gem_allocate_guc_obj(dev_priv, GUC_DB_SIZE + GUC_WQ_SIZE);
-	if (!obj)
+	vma = guc_allocate_vma(guc, GUC_DB_SIZE + GUC_WQ_SIZE);
+	if (IS_ERR(vma))
 		goto err;
 
 	/* We'll keep just the first (doorbell/proc) page permanently kmap'd. */
-	client->client_obj = obj;
-	client->client_base = kmap(i915_gem_object_get_page(obj, 0));
+	client->vma = vma;
+	client->client_base = kmap(i915_vma_first_page(vma));
 	client->wq_offset = GUC_DB_SIZE;
 	client->wq_size = GUC_WQ_SIZE;
 
@@ -842,8 +847,7 @@ err:
 
 static void guc_create_log(struct intel_guc *guc)
 {
-	struct drm_i915_private *dev_priv = guc_to_i915(guc);
-	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 	unsigned long offset;
 	uint32_t size, flags;
 
@@ -859,16 +863,16 @@ static void guc_create_log(struct intel_guc *guc)
 		GUC_LOG_ISR_PAGES + 1 +
 		GUC_LOG_CRASH_PAGES + 1) << PAGE_SHIFT;
 
-	obj = guc->log_obj;
-	if (!obj) {
-		obj = gem_allocate_guc_obj(dev_priv, size);
-		if (!obj) {
+	vma = guc->log_vma;
+	if (!vma) {
+		vma = guc_allocate_vma(guc, size);
+		if (IS_ERR(vma)) {
 			/* logging will be off */
 			i915.guc_log_level = -1;
 			return;
 		}
 
-		guc->log_obj = obj;
+		guc->log_vma = vma;
 	}
 
 	/* each allocated unit is a page */
@@ -877,7 +881,7 @@ static void guc_create_log(struct intel_guc *guc)
 		(GUC_LOG_ISR_PAGES << GUC_LOG_ISR_SHIFT) |
 		(GUC_LOG_CRASH_PAGES << GUC_LOG_CRASH_SHIFT);
 
-	offset = i915_gem_obj_ggtt_offset(obj) >> PAGE_SHIFT; /* in pages */
+	offset = vma->node.start >> PAGE_SHIFT; /* in pages */
 	guc->log_flags = (offset << GUC_LOG_BUF_ADDR_SHIFT) | flags;
 }
 
@@ -906,7 +910,7 @@ static void init_guc_policies(struct guc_policies *policies)
 static void guc_create_ads(struct intel_guc *guc)
 {
 	struct drm_i915_private *dev_priv = guc_to_i915(guc);
-	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 	struct guc_ads *ads;
 	struct guc_policies *policies;
 	struct guc_mmio_reg_state *reg_state;
@@ -919,16 +923,16 @@ static void guc_create_ads(struct intel_guc *guc)
 			sizeof(struct guc_mmio_reg_state) +
 			GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE;
 
-	obj = guc->ads_obj;
-	if (!obj) {
-		obj = gem_allocate_guc_obj(dev_priv, PAGE_ALIGN(size));
-		if (!obj)
+	vma = guc->ads_vma;
+	if (!vma) {
+		vma = guc_allocate_vma(guc, PAGE_ALIGN(size));
+		if (IS_ERR(vma))
 			return;
 
-		guc->ads_obj = obj;
+		guc->ads_vma = vma;
 	}
 
-	page = i915_gem_object_get_page(obj, 0);
+	page = i915_vma_first_page(vma);
 	ads = kmap(page);
 
 	/*
@@ -948,8 +952,7 @@ static void guc_create_ads(struct intel_guc *guc)
 	policies = (void *)ads + sizeof(struct guc_ads);
 	init_guc_policies(policies);
 
-	ads->scheduler_policies = i915_gem_obj_ggtt_offset(obj) +
-			sizeof(struct guc_ads);
+	ads->scheduler_policies = vma->node.start + sizeof(struct guc_ads);
 
 	/* MMIO reg state */
 	reg_state = (void *)policies + sizeof(struct guc_policies);
@@ -977,10 +980,9 @@ static void guc_create_ads(struct intel_guc *guc)
  */
 int i915_guc_submission_init(struct drm_i915_private *dev_priv)
 {
-	const size_t ctxsize = sizeof(struct guc_context_desc);
-	const size_t poolsize = GUC_MAX_GPU_CONTEXTS * ctxsize;
-	const size_t gemsize = round_up(poolsize, PAGE_SIZE);
 	struct intel_guc *guc = &dev_priv->guc;
+	struct i915_vma *vma;
+	u32 size;
 
 	/* Wipe bitmap & delete client in case of reinitialisation */
 	bitmap_clear(guc->doorbell_bitmap, 0, GUC_MAX_DOORBELLS);
@@ -989,13 +991,15 @@ int i915_guc_submission_init(struct drm_i915_private *dev_priv)
 	if (!i915.enable_guc_submission)
 		return 0; /* not enabled  */
 
-	if (guc->ctx_pool_obj)
+	if (guc->ctx_pool_vma)
 		return 0; /* already allocated */
 
-	guc->ctx_pool_obj = gem_allocate_guc_obj(dev_priv, gemsize);
-	if (!guc->ctx_pool_obj)
-		return -ENOMEM;
+	size = PAGE_ALIGN(GUC_MAX_GPU_CONTEXTS*sizeof(struct guc_context_desc));
+	vma = guc_allocate_vma(guc, size);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
 
+	guc->ctx_pool_vma = vma;
 	ida_init(&guc->ctx_ids);
 	guc_create_log(guc);
 	guc_create_ads(guc);
@@ -1048,16 +1052,12 @@ void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
 {
 	struct intel_guc *guc = &dev_priv->guc;
 
-	gem_release_guc_obj(dev_priv->guc.ads_obj);
-	guc->ads_obj = NULL;
+	guc_release_vma(fetch_and_zero(&guc->ads_vma));
+	guc_release_vma(fetch_and_zero(&guc->log_vma));
 
-	gem_release_guc_obj(dev_priv->guc.log_obj);
-	guc->log_obj = NULL;
-
-	if (guc->ctx_pool_obj)
+	if (guc->ctx_pool_vma)
 		ida_destroy(&guc->ctx_ids);
-	gem_release_guc_obj(guc->ctx_pool_obj);
-	guc->ctx_pool_obj = NULL;
+	guc_release_vma(fetch_and_zero(&guc->ctx_pool_vma));
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 26f3d9c08120..c97326269588 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -63,7 +63,7 @@ struct drm_i915_gem_request;
  *   retcode: errno from last guc_submit()
  */
 struct i915_guc_client {
-	struct drm_i915_gem_object *client_obj;
+	struct i915_vma *vma;
 	void *client_base;		/* first page (only) of above	*/
 	struct i915_gem_context *owner;
 	struct intel_guc *guc;
@@ -124,11 +124,10 @@ struct intel_guc_fw {
 struct intel_guc {
 	struct intel_guc_fw guc_fw;
 	uint32_t log_flags;
-	struct drm_i915_gem_object *log_obj;
+	struct i915_vma *log_vma;
 
-	struct drm_i915_gem_object *ads_obj;
-
-	struct drm_i915_gem_object *ctx_pool_obj;
+	struct i915_vma *ads_vma;
+	struct i915_vma *ctx_pool_vma;
 	struct ida ctx_ids;
 
 	struct i915_guc_client *execbuf_client;
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index bfcf6b5d29ad..1e2d9f2ad4f6 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -193,16 +193,15 @@ static void set_guc_init_params(struct drm_i915_private *dev_priv)
 			i915.guc_log_level << GUC_LOG_VERBOSITY_SHIFT;
 	}
 
-	if (guc->ads_obj) {
-		u32 ads = (u32)i915_gem_obj_ggtt_offset(guc->ads_obj)
-				>> PAGE_SHIFT;
+	if (guc->ads_vma) {
+		u32 ads = (u32)guc->ads_vma->node.start >> PAGE_SHIFT;
 		params[GUC_CTL_DEBUG] |= ads << GUC_ADS_ADDR_SHIFT;
 		params[GUC_CTL_DEBUG] |= GUC_ADS_ENABLED;
 	}
 
 	/* If GuC submission is enabled, set up additional parameters here */
 	if (i915.enable_guc_submission) {
-		u32 pgs = i915_gem_obj_ggtt_offset(dev_priv->guc.ctx_pool_obj);
+		u32 pgs = dev_priv->guc.ctx_pool_vma->node.start;
 		u32 ctx_in_16 = GUC_MAX_GPU_CONTEXTS / 16;
 
 		pgs >>= PAGE_SHIFT;

From a83718b6819d99d01a34ed1ef1a01598eb45c61b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:52 +0100
Subject: [PATCH 062/141] drm/i915: Convert fence computations to use vma
 directly

Lookup the GGTT vma once for the object assigned to the fence, and then
derive everything from that vma.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-13-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_fence.c | 55 +++++++++++++--------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index 9e8173fe2a09..d99fc5734cf1 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -85,22 +85,19 @@ static void i965_write_fence_reg(struct drm_device *dev, int reg,
 	POSTING_READ(fence_reg_lo);
 
 	if (obj) {
-		u32 size = i915_gem_obj_ggtt_size(obj);
+		struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
 		unsigned int tiling = i915_gem_object_get_tiling(obj);
 		unsigned int stride = i915_gem_object_get_stride(obj);
-		uint64_t val;
+		u32 size = vma->node.size;
+		u32 row_size = stride * (tiling == I915_TILING_Y ? 32 : 8);
+		u64 val;
 
 		/* Adjust fence size to match tiled area */
-		if (tiling != I915_TILING_NONE) {
-			uint32_t row_size = stride *
-				(tiling == I915_TILING_Y ? 32 : 8);
-			size = (size / row_size) * row_size;
-		}
+		size = rounddown(size, row_size);
 
-		val = (uint64_t)((i915_gem_obj_ggtt_offset(obj) + size - 4096) &
-				 0xfffff000) << 32;
-		val |= i915_gem_obj_ggtt_offset(obj) & 0xfffff000;
-		val |= (uint64_t)((stride / 128) - 1) << fence_pitch_shift;
+		val = ((vma->node.start + size - 4096) & 0xfffff000) << 32;
+		val |= vma->node.start & 0xfffff000;
+		val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
 		if (tiling == I915_TILING_Y)
 			val |= 1 << I965_FENCE_TILING_Y_SHIFT;
 		val |= I965_FENCE_REG_VALID;
@@ -123,17 +120,17 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 	u32 val;
 
 	if (obj) {
-		u32 size = i915_gem_obj_ggtt_size(obj);
+		struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
 		unsigned int tiling = i915_gem_object_get_tiling(obj);
 		unsigned int stride = i915_gem_object_get_stride(obj);
 		int pitch_val;
 		int tile_width;
 
-		WARN((i915_gem_obj_ggtt_offset(obj) & ~I915_FENCE_START_MASK) ||
-		     (size & -size) != size ||
-		     (i915_gem_obj_ggtt_offset(obj) & (size - 1)),
-		     "object 0x%08llx [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n",
-		     i915_gem_obj_ggtt_offset(obj), obj->map_and_fenceable, size);
+		WARN((vma->node.start & ~I915_FENCE_START_MASK) ||
+		     !is_power_of_2(vma->node.size) ||
+		     (vma->node.start & (vma->node.size - 1)),
+		     "object 0x%08llx [fenceable? %d] not 1M or pot-size (0x%08llx) aligned\n",
+		     vma->node.start, obj->map_and_fenceable, vma->node.size);
 
 		if (tiling == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
 			tile_width = 128;
@@ -144,10 +141,10 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 		pitch_val = stride / tile_width;
 		pitch_val = ffs(pitch_val) - 1;
 
-		val = i915_gem_obj_ggtt_offset(obj);
+		val = vma->node.start;
 		if (tiling == I915_TILING_Y)
 			val |= 1 << I830_FENCE_TILING_Y_SHIFT;
-		val |= I915_FENCE_SIZE_BITS(size);
+		val |= I915_FENCE_SIZE_BITS(vma->node.size);
 		val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 		val |= I830_FENCE_REG_VALID;
 	} else
@@ -161,27 +158,27 @@ static void i830_write_fence_reg(struct drm_device *dev, int reg,
 				struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	uint32_t val;
+	u32 val;
 
 	if (obj) {
-		u32 size = i915_gem_obj_ggtt_size(obj);
+		struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
 		unsigned int tiling = i915_gem_object_get_tiling(obj);
 		unsigned int stride = i915_gem_object_get_stride(obj);
-		uint32_t pitch_val;
+		u32 pitch_val;
 
-		WARN((i915_gem_obj_ggtt_offset(obj) & ~I830_FENCE_START_MASK) ||
-		     (size & -size) != size ||
-		     (i915_gem_obj_ggtt_offset(obj) & (size - 1)),
-		     "object 0x%08llx not 512K or pot-size 0x%08x aligned\n",
-		     i915_gem_obj_ggtt_offset(obj), size);
+		WARN((vma->node.start & ~I830_FENCE_START_MASK) ||
+		     !is_power_of_2(vma->node.size) ||
+		     (vma->node.start & (vma->node.size - 1)),
+		     "object 0x%08llx not 512K or pot-size 0x%08llx aligned\n",
+		     vma->node.start, vma->node.size);
 
 		pitch_val = stride / 128;
 		pitch_val = ffs(pitch_val) - 1;
 
-		val = i915_gem_obj_ggtt_offset(obj);
+		val = vma->node.start;
 		if (tiling == I915_TILING_Y)
 			val |= 1 << I830_FENCE_TILING_Y_SHIFT;
-		val |= I830_FENCE_SIZE_BITS(size);
+		val |= I830_FENCE_SIZE_BITS(vma->node.size);
 		val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 		val |= I830_FENCE_REG_VALID;
 	} else

From f23eda8cb444b4c45a9bd61768f8bcce9adee8a0 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:53 +0100
Subject: [PATCH 063/141] drm/i915: Use VMA directly for checking tiling
 parameters

v2: Rename functions to suit their more active role

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-14-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_tiling.c | 51 +++++++++++++++-----------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index f4b984de83b5..b2b0cb7199ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -116,35 +116,46 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode)
 	return true;
 }
 
-/* Is the current GTT allocation valid for the change in tiling? */
-static bool
-i915_gem_object_fence_ok(struct drm_i915_gem_object *obj, int tiling_mode)
+/* Make the current GTT allocation valid for the change in tiling. */
+static int
+i915_gem_object_fence_prepare(struct drm_i915_gem_object *obj, int tiling_mode)
 {
 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
+	struct i915_vma *vma;
 	u32 size;
 
 	if (tiling_mode == I915_TILING_NONE)
-		return true;
+		return 0;
 
 	if (INTEL_GEN(dev_priv) >= 4)
-		return true;
+		return 0;
+
+	vma = i915_gem_obj_to_ggtt(obj);
+	if (!vma)
+		return 0;
+
+	if (!obj->map_and_fenceable)
+		return 0;
 
 	if (IS_GEN3(dev_priv)) {
-		if (i915_gem_obj_ggtt_offset(obj) & ~I915_FENCE_START_MASK)
-			return false;
+		if (vma->node.start & ~I915_FENCE_START_MASK)
+			goto bad;
 	} else {
-		if (i915_gem_obj_ggtt_offset(obj) & ~I830_FENCE_START_MASK)
-			return false;
+		if (vma->node.start & ~I830_FENCE_START_MASK)
+			goto bad;
 	}
 
 	size = i915_gem_get_ggtt_size(dev_priv, obj->base.size, tiling_mode);
-	if (i915_gem_obj_ggtt_size(obj) != size)
-		return false;
+	if (vma->node.size < size)
+		goto bad;
 
-	if (i915_gem_obj_ggtt_offset(obj) & (size - 1))
-		return false;
+	if (vma->node.start & (size - 1))
+		goto bad;
 
-	return true;
+	return 0;
+
+bad:
+	return i915_vma_unbind(vma);
 }
 
 /**
@@ -168,7 +179,7 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 	struct drm_i915_gem_set_tiling *args = data;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_gem_object *obj;
-	int ret = 0;
+	int err = 0;
 
 	/* Make sure we don't cross-contaminate obj->tiling_and_stride */
 	BUILD_BUG_ON(I915_TILING_LAST & STRIDE_MASK);
@@ -187,7 +198,7 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 
 	mutex_lock(&dev->struct_mutex);
 	if (obj->pin_display || obj->framebuffer_references) {
-		ret = -EBUSY;
+		err = -EBUSY;
 		goto err;
 	}
 
@@ -234,11 +245,9 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 		 * has to also include the unfenced register the GPU uses
 		 * whilst executing a fenced command for an untiled object.
 		 */
-		if (obj->map_and_fenceable &&
-		    !i915_gem_object_fence_ok(obj, args->tiling_mode))
-			ret = i915_vma_unbind(i915_gem_obj_to_ggtt(obj));
 
-		if (ret == 0) {
+		err = i915_gem_object_fence_prepare(obj, args->tiling_mode);
+		if (!err) {
 			if (obj->pages &&
 			    obj->madv == I915_MADV_WILLNEED &&
 			    dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
@@ -281,7 +290,7 @@ err:
 
 	intel_runtime_pm_put(dev_priv);
 
-	return ret;
+	return err;
 }
 
 /**

From bf3783e52a8929c738c5c8c5fa1df7e267b5271d Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:54 +0100
Subject: [PATCH 064/141] drm/i915: Use VMA as the primary object for context
 state

When working with contexts, we most frequently want the GGTT VMA for the
context state, first and foremost. Since the object is available via the
VMA, we need only then store the VMA.

v2: Formatting tweaks to debugfs output, restored some comments removed
in the next patch

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-15-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c        | 34 ++++++------
 drivers/gpu/drm/i915/i915_drv.h            |  3 +-
 drivers/gpu/drm/i915/i915_gem_context.c    | 51 +++++++++--------
 drivers/gpu/drm/i915/i915_gpu_error.c      |  7 ++-
 drivers/gpu/drm/i915/i915_guc_submission.c |  6 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 64 +++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  6 +-
 7 files changed, 86 insertions(+), 85 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 311225249563..eb8753fb107a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -354,7 +354,7 @@ static int per_file_ctx_stats(int id, void *ptr, void *data)
 
 	for (n = 0; n < ARRAY_SIZE(ctx->engine); n++) {
 		if (ctx->engine[n].state)
-			per_file_stats(0, ctx->engine[n].state, data);
+			per_file_stats(0, ctx->engine[n].state->obj, data);
 		if (ctx->engine[n].ring)
 			per_file_stats(0, ctx->engine[n].ring->obj, data);
 	}
@@ -1977,7 +1977,7 @@ static int i915_context_status(struct seq_file *m, void *unused)
 			seq_printf(m, "%s: ", engine->name);
 			seq_putc(m, ce->initialised ? 'I' : 'i');
 			if (ce->state)
-				describe_obj(m, ce->state);
+				describe_obj(m, ce->state->obj);
 			if (ce->ring)
 				describe_ctx_ring(m, ce->ring);
 			seq_putc(m, '\n');
@@ -1995,36 +1995,34 @@ static void i915_dump_lrc_obj(struct seq_file *m,
 			      struct i915_gem_context *ctx,
 			      struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_object *ctx_obj = ctx->engine[engine->id].state;
+	struct i915_vma *vma = ctx->engine[engine->id].state;
 	struct page *page;
-	uint32_t *reg_state;
 	int j;
-	unsigned long ggtt_offset = 0;
 
 	seq_printf(m, "CONTEXT: %s %u\n", engine->name, ctx->hw_id);
 
-	if (ctx_obj == NULL) {
-		seq_puts(m, "\tNot allocated\n");
+	if (!vma) {
+		seq_puts(m, "\tFake context\n");
 		return;
 	}
 
-	if (!i915_gem_obj_ggtt_bound(ctx_obj))
-		seq_puts(m, "\tNot bound in GGTT\n");
-	else
-		ggtt_offset = i915_gem_obj_ggtt_offset(ctx_obj);
+	if (vma->flags & I915_VMA_GLOBAL_BIND)
+		seq_printf(m, "\tBound in GGTT at 0x%08x\n",
+			   lower_32_bits(vma->node.start));
 
-	if (i915_gem_object_get_pages(ctx_obj)) {
-		seq_puts(m, "\tFailed to get pages for context object\n");
+	if (i915_gem_object_get_pages(vma->obj)) {
+		seq_puts(m, "\tFailed to get pages for context object\n\n");
 		return;
 	}
 
-	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
-	if (!WARN_ON(page == NULL)) {
-		reg_state = kmap_atomic(page);
+	page = i915_gem_object_get_page(vma->obj, LRC_STATE_PN);
+	if (page) {
+		u32 *reg_state = kmap_atomic(page);
 
 		for (j = 0; j < 0x600 / sizeof(u32) / 4; j += 4) {
-			seq_printf(m, "\t[0x%08lx] 0x%08x 0x%08x 0x%08x 0x%08x\n",
-				   ggtt_offset + 4096 + (j * 4),
+			seq_printf(m,
+				   "\t[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				   j * 4,
 				   reg_state[j], reg_state[j + 1],
 				   reg_state[j + 2], reg_state[j + 3]);
 		}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3285c8e2c87a..259425d99e17 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -893,9 +893,8 @@ struct i915_gem_context {
 	u32 ggtt_alignment;
 
 	struct intel_context {
-		struct drm_i915_gem_object *state;
+		struct i915_vma *state;
 		struct intel_ring *ring;
-		struct i915_vma *lrc_vma;
 		uint32_t *lrc_reg_state;
 		u64 lrc_desc;
 		int pin_count;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 547caf26a6b9..3857ce097c84 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -155,7 +155,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 		if (ce->ring)
 			intel_ring_free(ce->ring);
 
-		i915_gem_object_put(ce->state);
+		i915_vma_put(ce->state);
 	}
 
 	list_del(&ctx->link);
@@ -281,13 +281,24 @@ __create_hw_context(struct drm_device *dev,
 	ctx->ggtt_alignment = get_context_alignment(dev_priv);
 
 	if (dev_priv->hw_context_size) {
-		struct drm_i915_gem_object *obj =
-				i915_gem_alloc_context_obj(dev, dev_priv->hw_context_size);
+		struct drm_i915_gem_object *obj;
+		struct i915_vma *vma;
+
+		obj = i915_gem_alloc_context_obj(dev,
+						 dev_priv->hw_context_size);
 		if (IS_ERR(obj)) {
 			ret = PTR_ERR(obj);
 			goto err_out;
 		}
-		ctx->engine[RCS].state = obj;
+
+		vma = i915_vma_create(obj, &dev_priv->ggtt.base, NULL);
+		if (IS_ERR(vma)) {
+			i915_gem_object_put(obj);
+			ret = PTR_ERR(vma);
+			goto err_out;
+		}
+
+		ctx->engine[RCS].state = vma;
 	}
 
 	/* Default context will never have a file_priv */
@@ -399,7 +410,7 @@ static void i915_gem_context_unpin(struct i915_gem_context *ctx,
 		struct intel_context *ce = &ctx->engine[engine->id];
 
 		if (ce->state)
-			i915_gem_object_ggtt_unpin(ce->state);
+			i915_vma_unpin(ce->state);
 
 		i915_gem_context_put(ctx);
 	}
@@ -620,9 +631,7 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_SET_CONTEXT);
-	intel_ring_emit(ring,
-			i915_gem_obj_ggtt_offset(req->ctx->engine[RCS].state) |
-			flags);
+	intel_ring_emit(ring, req->ctx->engine[RCS].state->node.start | flags);
 	/*
 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
 	 * WaMiSetContext_Hang:snb,ivb,vlv
@@ -755,6 +764,7 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	struct i915_gem_context *to = req->ctx;
 	struct intel_engine_cs *engine = req->engine;
 	struct i915_hw_ppgtt *ppgtt = to->ppgtt ?: req->i915->mm.aliasing_ppgtt;
+	struct i915_vma *vma = to->engine[RCS].state;
 	struct i915_gem_context *from;
 	u32 hw_flags;
 	int ret, i;
@@ -763,8 +773,7 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 		return 0;
 
 	/* Trying to pin first makes error handling easier. */
-	ret = i915_gem_object_ggtt_pin(to->engine[RCS].state, NULL, 0,
-				       to->ggtt_alignment, 0);
+	ret = i915_vma_pin(vma, 0, to->ggtt_alignment, PIN_GLOBAL);
 	if (ret)
 		return ret;
 
@@ -785,9 +794,9 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	 *
 	 * XXX: We need a real interface to do this instead of trickery.
 	 */
-	ret = i915_gem_object_set_to_gtt_domain(to->engine[RCS].state, false);
+	ret = i915_gem_object_set_to_gtt_domain(vma->obj, false);
 	if (ret)
-		goto unpin_out;
+		goto err;
 
 	if (needs_pd_load_pre(ppgtt, engine, to)) {
 		/* Older GENs and non render rings still want the load first,
@@ -797,7 +806,7 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 		trace_switch_mm(engine, to);
 		ret = ppgtt->switch_mm(ppgtt, req);
 		if (ret)
-			goto unpin_out;
+			goto err;
 	}
 
 	if (!to->engine[RCS].initialised || i915_gem_context_is_default(to))
@@ -814,7 +823,7 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	if (to != from || (hw_flags & MI_FORCE_RESTORE)) {
 		ret = mi_set_context(req, hw_flags);
 		if (ret)
-			goto unpin_out;
+			goto err;
 	}
 
 	/* The backing object for the context is done after switching to the
@@ -824,8 +833,6 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	 * MI_SET_CONTEXT instead of when the next seqno has completed.
 	 */
 	if (from != NULL) {
-		struct drm_i915_gem_object *obj = from->engine[RCS].state;
-
 		/* As long as MI_SET_CONTEXT is serializing, ie. it flushes the
 		 * whole damn pipeline, we don't need to explicitly mark the
 		 * object dirty. The only exception is that the context must be
@@ -833,11 +840,9 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 		 * able to defer doing this until we know the object would be
 		 * swapped, but there is no way to do that yet.
 		 */
-		obj->base.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
-		i915_vma_move_to_active(i915_gem_obj_to_ggtt(obj), req, 0);
-
-		/* obj is kept alive until the next request by its active ref */
-		i915_gem_object_ggtt_unpin(obj);
+		i915_vma_move_to_active(from->engine[RCS].state, req, 0);
+		/* state is kept alive until the next request */
+		i915_vma_unpin(from->engine[RCS].state);
 		i915_gem_context_put(from);
 	}
 	engine->last_context = i915_gem_context_get(to);
@@ -882,8 +887,8 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 
 	return 0;
 
-unpin_out:
-	i915_gem_object_ggtt_unpin(to->engine[RCS].state);
+err:
+	i915_vma_unpin(vma);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 9ba71dd4a312..61708faebf79 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1103,9 +1103,10 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 					i915_error_ggtt_object_create(dev_priv,
 								      engine->scratch.obj);
 
-			ee->ctx =
-				i915_error_ggtt_object_create(dev_priv,
-							      request->ctx->engine[i].state);
+			if (request->ctx->engine[i].state) {
+				ee->ctx = i915_error_ggtt_object_create(dev_priv,
+									request->ctx->engine[i].state->obj);
+			}
 
 			if (request->pid) {
 				struct task_struct *task;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 29de8cec1b58..4f0f173f9754 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -358,7 +358,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		lrc->context_desc = lower_32_bits(ce->lrc_desc);
 
 		/* The state page is after PPHWSP */
-		gfx_addr = i915_gem_obj_ggtt_offset(ce->state);
+		gfx_addr = ce->state->node.start;
 		lrc->ring_lcra = gfx_addr + LRC_STATE_PN * PAGE_SIZE;
 		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
 				(guc_engine_id << GUC_ELC_ENGINE_OFFSET);
@@ -1080,7 +1080,7 @@ int intel_guc_suspend(struct drm_device *dev)
 	/* any value greater than GUC_POWER_D0 */
 	data[1] = GUC_POWER_D1;
 	/* first page is shared data with GuC */
-	data[2] = i915_gem_obj_ggtt_offset(ctx->engine[RCS].state);
+	data[2] = ctx->engine[RCS].state->node.start;
 
 	return host2guc_action(guc, data, ARRAY_SIZE(data));
 }
@@ -1105,7 +1105,7 @@ int intel_guc_resume(struct drm_device *dev)
 	data[0] = HOST2GUC_ACTION_EXIT_S_STATE;
 	data[1] = GUC_POWER_D0;
 	/* first page is shared data with GuC */
-	data[2] = i915_gem_obj_ggtt_offset(ctx->engine[RCS].state);
+	data[2] = ctx->engine[RCS].state->node.start;
 
 	return host2guc_action(guc, data, ARRAY_SIZE(data));
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index c24ac39d51f6..5538e5c541fe 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -315,7 +315,7 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
 
 	desc = ctx->desc_template;				/* bits  3-4  */
 	desc |= engine->ctx_desc_template;			/* bits  0-11 */
-	desc |= ce->lrc_vma->node.start + LRC_PPHWSP_PN * PAGE_SIZE;
+	desc |= ce->state->node.start + LRC_PPHWSP_PN * PAGE_SIZE;
 								/* bits 12-31 */
 	desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;		/* bits 32-52 */
 
@@ -763,7 +763,6 @@ void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = ctx->i915;
 	struct intel_context *ce = &ctx->engine[engine->id];
 	void *vaddr;
 	u32 *lrc_reg_state;
@@ -774,16 +773,15 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 	if (ce->pin_count++)
 		return 0;
 
-	ret = i915_gem_object_ggtt_pin(ce->state, NULL,
-				       0, GEN8_LR_CONTEXT_ALIGN,
-				       PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
+	ret = i915_vma_pin(ce->state, 0, GEN8_LR_CONTEXT_ALIGN,
+			   PIN_OFFSET_BIAS | GUC_WOPCM_TOP | PIN_GLOBAL);
 	if (ret)
 		goto err;
 
-	vaddr = i915_gem_object_pin_map(ce->state, I915_MAP_WB);
+	vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
 		ret = PTR_ERR(vaddr);
-		goto unpin_ctx_obj;
+		goto unpin_vma;
 	}
 
 	lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
@@ -792,24 +790,25 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 	if (ret)
 		goto unpin_map;
 
-	ce->lrc_vma = i915_gem_obj_to_ggtt(ce->state);
 	intel_lr_context_descriptor_update(ctx, engine);
 
 	lrc_reg_state[CTX_RING_BUFFER_START+1] = ce->ring->vma->node.start;
 	ce->lrc_reg_state = lrc_reg_state;
-	ce->state->dirty = true;
+	ce->state->obj->dirty = true;
 
 	/* Invalidate GuC TLB. */
-	if (i915.enable_guc_submission)
+	if (i915.enable_guc_submission) {
+		struct drm_i915_private *dev_priv = ctx->i915;
 		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+	}
 
 	i915_gem_context_get(ctx);
 	return 0;
 
 unpin_map:
-	i915_gem_object_unpin_map(ce->state);
-unpin_ctx_obj:
-	i915_gem_object_ggtt_unpin(ce->state);
+	i915_gem_object_unpin_map(ce->state->obj);
+unpin_vma:
+	__i915_vma_unpin(ce->state);
 err:
 	ce->pin_count = 0;
 	return ret;
@@ -828,12 +827,8 @@ void intel_lr_context_unpin(struct i915_gem_context *ctx,
 
 	intel_ring_unpin(ce->ring);
 
-	i915_gem_object_unpin_map(ce->state);
-	i915_gem_object_ggtt_unpin(ce->state);
-
-	ce->lrc_vma = NULL;
-	ce->lrc_desc = 0;
-	ce->lrc_reg_state = NULL;
+	i915_gem_object_unpin_map(ce->state->obj);
+	i915_vma_unpin(ce->state);
 
 	i915_gem_context_put(ctx);
 }
@@ -1747,19 +1742,18 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 }
 
 static int
-lrc_setup_hws(struct intel_engine_cs *engine,
-	      struct drm_i915_gem_object *dctx_obj)
+lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
 {
 	void *hws;
 
 	/* The HWSP is part of the default context object in LRC mode. */
-	engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(dctx_obj) +
-				       LRC_PPHWSP_PN * PAGE_SIZE;
-	hws = i915_gem_object_pin_map(dctx_obj, I915_MAP_WB);
+	engine->status_page.gfx_addr =
+		vma->node.start + LRC_PPHWSP_PN * PAGE_SIZE;
+	hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
 	if (IS_ERR(hws))
 		return PTR_ERR(hws);
 	engine->status_page.page_addr = hws + LRC_PPHWSP_PN * PAGE_SIZE;
-	engine->status_page.obj = dctx_obj;
+	engine->status_page.obj = vma->obj;
 
 	return 0;
 }
@@ -2131,6 +2125,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 {
 	struct drm_i915_gem_object *ctx_obj;
 	struct intel_context *ce = &ctx->engine[engine->id];
+	struct i915_vma *vma;
 	uint32_t context_size;
 	struct intel_ring *ring;
 	int ret;
@@ -2148,6 +2143,12 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 		return PTR_ERR(ctx_obj);
 	}
 
+	vma = i915_vma_create(ctx_obj, &ctx->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto error_deref_obj;
+	}
+
 	ring = intel_engine_create_ring(engine, ctx->ring_size);
 	if (IS_ERR(ring)) {
 		ret = PTR_ERR(ring);
@@ -2161,7 +2162,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 	}
 
 	ce->ring = ring;
-	ce->state = ctx_obj;
+	ce->state = vma;
 	ce->initialised = engine->init_context == NULL;
 
 	return 0;
@@ -2170,8 +2171,6 @@ error_ring_free:
 	intel_ring_free(ring);
 error_deref_obj:
 	i915_gem_object_put(ctx_obj);
-	ce->ring = NULL;
-	ce->state = NULL;
 	return ret;
 }
 
@@ -2182,24 +2181,23 @@ void intel_lr_context_reset(struct drm_i915_private *dev_priv,
 
 	for_each_engine(engine, dev_priv) {
 		struct intel_context *ce = &ctx->engine[engine->id];
-		struct drm_i915_gem_object *ctx_obj = ce->state;
 		void *vaddr;
 		uint32_t *reg_state;
 
-		if (!ctx_obj)
+		if (!ce->state)
 			continue;
 
-		vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
+		vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
 		if (WARN_ON(IS_ERR(vaddr)))
 			continue;
 
 		reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
-		ctx_obj->dirty = true;
 
 		reg_state[CTX_RING_HEAD+1] = 0;
 		reg_state[CTX_RING_TAIL+1] = 0;
 
-		i915_gem_object_unpin_map(ctx_obj);
+		ce->state->obj->dirty = true;
+		i915_gem_object_unpin_map(ce->state->obj);
 
 		ce->ring->head = 0;
 		ce->ring->tail = 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 75a9f635c3dc..2318a27341c8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2092,8 +2092,8 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx,
 		return 0;
 
 	if (ce->state) {
-		ret = i915_gem_object_ggtt_pin(ce->state, NULL, 0,
-					       ctx->ggtt_alignment, PIN_HIGH);
+		ret = i915_vma_pin(ce->state, 0, ctx->ggtt_alignment,
+				   PIN_GLOBAL | PIN_HIGH);
 		if (ret)
 			goto error;
 	}
@@ -2127,7 +2127,7 @@ static void intel_ring_context_unpin(struct i915_gem_context *ctx,
 		return;
 
 	if (ce->state)
-		i915_gem_object_ggtt_unpin(ce->state);
+		i915_vma_unpin(ce->state);
 
 	i915_gem_context_put(ctx);
 }

From 7abc98fadfdd060e54fdb7172e2753a2f9808366 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:55 +0100
Subject: [PATCH 065/141] drm/i915: Only change the context object's domain
 when binding

We know that the only access to the context object is via the GPU, and
the only time when it can be out of the GPU domain is when it is swapped
out and unbound. Therefore we only need to clflush the object when
binding, thus avoiding any potential stall on touching the domain on an
active context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-16-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_context.c | 19 +++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.c |  4 ++++
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 3857ce097c84..824dfe14bcd0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -772,6 +772,13 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	if (skip_rcs_switch(ppgtt, engine, to))
 		return 0;
 
+	/* Clear this page out of any CPU caches for coherent swap-in/out. */
+	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
+		ret = i915_gem_object_set_to_gtt_domain(vma->obj, false);
+		if (ret)
+			return ret;
+	}
+
 	/* Trying to pin first makes error handling easier. */
 	ret = i915_vma_pin(vma, 0, to->ggtt_alignment, PIN_GLOBAL);
 	if (ret)
@@ -786,18 +793,6 @@ static int do_rcs_switch(struct drm_i915_gem_request *req)
 	 */
 	from = engine->last_context;
 
-	/*
-	 * Clear this page out of any CPU caches for coherent swap-in/out. Note
-	 * that thanks to write = false in this call and us not setting any gpu
-	 * write domains when putting a context object onto the active list
-	 * (when switching away from it), this won't block.
-	 *
-	 * XXX: We need a real interface to do this instead of trickery.
-	 */
-	ret = i915_gem_object_set_to_gtt_domain(vma->obj, false);
-	if (ret)
-		goto err;
-
 	if (needs_pd_load_pre(ppgtt, engine, to)) {
 		/* Older GENs and non render rings still want the load first,
 		 * "PP_DCLV followed by PP_DIR_BASE register through Load
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 2318a27341c8..81dc69d1ff05 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2092,6 +2092,10 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx,
 		return 0;
 
 	if (ce->state) {
+		ret = i915_gem_object_set_to_gtt_domain(ce->state->obj, false);
+		if (ret)
+			goto error;
+
 		ret = i915_vma_pin(ce->state, 0, ctx->ggtt_alignment,
 				   PIN_GLOBAL | PIN_HIGH);
 		if (ret)

From e5cdb22b2799f2729930ef6394378570c66da251 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:56 +0100
Subject: [PATCH 066/141] drm/i915: Move assertion for iomap access to
 i915_vma_pin_iomap

Access through the GTT requires the device to be awake. Ideally
i915_vma_pin_iomap() is short-lived and the pinning demarcates the
access through the iomap. This is not entirely true, we have a mixture
of long lived pins that exceed the wakelock (such as legacy ringbuffers)
and short lived pin that do live within the wakelock (such as execlist
ringbuffers).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-17-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_gtt.c     | 3 +++
 drivers/gpu/drm/i915/intel_ringbuffer.c | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 1bec50bd651b..738a474c5afa 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3650,6 +3650,9 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 {
 	void __iomem *ptr;
 
+	/* Access through the GTT requires the device to be awake. */
+	assert_rpm_wakelock_held(to_i915(vma->vm->dev));
+
 	lockdep_assert_held(&vma->vm->dev->struct_mutex);
 	if (WARN_ON(!vma->obj->map_and_fenceable))
 		return IO_ERR_PTR(-ENODEV);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 81dc69d1ff05..4a614e567353 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1966,9 +1966,6 @@ int intel_ring_pin(struct intel_ring *ring)
 		if (ret)
 			goto err_unpin;
 
-		/* Access through the GTT requires the device to be awake. */
-		assert_rpm_wakelock_held(dev_priv);
-
 		addr = (void __force *)
 			i915_vma_pin_iomap(i915_gem_obj_to_ggtt(obj));
 		if (IS_ERR(addr)) {

From 57e8853181198065bfd96b3690f6dee68d744745 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:57 +0100
Subject: [PATCH 067/141] drm/i915: Use VMA for ringbuffer tracking

Use the GGTT VMA as the primary cookie for handing ring objects as
the most common action upon the ring is mapping and unmapping which act
upon the VMA itself. By restructuring the code to work with the ring
VMA, we can shrink the code and remove a few cycles from context pinning.

v2: Move the flush of the object back to before the first pin. We use
the am-I-bound? query to only have to check the flush on the first
bind and so avoid stalling on active rings.
Lots of little renames and small hoops.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-18-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c      |   4 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |  16 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  17 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 253 ++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  14 +-
 6 files changed, 144 insertions(+), 162 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index eb8753fb107a..6e7cfbaff224 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -356,7 +356,7 @@ static int per_file_ctx_stats(int id, void *ptr, void *data)
 		if (ctx->engine[n].state)
 			per_file_stats(0, ctx->engine[n].state->obj, data);
 		if (ctx->engine[n].ring)
-			per_file_stats(0, ctx->engine[n].ring->obj, data);
+			per_file_stats(0, ctx->engine[n].ring->vma->obj, data);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 61708faebf79..27f973fbe80f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1128,12 +1128,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			ee->cpu_ring_tail = ring->tail;
 			ee->ringbuffer =
 				i915_error_ggtt_object_create(dev_priv,
-							      ring->obj);
+							      ring->vma->obj);
 		}
 
 		ee->hws_page =
 			i915_error_ggtt_object_create(dev_priv,
-						      engine->status_page.obj);
+						      engine->status_page.vma->obj);
 
 		ee->wa_ctx = i915_error_ggtt_object_create(dev_priv,
 							   engine->wa_ctx.obj);
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 4f0f173f9754..c40b92e212fa 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -343,7 +343,6 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		struct intel_context *ce = &ctx->engine[engine->id];
 		uint32_t guc_engine_id = engine->guc_id;
 		struct guc_execlist_context *lrc = &desc.lrc[guc_engine_id];
-		struct drm_i915_gem_object *obj;
 
 		/* TODO: We have a design issue to be solved here. Only when we
 		 * receive the first batch, we know which engine is used by the
@@ -358,17 +357,14 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 		lrc->context_desc = lower_32_bits(ce->lrc_desc);
 
 		/* The state page is after PPHWSP */
-		gfx_addr = ce->state->node.start;
-		lrc->ring_lcra = gfx_addr + LRC_STATE_PN * PAGE_SIZE;
+		lrc->ring_lcra =
+			ce->state->node.start + LRC_STATE_PN * PAGE_SIZE;
 		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
 				(guc_engine_id << GUC_ELC_ENGINE_OFFSET);
 
-		obj = ce->ring->obj;
-		gfx_addr = i915_gem_obj_ggtt_offset(obj);
-
-		lrc->ring_begin = gfx_addr;
-		lrc->ring_end = gfx_addr + obj->base.size - 1;
-		lrc->ring_next_free_location = gfx_addr;
+		lrc->ring_begin = ce->ring->vma->node.start;
+		lrc->ring_end = lrc->ring_begin + ce->ring->size - 1;
+		lrc->ring_next_free_location = lrc->ring_begin;
 		lrc->ring_current_tail_pointer_value = 0;
 
 		desc.engines_used |= (1 << guc_engine_id);
@@ -943,7 +939,7 @@ static void guc_create_ads(struct intel_guc *guc)
 	 * to find it.
 	 */
 	engine = &dev_priv->engine[RCS];
-	ads->golden_context_lrca = engine->status_page.gfx_addr;
+	ads->golden_context_lrca = engine->status_page.ggtt_offset;
 
 	for_each_engine(engine, dev_priv)
 		ads->eng_state_size[engine->guc_id] = intel_lr_context_size(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5538e5c541fe..73dd2f9e0547 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1273,7 +1273,7 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
 	struct drm_i915_private *dev_priv = engine->i915;
 
 	I915_WRITE(RING_HWS_PGA(engine->mmio_base),
-		   (u32)engine->status_page.gfx_addr);
+		   engine->status_page.ggtt_offset);
 	POSTING_READ(RING_HWS_PGA(engine->mmio_base));
 }
 
@@ -1695,9 +1695,9 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 
 	intel_engine_cleanup_common(engine);
 
-	if (engine->status_page.obj) {
-		i915_gem_object_unpin_map(engine->status_page.obj);
-		engine->status_page.obj = NULL;
+	if (engine->status_page.vma) {
+		i915_gem_object_unpin_map(engine->status_page.vma->obj);
+		engine->status_page.vma = NULL;
 	}
 	intel_lr_context_unpin(dev_priv->kernel_context, engine);
 
@@ -1744,16 +1744,17 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 static int
 lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
 {
+	const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE;
 	void *hws;
 
 	/* The HWSP is part of the default context object in LRC mode. */
-	engine->status_page.gfx_addr =
-		vma->node.start + LRC_PPHWSP_PN * PAGE_SIZE;
 	hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
 	if (IS_ERR(hws))
 		return PTR_ERR(hws);
-	engine->status_page.page_addr = hws + LRC_PPHWSP_PN * PAGE_SIZE;
-	engine->status_page.obj = vma->obj;
+
+	engine->status_page.page_addr = hws + hws_offset;
+	engine->status_page.ggtt_offset = vma->node.start + hws_offset;
+	engine->status_page.vma = vma;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 4a614e567353..bdb1ab98b4f2 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -466,7 +466,7 @@ static void intel_ring_setup_status_page(struct intel_engine_cs *engine)
 		mmio = RING_HWS_PGA(engine->mmio_base);
 	}
 
-	I915_WRITE(mmio, (u32)engine->status_page.gfx_addr);
+	I915_WRITE(mmio, engine->status_page.ggtt_offset);
 	POSTING_READ(mmio);
 
 	/*
@@ -531,7 +531,6 @@ static int init_ring_common(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct intel_ring *ring = engine->buffer;
-	struct drm_i915_gem_object *obj = ring->obj;
 	int ret = 0;
 
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
@@ -571,7 +570,7 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	 * registers with the above sequence (the readback of the HEAD registers
 	 * also enforces ordering), otherwise the hw might lose the new ring
 	 * register values. */
-	I915_WRITE_START(engine, i915_gem_obj_ggtt_offset(obj));
+	I915_WRITE_START(engine, ring->vma->node.start);
 
 	/* WaClearRingBufHeadRegAtInit:ctg,elk */
 	if (I915_READ_HEAD(engine))
@@ -586,16 +585,16 @@ static int init_ring_common(struct intel_engine_cs *engine)
 
 	/* If the head is still not zero, the ring is dead */
 	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
-		     I915_READ_START(engine) == i915_gem_obj_ggtt_offset(obj) &&
+		     I915_READ_START(engine) == ring->vma->node.start &&
 		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
 		DRM_ERROR("%s initialization failed "
-			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08lx]\n",
+			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08llx]\n",
 			  engine->name,
 			  I915_READ_CTL(engine),
 			  I915_READ_CTL(engine) & RING_VALID,
 			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
 			  I915_READ_START(engine),
-			  (unsigned long)i915_gem_obj_ggtt_offset(obj));
+			  ring->vma->node.start);
 		ret = -EIO;
 		goto out;
 	}
@@ -1853,79 +1852,79 @@ static void cleanup_phys_status_page(struct intel_engine_cs *engine)
 
 static void cleanup_status_page(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 
-	obj = engine->status_page.obj;
-	if (obj == NULL)
+	vma = fetch_and_zero(&engine->status_page.vma);
+	if (!vma)
 		return;
 
-	kunmap(sg_page(obj->pages->sgl));
-	i915_gem_object_ggtt_unpin(obj);
-	i915_gem_object_put(obj);
-	engine->status_page.obj = NULL;
+	i915_vma_unpin(vma);
+	i915_gem_object_unpin_map(vma->obj);
+	i915_vma_put(vma);
 }
 
 static int init_status_page(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_object *obj = engine->status_page.obj;
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	unsigned int flags;
+	int ret;
 
-	if (obj == NULL) {
-		unsigned flags;
-		int ret;
-
-		obj = i915_gem_object_create(&engine->i915->drm, 4096);
-		if (IS_ERR(obj)) {
-			DRM_ERROR("Failed to allocate status page\n");
-			return PTR_ERR(obj);
-		}
-
-		ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
-		if (ret)
-			goto err_unref;
-
-		flags = 0;
-		if (!HAS_LLC(engine->i915))
-			/* On g33, we cannot place HWS above 256MiB, so
-			 * restrict its pinning to the low mappable arena.
-			 * Though this restriction is not documented for
-			 * gen4, gen5, or byt, they also behave similarly
-			 * and hang if the HWS is placed at the top of the
-			 * GTT. To generalise, it appears that all !llc
-			 * platforms have issues with us placing the HWS
-			 * above the mappable region (even though we never
-			 * actualy map it).
-			 */
-			flags |= PIN_MAPPABLE;
-		ret = i915_gem_object_ggtt_pin(obj, NULL, 0, 4096, flags);
-		if (ret) {
-err_unref:
-			i915_gem_object_put(obj);
-			return ret;
-		}
-
-		engine->status_page.obj = obj;
+	obj = i915_gem_object_create(&engine->i915->drm, 4096);
+	if (IS_ERR(obj)) {
+		DRM_ERROR("Failed to allocate status page\n");
+		return PTR_ERR(obj);
 	}
 
-	engine->status_page.gfx_addr = i915_gem_obj_ggtt_offset(obj);
-	engine->status_page.page_addr = kmap(sg_page(obj->pages->sgl));
-	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
+	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
+	if (ret)
+		goto err;
 
-	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
-			engine->name, engine->status_page.gfx_addr);
+	vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err;
+	}
 
+	flags = PIN_GLOBAL;
+	if (!HAS_LLC(engine->i915))
+		/* On g33, we cannot place HWS above 256MiB, so
+		 * restrict its pinning to the low mappable arena.
+		 * Though this restriction is not documented for
+		 * gen4, gen5, or byt, they also behave similarly
+		 * and hang if the HWS is placed at the top of the
+		 * GTT. To generalise, it appears that all !llc
+		 * platforms have issues with us placing the HWS
+		 * above the mappable region (even though we never
+		 * actualy map it).
+		 */
+		flags |= PIN_MAPPABLE;
+	ret = i915_vma_pin(vma, 0, 4096, flags);
+	if (ret)
+		goto err;
+
+	engine->status_page.vma = vma;
+	engine->status_page.ggtt_offset = vma->node.start;
+	engine->status_page.page_addr =
+		i915_gem_object_pin_map(obj, I915_MAP_WB);
+
+	DRM_DEBUG_DRIVER("%s hws offset: 0x%08llx\n",
+			 engine->name, vma->node.start);
 	return 0;
+
+err:
+	i915_gem_object_put(obj);
+	return ret;
 }
 
 static int init_phys_status_page(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	if (!dev_priv->status_page_dmah) {
-		dev_priv->status_page_dmah =
-			drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
-		if (!dev_priv->status_page_dmah)
-			return -ENOMEM;
-	}
+	dev_priv->status_page_dmah =
+		drm_pci_alloc(&dev_priv->drm, PAGE_SIZE, PAGE_SIZE);
+	if (!dev_priv->status_page_dmah)
+		return -ENOMEM;
 
 	engine->status_page.page_addr = dev_priv->status_page_dmah->vaddr;
 	memset(engine->status_page.page_addr, 0, PAGE_SIZE);
@@ -1935,52 +1934,43 @@ static int init_phys_status_page(struct intel_engine_cs *engine)
 
 int intel_ring_pin(struct intel_ring *ring)
 {
-	struct drm_i915_private *dev_priv = ring->engine->i915;
-	struct drm_i915_gem_object *obj = ring->obj;
 	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
-	unsigned flags = PIN_OFFSET_BIAS | 4096;
+	unsigned int flags = PIN_GLOBAL | PIN_OFFSET_BIAS | 4096;
+	struct i915_vma *vma = ring->vma;
 	void *addr;
 	int ret;
 
-	if (HAS_LLC(dev_priv) && !obj->stolen) {
-		ret = i915_gem_object_ggtt_pin(obj, NULL, 0, PAGE_SIZE, flags);
-		if (ret)
+	GEM_BUG_ON(ring->vaddr);
+
+	if (ring->needs_iomap)
+		flags |= PIN_MAPPABLE;
+
+	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
+		if (flags & PIN_MAPPABLE)
+			ret = i915_gem_object_set_to_gtt_domain(vma->obj, true);
+		else
+			ret = i915_gem_object_set_to_cpu_domain(vma->obj, true);
+		if (unlikely(ret))
 			return ret;
-
-		ret = i915_gem_object_set_to_cpu_domain(obj, true);
-		if (ret)
-			goto err_unpin;
-
-		addr = i915_gem_object_pin_map(obj, I915_MAP_WB);
-		if (IS_ERR(addr)) {
-			ret = PTR_ERR(addr);
-			goto err_unpin;
-		}
-	} else {
-		ret = i915_gem_object_ggtt_pin(obj, NULL, 0, PAGE_SIZE,
-					       flags | PIN_MAPPABLE);
-		if (ret)
-			return ret;
-
-		ret = i915_gem_object_set_to_gtt_domain(obj, true);
-		if (ret)
-			goto err_unpin;
-
-		addr = (void __force *)
-			i915_vma_pin_iomap(i915_gem_obj_to_ggtt(obj));
-		if (IS_ERR(addr)) {
-			ret = PTR_ERR(addr);
-			goto err_unpin;
-		}
 	}
 
+	ret = i915_vma_pin(vma, 0, PAGE_SIZE, flags);
+	if (unlikely(ret))
+		return ret;
+
+	if (flags & PIN_MAPPABLE)
+		addr = (void __force *)i915_vma_pin_iomap(vma);
+	else
+		addr = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
+	if (IS_ERR(addr))
+		goto err;
+
 	ring->vaddr = addr;
-	ring->vma = i915_gem_obj_to_ggtt(obj);
 	return 0;
 
-err_unpin:
-	i915_gem_object_ggtt_unpin(obj);
-	return ret;
+err:
+	i915_vma_unpin(vma);
+	return PTR_ERR(addr);
 }
 
 void intel_ring_unpin(struct intel_ring *ring)
@@ -1988,60 +1978,56 @@ void intel_ring_unpin(struct intel_ring *ring)
 	GEM_BUG_ON(!ring->vma);
 	GEM_BUG_ON(!ring->vaddr);
 
-	if (HAS_LLC(ring->engine->i915) && !ring->obj->stolen)
-		i915_gem_object_unpin_map(ring->obj);
-	else
+	if (ring->needs_iomap)
 		i915_vma_unpin_iomap(ring->vma);
+	else
+		i915_gem_object_unpin_map(ring->vma->obj);
 	ring->vaddr = NULL;
 
-	i915_gem_object_ggtt_unpin(ring->obj);
-	ring->vma = NULL;
+	i915_vma_unpin(ring->vma);
 }
 
-static void intel_destroy_ringbuffer_obj(struct intel_ring *ring)
-{
-	i915_gem_object_put(ring->obj);
-	ring->obj = NULL;
-}
-
-static int intel_alloc_ringbuffer_obj(struct drm_device *dev,
-				      struct intel_ring *ring)
+static struct i915_vma *
+intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
 {
 	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 
-	obj = NULL;
-	if (!HAS_LLC(dev))
-		obj = i915_gem_object_create_stolen(dev, ring->size);
-	if (obj == NULL)
-		obj = i915_gem_object_create(dev, ring->size);
+	obj = ERR_PTR(-ENODEV);
+	if (!HAS_LLC(dev_priv))
+		obj = i915_gem_object_create_stolen(&dev_priv->drm, size);
 	if (IS_ERR(obj))
-		return PTR_ERR(obj);
+		obj = i915_gem_object_create(&dev_priv->drm, size);
+	if (IS_ERR(obj))
+		return ERR_CAST(obj);
 
 	/* mark ring buffers as read-only from GPU side by default */
 	obj->gt_ro = 1;
 
-	ring->obj = obj;
+	vma = i915_vma_create(obj, &dev_priv->ggtt.base, NULL);
+	if (IS_ERR(vma))
+		goto err;
 
-	return 0;
+	return vma;
+
+err:
+	i915_gem_object_put(obj);
+	return vma;
 }
 
 struct intel_ring *
 intel_engine_create_ring(struct intel_engine_cs *engine, int size)
 {
 	struct intel_ring *ring;
-	int ret;
+	struct i915_vma *vma;
 
 	GEM_BUG_ON(!is_power_of_2(size));
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
-	if (ring == NULL) {
-		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s\n",
-				 engine->name);
+	if (!ring)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	ring->engine = engine;
-	list_add(&ring->link, &engine->buffers);
 
 	INIT_LIST_HEAD(&ring->request_list);
 
@@ -2057,22 +2043,23 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
 	ring->last_retired_head = -1;
 	intel_ring_update_space(ring);
 
-	ret = intel_alloc_ringbuffer_obj(&engine->i915->drm, ring);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Failed to allocate ringbuffer %s: %d\n",
-				 engine->name, ret);
-		list_del(&ring->link);
+	vma = intel_ring_create_vma(engine->i915, size);
+	if (IS_ERR(vma)) {
 		kfree(ring);
-		return ERR_PTR(ret);
+		return ERR_CAST(vma);
 	}
+	ring->vma = vma;
+	if (!HAS_LLC(engine->i915) || vma->obj->stolen)
+		ring->needs_iomap = true;
 
+	list_add(&ring->link, &engine->buffers);
 	return ring;
 }
 
 void
 intel_ring_free(struct intel_ring *ring)
 {
-	intel_destroy_ringbuffer_obj(ring);
+	i915_vma_put(ring->vma);
 	list_del(&ring->link);
 	kfree(ring);
 }
@@ -2166,7 +2153,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 		ret = PTR_ERR(ring);
 		goto error;
 	}
-	engine->buffer = ring;
 
 	if (I915_NEED_GFX_HWS(dev_priv)) {
 		ret = init_status_page(engine);
@@ -2181,11 +2167,10 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 
 	ret = intel_ring_pin(ring);
 	if (ret) {
-		DRM_ERROR("Failed to pin and map ringbuffer %s: %d\n",
-				engine->name, ret);
-		intel_destroy_ringbuffer_obj(ring);
+		intel_ring_free(ring);
 		goto error;
 	}
+	engine->buffer = ring;
 
 	return 0;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ea2735144b2a..bc692d519a72 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -26,10 +26,10 @@
  */
 #define I915_RING_FREE_SPACE 64
 
-struct  intel_hw_status_page {
-	u32		*page_addr;
-	unsigned int	gfx_addr;
-	struct		drm_i915_gem_object *obj;
+struct intel_hw_status_page {
+	struct i915_vma *vma;
+	u32 *page_addr;
+	u32 ggtt_offset;
 };
 
 #define I915_READ_TAIL(engine) I915_READ(RING_TAIL((engine)->mmio_base))
@@ -83,9 +83,8 @@ struct intel_engine_hangcheck {
 };
 
 struct intel_ring {
-	struct drm_i915_gem_object *obj;
-	void *vaddr;
 	struct i915_vma *vma;
+	void *vaddr;
 
 	struct intel_engine_cs *engine;
 	struct list_head link;
@@ -97,6 +96,7 @@ struct intel_ring {
 	int space;
 	int size;
 	int effective_size;
+	bool needs_iomap;
 
 	/** We track the position of the requests in the ring buffer, and
 	 * when each is retired we increment last_retired_head as the GPU
@@ -516,7 +516,7 @@ int init_workarounds_ring(struct intel_engine_cs *engine);
 
 static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 {
-	return engine->status_page.gfx_addr + I915_GEM_HWS_INDEX_ADDR;
+	return engine->status_page.ggtt_offset + I915_GEM_HWS_INDEX_ADDR;
 }
 
 /* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */

From 56c0f1a7c1ae68cb719fc9c8aba35d1f86149b29 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:58 +0100
Subject: [PATCH 068/141] drm/i915: Use VMA for scratch page tracking

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-19-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_context.c |  2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c   |  2 +-
 drivers/gpu/drm/i915/intel_display.c    |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 18 ++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c | 55 ++++++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h | 10 ++---
 6 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 824dfe14bcd0..e566167d9441 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -660,7 +660,7 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 					MI_STORE_REGISTER_MEM |
 					MI_SRM_LRM_GLOBAL_GTT);
 			intel_ring_emit_reg(ring, last_reg);
-			intel_ring_emit(ring, engine->scratch.gtt_offset);
+			intel_ring_emit(ring, engine->scratch->node.start);
 			intel_ring_emit(ring, MI_NOOP);
 		}
 		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 27f973fbe80f..c327733e6735 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1101,7 +1101,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			if (HAS_BROKEN_CS_TLB(dev_priv))
 				ee->wa_batchbuffer =
 					i915_error_ggtt_object_create(dev_priv,
-								      engine->scratch.obj);
+								      engine->scratch->obj);
 
 			if (request->ctx->engine[i].state) {
 				ee->ctx = i915_error_ggtt_object_create(dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index c5c0c35d4f6e..2e7d03c5bf5c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11795,7 +11795,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
 					      MI_SRM_LRM_GLOBAL_GTT);
 		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring, req->engine->scratch.gtt_offset + 256);
+		intel_ring_emit(ring, req->engine->scratch->node.start + 256);
 		if (IS_GEN8(dev)) {
 			intel_ring_emit(ring, 0);
 			intel_ring_emit(ring, MI_NOOP);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 73dd2f9e0547..42999ba02152 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -914,7 +914,7 @@ static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine,
 	wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 |
 				   MI_SRM_LRM_GLOBAL_GTT));
 	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-	wa_ctx_emit(batch, index, engine->scratch.gtt_offset + 256);
+	wa_ctx_emit(batch, index, engine->scratch->node.start + 256);
 	wa_ctx_emit(batch, index, 0);
 
 	wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
@@ -932,7 +932,7 @@ static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine,
 	wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 |
 				   MI_SRM_LRM_GLOBAL_GTT));
 	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-	wa_ctx_emit(batch, index, engine->scratch.gtt_offset + 256);
+	wa_ctx_emit(batch, index, engine->scratch->node.start + 256);
 	wa_ctx_emit(batch, index, 0);
 
 	return index;
@@ -993,7 +993,7 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine,
 
 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
 	/* Actual scratch location is at 128 bytes offset */
-	scratch_addr = engine->scratch.gtt_offset + 2*CACHELINE_BYTES;
+	scratch_addr = engine->scratch->node.start + 2 * CACHELINE_BYTES;
 
 	wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
 	wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
@@ -1072,8 +1072,8 @@ static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine,
 	/* WaClearSlmSpaceAtContextSwitch:kbl */
 	/* Actual scratch location is at 128 bytes offset */
 	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_A0)) {
-		uint32_t scratch_addr
-			= engine->scratch.gtt_offset + 2*CACHELINE_BYTES;
+		u32 scratch_addr =
+			engine->scratch->node.start + 2 * CACHELINE_BYTES;
 
 		wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
 		wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
@@ -1215,7 +1215,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
 	}
 
 	/* some WA perform writes to scratch page, ensure it is valid */
-	if (engine->scratch.obj == NULL) {
+	if (!engine->scratch) {
 		DRM_ERROR("scratch page not allocated for %s\n", engine->name);
 		return -EINVAL;
 	}
@@ -1483,7 +1483,7 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 {
 	struct intel_ring *ring = request->ring;
 	struct intel_engine_cs *engine = request->engine;
-	u32 scratch_addr = engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
+	u32 scratch_addr = engine->scratch->node.start + 2 * CACHELINE_BYTES;
 	bool vf_flush_wa = false, dc_flush_wa = false;
 	u32 flags = 0;
 	int ret;
@@ -1844,11 +1844,11 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 	else
 		engine->init_hw = gen8_init_render_ring;
 	engine->init_context = gen8_init_rcs_context;
-	engine->cleanup = intel_fini_pipe_control;
+	engine->cleanup = intel_engine_cleanup_scratch;
 	engine->emit_flush = gen8_emit_flush_render;
 	engine->emit_request = gen8_emit_request_render;
 
-	ret = intel_init_pipe_control(engine, 4096);
+	ret = intel_engine_create_scratch(engine, 4096);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index bdb1ab98b4f2..7ce912f8d96c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -176,7 +176,7 @@ intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
-		req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
+		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
 	int ret;
 
 	ret = intel_ring_begin(req, 6);
@@ -212,7 +212,7 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
 	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
-		req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
+		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
 	u32 flags = 0;
 	int ret;
 
@@ -286,7 +286,7 @@ gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
 	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
-		req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
+		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
 	u32 flags = 0;
 	int ret;
 
@@ -370,7 +370,8 @@ gen8_emit_pipe_control(struct drm_i915_gem_request *req,
 static int
 gen8_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
-	u32 scratch_addr = req->engine->scratch.gtt_offset + 2 * CACHELINE_BYTES;
+	u32 scratch_addr =
+		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
 	u32 flags = 0;
 	int ret;
 
@@ -612,45 +613,51 @@ out:
 	return ret;
 }
 
-void intel_fini_pipe_control(struct intel_engine_cs *engine)
+void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
 {
-	if (engine->scratch.obj == NULL)
+	struct i915_vma *vma;
+
+	vma = fetch_and_zero(&engine->scratch);
+	if (!vma)
 		return;
 
-	i915_gem_object_ggtt_unpin(engine->scratch.obj);
-	i915_gem_object_put(engine->scratch.obj);
-	engine->scratch.obj = NULL;
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
 }
 
-int intel_init_pipe_control(struct intel_engine_cs *engine, int size)
+int intel_engine_create_scratch(struct intel_engine_cs *engine, int size)
 {
 	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 	int ret;
 
-	WARN_ON(engine->scratch.obj);
+	WARN_ON(engine->scratch);
 
 	obj = i915_gem_object_create_stolen(&engine->i915->drm, size);
 	if (!obj)
 		obj = i915_gem_object_create(&engine->i915->drm, size);
 	if (IS_ERR(obj)) {
 		DRM_ERROR("Failed to allocate scratch page\n");
-		ret = PTR_ERR(obj);
-		goto err;
+		return PTR_ERR(obj);
 	}
 
-	ret = i915_gem_object_ggtt_pin(obj, NULL, 0, 4096, PIN_HIGH);
+	vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err_unref;
+	}
+
+	ret = i915_vma_pin(vma, 0, 4096, PIN_GLOBAL | PIN_HIGH);
 	if (ret)
 		goto err_unref;
 
-	engine->scratch.obj = obj;
-	engine->scratch.gtt_offset = i915_gem_obj_ggtt_offset(obj);
-	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08x\n",
-			 engine->name, engine->scratch.gtt_offset);
+	engine->scratch = vma;
+	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08llx\n",
+			 engine->name, vma->node.start);
 	return 0;
 
 err_unref:
-	i915_gem_object_put(engine->scratch.obj);
-err:
+	i915_gem_object_put(obj);
 	return ret;
 }
 
@@ -1305,7 +1312,7 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
 		dev_priv->semaphore_obj = NULL;
 	}
 
-	intel_fini_pipe_control(engine);
+	intel_engine_cleanup_scratch(engine);
 }
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)
@@ -1763,7 +1770,7 @@ i830_emit_bb_start(struct drm_i915_gem_request *req,
 		   unsigned int dispatch_flags)
 {
 	struct intel_ring *ring = req->ring;
-	u32 cs_offset = req->engine->scratch.gtt_offset;
+	u32 cs_offset = req->engine->scratch->node.start;
 	int ret;
 
 	ret = intel_ring_begin(req, 6);
@@ -2793,11 +2800,11 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
 		return ret;
 
 	if (INTEL_GEN(dev_priv) >= 6) {
-		ret = intel_init_pipe_control(engine, 4096);
+		ret = intel_engine_create_scratch(engine, 4096);
 		if (ret)
 			return ret;
 	} else if (HAS_BROKEN_CS_TLB(dev_priv)) {
-		ret = intel_init_pipe_control(engine, I830_WA_SIZE);
+		ret = intel_engine_create_scratch(engine, I830_WA_SIZE);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index bc692d519a72..c6024db75f93 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -198,6 +198,7 @@ struct intel_engine_cs {
 
 	struct intel_hw_status_page status_page;
 	struct i915_ctx_workarounds wa_ctx;
+	struct i915_vma *scratch;
 
 	u32             irq_keep_mask; /* always keep these interrupts */
 	u32		irq_enable_mask; /* bitmask to enable ring interrupt */
@@ -320,11 +321,6 @@ struct intel_engine_cs {
 
 	struct intel_engine_hangcheck hangcheck;
 
-	struct {
-		struct drm_i915_gem_object *obj;
-		u32 gtt_offset;
-	} scratch;
-
 	bool needs_cmd_parser;
 
 	/*
@@ -476,8 +472,8 @@ void intel_ring_update_space(struct intel_ring *ring);
 
 void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno);
 
-int intel_init_pipe_control(struct intel_engine_cs *engine, int size);
-void intel_fini_pipe_control(struct intel_engine_cs *engine);
+int intel_engine_create_scratch(struct intel_engine_cs *engine, int size);
+void intel_engine_cleanup_scratch(struct intel_engine_cs *engine);
 
 void intel_engine_setup_common(struct intel_engine_cs *engine);
 int intel_engine_init_common(struct intel_engine_cs *engine);

From adc320c4b70916c7bed1420c912b83eb0c2b2c24 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:48:59 +0100
Subject: [PATCH 069/141] drm/i915: Move common scratch allocation/destroy to
 intel_engine_cs.c

Since the scratch allocation and cleanup is shared by all engine
submission backends, move it out of the legacy intel_ringbuffer.c and
into the new home for common routines, intel_engine_cs.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-20-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_engine_cs.c  | 50 +++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c        |  1 -
 drivers/gpu/drm/i915/intel_ringbuffer.c | 50 -------------------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 +-
 4 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 186c12d07f99..7104dec5e893 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -195,6 +195,54 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
 	i915_gem_batch_pool_init(engine, &engine->batch_pool);
 }
 
+int intel_engine_create_scratch(struct intel_engine_cs *engine, int size)
+{
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	int ret;
+
+	WARN_ON(engine->scratch);
+
+	obj = i915_gem_object_create_stolen(&engine->i915->drm, size);
+	if (!obj)
+		obj = i915_gem_object_create(&engine->i915->drm, size);
+	if (IS_ERR(obj)) {
+		DRM_ERROR("Failed to allocate scratch page\n");
+		return PTR_ERR(obj);
+	}
+
+	vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err_unref;
+	}
+
+	ret = i915_vma_pin(vma, 0, 4096, PIN_GLOBAL | PIN_HIGH);
+	if (ret)
+		goto err_unref;
+
+	engine->scratch = vma;
+	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08llx\n",
+			 engine->name, vma->node.start);
+	return 0;
+
+err_unref:
+	i915_gem_object_put(obj);
+	return ret;
+}
+
+static void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
+{
+	struct i915_vma *vma;
+
+	vma = fetch_and_zero(&engine->scratch);
+	if (!vma)
+		return;
+
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
+}
+
 /**
  * intel_engines_init_common - initialize cengine state which might require hw access
  * @engine: Engine to initialize.
@@ -226,6 +274,8 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
  */
 void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 {
+	intel_engine_cleanup_scratch(engine);
+
 	intel_engine_cleanup_cmd_parser(engine);
 	intel_engine_fini_breadcrumbs(engine);
 	i915_gem_batch_pool_fini(&engine->batch_pool);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 42999ba02152..56c904e2dc98 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1844,7 +1844,6 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 	else
 		engine->init_hw = gen8_init_render_ring;
 	engine->init_context = gen8_init_rcs_context;
-	engine->cleanup = intel_engine_cleanup_scratch;
 	engine->emit_flush = gen8_emit_flush_render;
 	engine->emit_request = gen8_emit_request_render;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 7ce912f8d96c..c89aea55bc10 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -613,54 +613,6 @@ out:
 	return ret;
 }
 
-void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
-{
-	struct i915_vma *vma;
-
-	vma = fetch_and_zero(&engine->scratch);
-	if (!vma)
-		return;
-
-	i915_vma_unpin(vma);
-	i915_vma_put(vma);
-}
-
-int intel_engine_create_scratch(struct intel_engine_cs *engine, int size)
-{
-	struct drm_i915_gem_object *obj;
-	struct i915_vma *vma;
-	int ret;
-
-	WARN_ON(engine->scratch);
-
-	obj = i915_gem_object_create_stolen(&engine->i915->drm, size);
-	if (!obj)
-		obj = i915_gem_object_create(&engine->i915->drm, size);
-	if (IS_ERR(obj)) {
-		DRM_ERROR("Failed to allocate scratch page\n");
-		return PTR_ERR(obj);
-	}
-
-	vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
-		goto err_unref;
-	}
-
-	ret = i915_vma_pin(vma, 0, 4096, PIN_GLOBAL | PIN_HIGH);
-	if (ret)
-		goto err_unref;
-
-	engine->scratch = vma;
-	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08llx\n",
-			 engine->name, vma->node.start);
-	return 0;
-
-err_unref:
-	i915_gem_object_put(obj);
-	return ret;
-}
-
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
@@ -1311,8 +1263,6 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
 		i915_gem_object_put(dev_priv->semaphore_obj);
 		dev_priv->semaphore_obj = NULL;
 	}
-
-	intel_engine_cleanup_scratch(engine);
 }
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c6024db75f93..dca84258be16 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -472,11 +472,9 @@ void intel_ring_update_space(struct intel_ring *ring);
 
 void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno);
 
-int intel_engine_create_scratch(struct intel_engine_cs *engine, int size);
-void intel_engine_cleanup_scratch(struct intel_engine_cs *engine);
-
 void intel_engine_setup_common(struct intel_engine_cs *engine);
 int intel_engine_init_common(struct intel_engine_cs *engine);
+int intel_engine_create_scratch(struct intel_engine_cs *engine, int size);
 void intel_engine_cleanup_common(struct intel_engine_cs *engine);
 
 static inline int intel_engine_idle(struct intel_engine_cs *engine,

From 57f275a22b0812b140bf2e581f5e9855f27b78f1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:00 +0100
Subject: [PATCH 070/141] drm/i915: Move common seqno reset to
 intel_engine_cs.c

Since the intel_engine_init_seqno() is shared by all engine submission
backends, move it out of the legacy intel_ringbuffer.c and
into the new home for common routines, intel_engine_cs.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-21-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_engine_cs.c  | 42 +++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c | 42 -------------------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 7104dec5e893..829624571ca4 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -161,6 +161,48 @@ cleanup:
 	return ret;
 }
 
+void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	/* Our semaphore implementation is strictly monotonic (i.e. we proceed
+	 * so long as the semaphore value in the register/page is greater
+	 * than the sync value), so whenever we reset the seqno,
+	 * so long as we reset the tracking semaphore value to 0, it will
+	 * always be before the next request's seqno. If we don't reset
+	 * the semaphore value, then when the seqno moves backwards all
+	 * future waits will complete instantly (causing rendering corruption).
+	 */
+	if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
+		I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
+		I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
+		if (HAS_VEBOX(dev_priv))
+			I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
+	}
+	if (dev_priv->semaphore_obj) {
+		struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
+		struct page *page = i915_gem_object_get_dirty_page(obj, 0);
+		void *semaphores = kmap(page);
+		memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
+		       0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
+		kunmap(page);
+	}
+	memset(engine->semaphore.sync_seqno, 0,
+	       sizeof(engine->semaphore.sync_seqno));
+
+	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
+	if (engine->irq_seqno_barrier)
+		engine->irq_seqno_barrier(engine);
+	engine->last_submitted_seqno = seqno;
+
+	engine->hangcheck.seqno = seqno;
+
+	/* After manually advancing the seqno, fake the interrupt in case
+	 * there are any waiters for that seqno.
+	 */
+	intel_engine_wakeup(engine);
+}
+
 void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
 {
 	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index c89aea55bc10..6008d54b9152 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2314,48 +2314,6 @@ int intel_ring_cacheline_align(struct drm_i915_gem_request *req)
 	return 0;
 }
 
-void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	/* Our semaphore implementation is strictly monotonic (i.e. we proceed
-	 * so long as the semaphore value in the register/page is greater
-	 * than the sync value), so whenever we reset the seqno,
-	 * so long as we reset the tracking semaphore value to 0, it will
-	 * always be before the next request's seqno. If we don't reset
-	 * the semaphore value, then when the seqno moves backwards all
-	 * future waits will complete instantly (causing rendering corruption).
-	 */
-	if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
-		I915_WRITE(RING_SYNC_0(engine->mmio_base), 0);
-		I915_WRITE(RING_SYNC_1(engine->mmio_base), 0);
-		if (HAS_VEBOX(dev_priv))
-			I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
-	}
-	if (dev_priv->semaphore_obj) {
-		struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
-		struct page *page = i915_gem_object_get_dirty_page(obj, 0);
-		void *semaphores = kmap(page);
-		memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
-		       0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
-		kunmap(page);
-	}
-	memset(engine->semaphore.sync_seqno, 0,
-	       sizeof(engine->semaphore.sync_seqno));
-
-	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
-	if (engine->irq_seqno_barrier)
-		engine->irq_seqno_barrier(engine);
-	engine->last_submitted_seqno = seqno;
-
-	engine->hangcheck.seqno = seqno;
-
-	/* After manually advancing the seqno, fake the interrupt in case
-	 * there are any waiters for that seqno.
-	 */
-	intel_engine_wakeup(engine);
-}
-
 static void gen6_bsd_submit_request(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_private *dev_priv = request->i915;

From 9b3b7841b86d54e95a43a747ee401883d218ebca Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:01 +0100
Subject: [PATCH 071/141] drm/i915/overlay: Use VMA as the primary tracker for
 images

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-22-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_overlay.c | 39 ++++++++++++++++------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 90f3ab424e01..d930e3a4a9cd 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -171,8 +171,8 @@ struct overlay_registers {
 struct intel_overlay {
 	struct drm_i915_private *i915;
 	struct intel_crtc *crtc;
-	struct drm_i915_gem_object *vid_bo;
-	struct drm_i915_gem_object *old_vid_bo;
+	struct i915_vma *vma;
+	struct i915_vma *old_vma;
 	bool active;
 	bool pfit_active;
 	u32 pfit_vscale_ratio; /* shifted-point number, (1<<12) == 1.0 */
@@ -317,15 +317,17 @@ static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
 {
 	struct intel_overlay *overlay =
 		container_of(active, typeof(*overlay), last_flip);
-	struct drm_i915_gem_object *obj = overlay->old_vid_bo;
+	struct i915_vma *vma;
 
-	i915_gem_track_fb(obj, NULL,
+	vma = fetch_and_zero(&overlay->old_vma);
+	if (WARN_ON(!vma))
+		return;
+
+	i915_gem_track_fb(vma->obj, NULL,
 			  INTEL_FRONTBUFFER_OVERLAY(overlay->crtc->pipe));
 
-	i915_gem_object_ggtt_unpin(obj);
-	i915_gem_object_put(obj);
-
-	overlay->old_vid_bo = NULL;
+	i915_gem_object_unpin_from_display_plane(vma->obj, &i915_ggtt_view_normal);
+	i915_vma_put(vma);
 }
 
 static void intel_overlay_off_tail(struct i915_gem_active *active,
@@ -333,15 +335,15 @@ static void intel_overlay_off_tail(struct i915_gem_active *active,
 {
 	struct intel_overlay *overlay =
 		container_of(active, typeof(*overlay), last_flip);
-	struct drm_i915_gem_object *obj = overlay->vid_bo;
+	struct i915_vma *vma;
 
 	/* never have the overlay hw on without showing a frame */
-	if (WARN_ON(!obj))
+	vma = fetch_and_zero(&overlay->vma);
+	if (WARN_ON(!vma))
 		return;
 
-	i915_gem_object_ggtt_unpin(obj);
-	i915_gem_object_put(obj);
-	overlay->vid_bo = NULL;
+	i915_gem_object_unpin_from_display_plane(vma->obj, &i915_ggtt_view_normal);
+	i915_vma_put(vma);
 
 	overlay->crtc->overlay = NULL;
 	overlay->crtc = NULL;
@@ -421,7 +423,7 @@ static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
 	/* Only wait if there is actually an old frame to release to
 	 * guarantee forward progress.
 	 */
-	if (!overlay->old_vid_bo)
+	if (!overlay->old_vma)
 		return 0;
 
 	if (I915_READ(ISR) & I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT) {
@@ -744,6 +746,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	struct drm_i915_private *dev_priv = overlay->i915;
 	u32 swidth, swidthsw, sheight, ostride;
 	enum pipe pipe = overlay->crtc->pipe;
+	struct i915_vma *vma;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 	WARN_ON(!drm_modeset_is_locked(&dev_priv->drm.mode_config.connection_mutex));
@@ -757,6 +760,8 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	if (ret != 0)
 		return ret;
 
+	vma = i915_gem_obj_to_ggtt_view(new_bo, &i915_ggtt_view_normal);
+
 	ret = i915_gem_object_put_fence(new_bo);
 	if (ret)
 		goto out_unpin;
@@ -834,11 +839,11 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	if (ret)
 		goto out_unpin;
 
-	i915_gem_track_fb(overlay->vid_bo, new_bo,
+	i915_gem_track_fb(overlay->vma->obj, new_bo,
 			  INTEL_FRONTBUFFER_OVERLAY(pipe));
 
-	overlay->old_vid_bo = overlay->vid_bo;
-	overlay->vid_bo = new_bo;
+	overlay->old_vma = overlay->vma;
+	overlay->vma = vma;
 
 	intel_frontbuffer_flip(dev_priv, INTEL_FRONTBUFFER_OVERLAY(pipe));
 

From 51d545d0268f3e1b68aa71c906377c2098462e4f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:02 +0100
Subject: [PATCH 072/141] drm/i915: Use VMA as the primary tracker for
 semaphore page

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-23-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  2 +-
 drivers/gpu/drm/i915/i915_drv.h         |  4 +-
 drivers/gpu/drm/i915/i915_gpu_error.c   | 16 +++----
 drivers/gpu/drm/i915/intel_engine_cs.c  | 12 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.c | 60 ++++++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 +-
 6 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6e7cfbaff224..a922b78350d1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3189,7 +3189,7 @@ static int i915_semaphore_status(struct seq_file *m, void *unused)
 		struct page *page;
 		uint64_t *seqno;
 
-		page = i915_gem_object_get_page(dev_priv->semaphore_obj, 0);
+		page = i915_gem_object_get_page(dev_priv->semaphore->obj, 0);
 
 		seqno = (uint64_t *)kmap_atomic(page);
 		for_each_engine_id(engine, dev_priv, id) {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 259425d99e17..50dc3613c61c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -733,7 +733,7 @@ struct drm_i915_error_state {
 	u64 fence[I915_MAX_NUM_FENCES];
 	struct intel_overlay_error_state *overlay;
 	struct intel_display_error_state *display;
-	struct drm_i915_error_object *semaphore_obj;
+	struct drm_i915_error_object *semaphore;
 
 	struct drm_i915_error_engine {
 		int engine_id;
@@ -1750,7 +1750,7 @@ struct drm_i915_private {
 	struct pci_dev *bridge_dev;
 	struct i915_gem_context *kernel_context;
 	struct intel_engine_cs engine[I915_NUM_ENGINES];
-	struct drm_i915_gem_object *semaphore_obj;
+	struct i915_vma *semaphore;
 	u32 next_seqno;
 
 	struct drm_dma_handle *status_page_dmah;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index c327733e6735..4068630bfc68 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -549,7 +549,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		}
 	}
 
-	if ((obj = error->semaphore_obj)) {
+	if ((obj = error->semaphore)) {
 		err_printf(m, "Semaphore page = 0x%08x\n",
 			   lower_32_bits(obj->gtt_offset));
 		for (elt = 0; elt < PAGE_SIZE/16; elt += 4) {
@@ -640,7 +640,7 @@ static void i915_error_state_free(struct kref *error_ref)
 		kfree(ee->waiters);
 	}
 
-	i915_error_object_free(error->semaphore_obj);
+	i915_error_object_free(error->semaphore);
 
 	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
 		kfree(error->active_bo[i]);
@@ -876,7 +876,7 @@ static void gen8_record_semaphore_state(struct drm_i915_error_state *error,
 	struct intel_engine_cs *to;
 	enum intel_engine_id id;
 
-	if (!error->semaphore_obj)
+	if (!error->semaphore)
 		return;
 
 	for_each_engine_id(to, dev_priv, id) {
@@ -889,7 +889,7 @@ static void gen8_record_semaphore_state(struct drm_i915_error_state *error,
 
 		signal_offset =
 			(GEN8_SIGNAL_OFFSET(engine, id) & (PAGE_SIZE - 1)) / 4;
-		tmp = error->semaphore_obj->pages[0];
+		tmp = error->semaphore->pages[0];
 		idx = intel_engine_sync_index(engine, to);
 
 		ee->semaphore_mboxes[idx] = tmp[signal_offset];
@@ -1061,11 +1061,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 	struct drm_i915_gem_request *request;
 	int i, count;
 
-	if (dev_priv->semaphore_obj) {
-		error->semaphore_obj =
-			i915_error_ggtt_object_create(dev_priv,
-						      dev_priv->semaphore_obj);
-	}
+	error->semaphore =
+		i915_error_ggtt_object_create(dev_priv,
+					      dev_priv->semaphore->obj);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
 		struct intel_engine_cs *engine = &dev_priv->engine[i];
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 829624571ca4..573f642a74f8 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -179,12 +179,16 @@ void intel_engine_init_seqno(struct intel_engine_cs *engine, u32 seqno)
 		if (HAS_VEBOX(dev_priv))
 			I915_WRITE(RING_SYNC_2(engine->mmio_base), 0);
 	}
-	if (dev_priv->semaphore_obj) {
-		struct drm_i915_gem_object *obj = dev_priv->semaphore_obj;
-		struct page *page = i915_gem_object_get_dirty_page(obj, 0);
-		void *semaphores = kmap(page);
+	if (dev_priv->semaphore) {
+		struct page *page = i915_vma_first_page(dev_priv->semaphore);
+		void *semaphores;
+
+		/* Semaphores are in noncoherent memory, flush to be safe */
+		semaphores = kmap(page);
 		memset(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
 		       0, I915_NUM_ENGINES * gen8_semaphore_seqno_size);
+		drm_clflush_virt_range(semaphores + GEN8_SEMAPHORE_OFFSET(engine->id, 0),
+				       I915_NUM_ENGINES * gen8_semaphore_seqno_size);
 		kunmap(page);
 	}
 	memset(engine->semaphore.sync_seqno, 0,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 6008d54b9152..30b066140b0c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1257,12 +1257,14 @@ static int init_render_ring(struct intel_engine_cs *engine)
 static void render_ring_cleanup(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_vma *vma;
 
-	if (dev_priv->semaphore_obj) {
-		i915_gem_object_ggtt_unpin(dev_priv->semaphore_obj);
-		i915_gem_object_put(dev_priv->semaphore_obj);
-		dev_priv->semaphore_obj = NULL;
-	}
+	vma = fetch_and_zero(&dev_priv->semaphore);
+	if (!vma)
+		return;
+
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
 }
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)
@@ -2523,30 +2525,30 @@ static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
 	if (!i915.semaphores)
 		return;
 
-	if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore_obj) {
+	if (INTEL_GEN(dev_priv) >= 8 && !dev_priv->semaphore) {
+		struct i915_vma *vma;
+
 		obj = i915_gem_object_create(&dev_priv->drm, 4096);
-		if (IS_ERR(obj)) {
-			DRM_ERROR("Failed to allocate semaphore bo. Disabling semaphores\n");
-			i915.semaphores = 0;
-		} else {
-			i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
-			ret = i915_gem_object_ggtt_pin(obj, NULL,
-						       0, 0, PIN_HIGH);
-			if (ret != 0) {
-				i915_gem_object_put(obj);
-				DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
-				i915.semaphores = 0;
-			} else {
-				dev_priv->semaphore_obj = obj;
-			}
-		}
+		if (IS_ERR(obj))
+			goto err;
+
+		vma = i915_vma_create(obj, &dev_priv->ggtt.base, NULL);
+		if (IS_ERR(vma))
+			goto err_obj;
+
+		ret = i915_gem_object_set_to_gtt_domain(obj, false);
+		if (ret)
+			goto err_obj;
+
+		ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+		if (ret)
+			goto err_obj;
+
+		dev_priv->semaphore = vma;
 	}
 
-	if (!i915.semaphores)
-		return;
-
 	if (INTEL_GEN(dev_priv) >= 8) {
-		u64 offset = i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj);
+		u64 offset = dev_priv->semaphore->node.start;
 
 		engine->semaphore.sync_to = gen8_ring_sync_to;
 		engine->semaphore.signal = gen8_xcs_signal;
@@ -2613,6 +2615,14 @@ static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
 			engine->semaphore.mbox.signal[i] = mbox_reg;
 		}
 	}
+
+	return;
+
+err_obj:
+	i915_gem_object_put(obj);
+err:
+	DRM_DEBUG_DRIVER("Failed to allocate space for semaphores, disabling\n");
+	i915.semaphores = 0;
 }
 
 static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index dca84258be16..cb40785e7677 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -57,10 +57,10 @@ struct intel_hw_status_page {
 #define GEN8_SEMAPHORE_OFFSET(__from, __to)			     \
 	(((__from) * I915_NUM_ENGINES  + (__to)) * gen8_semaphore_seqno_size)
 #define GEN8_SIGNAL_OFFSET(__ring, to)			     \
-	(i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj) + \
+	(dev_priv->semaphore->node.start + \
 	 GEN8_SEMAPHORE_OFFSET((__ring)->id, (to)))
 #define GEN8_WAIT_OFFSET(__ring, from)			     \
-	(i915_gem_obj_ggtt_offset(dev_priv->semaphore_obj) + \
+	(dev_priv->semaphore->node.start + \
 	 GEN8_SEMAPHORE_OFFSET(from, (__ring)->id))
 
 enum intel_engine_hangcheck_action {

From a5e85c8a4dca07f80647133f2e84cd4a6da98df6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:03 +0100
Subject: [PATCH 073/141] drm/i915: Use VMA for render state page tracking

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-24-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_render_state.c | 40 +++++++++++---------
 drivers/gpu/drm/i915/i915_gem_render_state.h |  2 +-
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c
index 57fd767a2d79..95b7e9afd5f8 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -30,8 +30,7 @@
 
 struct render_state {
 	const struct intel_renderstate_rodata *rodata;
-	struct drm_i915_gem_object *obj;
-	u64 ggtt_offset;
+	struct i915_vma *vma;
 	u32 aux_batch_size;
 	u32 aux_batch_offset;
 };
@@ -73,7 +72,7 @@ render_state_get_rodata(const struct drm_i915_gem_request *req)
 
 static int render_state_setup(struct render_state *so)
 {
-	struct drm_device *dev = so->obj->base.dev;
+	struct drm_device *dev = so->vma->vm->dev;
 	const struct intel_renderstate_rodata *rodata = so->rodata;
 	const bool has_64bit_reloc = INTEL_GEN(dev) >= 8;
 	unsigned int i = 0, reloc_index = 0;
@@ -81,18 +80,18 @@ static int render_state_setup(struct render_state *so)
 	u32 *d;
 	int ret;
 
-	ret = i915_gem_object_set_to_cpu_domain(so->obj, true);
+	ret = i915_gem_object_set_to_cpu_domain(so->vma->obj, true);
 	if (ret)
 		return ret;
 
-	page = i915_gem_object_get_dirty_page(so->obj, 0);
+	page = i915_gem_object_get_dirty_page(so->vma->obj, 0);
 	d = kmap(page);
 
 	while (i < rodata->batch_items) {
 		u32 s = rodata->batch[i];
 
 		if (i * 4  == rodata->reloc[reloc_index]) {
-			u64 r = s + so->ggtt_offset;
+			u64 r = s + so->vma->node.start;
 			s = lower_32_bits(r);
 			if (has_64bit_reloc) {
 				if (i + 1 >= rodata->batch_items ||
@@ -154,7 +153,7 @@ static int render_state_setup(struct render_state *so)
 
 	kunmap(page);
 
-	ret = i915_gem_object_set_to_gtt_domain(so->obj, false);
+	ret = i915_gem_object_set_to_gtt_domain(so->vma->obj, false);
 	if (ret)
 		return ret;
 
@@ -175,6 +174,7 @@ err_out:
 int i915_gem_render_state_init(struct drm_i915_gem_request *req)
 {
 	struct render_state so;
+	struct drm_i915_gem_object *obj;
 	int ret;
 
 	if (WARN_ON(req->engine->id != RCS))
@@ -187,21 +187,25 @@ int i915_gem_render_state_init(struct drm_i915_gem_request *req)
 	if (so.rodata->batch_items * 4 > 4096)
 		return -EINVAL;
 
-	so.obj = i915_gem_object_create(&req->i915->drm, 4096);
-	if (IS_ERR(so.obj))
-		return PTR_ERR(so.obj);
+	obj = i915_gem_object_create(&req->i915->drm, 4096);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
 
-	ret = i915_gem_object_ggtt_pin(so.obj, NULL, 0, 0, 0);
+	so.vma = i915_vma_create(obj, &req->i915->ggtt.base, NULL);
+	if (IS_ERR(so.vma)) {
+		ret = PTR_ERR(so.vma);
+		goto err_obj;
+	}
+
+	ret = i915_vma_pin(so.vma, 0, 0, PIN_GLOBAL);
 	if (ret)
 		goto err_obj;
 
-	so.ggtt_offset = i915_gem_obj_ggtt_offset(so.obj);
-
 	ret = render_state_setup(&so);
 	if (ret)
 		goto err_unpin;
 
-	ret = req->engine->emit_bb_start(req, so.ggtt_offset,
+	ret = req->engine->emit_bb_start(req, so.vma->node.start,
 					 so.rodata->batch_items * 4,
 					 I915_DISPATCH_SECURE);
 	if (ret)
@@ -209,7 +213,7 @@ int i915_gem_render_state_init(struct drm_i915_gem_request *req)
 
 	if (so.aux_batch_size > 8) {
 		ret = req->engine->emit_bb_start(req,
-						 (so.ggtt_offset +
+						 (so.vma->node.start +
 						  so.aux_batch_offset),
 						 so.aux_batch_size,
 						 I915_DISPATCH_SECURE);
@@ -217,10 +221,10 @@ int i915_gem_render_state_init(struct drm_i915_gem_request *req)
 			goto err_unpin;
 	}
 
-	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req, 0);
+	i915_vma_move_to_active(so.vma, req, 0);
 err_unpin:
-	i915_gem_object_ggtt_unpin(so.obj);
+	i915_vma_unpin(so.vma);
 err_obj:
-	i915_gem_object_put(so.obj);
+	i915_gem_object_put(obj);
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.h b/drivers/gpu/drm/i915/i915_gem_render_state.h
index c44fca8599bb..18cce3f06e9c 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.h
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.h
@@ -24,7 +24,7 @@
 #ifndef _I915_GEM_RENDER_STATE_H_
 #define _I915_GEM_RENDER_STATE_H_
 
-#include <linux/types.h>
+struct drm_i915_gem_request;
 
 int i915_gem_render_state_init(struct drm_i915_gem_request *req);
 

From 48bb74e48bc2cd106d7ed7697377c08d149f2633 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:04 +0100
Subject: [PATCH 074/141] drm/i915: Use VMA for wa_ctx tracking

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-25-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gpu_error.c   |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 58 ++++++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 +-
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 4068630bfc68..5e7734ca4579 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1134,7 +1134,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 						      engine->status_page.vma->obj);
 
 		ee->wa_ctx = i915_error_ggtt_object_create(dev_priv,
-							   engine->wa_ctx.obj);
+							   engine->wa_ctx.vma->obj);
 
 		count = 0;
 		list_for_each_entry(request, &engine->request_list, link)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 56c904e2dc98..64cb04e63512 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1165,45 +1165,51 @@ static int gen9_init_perctx_bb(struct intel_engine_cs *engine,
 
 static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *engine, u32 size)
 {
-	int ret;
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	int err;
 
-	engine->wa_ctx.obj = i915_gem_object_create(&engine->i915->drm,
-						    PAGE_ALIGN(size));
-	if (IS_ERR(engine->wa_ctx.obj)) {
-		DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
-		ret = PTR_ERR(engine->wa_ctx.obj);
-		engine->wa_ctx.obj = NULL;
-		return ret;
+	obj = i915_gem_object_create(&engine->i915->drm, PAGE_ALIGN(size));
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto err;
 	}
 
-	ret = i915_gem_object_ggtt_pin(engine->wa_ctx.obj, NULL,
-				       0, PAGE_SIZE, PIN_HIGH);
-	if (ret) {
-		DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
-				 ret);
-		i915_gem_object_put(engine->wa_ctx.obj);
-		return ret;
-	}
+	err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
+	if (err)
+		goto err;
 
+	engine->wa_ctx.vma = vma;
 	return 0;
+
+err:
+	i915_gem_object_put(obj);
+	return err;
 }
 
 static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *engine)
 {
-	if (engine->wa_ctx.obj) {
-		i915_gem_object_ggtt_unpin(engine->wa_ctx.obj);
-		i915_gem_object_put(engine->wa_ctx.obj);
-		engine->wa_ctx.obj = NULL;
-	}
+	struct i915_vma *vma;
+
+	vma = fetch_and_zero(&engine->wa_ctx.vma);
+	if (!vma)
+		return;
+
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
 }
 
 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
 {
-	int ret;
+	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
 	uint32_t *batch;
 	uint32_t offset;
 	struct page *page;
-	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
+	int ret;
 
 	WARN_ON(engine->id != RCS);
 
@@ -1226,7 +1232,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
 		return ret;
 	}
 
-	page = i915_gem_object_get_dirty_page(wa_ctx->obj, 0);
+	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
 	batch = kmap_atomic(page);
 	offset = 0;
 
@@ -2019,9 +2025,9 @@ populate_lr_context(struct i915_gem_context *ctx,
 			       RING_INDIRECT_CTX(engine->mmio_base), 0);
 		ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET,
 			       RING_INDIRECT_CTX_OFFSET(engine->mmio_base), 0);
-		if (engine->wa_ctx.obj) {
+		if (engine->wa_ctx.vma) {
 			struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
-			uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
+			u32 ggtt_offset = wa_ctx->vma->node.start;
 
 			reg_state[CTX_RCS_INDIRECT_CTX+1] =
 				(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index cb40785e7677..e3777572c70e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -123,12 +123,12 @@ struct drm_i915_reg_table;
  *    an option for future use.
  *  size: size of the batch in DWORDS
  */
-struct  i915_ctx_workarounds {
+struct i915_ctx_workarounds {
 	struct i915_wa_ctx_bb {
 		u32 offset;
 		u32 size;
 	} indirect_ctx, per_ctx;
-	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
 };
 
 struct drm_i915_gem_request;

From 19880c4a3f19a8ff116e992c2f79459b7c2d15c7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:05 +0100
Subject: [PATCH 075/141] drm/i915: Consolidate i915_vma_unpin_and_release()

In a few places, we repeat a call to clear a pointer to a vma whilst
unpinning and releasing a reference to its owner. Refactor those into a
common function.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-26-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_gtt.c        | 12 ++++++++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  1 +
 drivers/gpu/drm/i915/i915_guc_submission.c | 21 ++++-----------------
 drivers/gpu/drm/i915/intel_engine_cs.c     |  9 +--------
 drivers/gpu/drm/i915/intel_lrc.c           |  9 +--------
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  8 +-------
 6 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 738a474c5afa..d15eb1d71341 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3674,3 +3674,15 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 	__i915_vma_pin(vma);
 	return ptr;
 }
+
+void i915_vma_unpin_and_release(struct i915_vma **p_vma)
+{
+	struct i915_vma *vma;
+
+	vma = fetch_and_zero(p_vma);
+	if (!vma)
+		return;
+
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
+}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index a2691943a404..ec538fcc9c20 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -232,6 +232,7 @@ struct i915_vma *
 i915_vma_create(struct drm_i915_gem_object *obj,
 		struct i915_address_space *vm,
 		const struct i915_ggtt_view *view);
+void i915_vma_unpin_and_release(struct i915_vma **p_vma);
 
 static inline bool i915_vma_is_ggtt(const struct i915_vma *vma)
 {
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index c40b92e212fa..e7dbc64ec1da 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -653,19 +653,6 @@ err:
 	return vma;
 }
 
-/**
- * guc_release_vma() - Release gem object allocated for GuC usage
- * @vma:	gem obj to be released
- */
-static void guc_release_vma(struct i915_vma *vma)
-{
-	if (!vma)
-		return;
-
-	i915_vma_unpin(vma);
-	i915_vma_put(vma);
-}
-
 static void
 guc_client_free(struct drm_i915_private *dev_priv,
 		struct i915_guc_client *client)
@@ -690,7 +677,7 @@ guc_client_free(struct drm_i915_private *dev_priv,
 		kunmap(kmap_to_page(client->client_base));
 	}
 
-	guc_release_vma(client->vma);
+	i915_vma_unpin_and_release(&client->vma);
 
 	if (client->ctx_index != GUC_INVALID_CTX_ID) {
 		guc_fini_ctx_desc(guc, client);
@@ -1048,12 +1035,12 @@ void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
 {
 	struct intel_guc *guc = &dev_priv->guc;
 
-	guc_release_vma(fetch_and_zero(&guc->ads_vma));
-	guc_release_vma(fetch_and_zero(&guc->log_vma));
+	i915_vma_unpin_and_release(&guc->ads_vma);
+	i915_vma_unpin_and_release(&guc->log_vma);
 
 	if (guc->ctx_pool_vma)
 		ida_destroy(&guc->ctx_ids);
-	guc_release_vma(fetch_and_zero(&guc->ctx_pool_vma));
+	i915_vma_unpin_and_release(&guc->ctx_pool_vma);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 573f642a74f8..f02d66bbec4b 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -279,14 +279,7 @@ err_unref:
 
 static void intel_engine_cleanup_scratch(struct intel_engine_cs *engine)
 {
-	struct i915_vma *vma;
-
-	vma = fetch_and_zero(&engine->scratch);
-	if (!vma)
-		return;
-
-	i915_vma_unpin(vma);
-	i915_vma_put(vma);
+	i915_vma_unpin_and_release(&engine->scratch);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 64cb04e63512..2673fb4f817b 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1193,14 +1193,7 @@ err:
 
 static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *engine)
 {
-	struct i915_vma *vma;
-
-	vma = fetch_and_zero(&engine->wa_ctx.vma);
-	if (!vma)
-		return;
-
-	i915_vma_unpin(vma);
-	i915_vma_put(vma);
+	i915_vma_unpin_and_release(&engine->wa_ctx.vma);
 }
 
 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 30b066140b0c..65ef172e8761 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1257,14 +1257,8 @@ static int init_render_ring(struct intel_engine_cs *engine)
 static void render_ring_cleanup(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	struct i915_vma *vma;
 
-	vma = fetch_and_zero(&dev_priv->semaphore);
-	if (!vma)
-		return;
-
-	i915_vma_unpin(vma);
-	i915_vma_put(vma);
+	i915_vma_unpin_and_release(&dev_priv->semaphore);
 }
 
 static int gen8_rcs_signal(struct drm_i915_gem_request *req)

From 058d88c4330f963033a5d11b269c8f86677494d1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:06 +0100
Subject: [PATCH 076/141] drm/i915: Track pinned VMA

Treat the VMA as the primary struct responsible for tracking bindings
into the GPU's VM. That is we want to treat the VMA returned after we
pin an object into the VM as the cookie we hold and eventually release
when unpinning. Doing so eliminates the ambiguity in pinning the object
and then searching for the relevant pin later.

v2: Joonas' stylistic nitpicks, a fun rebase.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-27-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   2 +-
 drivers/gpu/drm/i915/i915_drv.h            |  60 ++----
 drivers/gpu/drm/i915/i915_gem.c            | 233 ++++++---------------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  65 +++---
 drivers/gpu/drm/i915/i915_gem_fence.c      |  14 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  85 +++++---
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  14 --
 drivers/gpu/drm/i915/i915_gem_request.c    |   2 +-
 drivers/gpu/drm/i915/i915_gem_request.h    |   2 +-
 drivers/gpu/drm/i915/i915_gem_stolen.c     |   2 +-
 drivers/gpu/drm/i915/i915_gem_tiling.c     |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c      |  58 +++--
 drivers/gpu/drm/i915/intel_display.c       |  57 +++--
 drivers/gpu/drm/i915/intel_drv.h           |   5 +-
 drivers/gpu/drm/i915/intel_fbc.c           |   2 +-
 drivers/gpu/drm/i915/intel_fbdev.c         |  19 +-
 drivers/gpu/drm/i915/intel_guc_loader.c    |  21 +-
 drivers/gpu/drm/i915/intel_overlay.c       |  32 +--
 18 files changed, 272 insertions(+), 403 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a922b78350d1..73f3466402c5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -105,7 +105,7 @@ static char get_tiling_flag(struct drm_i915_gem_object *obj)
 
 static char get_global_flag(struct drm_i915_gem_object *obj)
 {
-	return i915_gem_obj_to_ggtt(obj) ? 'g' : ' ';
+	return i915_gem_object_to_ggtt(obj, NULL) ?  'g' : ' ';
 }
 
 static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 50dc3613c61c..bbee45acedeb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3075,7 +3075,7 @@ struct drm_i915_gem_object *i915_gem_object_create_from_data(
 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file);
 void i915_gem_free_object(struct drm_gem_object *obj);
 
-int __must_check
+struct i915_vma * __must_check
 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 			 const struct i915_ggtt_view *view,
 			 u64 size,
@@ -3279,12 +3279,11 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 				  bool write);
 int __must_check
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
-int __must_check
+struct i915_vma * __must_check
 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     const struct i915_ggtt_view *view);
-void i915_gem_object_unpin_from_display_plane(struct drm_i915_gem_object *obj,
-					      const struct i915_ggtt_view *view);
+void i915_gem_object_unpin_from_display_plane(struct i915_vma *vma);
 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
 				int align);
 int i915_gem_open(struct drm_device *dev, struct drm_file *file);
@@ -3304,63 +3303,34 @@ struct drm_gem_object *i915_gem_prime_import(struct drm_device *dev,
 struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
 				struct drm_gem_object *gem_obj, int flags);
 
-u64 i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
-				  const struct i915_ggtt_view *view);
-u64 i915_gem_obj_offset(struct drm_i915_gem_object *o,
-			struct i915_address_space *vm);
-static inline u64
-i915_gem_obj_ggtt_offset(struct drm_i915_gem_object *o)
-{
-	return i915_gem_obj_ggtt_offset_view(o, &i915_ggtt_view_normal);
-}
-
-bool i915_gem_obj_ggtt_bound_view(struct drm_i915_gem_object *o,
-				  const struct i915_ggtt_view *view);
-bool i915_gem_obj_bound(struct drm_i915_gem_object *o,
-			struct i915_address_space *vm);
-
 struct i915_vma *
 i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
-		    struct i915_address_space *vm);
-struct i915_vma *
-i915_gem_obj_to_ggtt_view(struct drm_i915_gem_object *obj,
-			  const struct i915_ggtt_view *view);
+		     struct i915_address_space *vm,
+		     const struct i915_ggtt_view *view);
 
 struct i915_vma *
 i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
-				  struct i915_address_space *vm);
-struct i915_vma *
-i915_gem_obj_lookup_or_create_ggtt_vma(struct drm_i915_gem_object *obj,
-				       const struct i915_ggtt_view *view);
+				  struct i915_address_space *vm,
+				  const struct i915_ggtt_view *view);
 
-static inline struct i915_vma *
-i915_gem_obj_to_ggtt(struct drm_i915_gem_object *obj)
-{
-	return i915_gem_obj_to_ggtt_view(obj, &i915_ggtt_view_normal);
-}
-bool i915_gem_obj_is_pinned(struct drm_i915_gem_object *obj);
-
-/* Some GGTT VM helpers */
 static inline struct i915_hw_ppgtt *
 i915_vm_to_ppgtt(struct i915_address_space *vm)
 {
 	return container_of(vm, struct i915_hw_ppgtt, base);
 }
 
-static inline bool i915_gem_obj_ggtt_bound(struct drm_i915_gem_object *obj)
+static inline struct i915_vma *
+i915_gem_object_to_ggtt(struct drm_i915_gem_object *obj,
+			const struct i915_ggtt_view *view)
 {
-	return i915_gem_obj_ggtt_bound_view(obj, &i915_ggtt_view_normal);
+	return i915_gem_obj_to_vma(obj, &to_i915(obj->base.dev)->ggtt.base, view);
 }
 
-unsigned long
-i915_gem_obj_ggtt_size(struct drm_i915_gem_object *obj);
-
-void i915_gem_object_ggtt_unpin_view(struct drm_i915_gem_object *obj,
-				     const struct i915_ggtt_view *view);
-static inline void
-i915_gem_object_ggtt_unpin(struct drm_i915_gem_object *obj)
+static inline unsigned long
+i915_gem_object_ggtt_offset(struct drm_i915_gem_object *o,
+			    const struct i915_ggtt_view *view)
 {
-	i915_gem_object_ggtt_unpin_view(obj, &i915_ggtt_view_normal);
+	return i915_gem_object_to_ggtt(o, view)->node.start;
 }
 
 /* i915_gem_fence.c */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 45c45d3a6e31..685253a1323b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -746,14 +746,15 @@ i915_gem_gtt_pread(struct drm_device *dev,
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
+	struct i915_vma *vma;
 	struct drm_mm_node node;
 	char __user *user_data;
 	uint64_t remain;
 	uint64_t offset;
 	int ret;
 
-	ret = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
-	if (ret) {
+	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
+	if (IS_ERR(vma)) {
 		ret = insert_mappable_node(dev_priv, &node, PAGE_SIZE);
 		if (ret)
 			goto out;
@@ -766,7 +767,7 @@ i915_gem_gtt_pread(struct drm_device *dev,
 
 		i915_gem_object_pin_pages(obj);
 	} else {
-		node.start = i915_gem_obj_ggtt_offset(obj);
+		node.start = vma->node.start;
 		node.allocated = false;
 		ret = i915_gem_object_put_fence(obj);
 		if (ret)
@@ -847,7 +848,7 @@ out_unpin:
 		i915_gem_object_unpin_pages(obj);
 		remove_mappable_node(&node);
 	} else {
-		i915_gem_object_ggtt_unpin(obj);
+		i915_vma_unpin(vma);
 	}
 out:
 	return ret;
@@ -1045,6 +1046,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 {
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	struct drm_device *dev = obj->base.dev;
+	struct i915_vma *vma;
 	struct drm_mm_node node;
 	uint64_t remain, offset;
 	char __user *user_data;
@@ -1054,9 +1056,9 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 	if (i915_gem_object_is_tiled(obj))
 		return -EFAULT;
 
-	ret = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
+	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE | PIN_NONBLOCK);
-	if (ret) {
+	if (IS_ERR(vma)) {
 		ret = insert_mappable_node(i915, &node, PAGE_SIZE);
 		if (ret)
 			goto out;
@@ -1069,7 +1071,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 
 		i915_gem_object_pin_pages(obj);
 	} else {
-		node.start = i915_gem_obj_ggtt_offset(obj);
+		node.start = vma->node.start;
 		node.allocated = false;
 		ret = i915_gem_object_put_fence(obj);
 		if (ret)
@@ -1157,7 +1159,7 @@ out_unpin:
 		i915_gem_object_unpin_pages(obj);
 		remove_mappable_node(&node);
 	} else {
-		i915_gem_object_ggtt_unpin(obj);
+		i915_vma_unpin(vma);
 	}
 out:
 	return ret;
@@ -1625,7 +1627,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 
 /**
  * i915_gem_fault - fault a page into the GTT
- * @vma: VMA in question
+ * @area: CPU VMA in question
  * @vmf: fault info
  *
  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
@@ -1639,20 +1641,21 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
  * suffer if the GTT working set is large or there are few fence registers
  * left.
  */
-int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-	struct drm_i915_gem_object *obj = to_intel_bo(vma->vm_private_data);
+	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
 	struct i915_ggtt_view view = i915_ggtt_view_normal;
 	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
+	struct i915_vma *vma;
 	pgoff_t page_offset;
 	unsigned long pfn;
 	int ret;
 
 	/* We don't use vmf->pgoff since that has the fake offset */
-	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
+	page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >>
 		PAGE_SHIFT;
 
 	trace_i915_gem_object_fault(obj, page_offset, true, write);
@@ -1689,14 +1692,16 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		view.params.partial.size =
 			min_t(unsigned int,
 			      chunk_size,
-			      (vma->vm_end - vma->vm_start)/PAGE_SIZE -
+			      (area->vm_end - area->vm_start) / PAGE_SIZE -
 			      view.params.partial.offset);
 	}
 
 	/* Now pin it into the GTT if needed */
-	ret = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
-	if (ret)
+	vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
 		goto err_unlock;
+	}
 
 	ret = i915_gem_object_set_to_gtt_domain(obj, write);
 	if (ret)
@@ -1707,8 +1712,7 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto err_unpin;
 
 	/* Finally, remap it using the new GTT offset */
-	pfn = ggtt->mappable_base +
-		i915_gem_obj_ggtt_offset_view(obj, &view);
+	pfn = ggtt->mappable_base + vma->node.start;
 	pfn >>= PAGE_SHIFT;
 
 	if (unlikely(view.type == I915_GGTT_VIEW_PARTIAL)) {
@@ -1717,12 +1721,14 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * is due to userspace losing part of the mapping or never
 		 * having accessed it before (at this partials' range).
 		 */
-		unsigned long base = vma->vm_start +
+		unsigned long base = area->vm_start +
 				     (view.params.partial.offset << PAGE_SHIFT);
 		unsigned int i;
 
 		for (i = 0; i < view.params.partial.size; i++) {
-			ret = vm_insert_pfn(vma, base + i * PAGE_SIZE, pfn + i);
+			ret = vm_insert_pfn(area,
+					    base + i * PAGE_SIZE,
+					    pfn + i);
 			if (ret)
 				break;
 		}
@@ -1730,14 +1736,16 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		obj->fault_mappable = true;
 	} else {
 		if (!obj->fault_mappable) {
-			unsigned long size = min_t(unsigned long,
-						   vma->vm_end - vma->vm_start,
-						   obj->base.size);
+			unsigned long size =
+				min_t(unsigned long,
+				      area->vm_end - area->vm_start,
+				      obj->base.size) >> PAGE_SHIFT;
+			unsigned long base = area->vm_start;
 			int i;
 
-			for (i = 0; i < size >> PAGE_SHIFT; i++) {
-				ret = vm_insert_pfn(vma,
-						    (unsigned long)vma->vm_start + i * PAGE_SIZE,
+			for (i = 0; i < size; i++) {
+				ret = vm_insert_pfn(area,
+						    base + i * PAGE_SIZE,
 						    pfn + i);
 				if (ret)
 					break;
@@ -1745,12 +1753,12 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 			obj->fault_mappable = true;
 		} else
-			ret = vm_insert_pfn(vma,
+			ret = vm_insert_pfn(area,
 					    (unsigned long)vmf->virtual_address,
 					    pfn + page_offset);
 	}
 err_unpin:
-	i915_gem_object_ggtt_unpin_view(obj, &view);
+	__i915_vma_unpin(vma);
 err_unlock:
 	mutex_unlock(&dev->struct_mutex);
 err_rpm:
@@ -3235,7 +3243,7 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 					    old_write_domain);
 
 	/* And bump the LRU for this access */
-	vma = i915_gem_obj_to_ggtt(obj);
+	vma = i915_gem_object_to_ggtt(obj, NULL);
 	if (vma &&
 	    drm_mm_node_allocated(&vma->node) &&
 	    !i915_vma_is_active(vma))
@@ -3459,11 +3467,12 @@ rpm_put:
  * Can be called from an uninterruptible phase (modesetting) and allows
  * any flushes to be pipelined (for pageflips).
  */
-int
+struct i915_vma *
 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     const struct i915_ggtt_view *view)
 {
+	struct i915_vma *vma;
 	u32 old_read_domains, old_write_domain;
 	int ret;
 
@@ -3483,19 +3492,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 	 */
 	ret = i915_gem_object_set_cache_level(obj,
 					      HAS_WT(obj->base.dev) ? I915_CACHE_WT : I915_CACHE_NONE);
-	if (ret)
+	if (ret) {
+		vma = ERR_PTR(ret);
 		goto err_unpin_display;
+	}
 
 	/* As the user may map the buffer once pinned in the display plane
 	 * (e.g. libkms for the bootup splash), we have to ensure that we
 	 * always use map_and_fenceable for all scanout buffers.
 	 */
-	ret = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
+	vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
 				       view->type == I915_GGTT_VIEW_NORMAL ?
 				       PIN_MAPPABLE : 0);
-	if (ret)
+	if (IS_ERR(vma))
 		goto err_unpin_display;
 
+	WARN_ON(obj->pin_display > i915_vma_pin_count(vma));
+
 	i915_gem_object_flush_cpu_write_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
@@ -3511,23 +3524,23 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 					    old_read_domains,
 					    old_write_domain);
 
-	return 0;
+	return vma;
 
 err_unpin_display:
 	obj->pin_display--;
-	return ret;
+	return vma;
 }
 
 void
-i915_gem_object_unpin_from_display_plane(struct drm_i915_gem_object *obj,
-					 const struct i915_ggtt_view *view)
+i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
 {
-	if (WARN_ON(obj->pin_display == 0))
+	if (WARN_ON(vma->obj->pin_display == 0))
 		return;
 
-	i915_gem_object_ggtt_unpin_view(obj, view);
+	vma->obj->pin_display--;
 
-	obj->pin_display--;
+	i915_vma_unpin(vma);
+	WARN_ON(vma->obj->pin_display > i915_vma_pin_count(vma));
 }
 
 /**
@@ -3724,27 +3737,25 @@ err:
 	return ret;
 }
 
-int
+struct i915_vma *
 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 			 const struct i915_ggtt_view *view,
 			 u64 size,
 			 u64 alignment,
 			 u64 flags)
 {
+	struct i915_address_space *vm = &to_i915(obj->base.dev)->ggtt.base;
 	struct i915_vma *vma;
 	int ret;
 
-	if (!view)
-		view = &i915_ggtt_view_normal;
-
-	vma = i915_gem_obj_lookup_or_create_ggtt_vma(obj, view);
+	vma = i915_gem_obj_lookup_or_create_vma(obj, vm, view);
 	if (IS_ERR(vma))
-		return PTR_ERR(vma);
+		return vma;
 
 	if (i915_vma_misplaced(vma, size, alignment, flags)) {
 		if (flags & PIN_NONBLOCK &&
 		    (i915_vma_is_pinned(vma) || i915_vma_is_active(vma)))
-			return -ENOSPC;
+			return ERR_PTR(-ENOSPC);
 
 		WARN(i915_vma_is_pinned(vma),
 		     "bo is already pinned in ggtt with incorrect alignment:"
@@ -3757,17 +3768,14 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 		     obj->map_and_fenceable);
 		ret = i915_vma_unbind(vma);
 		if (ret)
-			return ret;
+			return ERR_PTR(ret);
 	}
 
-	return i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
-}
+	ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
+	if (ret)
+		return ERR_PTR(ret);
 
-void
-i915_gem_object_ggtt_unpin_view(struct drm_i915_gem_object *obj,
-				const struct i915_ggtt_view *view)
-{
-	i915_vma_unpin(i915_gem_obj_to_ggtt_view(obj, view));
+	return vma;
 }
 
 static __always_inline unsigned int __busy_read_flag(unsigned int id)
@@ -4153,32 +4161,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
 	intel_runtime_pm_put(dev_priv);
 }
 
-struct i915_vma *i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
-				     struct i915_address_space *vm)
-{
-	struct i915_vma *vma;
-	list_for_each_entry(vma, &obj->vma_list, obj_link) {
-		if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL &&
-		    vma->vm == vm)
-			return vma;
-	}
-	return NULL;
-}
-
-struct i915_vma *i915_gem_obj_to_ggtt_view(struct drm_i915_gem_object *obj,
-					   const struct i915_ggtt_view *view)
-{
-	struct i915_vma *vma;
-
-	GEM_BUG_ON(!view);
-
-	list_for_each_entry(vma, &obj->vma_list, obj_link)
-		if (i915_vma_is_ggtt(vma) &&
-		    i915_ggtt_view_equal(&vma->ggtt_view, view))
-			return vma;
-	return NULL;
-}
-
 int i915_gem_suspend(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
@@ -4646,97 +4628,6 @@ void i915_gem_track_fb(struct drm_i915_gem_object *old,
 	}
 }
 
-/* All the new VM stuff */
-u64 i915_gem_obj_offset(struct drm_i915_gem_object *o,
-			struct i915_address_space *vm)
-{
-	struct drm_i915_private *dev_priv = to_i915(o->base.dev);
-	struct i915_vma *vma;
-
-	WARN_ON(vm == &dev_priv->mm.aliasing_ppgtt->base);
-
-	list_for_each_entry(vma, &o->vma_list, obj_link) {
-		if (i915_vma_is_ggtt(vma) &&
-		    vma->ggtt_view.type != I915_GGTT_VIEW_NORMAL)
-			continue;
-		if (vma->vm == vm)
-			return vma->node.start;
-	}
-
-	WARN(1, "%s vma for this object not found.\n",
-	     i915_is_ggtt(vm) ? "global" : "ppgtt");
-	return -1;
-}
-
-u64 i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
-				  const struct i915_ggtt_view *view)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &o->vma_list, obj_link)
-		if (i915_vma_is_ggtt(vma) &&
-		    i915_ggtt_view_equal(&vma->ggtt_view, view))
-			return vma->node.start;
-
-	WARN(1, "global vma for this object not found. (view=%u)\n", view->type);
-	return -1;
-}
-
-bool i915_gem_obj_bound(struct drm_i915_gem_object *o,
-			struct i915_address_space *vm)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &o->vma_list, obj_link) {
-		if (i915_vma_is_ggtt(vma) &&
-		    vma->ggtt_view.type != I915_GGTT_VIEW_NORMAL)
-			continue;
-		if (vma->vm == vm && drm_mm_node_allocated(&vma->node))
-			return true;
-	}
-
-	return false;
-}
-
-bool i915_gem_obj_ggtt_bound_view(struct drm_i915_gem_object *o,
-				  const struct i915_ggtt_view *view)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &o->vma_list, obj_link)
-		if (i915_vma_is_ggtt(vma) &&
-		    i915_ggtt_view_equal(&vma->ggtt_view, view) &&
-		    drm_mm_node_allocated(&vma->node))
-			return true;
-
-	return false;
-}
-
-unsigned long i915_gem_obj_ggtt_size(struct drm_i915_gem_object *o)
-{
-	struct i915_vma *vma;
-
-	GEM_BUG_ON(list_empty(&o->vma_list));
-
-	list_for_each_entry(vma, &o->vma_list, obj_link) {
-		if (i915_vma_is_ggtt(vma) &&
-		    vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL)
-			return vma->node.size;
-	}
-
-	return 0;
-}
-
-bool i915_gem_obj_is_pinned(struct drm_i915_gem_object *obj)
-{
-	struct i915_vma *vma;
-	list_for_each_entry(vma, &obj->vma_list, obj_link)
-		if (i915_vma_is_pinned(vma))
-			return true;
-
-	return false;
-}
-
 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
 struct page *
 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj, int n)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index ced05878b405..699315304748 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -180,8 +180,8 @@ eb_lookup_vmas(struct eb_vmas *eb,
 		 * from the (obj, vm) we don't run the risk of creating
 		 * duplicated vmas for the same vm.
 		 */
-		vma = i915_gem_obj_lookup_or_create_vma(obj, vm);
-		if (IS_ERR(vma)) {
+		vma = i915_gem_obj_lookup_or_create_vma(obj, vm, NULL);
+		if (unlikely(IS_ERR(vma))) {
 			DRM_DEBUG("Failed to lookup VMA\n");
 			ret = PTR_ERR(vma);
 			goto err;
@@ -349,30 +349,33 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
 		   struct drm_i915_gem_relocation_entry *reloc,
 		   uint64_t target_offset)
 {
-	struct drm_device *dev = obj->base.dev;
-	struct drm_i915_private *dev_priv = to_i915(dev);
+	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
+	struct i915_vma *vma;
 	uint64_t delta = relocation_target(reloc, target_offset);
 	uint64_t offset;
 	void __iomem *reloc_page;
 	int ret;
 
+	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
 	ret = i915_gem_object_set_to_gtt_domain(obj, true);
 	if (ret)
-		return ret;
+		goto unpin;
 
 	ret = i915_gem_object_put_fence(obj);
 	if (ret)
-		return ret;
+		goto unpin;
 
 	/* Map the page containing the relocation we're going to perform.  */
-	offset = i915_gem_obj_ggtt_offset(obj);
-	offset += reloc->offset;
+	offset = vma->node.start + reloc->offset;
 	reloc_page = io_mapping_map_atomic_wc(ggtt->mappable,
 					      offset & PAGE_MASK);
 	iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
 
-	if (INTEL_INFO(dev)->gen >= 8) {
+	if (INTEL_GEN(dev_priv) >= 8) {
 		offset += sizeof(uint32_t);
 
 		if (offset_in_page(offset) == 0) {
@@ -388,7 +391,9 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
 
 	io_mapping_unmap_atomic(reloc_page);
 
-	return 0;
+unpin:
+	i915_vma_unpin(vma);
+	return ret;
 }
 
 static void
@@ -1281,7 +1286,7 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
 	return 0;
 }
 
-static struct i915_vma*
+static struct i915_vma *
 i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
 			  struct drm_i915_gem_exec_object2 *shadow_exec_entry,
 			  struct drm_i915_gem_object *batch_obj,
@@ -1305,31 +1310,28 @@ i915_gem_execbuffer_parse(struct intel_engine_cs *engine,
 				      batch_start_offset,
 				      batch_len,
 				      is_master);
-	if (ret)
-		goto err;
+	if (ret) {
+		if (ret == -EACCES) /* unhandled chained batch */
+			vma = NULL;
+		else
+			vma = ERR_PTR(ret);
+		goto out;
+	}
 
-	ret = i915_gem_object_ggtt_pin(shadow_batch_obj, NULL, 0, 0, 0);
-	if (ret)
-		goto err;
-
-	i915_gem_object_unpin_pages(shadow_batch_obj);
+	vma = i915_gem_object_ggtt_pin(shadow_batch_obj, NULL, 0, 0, 0);
+	if (IS_ERR(vma))
+		goto out;
 
 	memset(shadow_exec_entry, 0, sizeof(*shadow_exec_entry));
 
-	vma = i915_gem_obj_to_ggtt(shadow_batch_obj);
 	vma->exec_entry = shadow_exec_entry;
 	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
 	i915_gem_object_get(shadow_batch_obj);
 	list_add_tail(&vma->exec_list, &eb->vmas);
 
-	return vma;
-
-err:
+out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
-	if (ret == -EACCES) /* unhandled chained batch */
-		return NULL;
-	else
-		return ERR_PTR(ret);
+	return vma;
 }
 
 static int
@@ -1677,6 +1679,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * hsw should have this fixed, but bdw mucks it up again. */
 	if (dispatch_flags & I915_DISPATCH_SECURE) {
 		struct drm_i915_gem_object *obj = params->batch->obj;
+		struct i915_vma *vma;
 
 		/*
 		 * So on first glance it looks freaky that we pin the batch here
@@ -1688,11 +1691,13 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		 *   fitting due to fragmentation.
 		 * So this is actually safe.
 		 */
-		ret = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
-		if (ret)
+		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
+		if (IS_ERR(vma)) {
+			ret = PTR_ERR(vma);
 			goto err;
+		}
 
-		params->batch = i915_gem_obj_to_ggtt(obj);
+		params->batch = vma;
 	}
 
 	/* Allocate a request for this batch buffer nice and early. */
@@ -1708,7 +1713,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	 * inactive_list and lose its active reference. Hence we do not need
 	 * to explicitly hold another reference here.
 	 */
-	params->request->batch_obj = params->batch->obj;
+	params->request->batch = params->batch;
 
 	ret = i915_gem_request_add_to_client(params->request, file);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index d99fc5734cf1..334c3c4e8357 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -85,7 +85,7 @@ static void i965_write_fence_reg(struct drm_device *dev, int reg,
 	POSTING_READ(fence_reg_lo);
 
 	if (obj) {
-		struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
+		struct i915_vma *vma = i915_gem_object_to_ggtt(obj, NULL);
 		unsigned int tiling = i915_gem_object_get_tiling(obj);
 		unsigned int stride = i915_gem_object_get_stride(obj);
 		u32 size = vma->node.size;
@@ -120,7 +120,7 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 	u32 val;
 
 	if (obj) {
-		struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
+		struct i915_vma *vma = i915_gem_object_to_ggtt(obj, NULL);
 		unsigned int tiling = i915_gem_object_get_tiling(obj);
 		unsigned int stride = i915_gem_object_get_stride(obj);
 		int pitch_val;
@@ -161,7 +161,7 @@ static void i830_write_fence_reg(struct drm_device *dev, int reg,
 	u32 val;
 
 	if (obj) {
-		struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
+		struct i915_vma *vma = i915_gem_object_to_ggtt(obj, NULL);
 		unsigned int tiling = i915_gem_object_get_tiling(obj);
 		unsigned int stride = i915_gem_object_get_stride(obj);
 		u32 pitch_val;
@@ -432,13 +432,7 @@ bool
 i915_gem_object_pin_fence(struct drm_i915_gem_object *obj)
 {
 	if (obj->fence_reg != I915_FENCE_REG_NONE) {
-		struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-		struct i915_vma *ggtt_vma = i915_gem_obj_to_ggtt(obj);
-
-		WARN_ON(!ggtt_vma ||
-			dev_priv->fence_regs[obj->fence_reg].pin_count >
-			i915_vma_pin_count(ggtt_vma));
-		dev_priv->fence_regs[obj->fence_reg].pin_count++;
+		to_i915(obj->base.dev)->fence_regs[obj->fence_reg].pin_count++;
 		return true;
 	} else
 		return false;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d15eb1d71341..3631944ac2d9 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3341,23 +3341,19 @@ void i915_vma_close(struct i915_vma *vma)
 }
 
 static struct i915_vma *
-__i915_gem_vma_create(struct drm_i915_gem_object *obj,
-		      struct i915_address_space *vm,
-		      const struct i915_ggtt_view *view)
+__i915_vma_create(struct drm_i915_gem_object *obj,
+		  struct i915_address_space *vm,
+		  const struct i915_ggtt_view *view)
 {
 	struct i915_vma *vma;
 	int i;
 
 	GEM_BUG_ON(vm->closed);
 
-	if (WARN_ON(i915_is_ggtt(vm) != !!view))
-		return ERR_PTR(-EINVAL);
-
 	vma = kmem_cache_zalloc(to_i915(obj->base.dev)->vmas, GFP_KERNEL);
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&vma->obj_link);
 	INIT_LIST_HEAD(&vma->exec_list);
 	for (i = 0; i < ARRAY_SIZE(vma->last_read); i++)
 		init_request_active(&vma->last_read[i], i915_vma_retire);
@@ -3366,8 +3362,7 @@ __i915_gem_vma_create(struct drm_i915_gem_object *obj,
 	vma->obj = obj;
 	vma->size = obj->base.size;
 
-	if (i915_is_ggtt(vm)) {
-		vma->flags |= I915_VMA_GGTT;
+	if (view) {
 		vma->ggtt_view = *view;
 		if (view->type == I915_GGTT_VIEW_PARTIAL) {
 			vma->size = view->params.partial.size;
@@ -3377,57 +3372,79 @@ __i915_gem_vma_create(struct drm_i915_gem_object *obj,
 				intel_rotation_info_size(&view->params.rotated);
 			vma->size <<= PAGE_SHIFT;
 		}
+	}
+
+	if (i915_is_ggtt(vm)) {
+		vma->flags |= I915_VMA_GGTT;
 	} else {
 		i915_ppgtt_get(i915_vm_to_ppgtt(vm));
 	}
 
 	list_add_tail(&vma->obj_link, &obj->vma_list);
-
 	return vma;
 }
 
+static inline bool vma_matches(struct i915_vma *vma,
+			       struct i915_address_space *vm,
+			       const struct i915_ggtt_view *view)
+{
+	if (vma->vm != vm)
+		return false;
+
+	if (!i915_vma_is_ggtt(vma))
+		return true;
+
+	if (!view)
+		return vma->ggtt_view.type == 0;
+
+	if (vma->ggtt_view.type != view->type)
+		return false;
+
+	return memcmp(&vma->ggtt_view.params,
+		      &view->params,
+		      sizeof(view->params)) == 0;
+}
+
 struct i915_vma *
 i915_vma_create(struct drm_i915_gem_object *obj,
 		struct i915_address_space *vm,
 		const struct i915_ggtt_view *view)
 {
 	GEM_BUG_ON(view && !i915_is_ggtt(vm));
-	GEM_BUG_ON(view ? i915_gem_obj_to_ggtt_view(obj, view) : i915_gem_obj_to_vma(obj, vm));
+	GEM_BUG_ON(i915_gem_obj_to_vma(obj, vm, view));
 
-	return __i915_gem_vma_create(obj, vm, view ?: &i915_ggtt_view_normal);
+	return __i915_vma_create(obj, vm, view);
+}
+
+struct i915_vma *
+i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
+		    struct i915_address_space *vm,
+		    const struct i915_ggtt_view *view)
+{
+	struct i915_vma *vma;
+
+	list_for_each_entry_reverse(vma, &obj->vma_list, obj_link)
+		if (vma_matches(vma, vm, view))
+			return vma;
+
+	return NULL;
 }
 
 struct i915_vma *
 i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
-				  struct i915_address_space *vm)
+				  struct i915_address_space *vm,
+				  const struct i915_ggtt_view *view)
 {
 	struct i915_vma *vma;
 
-	vma = i915_gem_obj_to_vma(obj, vm);
+	GEM_BUG_ON(view && !i915_is_ggtt(vm));
+
+	vma = i915_gem_obj_to_vma(obj, vm, view);
 	if (!vma)
-		vma = __i915_gem_vma_create(obj, vm,
-					    i915_is_ggtt(vm) ? &i915_ggtt_view_normal : NULL);
-
-	return vma;
-}
-
-struct i915_vma *
-i915_gem_obj_lookup_or_create_ggtt_vma(struct drm_i915_gem_object *obj,
-				       const struct i915_ggtt_view *view)
-{
-	struct drm_device *dev = obj->base.dev;
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct i915_vma *vma = i915_gem_obj_to_ggtt_view(obj, view);
-
-	GEM_BUG_ON(!view);
-
-	if (!vma)
-		vma = __i915_gem_vma_create(obj, &ggtt->base, view);
+		vma = __i915_vma_create(obj, vm, view);
 
 	GEM_BUG_ON(i915_vma_is_closed(vma));
 	return vma;
-
 }
 
 static struct scatterlist *
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index ec538fcc9c20..71513b13ca94 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -610,20 +610,6 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev);
 int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object *obj);
 void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
 
-static inline bool
-i915_ggtt_view_equal(const struct i915_ggtt_view *a,
-                     const struct i915_ggtt_view *b)
-{
-	if (WARN_ON(!a || !b))
-		return false;
-
-	if (a->type != b->type)
-		return false;
-	if (a->type != I915_GGTT_VIEW_NORMAL)
-		return !memcmp(&a->params, &b->params, sizeof(a->params));
-	return true;
-}
-
 /* Flags used by pin/bind&friends. */
 #define PIN_NONBLOCK		BIT(0)
 #define PIN_MAPPABLE		BIT(1)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 4c5b7e104f2f..a0756b553a19 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -406,7 +406,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	/* No zalloc, must clear what we need by hand */
 	req->previous_context = NULL;
 	req->file_priv = NULL;
-	req->batch_obj = NULL;
+	req->batch = NULL;
 	req->pid = NULL;
 	req->elsp_submitted = 0;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 85393be09c27..a66243a6f791 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -118,7 +118,7 @@ struct drm_i915_gem_request {
 	/** Batch buffer related to this request if any (used for
 	 * error state dump only).
 	 */
-	struct drm_i915_gem_object *batch_obj;
+	struct i915_vma *batch;
 	struct list_head active_list;
 
 	/** Time at which this request was emitted, in jiffies. */
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index f7633585ea38..9a9b3cfbac7f 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -696,7 +696,7 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_device *dev,
 	if (gtt_offset == I915_GTT_OFFSET_NONE)
 		return obj;
 
-	vma = i915_gem_obj_lookup_or_create_vma(obj, &ggtt->base);
+	vma = i915_gem_obj_lookup_or_create_vma(obj, &ggtt->base, NULL);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto err;
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index b2b0cb7199ac..bfefb63a55ef 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -130,7 +130,7 @@ i915_gem_object_fence_prepare(struct drm_i915_gem_object *obj, int tiling_mode)
 	if (INTEL_GEN(dev_priv) >= 4)
 		return 0;
 
-	vma = i915_gem_obj_to_ggtt(obj);
+	vma = i915_gem_object_to_ggtt(obj, NULL);
 	if (!vma)
 		return 0;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5e7734ca4579..8fc21a9c0abd 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -653,46 +653,42 @@ static void i915_error_state_free(struct kref *error_ref)
 
 static struct drm_i915_error_object *
 i915_error_object_create(struct drm_i915_private *dev_priv,
-			 struct drm_i915_gem_object *src,
-			 struct i915_address_space *vm)
+			 struct i915_vma *vma)
 {
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
+	struct drm_i915_gem_object *src;
 	struct drm_i915_error_object *dst;
-	struct i915_vma *vma = NULL;
 	int num_pages;
 	bool use_ggtt;
 	int i = 0;
 	u64 reloc_offset;
 
-	if (src == NULL || src->pages == NULL)
+	if (!vma)
+		return NULL;
+
+	src = vma->obj;
+	if (!src->pages)
 		return NULL;
 
 	num_pages = src->base.size >> PAGE_SHIFT;
 
 	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), GFP_ATOMIC);
-	if (dst == NULL)
+	if (!dst)
 		return NULL;
 
-	if (i915_gem_obj_bound(src, vm))
-		dst->gtt_offset = i915_gem_obj_offset(src, vm);
-	else
-		dst->gtt_offset = -1;
-
-	reloc_offset = dst->gtt_offset;
-	if (i915_is_ggtt(vm))
-		vma = i915_gem_obj_to_ggtt(src);
+	reloc_offset = dst->gtt_offset = vma->node.start;
 	use_ggtt = (src->cache_level == I915_CACHE_NONE &&
-		   vma && (vma->flags & I915_VMA_GLOBAL_BIND) &&
+		   (vma->flags & I915_VMA_GLOBAL_BIND) &&
 		   reloc_offset + num_pages * PAGE_SIZE <= ggtt->mappable_end);
 
 	/* Cannot access stolen address directly, try to use the aperture */
 	if (src->stolen) {
 		use_ggtt = true;
 
-		if (!(vma && vma->flags & I915_VMA_GLOBAL_BIND))
+		if (!(vma->flags & I915_VMA_GLOBAL_BIND))
 			goto unwind;
 
-		reloc_offset = i915_gem_obj_ggtt_offset(src);
+		reloc_offset = vma->node.start;
 		if (reloc_offset + num_pages * PAGE_SIZE > ggtt->mappable_end)
 			goto unwind;
 	}
@@ -752,8 +748,6 @@ unwind:
 	kfree(dst);
 	return NULL;
 }
-#define i915_error_ggtt_object_create(dev_priv, src) \
-	i915_error_object_create((dev_priv), (src), &(dev_priv)->ggtt.base)
 
 /* The error capture is special as tries to run underneath the normal
  * locking rules - so we use the raw version of the i915_gem_active lookup.
@@ -1062,8 +1056,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 	int i, count;
 
 	error->semaphore =
-		i915_error_ggtt_object_create(dev_priv,
-					      dev_priv->semaphore->obj);
+		i915_error_object_create(dev_priv, dev_priv->semaphore);
 
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
 		struct intel_engine_cs *engine = &dev_priv->engine[i];
@@ -1093,18 +1086,16 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			 */
 			ee->batchbuffer =
 				i915_error_object_create(dev_priv,
-							 request->batch_obj,
-							 ee->vm);
+							 request->batch);
 
 			if (HAS_BROKEN_CS_TLB(dev_priv))
 				ee->wa_batchbuffer =
-					i915_error_ggtt_object_create(dev_priv,
-								      engine->scratch->obj);
+					i915_error_object_create(dev_priv,
+								 engine->scratch);
 
-			if (request->ctx->engine[i].state) {
-				ee->ctx = i915_error_ggtt_object_create(dev_priv,
-									request->ctx->engine[i].state->obj);
-			}
+			ee->ctx =
+				i915_error_object_create(dev_priv,
+							 request->ctx->engine[i].state);
 
 			if (request->pid) {
 				struct task_struct *task;
@@ -1125,16 +1116,15 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			ee->cpu_ring_head = ring->head;
 			ee->cpu_ring_tail = ring->tail;
 			ee->ringbuffer =
-				i915_error_ggtt_object_create(dev_priv,
-							      ring->vma->obj);
+				i915_error_object_create(dev_priv, ring->vma);
 		}
 
 		ee->hws_page =
-			i915_error_ggtt_object_create(dev_priv,
-						      engine->status_page.vma->obj);
+			i915_error_object_create(dev_priv,
+						 engine->status_page.vma);
 
-		ee->wa_ctx = i915_error_ggtt_object_create(dev_priv,
-							   engine->wa_ctx.vma->obj);
+		ee->wa_ctx =
+			i915_error_object_create(dev_priv, engine->wa_ctx.vma);
 
 		count = 0;
 		list_for_each_entry(request, &engine->request_list, link)
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 2e7d03c5bf5c..e582b4267036 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2179,14 +2179,14 @@ static unsigned int intel_surf_alignment(const struct drm_i915_private *dev_priv
 	}
 }
 
-int
-intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb,
-			   unsigned int rotation)
+struct i915_vma *
+intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 {
 	struct drm_device *dev = fb->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	struct i915_ggtt_view view;
+	struct i915_vma *vma;
 	u32 alignment;
 	int ret;
 
@@ -2213,10 +2213,11 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb,
 	 */
 	intel_runtime_pm_get(dev_priv);
 
-	ret = i915_gem_object_pin_to_display_plane(obj, alignment,
-						   &view);
-	if (ret)
+	vma = i915_gem_object_pin_to_display_plane(obj, alignment, &view);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
 		goto err_pm;
+	}
 
 	/* Install a fence for tiled scan-out. Pre-i965 always needs a
 	 * fence, whereas 965+ only requires a fence if using
@@ -2243,19 +2244,20 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb,
 	}
 
 	intel_runtime_pm_put(dev_priv);
-	return 0;
+	return vma;
 
 err_unpin:
-	i915_gem_object_unpin_from_display_plane(obj, &view);
+	i915_gem_object_unpin_from_display_plane(vma);
 err_pm:
 	intel_runtime_pm_put(dev_priv);
-	return ret;
+	return ERR_PTR(ret);
 }
 
 void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 {
 	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	struct i915_ggtt_view view;
+	struct i915_vma *vma;
 
 	WARN_ON(!mutex_is_locked(&obj->base.dev->struct_mutex));
 
@@ -2264,7 +2266,8 @@ void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	if (view.type == I915_GGTT_VIEW_NORMAL)
 		i915_gem_object_unpin_fence(obj);
 
-	i915_gem_object_unpin_from_display_plane(obj, &view);
+	vma = i915_gem_object_to_ggtt(obj, &view);
+	i915_gem_object_unpin_from_display_plane(vma);
 }
 
 static int intel_fb_pitch(const struct drm_framebuffer *fb, int plane,
@@ -2796,7 +2799,7 @@ intel_find_initial_plane_obj(struct intel_crtc *intel_crtc,
 			continue;
 
 		obj = intel_fb_obj(fb);
-		if (i915_gem_obj_ggtt_offset(obj) == plane_config->base) {
+		if (i915_gem_object_ggtt_offset(obj, NULL) == plane_config->base) {
 			drm_framebuffer_reference(fb);
 			goto valid_fb;
 		}
@@ -3115,7 +3118,7 @@ static void i9xx_update_primary_plane(struct drm_plane *primary,
 		I915_WRITE(DSPTILEOFF(plane), (y << 16) | x);
 		I915_WRITE(DSPLINOFF(plane), linear_offset);
 	} else
-		I915_WRITE(DSPADDR(plane), i915_gem_obj_ggtt_offset(obj) + linear_offset);
+		I915_WRITE(DSPADDR(plane), i915_gem_object_ggtt_offset(obj, NULL) + linear_offset);
 	POSTING_READ(reg);
 }
 
@@ -3237,11 +3240,17 @@ u32 intel_fb_gtt_offset(struct drm_framebuffer *fb,
 {
 	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	struct i915_ggtt_view view;
+	struct i915_vma *vma;
 	u64 offset;
 
 	intel_fill_fb_ggtt_view(&view, fb, rotation);
 
-	offset = i915_gem_obj_ggtt_offset_view(obj, &view);
+	vma = i915_gem_object_to_ggtt(obj, &view);
+	if (WARN(!vma, "ggtt vma for display object not found! (view=%u)\n",
+		 view.type))
+		return -1;
+
+	offset = vma->node.start;
 
 	WARN_ON(upper_32_bits(offset));
 
@@ -12032,6 +12041,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	struct intel_engine_cs *engine;
 	bool mmio_flip;
 	struct drm_i915_gem_request *request;
+	struct i915_vma *vma;
 	int ret;
 
 	/*
@@ -12139,9 +12149,11 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 
 	mmio_flip = use_mmio_flip(engine, obj);
 
-	ret = intel_pin_and_fence_fb_obj(fb, primary->state->rotation);
-	if (ret)
+	vma = intel_pin_and_fence_fb_obj(fb, primary->state->rotation);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
 		goto cleanup_pending;
+	}
 
 	work->gtt_offset = intel_fb_gtt_offset(fb, primary->state->rotation);
 	work->gtt_offset += intel_crtc->dspaddr_offset;
@@ -14484,7 +14496,11 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 		if (ret)
 			DRM_DEBUG_KMS("failed to attach phys object\n");
 	} else {
-		ret = intel_pin_and_fence_fb_obj(fb, new_state->rotation);
+		struct i915_vma *vma;
+
+		vma = intel_pin_and_fence_fb_obj(fb, new_state->rotation);
+		if (IS_ERR(vma))
+			ret = PTR_ERR(vma);
 	}
 
 	if (ret == 0) {
@@ -14850,7 +14866,7 @@ intel_update_cursor_plane(struct drm_plane *plane,
 	if (!obj)
 		addr = 0;
 	else if (!INTEL_INFO(dev)->cursor_needs_physical)
-		addr = i915_gem_obj_ggtt_offset(obj);
+		addr = i915_gem_object_ggtt_offset(obj, NULL);
 	else
 		addr = obj->phys_handle->busaddr;
 
@@ -16723,7 +16739,6 @@ void intel_modeset_gem_init(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct drm_crtc *c;
 	struct drm_i915_gem_object *obj;
-	int ret;
 
 	intel_init_gt_powersave(dev_priv);
 
@@ -16737,15 +16752,17 @@ void intel_modeset_gem_init(struct drm_device *dev)
 	 * for this.
 	 */
 	for_each_crtc(dev, c) {
+		struct i915_vma *vma;
+
 		obj = intel_fb_obj(c->primary->fb);
 		if (obj == NULL)
 			continue;
 
 		mutex_lock(&dev->struct_mutex);
-		ret = intel_pin_and_fence_fb_obj(c->primary->fb,
+		vma = intel_pin_and_fence_fb_obj(c->primary->fb,
 						 c->primary->state->rotation);
 		mutex_unlock(&dev->struct_mutex);
-		if (ret) {
+		if (IS_ERR(vma)) {
 			DRM_ERROR("failed to pin boot fb on pipe %d\n",
 				  to_intel_crtc(c)->pipe);
 			drm_framebuffer_unreference(c->primary->fb);
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 9539f0e63fa5..1c700b0c3cea 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -193,6 +193,7 @@ struct intel_framebuffer {
 struct intel_fbdev {
 	struct drm_fb_helper helper;
 	struct intel_framebuffer *fb;
+	struct i915_vma *vma;
 	async_cookie_t cookie;
 	int preferred_bpp;
 };
@@ -1239,8 +1240,8 @@ bool intel_get_load_detect_pipe(struct drm_connector *connector,
 void intel_release_load_detect_pipe(struct drm_connector *connector,
 				    struct intel_load_detect_pipe *old,
 				    struct drm_modeset_acquire_ctx *ctx);
-int intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb,
-			       unsigned int rotation);
+struct i915_vma *
+intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation);
 void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation);
 struct drm_framebuffer *
 __intel_framebuffer_create(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index e67b09a3328c..e122052c4081 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -737,7 +737,7 @@ static void intel_fbc_update_state_cache(struct intel_crtc *crtc,
 	/* FIXME: We lack the proper locking here, so only run this on the
 	 * platforms that need. */
 	if (IS_GEN(dev_priv, 5, 6))
-		cache->fb.ilk_ggtt_offset = i915_gem_obj_ggtt_offset(obj);
+		cache->fb.ilk_ggtt_offset = i915_gem_object_ggtt_offset(obj, NULL);
 	cache->fb.pixel_format = fb->pixel_format;
 	cache->fb.stride = fb->pitches[0];
 	cache->fb.fence_reg = obj->fence_reg;
diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c
index 2c14dfc5e4f0..096e84dabace 100644
--- a/drivers/gpu/drm/i915/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/intel_fbdev.c
@@ -187,7 +187,6 @@ static int intelfb_create(struct drm_fb_helper *helper,
 	struct fb_info *info;
 	struct drm_framebuffer *fb;
 	struct i915_vma *vma;
-	struct drm_i915_gem_object *obj;
 	bool prealloc = false;
 	void __iomem *vaddr;
 	int ret;
@@ -215,17 +214,17 @@ static int intelfb_create(struct drm_fb_helper *helper,
 		sizes->fb_height = intel_fb->base.height;
 	}
 
-	obj = intel_fb->obj;
-
 	mutex_lock(&dev->struct_mutex);
 
 	/* Pin the GGTT vma for our access via info->screen_base.
 	 * This also validates that any existing fb inherited from the
 	 * BIOS is suitable for own access.
 	 */
-	ret = intel_pin_and_fence_fb_obj(&ifbdev->fb->base, DRM_ROTATE_0);
-	if (ret)
+	vma = intel_pin_and_fence_fb_obj(&ifbdev->fb->base, DRM_ROTATE_0);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
 		goto out_unlock;
+	}
 
 	info = drm_fb_helper_alloc_fbi(helper);
 	if (IS_ERR(info)) {
@@ -245,8 +244,6 @@ static int intelfb_create(struct drm_fb_helper *helper,
 	info->flags = FBINFO_DEFAULT | FBINFO_CAN_FORCE_OUTPUT;
 	info->fbops = &intelfb_ops;
 
-	vma = i915_gem_obj_to_ggtt(obj);
-
 	/* setup aperture base/size for vesafb takeover */
 	info->apertures->ranges[0].base = dev->mode_config.fb_base;
 	info->apertures->ranges[0].size = ggtt->mappable_end;
@@ -273,14 +270,14 @@ static int intelfb_create(struct drm_fb_helper *helper,
 	 * If the object is stolen however, it will be full of whatever
 	 * garbage was left in there.
 	 */
-	if (ifbdev->fb->obj->stolen && !prealloc)
+	if (intel_fb->obj->stolen && !prealloc)
 		memset_io(info->screen_base, 0, info->screen_size);
 
 	/* Use default scratch pixmap (info->pixmap.flags = FB_PIXMAP_SYSTEM) */
 
-	DRM_DEBUG_KMS("allocated %dx%d fb: 0x%08llx, bo %p\n",
-		      fb->width, fb->height,
-		      i915_gem_obj_ggtt_offset(obj), obj);
+	DRM_DEBUG_KMS("allocated %dx%d fb: 0x%08llx\n",
+		      fb->width, fb->height, vma->node.start);
+	ifbdev->vma = vma;
 
 	mutex_unlock(&dev->struct_mutex);
 	vga_switcheroo_client_fb_set(dev->pdev, info);
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index 1e2d9f2ad4f6..7dd745dfeffb 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -249,12 +249,12 @@ static inline bool guc_ucode_response(struct drm_i915_private *dev_priv,
  * Note that GuC needs the CSS header plus uKernel code to be copied by the
  * DMA engine in one operation, whereas the RSA signature is loaded via MMIO.
  */
-static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv)
+static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv,
+			      struct i915_vma *vma)
 {
 	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
-	struct drm_i915_gem_object *fw_obj = guc_fw->guc_fw_obj;
 	unsigned long offset;
-	struct sg_table *sg = fw_obj->pages;
+	struct sg_table *sg = vma->pages;
 	u32 status, rsa[UOS_RSA_SCRATCH_MAX_COUNT];
 	int i, ret = 0;
 
@@ -271,7 +271,7 @@ static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv)
 	I915_WRITE(DMA_COPY_SIZE, guc_fw->header_size + guc_fw->ucode_size);
 
 	/* Set the source address for the new blob */
-	offset = i915_gem_obj_ggtt_offset(fw_obj) + guc_fw->header_offset;
+	offset = vma->node.start + guc_fw->header_offset;
 	I915_WRITE(DMA_ADDR_0_LOW, lower_32_bits(offset));
 	I915_WRITE(DMA_ADDR_0_HIGH, upper_32_bits(offset) & 0xFFFF);
 
@@ -326,6 +326,7 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 {
 	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
 	struct drm_device *dev = &dev_priv->drm;
+	struct i915_vma *vma;
 	int ret;
 
 	ret = i915_gem_object_set_to_gtt_domain(guc_fw->guc_fw_obj, false);
@@ -334,10 +335,10 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 		return ret;
 	}
 
-	ret = i915_gem_object_ggtt_pin(guc_fw->guc_fw_obj, NULL, 0, 0, 0);
-	if (ret) {
-		DRM_DEBUG_DRIVER("pin failed %d\n", ret);
-		return ret;
+	vma = i915_gem_object_ggtt_pin(guc_fw->guc_fw_obj, NULL, 0, 0, 0);
+	if (IS_ERR(vma)) {
+		DRM_DEBUG_DRIVER("pin failed %d\n", (int)PTR_ERR(vma));
+		return PTR_ERR(vma);
 	}
 
 	/* Invalidate GuC TLB to let GuC take the latest updates to GTT. */
@@ -380,7 +381,7 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 
 	set_guc_init_params(dev_priv);
 
-	ret = guc_ucode_xfer_dma(dev_priv);
+	ret = guc_ucode_xfer_dma(dev_priv, vma);
 
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
@@ -388,7 +389,7 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 	 * We keep the object pages for reuse during resume. But we can unpin it
 	 * now that DMA has completed, so it doesn't continue to take up space.
 	 */
-	i915_gem_object_ggtt_unpin(guc_fw->guc_fw_obj);
+	i915_vma_unpin(vma);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index d930e3a4a9cd..402e05f2f1de 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -326,7 +326,7 @@ static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
 	i915_gem_track_fb(vma->obj, NULL,
 			  INTEL_FRONTBUFFER_OVERLAY(overlay->crtc->pipe));
 
-	i915_gem_object_unpin_from_display_plane(vma->obj, &i915_ggtt_view_normal);
+	i915_gem_object_unpin_from_display_plane(vma);
 	i915_vma_put(vma);
 }
 
@@ -342,7 +342,7 @@ static void intel_overlay_off_tail(struct i915_gem_active *active,
 	if (WARN_ON(!vma))
 		return;
 
-	i915_gem_object_unpin_from_display_plane(vma->obj, &i915_ggtt_view_normal);
+	i915_gem_object_unpin_from_display_plane(vma);
 	i915_vma_put(vma);
 
 	overlay->crtc->overlay = NULL;
@@ -755,12 +755,10 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	if (ret != 0)
 		return ret;
 
-	ret = i915_gem_object_pin_to_display_plane(new_bo, 0,
+	vma = i915_gem_object_pin_to_display_plane(new_bo, 0,
 						   &i915_ggtt_view_normal);
-	if (ret != 0)
-		return ret;
-
-	vma = i915_gem_obj_to_ggtt_view(new_bo, &i915_ggtt_view_normal);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
 
 	ret = i915_gem_object_put_fence(new_bo);
 	if (ret)
@@ -803,7 +801,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	swidth = params->src_w;
 	swidthsw = calc_swidthsw(dev_priv, params->offset_Y, tmp_width);
 	sheight = params->src_h;
-	iowrite32(i915_gem_obj_ggtt_offset(new_bo) + params->offset_Y, &regs->OBUF_0Y);
+	iowrite32(vma->node.start + params->offset_Y, &regs->OBUF_0Y);
 	ostride = params->stride_Y;
 
 	if (params->format & I915_OVERLAY_YUV_PLANAR) {
@@ -817,8 +815,8 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 				      params->src_w/uv_hscale);
 		swidthsw |= max_t(u32, tmp_U, tmp_V) << 16;
 		sheight |= (params->src_h/uv_vscale) << 16;
-		iowrite32(i915_gem_obj_ggtt_offset(new_bo) + params->offset_U, &regs->OBUF_0U);
-		iowrite32(i915_gem_obj_ggtt_offset(new_bo) + params->offset_V, &regs->OBUF_0V);
+		iowrite32(vma->node.start + params->offset_U, &regs->OBUF_0U);
+		iowrite32(vma->node.start + params->offset_V, &regs->OBUF_0V);
 		ostride |= params->stride_UV << 16;
 	}
 
@@ -850,7 +848,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	return 0;
 
 out_unpin:
-	i915_gem_object_ggtt_unpin(new_bo);
+	i915_gem_object_unpin_from_display_plane(vma);
 	return ret;
 }
 
@@ -1373,6 +1371,7 @@ void intel_setup_overlay(struct drm_i915_private *dev_priv)
 	struct intel_overlay *overlay;
 	struct drm_i915_gem_object *reg_bo;
 	struct overlay_registers __iomem *regs;
+	struct i915_vma *vma = NULL;
 	int ret;
 
 	if (!HAS_OVERLAY(dev_priv))
@@ -1406,13 +1405,14 @@ void intel_setup_overlay(struct drm_i915_private *dev_priv)
 		}
 		overlay->flip_addr = reg_bo->phys_handle->busaddr;
 	} else {
-		ret = i915_gem_object_ggtt_pin(reg_bo, NULL,
+		vma = i915_gem_object_ggtt_pin(reg_bo, NULL,
 					       0, PAGE_SIZE, PIN_MAPPABLE);
-		if (ret) {
+		if (IS_ERR(vma)) {
 			DRM_ERROR("failed to pin overlay register bo\n");
+			ret = PTR_ERR(vma);
 			goto out_free_bo;
 		}
-		overlay->flip_addr = i915_gem_obj_ggtt_offset(reg_bo);
+		overlay->flip_addr = vma->node.start;
 
 		ret = i915_gem_object_set_to_gtt_domain(reg_bo, true);
 		if (ret) {
@@ -1444,8 +1444,8 @@ void intel_setup_overlay(struct drm_i915_private *dev_priv)
 	return;
 
 out_unpin_bo:
-	if (!OVERLAY_NEEDS_PHYSICAL(dev_priv))
-		i915_gem_object_ggtt_unpin(reg_bo);
+	if (vma)
+		i915_vma_unpin(vma);
 out_free_bo:
 	i915_gem_object_put(reg_bo);
 out_free:

From bde13ebdab0778b758b267ff9e38d6c10a42bdc3 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:07 +0100
Subject: [PATCH 077/141] drm/i915: Introduce i915_ggtt_offset()

This little helper only exists to safely discard the upper unused 32bits
of the general 64-bit VMA address - as we know that all Global GTT
currently are less than 4GiB in size and so that the upper bits must be
zero. In many places, we use a u32 for the global GTT offset and we want
to document where we are discarding the full VMA offset.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-28-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  2 +-
 drivers/gpu/drm/i915/i915_drv.h            |  2 +-
 drivers/gpu/drm/i915/i915_gem.c            | 11 ++++-----
 drivers/gpu/drm/i915/i915_gem_context.c    |  6 +++--
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  9 +++++++
 drivers/gpu/drm/i915/i915_guc_submission.c | 15 ++++++------
 drivers/gpu/drm/i915/intel_display.c       | 10 +++-----
 drivers/gpu/drm/i915/intel_engine_cs.c     |  4 ++--
 drivers/gpu/drm/i915/intel_fbdev.c         |  6 ++---
 drivers/gpu/drm/i915/intel_guc_loader.c    |  6 ++---
 drivers/gpu/drm/i915/intel_lrc.c           | 20 +++++++++-------
 drivers/gpu/drm/i915/intel_overlay.c       | 10 ++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 28 +++++++++++-----------
 13 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 73f3466402c5..019df75be5ac 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2008,7 +2008,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
 
 	if (vma->flags & I915_VMA_GLOBAL_BIND)
 		seq_printf(m, "\tBound in GGTT at 0x%08x\n",
-			   lower_32_bits(vma->node.start));
+			   i915_ggtt_offset(vma));
 
 	if (i915_gem_object_get_pages(vma->obj)) {
 		seq_puts(m, "\tFailed to get pages for context object\n\n");
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bbee45acedeb..bd58878de77b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3330,7 +3330,7 @@ static inline unsigned long
 i915_gem_object_ggtt_offset(struct drm_i915_gem_object *o,
 			    const struct i915_ggtt_view *view)
 {
-	return i915_gem_object_to_ggtt(o, view)->node.start;
+	return i915_ggtt_offset(i915_gem_object_to_ggtt(o, view));
 }
 
 /* i915_gem_fence.c */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 685253a1323b..7e08c774a1aa 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -767,7 +767,7 @@ i915_gem_gtt_pread(struct drm_device *dev,
 
 		i915_gem_object_pin_pages(obj);
 	} else {
-		node.start = vma->node.start;
+		node.start = i915_ggtt_offset(vma);
 		node.allocated = false;
 		ret = i915_gem_object_put_fence(obj);
 		if (ret)
@@ -1071,7 +1071,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 
 		i915_gem_object_pin_pages(obj);
 	} else {
-		node.start = vma->node.start;
+		node.start = i915_ggtt_offset(vma);
 		node.allocated = false;
 		ret = i915_gem_object_put_fence(obj);
 		if (ret)
@@ -1712,7 +1712,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 		goto err_unpin;
 
 	/* Finally, remap it using the new GTT offset */
-	pfn = ggtt->mappable_base + vma->node.start;
+	pfn = ggtt->mappable_base + i915_ggtt_offset(vma);
 	pfn >>= PAGE_SHIFT;
 
 	if (unlikely(view.type == I915_GGTT_VIEW_PARTIAL)) {
@@ -3759,10 +3759,9 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 
 		WARN(i915_vma_is_pinned(vma),
 		     "bo is already pinned in ggtt with incorrect alignment:"
-		     " offset=%08x %08x, req.alignment=%llx, req.map_and_fenceable=%d,"
+		     " offset=%08x, req.alignment=%llx, req.map_and_fenceable=%d,"
 		     " obj->map_and_fenceable=%d\n",
-		     upper_32_bits(vma->node.start),
-		     lower_32_bits(vma->node.start),
+		     i915_ggtt_offset(vma),
 		     alignment,
 		     !!(flags & PIN_MAPPABLE),
 		     obj->map_and_fenceable);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index e566167d9441..98d2956f91f4 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -631,7 +631,8 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 
 	intel_ring_emit(ring, MI_NOOP);
 	intel_ring_emit(ring, MI_SET_CONTEXT);
-	intel_ring_emit(ring, req->ctx->engine[RCS].state->node.start | flags);
+	intel_ring_emit(ring,
+			i915_ggtt_offset(req->ctx->engine[RCS].state) | flags);
 	/*
 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
 	 * WaMiSetContext_Hang:snb,ivb,vlv
@@ -660,7 +661,8 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
 					MI_STORE_REGISTER_MEM |
 					MI_SRM_LRM_GLOBAL_GTT);
 			intel_ring_emit_reg(ring, last_reg);
-			intel_ring_emit(ring, engine->scratch->node.start);
+			intel_ring_emit(ring,
+					i915_ggtt_offset(engine->scratch));
 			intel_ring_emit(ring, MI_NOOP);
 		}
 		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 71513b13ca94..d6e4b6529196 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -272,6 +272,15 @@ static inline bool i915_vma_has_active_engine(const struct i915_vma *vma,
 	return vma->active & BIT(engine);
 }
 
+static inline u32 i915_ggtt_offset(const struct i915_vma *vma)
+{
+	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
+	GEM_BUG_ON(!vma->node.allocated);
+	GEM_BUG_ON(upper_32_bits(vma->node.start));
+	GEM_BUG_ON(upper_32_bits(vma->node.start + vma->node.size - 1));
+	return lower_32_bits(vma->node.start);
+}
+
 struct i915_page_dma {
 	struct page *page;
 	union {
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index e7dbc64ec1da..bb4079223e39 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -358,11 +358,11 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 
 		/* The state page is after PPHWSP */
 		lrc->ring_lcra =
-			ce->state->node.start + LRC_STATE_PN * PAGE_SIZE;
+			i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
 		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
 				(guc_engine_id << GUC_ELC_ENGINE_OFFSET);
 
-		lrc->ring_begin = ce->ring->vma->node.start;
+		lrc->ring_begin = i915_ggtt_offset(ce->ring->vma);
 		lrc->ring_end = lrc->ring_begin + ce->ring->size - 1;
 		lrc->ring_next_free_location = lrc->ring_begin;
 		lrc->ring_current_tail_pointer_value = 0;
@@ -378,7 +378,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
 	 * The doorbell, process descriptor, and workqueue are all parts
 	 * of the client object, which the GuC will reference via the GGTT
 	 */
-	gfx_addr = client->vma->node.start;
+	gfx_addr = i915_ggtt_offset(client->vma);
 	desc.db_trigger_phy = sg_dma_address(client->vma->pages->sgl) +
 				client->doorbell_offset;
 	desc.db_trigger_cpu = (uintptr_t)client->client_base +
@@ -864,7 +864,7 @@ static void guc_create_log(struct intel_guc *guc)
 		(GUC_LOG_ISR_PAGES << GUC_LOG_ISR_SHIFT) |
 		(GUC_LOG_CRASH_PAGES << GUC_LOG_CRASH_SHIFT);
 
-	offset = vma->node.start >> PAGE_SHIFT; /* in pages */
+	offset = i915_ggtt_offset(vma) >> PAGE_SHIFT; /* in pages */
 	guc->log_flags = (offset << GUC_LOG_BUF_ADDR_SHIFT) | flags;
 }
 
@@ -935,7 +935,8 @@ static void guc_create_ads(struct intel_guc *guc)
 	policies = (void *)ads + sizeof(struct guc_ads);
 	init_guc_policies(policies);
 
-	ads->scheduler_policies = vma->node.start + sizeof(struct guc_ads);
+	ads->scheduler_policies =
+		i915_ggtt_offset(vma) + sizeof(struct guc_ads);
 
 	/* MMIO reg state */
 	reg_state = (void *)policies + sizeof(struct guc_policies);
@@ -1063,7 +1064,7 @@ int intel_guc_suspend(struct drm_device *dev)
 	/* any value greater than GUC_POWER_D0 */
 	data[1] = GUC_POWER_D1;
 	/* first page is shared data with GuC */
-	data[2] = ctx->engine[RCS].state->node.start;
+	data[2] = i915_ggtt_offset(ctx->engine[RCS].state);
 
 	return host2guc_action(guc, data, ARRAY_SIZE(data));
 }
@@ -1088,7 +1089,7 @@ int intel_guc_resume(struct drm_device *dev)
 	data[0] = HOST2GUC_ACTION_EXIT_S_STATE;
 	data[1] = GUC_POWER_D0;
 	/* first page is shared data with GuC */
-	data[2] = ctx->engine[RCS].state->node.start;
+	data[2] = i915_ggtt_offset(ctx->engine[RCS].state);
 
 	return host2guc_action(guc, data, ARRAY_SIZE(data));
 }
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index e582b4267036..31eaeedfad30 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3241,7 +3241,6 @@ u32 intel_fb_gtt_offset(struct drm_framebuffer *fb,
 	struct drm_i915_gem_object *obj = intel_fb_obj(fb);
 	struct i915_ggtt_view view;
 	struct i915_vma *vma;
-	u64 offset;
 
 	intel_fill_fb_ggtt_view(&view, fb, rotation);
 
@@ -3250,11 +3249,7 @@ u32 intel_fb_gtt_offset(struct drm_framebuffer *fb,
 		 view.type))
 		return -1;
 
-	offset = vma->node.start;
-
-	WARN_ON(upper_32_bits(offset));
-
-	return lower_32_bits(offset);
+	return i915_ggtt_offset(vma);
 }
 
 static void skl_detach_scaler(struct intel_crtc *intel_crtc, int id)
@@ -11804,7 +11799,8 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
 					      MI_SRM_LRM_GLOBAL_GTT);
 		intel_ring_emit_reg(ring, DERRMR);
-		intel_ring_emit(ring, req->engine->scratch->node.start + 256);
+		intel_ring_emit(ring,
+				i915_ggtt_offset(req->engine->scratch) + 256);
 		if (IS_GEN8(dev)) {
 			intel_ring_emit(ring, 0);
 			intel_ring_emit(ring, MI_NOOP);
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index f02d66bbec4b..5ec8a10fd18b 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -268,8 +268,8 @@ int intel_engine_create_scratch(struct intel_engine_cs *engine, int size)
 		goto err_unref;
 
 	engine->scratch = vma;
-	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08llx\n",
-			 engine->name, vma->node.start);
+	DRM_DEBUG_DRIVER("%s pipe control offset: 0x%08x\n",
+			 engine->name, i915_ggtt_offset(vma));
 	return 0;
 
 err_unref:
diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c
index 096e84dabace..4003e4908c09 100644
--- a/drivers/gpu/drm/i915/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/intel_fbdev.c
@@ -248,7 +248,7 @@ static int intelfb_create(struct drm_fb_helper *helper,
 	info->apertures->ranges[0].base = dev->mode_config.fb_base;
 	info->apertures->ranges[0].size = ggtt->mappable_end;
 
-	info->fix.smem_start = dev->mode_config.fb_base + vma->node.start;
+	info->fix.smem_start = dev->mode_config.fb_base + i915_ggtt_offset(vma);
 	info->fix.smem_len = vma->node.size;
 
 	vaddr = i915_vma_pin_iomap(vma);
@@ -275,8 +275,8 @@ static int intelfb_create(struct drm_fb_helper *helper,
 
 	/* Use default scratch pixmap (info->pixmap.flags = FB_PIXMAP_SYSTEM) */
 
-	DRM_DEBUG_KMS("allocated %dx%d fb: 0x%08llx\n",
-		      fb->width, fb->height, vma->node.start);
+	DRM_DEBUG_KMS("allocated %dx%d fb: 0x%08x\n",
+		      fb->width, fb->height, i915_ggtt_offset(vma));
 	ifbdev->vma = vma;
 
 	mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index 7dd745dfeffb..324812d69b70 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -194,14 +194,14 @@ static void set_guc_init_params(struct drm_i915_private *dev_priv)
 	}
 
 	if (guc->ads_vma) {
-		u32 ads = (u32)guc->ads_vma->node.start >> PAGE_SHIFT;
+		u32 ads = i915_ggtt_offset(guc->ads_vma) >> PAGE_SHIFT;
 		params[GUC_CTL_DEBUG] |= ads << GUC_ADS_ADDR_SHIFT;
 		params[GUC_CTL_DEBUG] |= GUC_ADS_ENABLED;
 	}
 
 	/* If GuC submission is enabled, set up additional parameters here */
 	if (i915.enable_guc_submission) {
-		u32 pgs = dev_priv->guc.ctx_pool_vma->node.start;
+		u32 pgs = i915_ggtt_offset(dev_priv->guc.ctx_pool_vma);
 		u32 ctx_in_16 = GUC_MAX_GPU_CONTEXTS / 16;
 
 		pgs >>= PAGE_SHIFT;
@@ -271,7 +271,7 @@ static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv,
 	I915_WRITE(DMA_COPY_SIZE, guc_fw->header_size + guc_fw->ucode_size);
 
 	/* Set the source address for the new blob */
-	offset = vma->node.start + guc_fw->header_offset;
+	offset = i915_ggtt_offset(vma) + guc_fw->header_offset;
 	I915_WRITE(DMA_ADDR_0_LOW, lower_32_bits(offset));
 	I915_WRITE(DMA_ADDR_0_HIGH, upper_32_bits(offset) & 0xFFFF);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2673fb4f817b..6b49df4316f4 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -315,7 +315,7 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
 
 	desc = ctx->desc_template;				/* bits  3-4  */
 	desc |= engine->ctx_desc_template;			/* bits  0-11 */
-	desc |= ce->state->node.start + LRC_PPHWSP_PN * PAGE_SIZE;
+	desc |= i915_ggtt_offset(ce->state) + LRC_PPHWSP_PN * PAGE_SIZE;
 								/* bits 12-31 */
 	desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;		/* bits 32-52 */
 
@@ -792,7 +792,8 @@ static int intel_lr_context_pin(struct i915_gem_context *ctx,
 
 	intel_lr_context_descriptor_update(ctx, engine);
 
-	lrc_reg_state[CTX_RING_BUFFER_START+1] = ce->ring->vma->node.start;
+	lrc_reg_state[CTX_RING_BUFFER_START+1] =
+		i915_ggtt_offset(ce->ring->vma);
 	ce->lrc_reg_state = lrc_reg_state;
 	ce->state->obj->dirty = true;
 
@@ -914,7 +915,7 @@ static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine,
 	wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 |
 				   MI_SRM_LRM_GLOBAL_GTT));
 	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-	wa_ctx_emit(batch, index, engine->scratch->node.start + 256);
+	wa_ctx_emit(batch, index, i915_ggtt_offset(engine->scratch) + 256);
 	wa_ctx_emit(batch, index, 0);
 
 	wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
@@ -932,7 +933,7 @@ static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine,
 	wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 |
 				   MI_SRM_LRM_GLOBAL_GTT));
 	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-	wa_ctx_emit(batch, index, engine->scratch->node.start + 256);
+	wa_ctx_emit(batch, index, i915_ggtt_offset(engine->scratch) + 256);
 	wa_ctx_emit(batch, index, 0);
 
 	return index;
@@ -993,7 +994,7 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine,
 
 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
 	/* Actual scratch location is at 128 bytes offset */
-	scratch_addr = engine->scratch->node.start + 2 * CACHELINE_BYTES;
+	scratch_addr = i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
 
 	wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
 	wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
@@ -1073,7 +1074,7 @@ static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine,
 	/* Actual scratch location is at 128 bytes offset */
 	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_A0)) {
 		u32 scratch_addr =
-			engine->scratch->node.start + 2 * CACHELINE_BYTES;
+			i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
 
 		wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
 		wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
@@ -1482,7 +1483,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 {
 	struct intel_ring *ring = request->ring;
 	struct intel_engine_cs *engine = request->engine;
-	u32 scratch_addr = engine->scratch->node.start + 2 * CACHELINE_BYTES;
+	u32 scratch_addr =
+		i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
 	bool vf_flush_wa = false, dc_flush_wa = false;
 	u32 flags = 0;
 	int ret;
@@ -1752,7 +1754,7 @@ lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma)
 		return PTR_ERR(hws);
 
 	engine->status_page.page_addr = hws + hws_offset;
-	engine->status_page.ggtt_offset = vma->node.start + hws_offset;
+	engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset;
 	engine->status_page.vma = vma;
 
 	return 0;
@@ -2020,7 +2022,7 @@ populate_lr_context(struct i915_gem_context *ctx,
 			       RING_INDIRECT_CTX_OFFSET(engine->mmio_base), 0);
 		if (engine->wa_ctx.vma) {
 			struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
-			u32 ggtt_offset = wa_ctx->vma->node.start;
+			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 
 			reg_state[CTX_RCS_INDIRECT_CTX+1] =
 				(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 402e05f2f1de..72f8990a13d2 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -801,7 +801,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	swidth = params->src_w;
 	swidthsw = calc_swidthsw(dev_priv, params->offset_Y, tmp_width);
 	sheight = params->src_h;
-	iowrite32(vma->node.start + params->offset_Y, &regs->OBUF_0Y);
+	iowrite32(i915_ggtt_offset(vma) + params->offset_Y, &regs->OBUF_0Y);
 	ostride = params->stride_Y;
 
 	if (params->format & I915_OVERLAY_YUV_PLANAR) {
@@ -815,8 +815,10 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 				      params->src_w/uv_hscale);
 		swidthsw |= max_t(u32, tmp_U, tmp_V) << 16;
 		sheight |= (params->src_h/uv_vscale) << 16;
-		iowrite32(vma->node.start + params->offset_U, &regs->OBUF_0U);
-		iowrite32(vma->node.start + params->offset_V, &regs->OBUF_0V);
+		iowrite32(i915_ggtt_offset(vma) + params->offset_U,
+			  &regs->OBUF_0U);
+		iowrite32(i915_ggtt_offset(vma) + params->offset_V,
+			  &regs->OBUF_0V);
 		ostride |= params->stride_UV << 16;
 	}
 
@@ -1412,7 +1414,7 @@ void intel_setup_overlay(struct drm_i915_private *dev_priv)
 			ret = PTR_ERR(vma);
 			goto out_free_bo;
 		}
-		overlay->flip_addr = vma->node.start;
+		overlay->flip_addr = i915_ggtt_offset(vma);
 
 		ret = i915_gem_object_set_to_gtt_domain(reg_bo, true);
 		if (ret) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 65ef172e8761..e3327a2ac6e1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -176,7 +176,7 @@ intel_emit_post_sync_nonzero_flush(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
-		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
+		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 	int ret;
 
 	ret = intel_ring_begin(req, 6);
@@ -212,7 +212,7 @@ gen6_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
 	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
-		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
+		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 	u32 flags = 0;
 	int ret;
 
@@ -286,7 +286,7 @@ gen7_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
 	struct intel_ring *ring = req->ring;
 	u32 scratch_addr =
-		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
+		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 	u32 flags = 0;
 	int ret;
 
@@ -371,7 +371,7 @@ static int
 gen8_render_ring_flush(struct drm_i915_gem_request *req, u32 mode)
 {
 	u32 scratch_addr =
-		req->engine->scratch->node.start + 2 * CACHELINE_BYTES;
+		i915_ggtt_offset(req->engine->scratch) + 2 * CACHELINE_BYTES;
 	u32 flags = 0;
 	int ret;
 
@@ -571,7 +571,7 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	 * registers with the above sequence (the readback of the HEAD registers
 	 * also enforces ordering), otherwise the hw might lose the new ring
 	 * register values. */
-	I915_WRITE_START(engine, ring->vma->node.start);
+	I915_WRITE_START(engine, i915_ggtt_offset(ring->vma));
 
 	/* WaClearRingBufHeadRegAtInit:ctg,elk */
 	if (I915_READ_HEAD(engine))
@@ -586,16 +586,16 @@ static int init_ring_common(struct intel_engine_cs *engine)
 
 	/* If the head is still not zero, the ring is dead */
 	if (wait_for((I915_READ_CTL(engine) & RING_VALID) != 0 &&
-		     I915_READ_START(engine) == ring->vma->node.start &&
+		     I915_READ_START(engine) == i915_ggtt_offset(ring->vma) &&
 		     (I915_READ_HEAD(engine) & HEAD_ADDR) == 0, 50)) {
 		DRM_ERROR("%s initialization failed "
-			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08llx]\n",
+			  "ctl %08x (valid? %d) head %08x tail %08x start %08x [expected %08x]\n",
 			  engine->name,
 			  I915_READ_CTL(engine),
 			  I915_READ_CTL(engine) & RING_VALID,
 			  I915_READ_HEAD(engine), I915_READ_TAIL(engine),
 			  I915_READ_START(engine),
-			  ring->vma->node.start);
+			  i915_ggtt_offset(ring->vma));
 		ret = -EIO;
 		goto out;
 	}
@@ -1716,7 +1716,7 @@ i830_emit_bb_start(struct drm_i915_gem_request *req,
 		   unsigned int dispatch_flags)
 {
 	struct intel_ring *ring = req->ring;
-	u32 cs_offset = req->engine->scratch->node.start;
+	u32 cs_offset = i915_ggtt_offset(req->engine->scratch);
 	int ret;
 
 	ret = intel_ring_begin(req, 6);
@@ -1857,12 +1857,12 @@ static int init_status_page(struct intel_engine_cs *engine)
 		goto err;
 
 	engine->status_page.vma = vma;
-	engine->status_page.ggtt_offset = vma->node.start;
+	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
 	engine->status_page.page_addr =
 		i915_gem_object_pin_map(obj, I915_MAP_WB);
 
-	DRM_DEBUG_DRIVER("%s hws offset: 0x%08llx\n",
-			 engine->name, vma->node.start);
+	DRM_DEBUG_DRIVER("%s hws offset: 0x%08x\n",
+			 engine->name, i915_ggtt_offset(vma));
 	return 0;
 
 err:
@@ -2542,13 +2542,13 @@ static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
 	}
 
 	if (INTEL_GEN(dev_priv) >= 8) {
-		u64 offset = dev_priv->semaphore->node.start;
+		u32 offset = i915_ggtt_offset(dev_priv->semaphore);
 
 		engine->semaphore.sync_to = gen8_ring_sync_to;
 		engine->semaphore.signal = gen8_xcs_signal;
 
 		for (i = 0; i < I915_NUM_ENGINES; i++) {
-			u64 ring_offset;
+			u32 ring_offset;
 
 			if (i != engine->id)
 				ring_offset = offset + GEN8_SEMAPHORE_OFFSET(engine->id, i);

From c84455b4baccce89384e45c53198c3c948bd97b6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:08 +0100
Subject: [PATCH 078/141] drm/i915: Move debug only per-request pid tracking
 from request to ctx

Since contexts are not currently shared between userspace processes, we
have an exact correspondence between context creator and guilty batch
submitter. Therefore we can save some per-batch work by inspecting the
context->pid upon error instead. Note that we take the context's
creator's pid rather than the file's pid in order to better track fd
passed over sockets.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-29-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 25 ++++++++++++++++---------
 drivers/gpu/drm/i915/i915_drv.h         |  2 ++
 drivers/gpu/drm/i915/i915_gem_context.c |  4 ++++
 drivers/gpu/drm/i915/i915_gem_request.c |  6 ------
 drivers/gpu/drm/i915/i915_gem_request.h |  3 ---
 drivers/gpu/drm/i915/i915_gpu_error.c   | 13 ++++++++++---
 6 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 019df75be5ac..1b4de447ae9f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -460,6 +460,8 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 	print_context_stats(m, dev_priv);
 	list_for_each_entry_reverse(file, &dev->filelist, lhead) {
 		struct file_stats stats;
+		struct drm_i915_file_private *file_priv = file->driver_priv;
+		struct drm_i915_gem_request *request;
 		struct task_struct *task;
 
 		memset(&stats, 0, sizeof(stats));
@@ -473,10 +475,17 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 		 * still alive (e.g. get_pid(current) => fork() => exit()).
 		 * Therefore, we need to protect this ->comm access using RCU.
 		 */
+		mutex_lock(&dev->struct_mutex);
+		request = list_first_entry_or_null(&file_priv->mm.request_list,
+						   struct drm_i915_gem_request,
+						   client_list);
 		rcu_read_lock();
-		task = pid_task(file->pid, PIDTYPE_PID);
+		task = pid_task(request && request->ctx->pid ?
+				request->ctx->pid : file->pid,
+				PIDTYPE_PID);
 		print_file_stats(m, task ? task->comm : "<unknown>", stats);
 		rcu_read_unlock();
+		mutex_unlock(&dev->struct_mutex);
 	}
 	mutex_unlock(&dev->filelist_mutex);
 
@@ -658,12 +667,11 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
 
 		seq_printf(m, "%s requests: %d\n", engine->name, count);
 		list_for_each_entry(req, &engine->request_list, link) {
+			struct pid *pid = req->ctx->pid;
 			struct task_struct *task;
 
 			rcu_read_lock();
-			task = NULL;
-			if (req->pid)
-				task = pid_task(req->pid, PIDTYPE_PID);
+			task = pid ? pid_task(pid, PIDTYPE_PID) : NULL;
 			seq_printf(m, "    %x @ %d: %s [%d]\n",
 				   req->fence.seqno,
 				   (int) (jiffies - req->emitted_jiffies),
@@ -1952,18 +1960,17 @@ static int i915_context_status(struct seq_file *m, void *unused)
 
 	list_for_each_entry(ctx, &dev_priv->context_list, link) {
 		seq_printf(m, "HW context %u ", ctx->hw_id);
-		if (IS_ERR(ctx->file_priv)) {
-			seq_puts(m, "(deleted) ");
-		} else if (ctx->file_priv) {
-			struct pid *pid = ctx->file_priv->file->pid;
+		if (ctx->pid) {
 			struct task_struct *task;
 
-			task = get_pid_task(pid, PIDTYPE_PID);
+			task = get_pid_task(ctx->pid, PIDTYPE_PID);
 			if (task) {
 				seq_printf(m, "(%s [%d]) ",
 					   task->comm, task->pid);
 				put_task_struct(task);
 			}
+		} else if (IS_ERR(ctx->file_priv)) {
+			seq_puts(m, "(deleted) ");
 		} else {
 			seq_puts(m, "(kernel) ");
 		}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bd58878de77b..d9f29244bafb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -782,6 +782,7 @@ struct drm_i915_error_state {
 
 		struct drm_i915_error_request {
 			long jiffies;
+			pid_t pid;
 			u32 seqno;
 			u32 head;
 			u32 tail;
@@ -880,6 +881,7 @@ struct i915_gem_context {
 	struct drm_i915_private *i915;
 	struct drm_i915_file_private *file_priv;
 	struct i915_hw_ppgtt *ppgtt;
+	struct pid *pid;
 
 	struct i915_ctx_hang_stats hang_stats;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 98d2956f91f4..35950ee46a1d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -158,6 +158,7 @@ void i915_gem_context_free(struct kref *ctx_ref)
 		i915_vma_put(ce->state);
 	}
 
+	put_pid(ctx->pid);
 	list_del(&ctx->link);
 
 	ida_simple_remove(&ctx->i915->context_hw_ida, ctx->hw_id);
@@ -311,6 +312,9 @@ __create_hw_context(struct drm_device *dev,
 		ret = DEFAULT_CONTEXT_HANDLE;
 
 	ctx->file_priv = file_priv;
+	if (file_priv)
+		ctx->pid = get_task_pid(current, PIDTYPE_PID);
+
 	ctx->user_handle = ret;
 	/* NB: Mark all slices as needing a remap so that when the context first
 	 * loads it will restore whatever remap state already exists. If there
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index a0756b553a19..1a215320cefb 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -137,8 +137,6 @@ int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
 	list_add_tail(&req->client_list, &file_priv->mm.request_list);
 	spin_unlock(&file_priv->mm.lock);
 
-	req->pid = get_pid(task_pid(current));
-
 	return 0;
 }
 
@@ -154,9 +152,6 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	list_del(&request->client_list);
 	request->file_priv = NULL;
 	spin_unlock(&file_priv->mm.lock);
-
-	put_pid(request->pid);
-	request->pid = NULL;
 }
 
 void i915_gem_retire_noop(struct i915_gem_active *active,
@@ -407,7 +402,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
-	req->pid = NULL;
 	req->elsp_submitted = 0;
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index a66243a6f791..6c72bd8d9423 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -134,9 +134,6 @@ struct drm_i915_gem_request {
 	/** file_priv list entry for this request */
 	struct list_head client_list;
 
-	/** process identifier submitting this request */
-	struct pid *pid;
-
 	/**
 	 * The ELSP only accepts two elements at a time, so we queue
 	 * context/tail pairs on a given queue (ring->execlist_queue) until the
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 8fc21a9c0abd..638664f78dd5 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -470,7 +470,8 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 				   dev_priv->engine[i].name,
 				   ee->num_requests);
 			for (j = 0; j < ee->num_requests; j++) {
-				err_printf(m, "  seqno 0x%08x, emitted %ld, head 0x%08x, tail 0x%08x\n",
+				err_printf(m, "  pid %d, seqno 0x%08x, emitted %ld, head 0x%08x, tail 0x%08x\n",
+					   ee->requests[j].pid,
 					   ee->requests[j].seqno,
 					   ee->requests[j].jiffies,
 					   ee->requests[j].head,
@@ -1076,6 +1077,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 		request = i915_gem_find_active_request(engine);
 		if (request) {
 			struct intel_ring *ring;
+			struct pid *pid;
 
 			ee->vm = request->ctx->ppgtt ?
 				&request->ctx->ppgtt->base : &ggtt->base;
@@ -1097,11 +1099,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				i915_error_object_create(dev_priv,
 							 request->ctx->engine[i].state);
 
-			if (request->pid) {
+			pid = request->ctx->pid;
+			if (pid) {
 				struct task_struct *task;
 
 				rcu_read_lock();
-				task = pid_task(request->pid, PIDTYPE_PID);
+				task = pid_task(pid, PIDTYPE_PID);
 				if (task) {
 					strcpy(ee->comm, task->comm);
 					ee->pid = task->pid;
@@ -1166,6 +1169,10 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			erq->jiffies = request->emitted_jiffies;
 			erq->head = request->head;
 			erq->tail = request->tail;
+
+			rcu_read_lock();
+			erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
+			rcu_read_unlock();
 		}
 	}
 }

From 03382dfb633c1c1ac85b3d81ddd2256dcbc8f353 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:09 +0100
Subject: [PATCH 079/141] drm/i915: Print the batchbuffer offset next to BBADDR
 in error state

It is useful when looking at captured error states to check the recorded
BBADDR register (the address of the last batchbuffer instruction loaded)
against the expected offset of the batch buffer, and so do a quick check
that (a) the capture is true or (b) HEAD hasn't wandered off into the
badlands.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-30-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d9f29244bafb..bb7d8130dbfd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -775,6 +775,7 @@ struct drm_i915_error_state {
 		struct drm_i915_error_object {
 			int page_count;
 			u64 gtt_offset;
+			u64 gtt_size;
 			u32 *pages[0];
 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 638664f78dd5..0f0b65214ef1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -242,8 +242,16 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
 	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
 	err_printf(m, "  INSTDONE: 0x%08x\n", ee->instdone);
+	if (ee->batchbuffer) {
+		u64 start = ee->batchbuffer->gtt_offset;
+		u64 end = start + ee->batchbuffer->gtt_size;
+
+		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
+			   upper_32_bits(start), lower_32_bits(start),
+			   upper_32_bits(end), lower_32_bits(end));
+	}
 	if (INTEL_GEN(m->i915) >= 4) {
-		err_printf(m, "  BBADDR: 0x%08x %08x\n",
+		err_printf(m, "  BBADDR: 0x%08x_%08x\n",
 			   (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
 		err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
 		err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
@@ -677,7 +685,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
 	if (!dst)
 		return NULL;
 
-	reloc_offset = dst->gtt_offset = vma->node.start;
+	dst->gtt_offset = vma->node.start;
+	dst->gtt_size = vma->node.size;
+
+	reloc_offset = dst->gtt_offset;
 	use_ggtt = (src->cache_level == I915_CACHE_NONE &&
 		   (vma->flags & I915_VMA_GLOBAL_BIND) &&
 		   reloc_offset + num_pages * PAGE_SIZE <= ggtt->mappable_end);

From 57bc699d43390aab7a21f3f2cee1f13cd31fd0fd Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:10 +0100
Subject: [PATCH 080/141] drm/i915: Only record active and pending requests
 upon a GPU hang

There is no other state pertaining to the completed requests in the
hang, other than gleamed through the ringbuffer, so including the
expired requests in the list of outstanding requests simply adds noise.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-31-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 109 ++++++++++++++------------
 1 file changed, 61 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0f0b65214ef1..776818b86c0c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1060,12 +1060,68 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 	}
 }
 
+static void engine_record_requests(struct intel_engine_cs *engine,
+				   struct drm_i915_gem_request *first,
+				   struct drm_i915_error_engine *ee)
+{
+	struct drm_i915_gem_request *request;
+	int count;
+
+	count = 0;
+	request = first;
+	list_for_each_entry_from(request, &engine->request_list, link)
+		count++;
+	if (!count)
+		return;
+
+	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
+	if (!ee->requests)
+		return;
+
+	ee->num_requests = count;
+
+	count = 0;
+	request = first;
+	list_for_each_entry_from(request, &engine->request_list, link) {
+		struct drm_i915_error_request *erq;
+
+		if (count >= ee->num_requests) {
+			/*
+			 * If the ring request list was changed in
+			 * between the point where the error request
+			 * list was created and dimensioned and this
+			 * point then just exit early to avoid crashes.
+			 *
+			 * We don't need to communicate that the
+			 * request list changed state during error
+			 * state capture and that the error state is
+			 * slightly incorrect as a consequence since we
+			 * are typically only interested in the request
+			 * list state at the point of error state
+			 * capture, not in any changes happening during
+			 * the capture.
+			 */
+			break;
+		}
+
+		erq = &ee->requests[count++];
+		erq->seqno = request->fence.seqno;
+		erq->jiffies = request->emitted_jiffies;
+		erq->head = request->head;
+		erq->tail = request->tail;
+
+		rcu_read_lock();
+		erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
+		rcu_read_unlock();
+	}
+	ee->num_requests = count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 				  struct drm_i915_error_state *error)
 {
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct drm_i915_gem_request *request;
-	int i, count;
+	int i;
 
 	error->semaphore =
 		i915_error_object_create(dev_priv, dev_priv->semaphore);
@@ -1073,6 +1129,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
 		struct intel_engine_cs *engine = &dev_priv->engine[i];
 		struct drm_i915_error_engine *ee = &error->engine[i];
+		struct drm_i915_gem_request *request;
 
 		ee->pid = -1;
 		ee->engine_id = -1;
@@ -1131,6 +1188,8 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			ee->cpu_ring_tail = ring->tail;
 			ee->ringbuffer =
 				i915_error_object_create(dev_priv, ring->vma);
+
+			engine_record_requests(engine, request, ee);
 		}
 
 		ee->hws_page =
@@ -1139,52 +1198,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 
 		ee->wa_ctx =
 			i915_error_object_create(dev_priv, engine->wa_ctx.vma);
-
-		count = 0;
-		list_for_each_entry(request, &engine->request_list, link)
-			count++;
-
-		ee->num_requests = count;
-		ee->requests =
-			kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
-		if (!ee->requests) {
-			ee->num_requests = 0;
-			continue;
-		}
-
-		count = 0;
-		list_for_each_entry(request, &engine->request_list, link) {
-			struct drm_i915_error_request *erq;
-
-			if (count >= ee->num_requests) {
-				/*
-				 * If the ring request list was changed in
-				 * between the point where the error request
-				 * list was created and dimensioned and this
-				 * point then just exit early to avoid crashes.
-				 *
-				 * We don't need to communicate that the
-				 * request list changed state during error
-				 * state capture and that the error state is
-				 * slightly incorrect as a consequence since we
-				 * are typically only interested in the request
-				 * list state at the point of error state
-				 * capture, not in any changes happening during
-				 * the capture.
-				 */
-				break;
-			}
-
-			erq = &ee->requests[count++];
-			erq->seqno = request->fence.seqno;
-			erq->jiffies = request->emitted_jiffies;
-			erq->head = request->head;
-			erq->tail = request->tail;
-
-			rcu_read_lock();
-			erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
-			rcu_read_unlock();
-		}
 	}
 }
 

From 21a2c58a9c122151080ecbdddc115257cd7c30d8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 10:49:11 +0100
Subject: [PATCH 081/141] drm/i915: Record the RING_MODE register for
 post-mortem debugging

Just another useful register to inspect following a GPU hang.

v2: Remove partial decoding of RING_MODE to userspace, be consistent and
use GEN > 2 guards around RING_MODE everywhere.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471254551-25805-32-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h         | 1 +
 drivers/gpu/drm/i915/i915_gpu_error.c   | 3 +++
 drivers/gpu/drm/i915/intel_ringbuffer.c | 7 ++++---
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bb7d8130dbfd..35caa9b2f36a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -757,6 +757,7 @@ struct drm_i915_error_state {
 		u32 tail;
 		u32 head;
 		u32 ctl;
+		u32 mode;
 		u32 hws;
 		u32 ipeir;
 		u32 ipehr;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 776818b86c0c..0c3f30ce85c3 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -236,6 +236,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  HEAD:  0x%08x\n", ee->head);
 	err_printf(m, "  TAIL:  0x%08x\n", ee->tail);
 	err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
+	err_printf(m, "  MODE:  0x%08x\n", ee->mode);
 	err_printf(m, "  HWS:   0x%08x\n", ee->hws);
 	err_printf(m, "  ACTHD: 0x%08x %08x\n",
 		   (u32)(ee->acthd>>32), (u32)ee->acthd);
@@ -1005,6 +1006,8 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 	ee->head = I915_READ_HEAD(engine);
 	ee->tail = I915_READ_TAIL(engine);
 	ee->ctl = I915_READ_CTL(engine);
+	if (INTEL_GEN(dev_priv) > 2)
+		ee->mode = I915_READ_MODE(engine);
 
 	if (I915_NEED_GFX_HWS(dev_priv)) {
 		i915_reg_t mmio;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index e3327a2ac6e1..fa22bd87bab0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -498,7 +498,7 @@ static bool stop_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
 
-	if (!IS_GEN2(dev_priv)) {
+	if (INTEL_GEN(dev_priv) > 2) {
 		I915_WRITE_MODE(engine, _MASKED_BIT_ENABLE(STOP_RING));
 		if (intel_wait_for_register(dev_priv,
 					    RING_MI_MODE(engine->mmio_base),
@@ -520,7 +520,7 @@ static bool stop_ring(struct intel_engine_cs *engine)
 	I915_WRITE_HEAD(engine, 0);
 	I915_WRITE_TAIL(engine, 0);
 
-	if (!IS_GEN2(dev_priv)) {
+	if (INTEL_GEN(dev_priv) > 2) {
 		(void)I915_READ_CTL(engine);
 		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
 	}
@@ -2142,7 +2142,8 @@ void intel_engine_cleanup(struct intel_engine_cs *engine)
 	dev_priv = engine->i915;
 
 	if (engine->buffer) {
-		WARN_ON(!IS_GEN2(dev_priv) && (I915_READ_MODE(engine) & MODE_IDLE) == 0);
+		WARN_ON(INTEL_GEN(dev_priv) > 2 &&
+			(I915_READ_MODE(engine) & MODE_IDLE) == 0);
 
 		intel_ring_unpin(engine->buffer);
 		intel_ring_free(engine->buffer);

From 1544c42ed9904835cce14ec917e7678c92191614 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 13:18:16 +0100
Subject: [PATCH 082/141] drm/i915: Initialise mmaped_count for
 i915_gem_object_info

Reported-by: 0day kbuild test robot
Fixes: 2bd160a131ac ("drm/i915: Reduce i915_gem_objects to only show...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1471263496-27537-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1b4de447ae9f..36112282f590 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -403,7 +403,9 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 		   dev_priv->mm.object_count,
 		   dev_priv->mm.object_memory);
 
-	size = count = purgeable_size = purgeable_count = 0;
+	size = count = 0;
+	mapped_size = mapped_count = 0;
+	purgeable_size = purgeable_count = 0;
 	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_list) {
 		size += obj->base.size;
 		++count;

From 1255501d8681775d564de45742c6e82b7782b7f5 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 16 Aug 2016 09:50:40 +0100
Subject: [PATCH 083/141] drm/i915: Embrace the race in busy-ioctl

Daniel Vetter proposed a new challenge to the serialisation inside the
busy-ioctl that exposed a flaw that could result in us reporting the
wrong engine as being busy. If the request is reallocated as we test
its busyness and then reassigned to this object by another thread, we
would not notice that the test itself was incorrect.

We are faced with a choice of using __i915_gem_active_get_request_rcu()
to first acquire a reference to the request preventing the race, or to
acknowledge the race and accept the limitations upon the accuracy of the
busy flags. Note that we guarantee that we never falsely report the
object as idle (providing userspace itself doesn't race), and so the
most important use of the busy-ioctl and its guarantees are fulfilled.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471337440-16777-1-git-send-email-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 114 +++++++++++++++++++-------------
 include/uapi/drm/i915_drm.h     |  16 ++++-
 2 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7e08c774a1aa..a8d0f70c22f9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3807,49 +3807,68 @@ static __always_inline unsigned int
 __busy_set_if_active(const struct i915_gem_active *active,
 		     unsigned int (*flag)(unsigned int id))
 {
-	/* For more discussion about the barriers and locking concerns,
-	 * see __i915_gem_active_get_rcu().
+	struct drm_i915_gem_request *request;
+
+	request = rcu_dereference(active->request);
+	if (!request || i915_gem_request_completed(request))
+		return 0;
+
+	/* This is racy. See __i915_gem_active_get_rcu() for an in detail
+	 * discussion of how to handle the race correctly, but for reporting
+	 * the busy state we err on the side of potentially reporting the
+	 * wrong engine as being busy (but we guarantee that the result
+	 * is at least self-consistent).
+	 *
+	 * As we use SLAB_DESTROY_BY_RCU, the request may be reallocated
+	 * whilst we are inspecting it, even under the RCU read lock as we are.
+	 * This means that there is a small window for the engine and/or the
+	 * seqno to have been overwritten. The seqno will always be in the
+	 * future compared to the intended, and so we know that if that
+	 * seqno is idle (on whatever engine) our request is idle and the
+	 * return 0 above is correct.
+	 *
+	 * The issue is that if the engine is switched, it is just as likely
+	 * to report that it is busy (but since the switch happened, we know
+	 * the request should be idle). So there is a small chance that a busy
+	 * result is actually the wrong engine.
+	 *
+	 * So why don't we care?
+	 *
+	 * For starters, the busy ioctl is a heuristic that is by definition
+	 * racy. Even with perfect serialisation in the driver, the hardware
+	 * state is constantly advancing - the state we report to the user
+	 * is stale.
+	 *
+	 * The critical information for the busy-ioctl is whether the object
+	 * is idle as userspace relies on that to detect whether its next
+	 * access will stall, or if it has missed submitting commands to
+	 * the hardware allowing the GPU to stall. We never generate a
+	 * false-positive for idleness, thus busy-ioctl is reliable at the
+	 * most fundamental level, and we maintain the guarantee that a
+	 * busy object left to itself will eventually become idle (and stay
+	 * idle!).
+	 *
+	 * We allow ourselves the leeway of potentially misreporting the busy
+	 * state because that is an optimisation heuristic that is constantly
+	 * in flux. Being quickly able to detect the busy/idle state is much
+	 * more important than accurate logging of exactly which engines were
+	 * busy.
+	 *
+	 * For accuracy in reporting the engine, we could use
+	 *
+	 *	result = 0;
+	 *	request = __i915_gem_active_get_rcu(active);
+	 *	if (request) {
+	 *		if (!i915_gem_request_completed(request))
+	 *			result = flag(request->engine->exec_id);
+	 *		i915_gem_request_put(request);
+	 *	}
+	 *
+	 * but that still remains susceptible to both hardware and userspace
+	 * races. So we accept making the result of that race slightly worse,
+	 * given the rarity of the race and its low impact on the result.
 	 */
-	do {
-		struct drm_i915_gem_request *request;
-		unsigned int id;
-
-		request = rcu_dereference(active->request);
-		if (!request || i915_gem_request_completed(request))
-			return 0;
-
-		id = request->engine->exec_id;
-
-		/* Check that the pointer wasn't reassigned and overwritten.
-		 *
-		 * In __i915_gem_active_get_rcu(), we enforce ordering between
-		 * the first rcu pointer dereference (imposing a
-		 * read-dependency only on access through the pointer) and
-		 * the second lockless access through the memory barrier
-		 * following a successful atomic_inc_not_zero(). Here there
-		 * is no such barrier, and so we must manually insert an
-		 * explicit read barrier to ensure that the following
-		 * access occurs after all the loads through the first
-		 * pointer.
-		 *
-		 * It is worth comparing this sequence with
-		 * raw_write_seqcount_latch() which operates very similarly.
-		 * The challenge here is the visibility of the other CPU
-		 * writes to the reallocated request vs the local CPU ordering.
-		 * Before the other CPU can overwrite the request, it will
-		 * have updated our active->request and gone through a wmb.
-		 * During the read here, we want to make sure that the values
-		 * we see have not been overwritten as we do so - and we do
-		 * that by serialising the second pointer check with the writes
-		 * on other other CPUs.
-		 *
-		 * The corresponding write barrier is part of
-		 * rcu_assign_pointer().
-		 */
-		smp_rmb();
-		if (request == rcu_access_pointer(active->request))
-			return flag(id);
-	} while (1);
+	return flag(READ_ONCE(request->engine->exec_id));
 }
 
 static __always_inline unsigned int
@@ -3897,11 +3916,12 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 		 * retired and freed. We take a local copy of the pointer,
 		 * but before we add its engine into the busy set, the other
 		 * thread reallocates it and assigns it to a task on another
-		 * engine with a fresh and incomplete seqno.
-		 *
-		 * So after we lookup the engine's id, we double check that
-		 * the active request is the same and only then do we add it
-		 * into the busy set.
+		 * engine with a fresh and incomplete seqno. Guarding against
+		 * that requires careful serialisation and reference counting,
+		 * i.e. using __i915_gem_active_get_request_rcu(). We don't,
+		 * instead we expect that if the result is busy, which engines
+		 * are busy is not completely reliable - we only guarantee
+		 * that the object was busy.
 		 */
 		rcu_read_lock();
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 452629de7a57..5501fe83ed92 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -855,7 +855,16 @@ struct drm_i915_gem_busy {
 	 * having flushed any pending activity), and a non-zero return that
 	 * the object is still in-flight on the GPU. (The GPU has not yet
 	 * signaled completion for all pending requests that reference the
-	 * object.)
+	 * object.) An object is guaranteed to become idle eventually (so
+	 * long as no new GPU commands are executed upon it). Due to the
+	 * asynchronous nature of the hardware, an object reported
+	 * as busy may become idle before the ioctl is completed.
+	 *
+	 * Furthermore, if the object is busy, which engine is busy is only
+	 * provided as a guide. There are race conditions which prevent the
+	 * report of which engines are busy from being always accurate.
+	 * However, the converse is not true. If the object is idle, the
+	 * result of the ioctl, that all engines are idle, is accurate.
 	 *
 	 * The returned dword is split into two fields to indicate both
 	 * the engines on which the object is being read, and the
@@ -878,6 +887,11 @@ struct drm_i915_gem_busy {
 	 * execution engines, e.g. multiple media engines, which are
 	 * mapped to the same identifier in the EXECBUFFER2 ioctl and
 	 * so are not separately reported for busyness.
+	 *
+	 * Caveat emptor:
+	 * Only the boolean result of this query is reliable; that is whether
+	 * the object is idle or busy. The report of which engines are busy
+	 * should be only used as a heuristic.
 	 */
 	__u32 busy;
 };

From ca99d8781fd16edf4c98536a9c18e59a17b06b6c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 15 Aug 2016 19:06:23 +0100
Subject: [PATCH 084/141] drm/i915: Silence GCC warning for cmn_a_well

Just make the logic simple enough for even GCC to understand (and
foolproof against random changes):

drivers/gpu/drm/i915/intel_runtime_pm.c: warning: 'cmn_a_well' may be
used uninitialized in this function [-Wuninitialized]:  => 871:23

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1471284383-22324-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Imre Deak <imre.deak@intel.com>
---
 drivers/gpu/drm/i915/intel_runtime_pm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index d659d6f5b8d3..a1d73c2de332 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -856,7 +856,7 @@ static void bxt_dpio_cmn_power_well_enable(struct drm_i915_private *dev_priv,
 					   struct i915_power_well *power_well)
 {
 	enum skl_disp_power_wells power_well_id = power_well->data;
-	struct i915_power_well *cmn_a_well;
+	struct i915_power_well *cmn_a_well = NULL;
 
 	if (power_well_id == BXT_DPIO_CMN_BC) {
 		/*
@@ -869,7 +869,7 @@ static void bxt_dpio_cmn_power_well_enable(struct drm_i915_private *dev_priv,
 
 	bxt_ddi_phy_init(dev_priv, bxt_power_well_to_phy(power_well));
 
-	if (power_well_id == BXT_DPIO_CMN_BC)
+	if (cmn_a_well)
 		intel_power_well_put(dev_priv, cmn_a_well);
 }
 

From 5ec2cf7e34be622968e865fa99f6b9bd4494020d Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 16 Aug 2016 17:04:20 +0100
Subject: [PATCH 085/141] drm/i915: Add enum for hardware engine identifiers

Put the engine hardware id in the common header so they are
not only associated with the GuC since they are needed for
the legacy semaphores implementation.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_engine_cs.c  | 14 +++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h | 10 ++++++++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 5ec8a10fd18b..8a27bb9f6bc1 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -29,7 +29,7 @@
 static const struct engine_info {
 	const char *name;
 	unsigned exec_id;
-	unsigned guc_id;
+	enum intel_engine_hw_id hw_id;
 	u32 mmio_base;
 	unsigned irq_shift;
 	int (*init_legacy)(struct intel_engine_cs *engine);
@@ -38,7 +38,7 @@ static const struct engine_info {
 	[RCS] = {
 		.name = "render ring",
 		.exec_id = I915_EXEC_RENDER,
-		.guc_id = GUC_RENDER_ENGINE,
+		.hw_id = RCS_HW,
 		.mmio_base = RENDER_RING_BASE,
 		.irq_shift = GEN8_RCS_IRQ_SHIFT,
 		.init_execlists = logical_render_ring_init,
@@ -47,7 +47,7 @@ static const struct engine_info {
 	[BCS] = {
 		.name = "blitter ring",
 		.exec_id = I915_EXEC_BLT,
-		.guc_id = GUC_BLITTER_ENGINE,
+		.hw_id = BCS_HW,
 		.mmio_base = BLT_RING_BASE,
 		.irq_shift = GEN8_BCS_IRQ_SHIFT,
 		.init_execlists = logical_xcs_ring_init,
@@ -56,7 +56,7 @@ static const struct engine_info {
 	[VCS] = {
 		.name = "bsd ring",
 		.exec_id = I915_EXEC_BSD,
-		.guc_id = GUC_VIDEO_ENGINE,
+		.hw_id = VCS_HW,
 		.mmio_base = GEN6_BSD_RING_BASE,
 		.irq_shift = GEN8_VCS1_IRQ_SHIFT,
 		.init_execlists = logical_xcs_ring_init,
@@ -65,7 +65,7 @@ static const struct engine_info {
 	[VCS2] = {
 		.name = "bsd2 ring",
 		.exec_id = I915_EXEC_BSD,
-		.guc_id = GUC_VIDEO_ENGINE2,
+		.hw_id = VCS2_HW,
 		.mmio_base = GEN8_BSD2_RING_BASE,
 		.irq_shift = GEN8_VCS2_IRQ_SHIFT,
 		.init_execlists = logical_xcs_ring_init,
@@ -74,7 +74,7 @@ static const struct engine_info {
 	[VECS] = {
 		.name = "video enhancement ring",
 		.exec_id = I915_EXEC_VEBOX,
-		.guc_id = GUC_VIDEOENHANCE_ENGINE,
+		.hw_id = VECS_HW,
 		.mmio_base = VEBOX_RING_BASE,
 		.irq_shift = GEN8_VECS_IRQ_SHIFT,
 		.init_execlists = logical_xcs_ring_init,
@@ -93,7 +93,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	engine->i915 = dev_priv;
 	engine->name = info->name;
 	engine->exec_id = info->exec_id;
-	engine->hw_id = engine->guc_id = info->guc_id;
+	engine->hw_id = engine->guc_id = info->hw_id;
 	engine->mmio_base = info->mmio_base;
 	engine->irq_shift = info->irq_shift;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e3777572c70e..9d723c24eeff 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -146,8 +146,14 @@ struct intel_engine_cs {
 #define I915_NUM_ENGINES 5
 #define _VCS(n) (VCS + (n))
 	unsigned int exec_id;
-	unsigned int hw_id;
-	unsigned int guc_id; /* XXX same as hw_id? */
+	enum intel_engine_hw_id {
+		RCS_HW = 0,
+		VCS_HW,
+		BCS_HW,
+		VECS_HW,
+		VCS2_HW
+	} hw_id;
+	enum intel_engine_hw_id guc_id; /* XXX same as hw_id? */
 	u64 fence_context;
 	u32		mmio_base;
 	unsigned int irq_shift;

From 318f89ca205fc2df61954e3b415d93a06691817e Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 16 Aug 2016 17:04:21 +0100
Subject: [PATCH 086/141] drm/i915: Initialize legacy semaphores from engine hw
 id indexed array

Build the legacy semaphore initialisation array using the engine
hardware ids instead of driver internal ones. This makes the
static array size dependent only on the number of gen6 semaphore
engines.

Also makes the per-engine semaphore wait and signal tables
hardware id indexed saving some more space.

v2: Refactor I915_GEN6_NUM_ENGINES to GEN6_SEMAPHORE_LAST. (Chris Wilson)
v3: More polish. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1471363461-9973-1-git-send-email-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 55 +++++++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  7 +++-
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index fa22bd87bab0..12703ea27259 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1337,8 +1337,7 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 {
 	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
-	struct intel_engine_cs *useless;
-	enum intel_engine_id id;
+	struct intel_engine_cs *engine;
 	int ret, num_rings;
 
 	num_rings = INTEL_INFO(dev_priv)->num_rings;
@@ -1346,9 +1345,13 @@ static int gen6_signal(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	for_each_engine_id(useless, dev_priv, id) {
-		i915_reg_t mbox_reg = req->engine->semaphore.mbox.signal[id];
+	for_each_engine(engine, dev_priv) {
+		i915_reg_t mbox_reg;
 
+		if (!(BIT(engine->hw_id) & GEN6_SEMAPHORES_MASK))
+			continue;
+
+		mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
 		if (i915_mmio_reg_valid(mbox_reg)) {
 			intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
 			intel_ring_emit_reg(ring, mbox_reg);
@@ -1495,7 +1498,7 @@ gen6_ring_sync_to(struct drm_i915_gem_request *req,
 	u32 dw1 = MI_SEMAPHORE_MBOX |
 		  MI_SEMAPHORE_COMPARE |
 		  MI_SEMAPHORE_REGISTER;
-	u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->id];
+	u32 wait_mbox = signal->engine->semaphore.mbox.wait[req->engine->hw_id];
 	int ret;
 
 	WARN_ON(wait_mbox == MI_SEMAPHORE_SYNC_INVALID);
@@ -2569,41 +2572,41 @@ static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv,
 		 * initialized as INVALID.  Gen8 will initialize the
 		 * sema between VCS2 and RCS later.
 		 */
-		for (i = 0; i < I915_NUM_ENGINES; i++) {
+		for (i = 0; i < GEN6_NUM_SEMAPHORES; i++) {
 			static const struct {
 				u32 wait_mbox;
 				i915_reg_t mbox_reg;
-			} sem_data[I915_NUM_ENGINES][I915_NUM_ENGINES] = {
-				[RCS] = {
-					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RV,  .mbox_reg = GEN6_VRSYNC },
-					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RB,  .mbox_reg = GEN6_BRSYNC },
-					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC },
+			} sem_data[GEN6_NUM_SEMAPHORES][GEN6_NUM_SEMAPHORES] = {
+				[RCS_HW] = {
+					[VCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RV,  .mbox_reg = GEN6_VRSYNC },
+					[BCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_RB,  .mbox_reg = GEN6_BRSYNC },
+					[VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_RVE, .mbox_reg = GEN6_VERSYNC },
 				},
-				[VCS] = {
-					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VR,  .mbox_reg = GEN6_RVSYNC },
-					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VB,  .mbox_reg = GEN6_BVSYNC },
-					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC },
+				[VCS_HW] = {
+					[RCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VR,  .mbox_reg = GEN6_RVSYNC },
+					[BCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VB,  .mbox_reg = GEN6_BVSYNC },
+					[VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_VVE, .mbox_reg = GEN6_VEVSYNC },
 				},
-				[BCS] = {
-					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BR,  .mbox_reg = GEN6_RBSYNC },
-					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BV,  .mbox_reg = GEN6_VBSYNC },
-					[VECS] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC },
+				[BCS_HW] = {
+					[RCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BR,  .mbox_reg = GEN6_RBSYNC },
+					[VCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_BV,  .mbox_reg = GEN6_VBSYNC },
+					[VECS_HW] = { .wait_mbox = MI_SEMAPHORE_SYNC_BVE, .mbox_reg = GEN6_VEBSYNC },
 				},
-				[VECS] = {
-					[RCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC },
-					[VCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC },
-					[BCS] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC },
+				[VECS_HW] = {
+					[RCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VER, .mbox_reg = GEN6_RVESYNC },
+					[VCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEV, .mbox_reg = GEN6_VVESYNC },
+					[BCS_HW] =  { .wait_mbox = MI_SEMAPHORE_SYNC_VEB, .mbox_reg = GEN6_BVESYNC },
 				},
 			};
 			u32 wait_mbox;
 			i915_reg_t mbox_reg;
 
-			if (i == engine->id || i == VCS2) {
+			if (i == engine->hw_id) {
 				wait_mbox = MI_SEMAPHORE_SYNC_INVALID;
 				mbox_reg = GEN6_NOSYNC;
 			} else {
-				wait_mbox = sem_data[engine->id][i].wait_mbox;
-				mbox_reg = sem_data[engine->id][i].mbox_reg;
+				wait_mbox = sem_data[engine->hw_id][i].wait_mbox;
+				mbox_reg = sem_data[engine->hw_id][i].mbox_reg;
 			}
 
 			engine->semaphore.mbox.wait[i] = wait_mbox;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 9d723c24eeff..86612d502c65 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -278,11 +278,14 @@ struct intel_engine_cs {
 		u32	sync_seqno[I915_NUM_ENGINES-1];
 
 		union {
+#define GEN6_SEMAPHORE_LAST	VECS_HW
+#define GEN6_NUM_SEMAPHORES	(GEN6_SEMAPHORE_LAST + 1)
+#define GEN6_SEMAPHORES_MASK	GENMASK(GEN6_SEMAPHORE_LAST, 0)
 			struct {
 				/* our mbox written by others */
-				u32		wait[I915_NUM_ENGINES];
+				u32		wait[GEN6_NUM_SEMAPHORES];
 				/* mboxes this ring signals to */
-				i915_reg_t	signal[I915_NUM_ENGINES];
+				i915_reg_t	signal[GEN6_NUM_SEMAPHORES];
 			} mbox;
 			u64		signal_ggtt[I915_NUM_ENGINES];
 		};

From 21aea5cc06667b7e34cf7bd3fc17a1fb2937baa9 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 17 Aug 2016 12:09:05 +0100
Subject: [PATCH 087/141] drm/i915: Mark the static key for movntqda as static

Silence sparse who warns that the global variable is not declared
static.

Fixes: 0b1de5d58e19 ("drm/i915: Use SSE4.1 movntdqa to ...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471432146-5196-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_memcpy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
index 6f1df0ec8a81..49a079494b68 100644
--- a/drivers/gpu/drm/i915/i915_memcpy.c
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -27,7 +27,7 @@
 
 #include "i915_drv.h"
 
-DEFINE_STATIC_KEY_FALSE(has_movntdqa);
+static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
 
 #ifdef CONFIG_AS_MOVNTDQA
 static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)

From 24808e96792a860f3e83e2eb69c5190261716924 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 17 Aug 2016 12:09:06 +0100
Subject: [PATCH 088/141] drm/i915: Mark i915_hpd_poll_init_work as static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Local function with forgotten static declaration.

Fixes: 19625e85c6ec ("drm/i915: Enable polling when we don't have hpd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Lyude <cpaul@redhat.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471432146-5196-2-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_hotplug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_hotplug.c b/drivers/gpu/drm/i915/intel_hotplug.c
index 5dc2c20f6ca1..334d47b5811a 100644
--- a/drivers/gpu/drm/i915/intel_hotplug.c
+++ b/drivers/gpu/drm/i915/intel_hotplug.c
@@ -477,7 +477,8 @@ void intel_hpd_init(struct drm_i915_private *dev_priv)
 	spin_unlock_irq(&dev_priv->irq_lock);
 }
 
-void i915_hpd_poll_init_work(struct work_struct *work) {
+static void i915_hpd_poll_init_work(struct work_struct *work)
+{
 	struct drm_i915_private *dev_priv =
 		container_of(work, struct drm_i915_private,
 			     hotplug.poll_init_work);

From dbaf788e710d1f65340c5f46a706fee5b634493c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 17 Aug 2016 11:33:33 +0100
Subject: [PATCH 089/141] drm/i915: Remember to set vma->pages for the
 preallocated stolen object

In commit 247177ddd517 ("drm/i915: Always set the vma->pages"), as it
title implies, we always set vma->pages for bound objects. Even before
that, we would set vma->ggtt_view.pages, for globally bound objects.
This was forgotten for the fixup inside the preallocated stolen objects,
which has to recreate a global GTT binding outside of the usual VMA
insertion path

Fixes: 247177ddd517 ("drm/i915: Always set the vma->pages")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471430013-3449-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_stolen.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index 9a9b3cfbac7f..aa050fa1e558 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -716,6 +716,7 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_device *dev,
 		goto err;
 	}
 
+	vma->pages = obj->pages;
 	vma->flags |= I915_VMA_GLOBAL_BIND;
 	__i915_vma_set_map_and_fenceable(vma);
 	list_move_tail(&vma->vm_link, &ggtt->base.inactive_list);

From ceae5317f608aaabf3e03377e061e44c7345cc6a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 17 Aug 2016 13:42:42 +0100
Subject: [PATCH 090/141] drm/i915: Add missing kerneldoc for
 guc_client_alloc:engines

Brief parameter text to describe @engines.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1471437762-22936-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index bb4079223e39..e4369411f07d 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -744,6 +744,7 @@ static void guc_init_doorbell_hw(struct intel_guc *guc)
 /**
  * guc_client_alloc() - Allocate an i915_guc_client
  * @dev_priv:	driver private data structure
+ * @engines:	The set of engines to enable for this client
  * @priority:	four levels priority _CRITICAL, _HIGH, _NORMAL and _LOW
  * 		The kernel client to replace ExecList submission is created with
  * 		NORMAL priority. Priority of a client for scheduler can be HIGH,

From 1b54a880b250acc226b13cea221b90aa1b3e37dd Mon Sep 17 00:00:00 2001
From: Matt Roper <matthew.d.roper@intel.com>
Date: Fri, 17 Jun 2016 13:42:18 -0700
Subject: [PATCH 091/141] drm/i915/gen9: Initialize intel_state->active_crtcs
 during WM sanitization (v2)

intel_state->active_crtcs is usually only initialized when doing a
modeset.  During our first atomic commit after boot, we're effectively
faking a modeset to sanitize the DDB/wm setup, so ensure that this field
gets initialized before use.

v2:
 - Don't clobber active_crtcs if our first commit really is a modeset
   (Maarten)
 - Grab connection_mutex when faking a modeset during sanitization
   (Maarten)

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Tested-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1466196140-16336-2-git-send-email-matthew.d.roper@intel.com
Cc: stable@vger.kernel.org #v4.7+
---
 drivers/gpu/drm/i915/intel_pm.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 99014d7821c2..d74c26f1cf77 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3920,9 +3920,24 @@ skl_compute_ddb(struct drm_atomic_state *state)
 	 * pretend that all pipes switched active status so that we'll
 	 * ensure a full DDB recompute.
 	 */
-	if (dev_priv->wm.distrust_bios_wm)
+	if (dev_priv->wm.distrust_bios_wm) {
+		ret = drm_modeset_lock(&dev->mode_config.connection_mutex,
+				       state->acquire_ctx);
+		if (ret)
+			return ret;
+
 		intel_state->active_pipe_changes = ~0;
 
+		/*
+		 * We usually only initialize intel_state->active_crtcs if we
+		 * we're doing a modeset; make sure this field is always
+		 * initialized during the sanitization process that happens
+		 * on the first commit too.
+		 */
+		if (!intel_state->modeset)
+			intel_state->active_crtcs = dev_priv->active_crtcs;
+	}
+
 	/*
 	 * If the modeset changes which CRTC's are active, we need to
 	 * recompute the DDB allocation for *all* active pipes, even

From 43aa7e87507f519b0b2497b6fac1e894554eaef2 Mon Sep 17 00:00:00 2001
From: Matt Roper <matthew.d.roper@intel.com>
Date: Fri, 17 Jun 2016 13:42:20 -0700
Subject: [PATCH 092/141] drm/i915/gen9: Drop invalid WARN() during data rate
 calculation

It's possible to have a non-zero plane mask and still wind up with a
total data rate of zero.  There are two cases where this can happen:

 * planes are active (from the KMS point of view), but are
   all fully clipped (positioned offscreen)
 * the only active plane on a CRTC is the cursor (which is handled
   independently and not counted into the general data rate computations

These are both valid display setups (although unusual), so we need to
drop the WARN().

Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Testcase: kms_universal_planes.cursor-only-pipe-*
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1466196140-16336-4-git-send-email-matthew.d.roper@intel.com
Cc: stable@vger.kernel.org #v4.7+
---
 drivers/gpu/drm/i915/intel_pm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index d74c26f1cf77..8a6751e14ab9 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3115,8 +3115,6 @@ skl_get_total_relative_data_rate(struct intel_crtc_state *intel_cstate)
 		total_data_rate += intel_cstate->wm.skl.plane_y_data_rate[id];
 	}
 
-	WARN_ON(cstate->plane_mask && total_data_rate == 0);
-
 	return total_data_rate;
 }
 

From 600f436801deae65e48404847b61c89b4944e355 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:40 +0100
Subject: [PATCH 093/141] drm/i915: Unconditionally flush any chipset buffers
 before execbuf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If userspace is asynchronously streaming into the batch or other
execobjects, we may not flush those writes along with a change in cache
domain (as there is no change). Therefore those writes may end up in
internal chipset buffers and not visible to the GPU upon execution. We
must issue a flush command or otherwise we encounter incoherency in the
batchbuffers and the GPU executing invalid commands (i.e. hanging) quite
regularly.

v2: Throw a paranoid wmb() into the general flush so that we remain
consistent with before.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90841
Fixes: 1816f9236303 ("drm/i915: Support creation of unbound wc user...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Tested-by: Matti Hämäläinen <ccr@tnsp.org>
Cc: stable@vger.kernel.org
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++----------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 35caa9b2f36a..a08d2595cb6d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3420,6 +3420,7 @@ int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
 /* belongs in i915_gem_gtt.h */
 static inline void i915_gem_chipset_flush(struct drm_i915_private *dev_priv)
 {
+	wmb();
 	if (INTEL_GEN(dev_priv) < 6)
 		intel_gtt_chipset_flush();
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 699315304748..bce587abc601 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1015,8 +1015,6 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 {
 	const unsigned int other_rings = eb_other_engines(req);
 	struct i915_vma *vma;
-	uint32_t flush_domains = 0;
-	bool flush_chipset = false;
 	int ret;
 
 	list_for_each_entry(vma, vmas, exec_list) {
@@ -1029,16 +1027,11 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		}
 
 		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
-			flush_chipset |= i915_gem_clflush_object(obj, false);
-
-		flush_domains |= obj->base.write_domain;
+			i915_gem_clflush_object(obj, false);
 	}
 
-	if (flush_chipset)
-		i915_gem_chipset_flush(req->engine->i915);
-
-	if (flush_domains & I915_GEM_DOMAIN_GTT)
-		wmb();
+	/* Unconditionally flush any chipset caches (for streaming writes). */
+	i915_gem_chipset_flush(req->engine->i915);
 
 	/* Unconditionally invalidate GPU caches and TLBs. */
 	return req->engine->emit_flush(req, EMIT_INVALIDATE);

From 3497971a71d8b15a41b7bf2bf66ebf5909b2bd3f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:41 +0100
Subject: [PATCH 094/141] agp/intel: Flush chipset writes after updating a
 single PTE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After we update one PTE for a page, the caller expects to be able to
immediately use that through a GGTT read/write. To comply with the
callers expectations we therefore need to flush the chipset buffers
before returning.

Reported-by: Matti Hämäläinen <ccr@tnsp.org>
Fixes: d6473f566417 ("drm/i915: Add support for mapping an object page...")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ankitprasad Sharma <ankitprasad.r.sharma@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Tested-by: Matti Hämäläinen <ccr@tnsp.org>
Cc: drm-intel-fixes@lists.freedesktop.org
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-2-chris@chris-wilson.co.uk
---
 drivers/char/agp/intel-gtt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index 44311296ec02..0f7d28a98b9a 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -845,6 +845,8 @@ void intel_gtt_insert_page(dma_addr_t addr,
 			   unsigned int flags)
 {
 	intel_private.driver->write_entry(addr, pg, flags);
+	if (intel_private.driver->chipset_flush)
+		intel_private.driver->chipset_flush();
 }
 EXPORT_SYMBOL(intel_gtt_insert_page);
 

From 4b30cb23343e98f68ef0917e20fbe121aea78dcf Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:42 +0100
Subject: [PATCH 095/141] drm/i915: vfree() no longer ignores the low bits of
 the address

Since vfree() now likes to WARN when passed a non-page-aligned pointer,
we need to discard the low bits to comply with it.

Fixes: d31d7cb1460c ("drm/i915: Support for creating write combined type vmaps")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-3-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h |  5 +++++
 drivers/gpu/drm/i915/i915_gem.c | 11 +++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a08d2595cb6d..5b778ceba82e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3897,6 +3897,11 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
 bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
 
+#define ptr_mask_bits(ptr) ({						\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(typeof(ptr))(__v & PAGE_MASK);					\
+})
+
 #define ptr_unpack_bits(ptr, bits) ({					\
 	unsigned long __v = (unsigned long)(ptr);			\
 	(bits) = __v & ~PAGE_MASK;					\
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a8d0f70c22f9..cffa57b14246 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2094,11 +2094,14 @@ i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 	list_del(&obj->global_list);
 
 	if (obj->mapping) {
-		/* low bits are ignored by is_vmalloc_addr and kmap_to_page */
-		if (is_vmalloc_addr(obj->mapping))
-			vunmap(obj->mapping);
+		void *ptr;
+
+		ptr = ptr_mask_bits(obj->mapping);
+		if (is_vmalloc_addr(ptr))
+			vunmap(ptr);
 		else
-			kunmap(kmap_to_page(obj->mapping));
+			kunmap(kmap_to_page(ptr));
+
 		obj->mapping = NULL;
 	}
 

From b19482d7ce505776af783dd75b07e77f2bf3bd89 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:43 +0100
Subject: [PATCH 096/141] drm/i915: Use ORIGIN_CPU for fb invalidation from
 pwrite

As pwrite does not use the fence for its GTT access, and may even go
through a secondary interface avoiding the main VMA, we cannot treat the
write as automatically invalidated by the hardware and so we require
ORIGIN_CPU frontbufer invalidate/flushes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-4-chris@chris-wilson.co.uk
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cffa57b14246..9310dda7fca1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1082,7 +1082,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 	if (ret)
 		goto out_unpin;
 
-	intel_fb_obj_invalidate(obj, ORIGIN_GTT);
+	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 	obj->dirty = true;
 
 	user_data = u64_to_user_ptr(args->data_ptr);
@@ -1149,7 +1149,7 @@ out_flush:
 		}
 	}
 
-	intel_fb_obj_flush(obj, false, ORIGIN_GTT);
+	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
 out_unpin:
 	if (node.allocated) {
 		wmb();

From d243ad820295d1c499f9256f8ef04d1d36e74e34 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:44 +0100
Subject: [PATCH 097/141] drm/i915: Mark up the GTT flush following WC writes
 as ORIGIN_CPU

Similarly to invalidating beforehand, if the object is mmapped via
I915_MMAP_WC we cannot track writes through the I915_GEM_DOMAIN_GTT. At
the conclusion of the write, i915_gem_object_flush_gtt_writes() we also
need to treat the origin carefully in case it may have been untracked.

See also commit aeecc9696aa0 ("drm/i915: use ORIGIN_CPU for frontbuffer
invalidation on WC mmaps").

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-5-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9310dda7fca1..5adbf8c3b81c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1450,7 +1450,7 @@ err:
 	return ret;
 }
 
-static enum fb_op_origin
+static inline enum fb_op_origin
 write_origin(struct drm_i915_gem_object *obj, unsigned domain)
 {
 	return domain == I915_GEM_DOMAIN_GTT && !obj->has_wc_mmap ?
@@ -3155,7 +3155,7 @@ i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
-	intel_fb_obj_flush(obj, false, ORIGIN_GTT);
+	intel_fb_obj_flush(obj, false, write_origin(obj, I915_GEM_DOMAIN_GTT));
 
 	trace_i915_gem_object_change_domain(obj,
 					    obj->base.read_domains,

From 18034584784168c73bc8dd57104ea48c42f38527 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:45 +0100
Subject: [PATCH 098/141] drm/i915: Fallback to single page pwrite/pread if
 unable to release fence

If we cannot release the fence (for example if someone is inexplicably
trying to write into a tiled framebuffer that is currently pinned to the
display! *cough* kms_frontbuffer_tracking *cough*) fallback to using the
page-by-page pwrite/pread interface, rather than fail the syscall
entirely.

Since this is triggerable by the user (along pwrite) we have to remove
the WARN_ON(fence->pin_count).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-6-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c       | 30 ++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_fence.c |  2 +-
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5adbf8c3b81c..a609522221ed 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -754,6 +754,15 @@ i915_gem_gtt_pread(struct drm_device *dev,
 	int ret;
 
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
+	if (!IS_ERR(vma)) {
+		node.start = i915_ggtt_offset(vma);
+		node.allocated = false;
+		ret = i915_gem_object_put_fence(obj);
+		if (ret) {
+			i915_vma_unpin(vma);
+			vma = ERR_PTR(ret);
+		}
+	}
 	if (IS_ERR(vma)) {
 		ret = insert_mappable_node(dev_priv, &node, PAGE_SIZE);
 		if (ret)
@@ -766,12 +775,6 @@ i915_gem_gtt_pread(struct drm_device *dev,
 		}
 
 		i915_gem_object_pin_pages(obj);
-	} else {
-		node.start = i915_ggtt_offset(vma);
-		node.allocated = false;
-		ret = i915_gem_object_put_fence(obj);
-		if (ret)
-			goto out_unpin;
 	}
 
 	ret = i915_gem_object_set_to_gtt_domain(obj, false);
@@ -1058,6 +1061,15 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE | PIN_NONBLOCK);
+	if (!IS_ERR(vma)) {
+		node.start = i915_ggtt_offset(vma);
+		node.allocated = false;
+		ret = i915_gem_object_put_fence(obj);
+		if (ret) {
+			i915_vma_unpin(vma);
+			vma = ERR_PTR(ret);
+		}
+	}
 	if (IS_ERR(vma)) {
 		ret = insert_mappable_node(i915, &node, PAGE_SIZE);
 		if (ret)
@@ -1070,12 +1082,6 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 		}
 
 		i915_gem_object_pin_pages(obj);
-	} else {
-		node.start = i915_ggtt_offset(vma);
-		node.allocated = false;
-		ret = i915_gem_object_put_fence(obj);
-		if (ret)
-			goto out_unpin;
 	}
 
 	ret = i915_gem_object_set_to_gtt_domain(obj, true);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index 334c3c4e8357..b0c6c2777725 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -298,7 +298,7 @@ i915_gem_object_put_fence(struct drm_i915_gem_object *obj)
 
 	fence = &dev_priv->fence_regs[obj->fence_reg];
 
-	if (WARN_ON(fence->pin_count))
+	if (fence->pin_count)
 		return -EBUSY;
 
 	i915_gem_object_fence_lost(obj);

From 31a39207f04a37324d70ff5665fd19d3a75b39c1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:46 +0100
Subject: [PATCH 099/141] drm/i915: Cache kmap between relocations

When doing relocations, we have to obtain a mapping to the page
containing the target address. This is either a kmap or iomap depending
on GPU and its cache coherency. Neighbouring relocation entries are
typically within the same page and so we can cache our kmapping between
them and avoid those pesky TLB flushes.

Note that there is some sleight-of-hand in how the slow relocate works
as the reloc_entry_cache implies pagefaults disabled (as we are inside a
kmap_atomic section). However, the slow relocate code is meant to be the
fallback from the atomic fast path failing. Fortunately it works as we
already have performed the copy_from_user for the relocation array (no
more pagefaults there) and the kmap_atomic cache is enabled after we
have waited upon an active buffer (so no more sleeping in atomic).
Magic!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-7-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 165 ++++++++++++++-------
 1 file changed, 111 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index bce587abc601..4cef9e9e05c5 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -308,9 +308,55 @@ relocation_target(struct drm_i915_gem_relocation_entry *reloc,
 	return gen8_canonical_addr((int)reloc->delta + target_offset);
 }
 
+struct reloc_cache {
+	void *vaddr;
+	unsigned int page;
+	enum { KMAP, IOMAP } type;
+};
+
+static void reloc_cache_init(struct reloc_cache *cache)
+{
+	cache->page = -1;
+	cache->vaddr = NULL;
+}
+
+static void reloc_cache_fini(struct reloc_cache *cache)
+{
+	if (!cache->vaddr)
+		return;
+
+	switch (cache->type) {
+	case KMAP:
+		kunmap_atomic(cache->vaddr);
+		break;
+
+	case IOMAP:
+		io_mapping_unmap_atomic(cache->vaddr);
+		break;
+	}
+}
+
+static void *reloc_kmap(struct drm_i915_gem_object *obj,
+			struct reloc_cache *cache,
+			int page)
+{
+	if (cache->page == page)
+		return cache->vaddr;
+
+	if (cache->vaddr)
+		kunmap_atomic(cache->vaddr);
+
+	cache->page = page;
+	cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
+	cache->type = KMAP;
+
+	return cache->vaddr;
+}
+
 static int
 relocate_entry_cpu(struct drm_i915_gem_object *obj,
 		   struct drm_i915_gem_relocation_entry *reloc,
+		   struct reloc_cache *cache,
 		   uint64_t target_offset)
 {
 	struct drm_device *dev = obj->base.dev;
@@ -323,34 +369,47 @@ relocate_entry_cpu(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
-	vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
-				reloc->offset >> PAGE_SHIFT));
+	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
 	*(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta);
 
-	if (INTEL_INFO(dev)->gen >= 8) {
-		page_offset = offset_in_page(page_offset + sizeof(uint32_t));
-
-		if (page_offset == 0) {
-			kunmap_atomic(vaddr);
-			vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
-			    (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
+	if (INTEL_GEN(dev) >= 8) {
+		page_offset += sizeof(uint32_t);
+		if (page_offset == PAGE_SIZE) {
+			vaddr = reloc_kmap(obj, cache, cache->page + 1);
+			page_offset = 0;
 		}
-
 		*(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta);
 	}
 
-	kunmap_atomic(vaddr);
-
 	return 0;
 }
 
+static void *reloc_iomap(struct drm_i915_private *i915,
+			 struct reloc_cache *cache,
+			 uint64_t offset)
+{
+	if (cache->page == offset >> PAGE_SHIFT)
+		return cache->vaddr;
+
+	if (cache->vaddr)
+		io_mapping_unmap_atomic(cache->vaddr);
+
+	cache->page = offset >> PAGE_SHIFT;
+	cache->vaddr =
+		io_mapping_map_atomic_wc(i915->ggtt.mappable,
+					 offset & PAGE_MASK);
+	cache->type = IOMAP;
+
+	return cache->vaddr;
+}
+
 static int
 relocate_entry_gtt(struct drm_i915_gem_object *obj,
 		   struct drm_i915_gem_relocation_entry *reloc,
+		   struct reloc_cache *cache,
 		   uint64_t target_offset)
 {
 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
 	struct i915_vma *vma;
 	uint64_t delta = relocation_target(reloc, target_offset);
 	uint64_t offset;
@@ -371,28 +430,19 @@ relocate_entry_gtt(struct drm_i915_gem_object *obj,
 
 	/* Map the page containing the relocation we're going to perform.  */
 	offset = vma->node.start + reloc->offset;
-	reloc_page = io_mapping_map_atomic_wc(ggtt->mappable,
-					      offset & PAGE_MASK);
+	reloc_page = reloc_iomap(dev_priv, cache, offset);
 	iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
 
 	if (INTEL_GEN(dev_priv) >= 8) {
 		offset += sizeof(uint32_t);
-
-		if (offset_in_page(offset) == 0) {
-			io_mapping_unmap_atomic(reloc_page);
-			reloc_page =
-				io_mapping_map_atomic_wc(ggtt->mappable,
-							 offset);
-		}
-
+		if (offset_in_page(offset) == 0)
+			reloc_page = reloc_iomap(dev_priv, cache, offset);
 		iowrite32(upper_32_bits(delta),
 			  reloc_page + offset_in_page(offset));
 	}
 
-	io_mapping_unmap_atomic(reloc_page);
-
 unpin:
-	i915_vma_unpin(vma);
+	__i915_vma_unpin(vma);
 	return ret;
 }
 
@@ -408,6 +458,7 @@ clflush_write32(void *addr, uint32_t value)
 static int
 relocate_entry_clflush(struct drm_i915_gem_object *obj,
 		       struct drm_i915_gem_relocation_entry *reloc,
+		       struct reloc_cache *cache,
 		       uint64_t target_offset)
 {
 	struct drm_device *dev = obj->base.dev;
@@ -420,24 +471,18 @@ relocate_entry_clflush(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
-	vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
-				reloc->offset >> PAGE_SHIFT));
+	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
 	clflush_write32(vaddr + page_offset, lower_32_bits(delta));
 
-	if (INTEL_INFO(dev)->gen >= 8) {
-		page_offset = offset_in_page(page_offset + sizeof(uint32_t));
-
-		if (page_offset == 0) {
-			kunmap_atomic(vaddr);
-			vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj,
-			    (reloc->offset + sizeof(uint32_t)) >> PAGE_SHIFT));
+	if (INTEL_GEN(dev) >= 8) {
+		page_offset += sizeof(uint32_t);
+		if (page_offset == PAGE_SIZE) {
+			vaddr = reloc_kmap(obj, cache, cache->page + 1);
+			page_offset = 0;
 		}
-
 		clflush_write32(vaddr + page_offset, upper_32_bits(delta));
 	}
 
-	kunmap_atomic(vaddr);
-
 	return 0;
 }
 
@@ -458,7 +503,8 @@ static bool object_is_idle(struct drm_i915_gem_object *obj)
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_vmas *eb,
-				   struct drm_i915_gem_relocation_entry *reloc)
+				   struct drm_i915_gem_relocation_entry *reloc,
+				   struct reloc_cache *cache)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_gem_object *target_obj;
@@ -542,11 +588,11 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		return -EFAULT;
 
 	if (use_cpu_reloc(obj))
-		ret = relocate_entry_cpu(obj, reloc, target_offset);
+		ret = relocate_entry_cpu(obj, reloc, cache, target_offset);
 	else if (obj->map_and_fenceable)
-		ret = relocate_entry_gtt(obj, reloc, target_offset);
+		ret = relocate_entry_gtt(obj, reloc, cache, target_offset);
 	else if (static_cpu_has(X86_FEATURE_CLFLUSH))
-		ret = relocate_entry_clflush(obj, reloc, target_offset);
+		ret = relocate_entry_clflush(obj, reloc, cache, target_offset);
 	else {
 		WARN_ONCE(1, "Impossible case in relocation handling\n");
 		ret = -ENODEV;
@@ -569,9 +615,11 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
 	struct drm_i915_gem_relocation_entry __user *user_relocs;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int remain, ret;
+	struct reloc_cache cache;
+	int remain, ret = 0;
 
 	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
+	reloc_cache_init(&cache);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -581,19 +629,23 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 			count = ARRAY_SIZE(stack_reloc);
 		remain -= count;
 
-		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0])))
-			return -EFAULT;
+		if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) {
+			ret = -EFAULT;
+			goto out;
+		}
 
 		do {
 			u64 offset = r->presumed_offset;
 
-			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r);
+			ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache);
 			if (ret)
-				return ret;
+				goto out;
 
 			if (r->presumed_offset != offset &&
-			    __put_user(r->presumed_offset, &user_relocs->presumed_offset)) {
-				return -EFAULT;
+			    __put_user(r->presumed_offset,
+				       &user_relocs->presumed_offset)) {
+				ret = -EFAULT;
+				goto out;
 			}
 
 			user_relocs++;
@@ -601,7 +653,9 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 		} while (--count);
 	}
 
-	return 0;
+out:
+	reloc_cache_fini(&cache);
+	return ret;
 #undef N_RELOC
 }
 
@@ -611,15 +665,18 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 				      struct drm_i915_gem_relocation_entry *relocs)
 {
 	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	int i, ret;
+	struct reloc_cache cache;
+	int i, ret = 0;
 
+	reloc_cache_init(&cache);
 	for (i = 0; i < entry->relocation_count; i++) {
-		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i]);
+		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
 		if (ret)
-			return ret;
+			break;
 	}
+	reloc_cache_fini(&cache);
 
-	return 0;
+	return ret;
 }
 
 static int

From 43394c7d0d36716edd92e482ec68bec6b82f2982 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:47 +0100
Subject: [PATCH 100/141] drm/i915: Extract i915_gem_obj_prepare_shmem_write()

This is a companion to i915_gem_obj_prepare_shmem_read() that prepares
the backing storage for direct writes. It first serialises with the GPU,
pins the backing storage and then indicates what clfushes are required in
order for the writes to be coherent.

Whilst here, fix support for ancient CPUs without clflush for which we
cannot do the GTT+clflush tricks.

v2: Add i915_gem_obj_finish_shmem_access() for symmetry

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-8-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c |   4 +-
 drivers/gpu/drm/i915/i915_drv.h        |  17 ++-
 drivers/gpu/drm/i915/i915_gem.c        | 150 +++++++++++++++----------
 3 files changed, 104 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 1db829c8b912..e586e15e172f 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -973,7 +973,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
 		       u32 batch_start_offset,
 		       u32 batch_len)
 {
-	int needs_clflush = 0;
+	unsigned int needs_clflush;
 	void *src_base, *src;
 	void *dst = NULL;
 	int ret;
@@ -1020,7 +1020,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
 unmap_src:
 	vunmap(src_base);
 unpin_src:
-	i915_gem_object_unpin_pages(src_obj);
+	i915_gem_obj_finish_shmem_access(src_obj);
 
 	return ret ? ERR_PTR(ret) : dst;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5b778ceba82e..91861a08787c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3098,9 +3098,6 @@ int i915_gem_object_put_pages(struct drm_i915_gem_object *obj);
 void i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv);
 void i915_gem_release_mmap(struct drm_i915_gem_object *obj);
 
-int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
-				    int *needs_clflush);
-
 int __must_check i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
 
 static inline int __sg_page_count(struct scatterlist *sg)
@@ -3201,6 +3198,20 @@ static inline void i915_gem_object_unpin_map(struct drm_i915_gem_object *obj)
 	i915_gem_object_unpin_pages(obj);
 }
 
+int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
+				    unsigned int *needs_clflush);
+int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
+				     unsigned int *needs_clflush);
+#define CLFLUSH_BEFORE 0x1
+#define CLFLUSH_AFTER 0x2
+#define CLFLUSH_FLAGS (CLFLUSH_BEFORE | CLFLUSH_AFTER)
+
+static inline void
+i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
+{
+	i915_gem_object_unpin_pages(obj);
+}
+
 int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
 int i915_gem_object_sync(struct drm_i915_gem_object *obj,
 			 struct drm_i915_gem_request *to);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a609522221ed..f27c340bb8ee 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -609,27 +609,27 @@ __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
  * flush the object from the CPU cache.
  */
 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
-				    int *needs_clflush)
+				    unsigned int *needs_clflush)
 {
 	int ret;
 
 	*needs_clflush = 0;
 
-	if (WARN_ON(!i915_gem_object_has_struct_page(obj)))
-		return -EINVAL;
+	if (!i915_gem_object_has_struct_page(obj))
+		return -ENODEV;
 
 	ret = i915_gem_object_wait_rendering(obj, true);
 	if (ret)
 		return ret;
 
-	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) {
-		/* If we're not in the cpu read domain, set ourself into the gtt
-		 * read domain and manually flush cachelines (if required). This
-		 * optimizes for the case when the gpu will dirty the data
-		 * anyway again before the next pread happens. */
+	/* If we're not in the cpu read domain, set ourself into the gtt
+	 * read domain and manually flush cachelines (if required). This
+	 * optimizes for the case when the gpu will dirty the data
+	 * anyway again before the next pread happens.
+	 */
+	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
 		*needs_clflush = !cpu_cache_is_coherent(obj->base.dev,
 							obj->cache_level);
-	}
 
 	ret = i915_gem_object_get_pages(obj);
 	if (ret)
@@ -637,7 +637,67 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 
 	i915_gem_object_pin_pages(obj);
 
-	return ret;
+	if (*needs_clflush && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+		ret = i915_gem_object_set_to_cpu_domain(obj, false);
+		if (ret) {
+			i915_gem_object_unpin_pages(obj);
+			return ret;
+		}
+		*needs_clflush = 0;
+	}
+
+	return 0;
+}
+
+int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
+				     unsigned int *needs_clflush)
+{
+	int ret;
+
+	*needs_clflush = 0;
+	if (!i915_gem_object_has_struct_page(obj))
+		return -ENODEV;
+
+	ret = i915_gem_object_wait_rendering(obj, false);
+	if (ret)
+		return ret;
+
+	/* If we're not in the cpu write domain, set ourself into the
+	 * gtt write domain and manually flush cachelines (as required).
+	 * This optimizes for the case when the gpu will use the data
+	 * right away and we therefore have to clflush anyway.
+	 */
+	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
+		*needs_clflush |= cpu_write_needs_clflush(obj) << 1;
+
+	/* Same trick applies to invalidate partially written cachelines read
+	 * before writing.
+	 */
+	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+		*needs_clflush |= !cpu_cache_is_coherent(obj->base.dev,
+							 obj->cache_level);
+
+	ret = i915_gem_object_get_pages(obj);
+	if (ret)
+		return ret;
+
+	i915_gem_object_pin_pages(obj);
+
+	if (*needs_clflush && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+		ret = i915_gem_object_set_to_cpu_domain(obj, true);
+		if (ret) {
+			i915_gem_object_unpin_pages(obj);
+			return ret;
+		}
+		*needs_clflush = 0;
+	}
+
+	if ((*needs_clflush & CLFLUSH_AFTER) == 0)
+		obj->cache_dirty = true;
+
+	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
+	obj->dirty = 1;
+	return 0;
 }
 
 /* Per-page copy function for the shmem pread fastpath.
@@ -872,19 +932,14 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int needs_clflush = 0;
 	struct sg_page_iter sg_iter;
 
-	if (!i915_gem_object_has_struct_page(obj))
-		return -ENODEV;
-
-	user_data = u64_to_user_ptr(args->data_ptr);
-	remain = args->size;
-
-	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
-
 	ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
 	if (ret)
 		return ret;
 
+	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
+	user_data = u64_to_user_ptr(args->data_ptr);
 	offset = args->offset;
+	remain = args->size;
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
 			 offset >> PAGE_SHIFT) {
@@ -940,7 +995,7 @@ next_page:
 	}
 
 out:
-	i915_gem_object_unpin_pages(obj);
+	i915_gem_obj_finish_shmem_access(obj);
 
 	return ret;
 }
@@ -1248,42 +1303,17 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
-	int needs_clflush_after = 0;
-	int needs_clflush_before = 0;
+	unsigned int needs_clflush;
 	struct sg_page_iter sg_iter;
 
-	user_data = u64_to_user_ptr(args->data_ptr);
-	remain = args->size;
+	ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
+	if (ret)
+		return ret;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
-
-	ret = i915_gem_object_wait_rendering(obj, false);
-	if (ret)
-		return ret;
-
-	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
-		/* If we're not in the cpu write domain, set ourself into the gtt
-		 * write domain and manually flush cachelines (if required). This
-		 * optimizes for the case when the gpu will use the data
-		 * right away and we therefore have to clflush anyway. */
-		needs_clflush_after = cpu_write_needs_clflush(obj);
-	}
-	/* Same trick applies to invalidate partially written cachelines read
-	 * before writing. */
-	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
-		needs_clflush_before =
-			!cpu_cache_is_coherent(dev, obj->cache_level);
-
-	ret = i915_gem_object_get_pages(obj);
-	if (ret)
-		return ret;
-
-	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
-
-	i915_gem_object_pin_pages(obj);
-
+	user_data = u64_to_user_ptr(args->data_ptr);
 	offset = args->offset;
-	obj->dirty = 1;
+	remain = args->size;
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
 			 offset >> PAGE_SHIFT) {
@@ -1307,7 +1337,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		/* If we don't overwrite a cacheline completely we need to be
 		 * careful to have up-to-date data by first clflushing. Don't
 		 * overcomplicate things and flush the entire patch. */
-		partial_cacheline_write = needs_clflush_before &&
+		partial_cacheline_write = needs_clflush & CLFLUSH_BEFORE &&
 			((shmem_page_offset | page_length)
 				& (boot_cpu_data.x86_clflush_size - 1));
 
@@ -1317,7 +1347,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		ret = shmem_pwrite_fast(page, shmem_page_offset, page_length,
 					user_data, page_do_bit17_swizzling,
 					partial_cacheline_write,
-					needs_clflush_after);
+					needs_clflush & CLFLUSH_AFTER);
 		if (ret == 0)
 			goto next_page;
 
@@ -1326,7 +1356,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		ret = shmem_pwrite_slow(page, shmem_page_offset, page_length,
 					user_data, page_do_bit17_swizzling,
 					partial_cacheline_write,
-					needs_clflush_after);
+					needs_clflush & CLFLUSH_AFTER);
 
 		mutex_lock(&dev->struct_mutex);
 
@@ -1340,7 +1370,7 @@ next_page:
 	}
 
 out:
-	i915_gem_object_unpin_pages(obj);
+	i915_gem_obj_finish_shmem_access(obj);
 
 	if (hit_slowpath) {
 		/*
@@ -1348,17 +1378,15 @@ out:
 		 * cachelines in-line while writing and the object moved
 		 * out of the cpu write domain while we've dropped the lock.
 		 */
-		if (!needs_clflush_after &&
+		if (!(needs_clflush & CLFLUSH_AFTER) &&
 		    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 			if (i915_gem_clflush_object(obj, obj->pin_display))
-				needs_clflush_after = true;
+				needs_clflush |= CLFLUSH_AFTER;
 		}
 	}
 
-	if (needs_clflush_after)
+	if (needs_clflush & CLFLUSH_AFTER)
 		i915_gem_chipset_flush(to_i915(dev));
-	else
-		obj->cache_dirty = true;
 
 	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
 	return ret;
@@ -1437,10 +1465,8 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	if (ret == -EFAULT || ret == -ENOSPC) {
 		if (obj->phys_handle)
 			ret = i915_gem_phys_pwrite(obj, args, file);
-		else if (i915_gem_object_has_struct_page(obj))
-			ret = i915_gem_shmem_pwrite(dev, obj, args, file);
 		else
-			ret = -ENODEV;
+			ret = i915_gem_shmem_pwrite(dev, obj, args, file);
 	}
 
 	i915_gem_object_put(obj);

From a314d5cb4ac3722b9a673656e2499f4d92ee5e6f Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:48 +0100
Subject: [PATCH 101/141] drm/i915: Before accessing an object via the cpu,
 flush GTT writes

If we want to read the pages directly via the CPU, we have to be sure
that we have to flush the writes via the GTT (as the CPU can not see
the address aliasing).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-9-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f27c340bb8ee..c3f1a981b544 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -622,6 +622,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
+	i915_gem_object_flush_gtt_write_domain(obj);
+
 	/* If we're not in the cpu read domain, set ourself into the gtt
 	 * read domain and manually flush cachelines (if required). This
 	 * optimizes for the case when the gpu will dirty the data
@@ -662,6 +664,8 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
+	i915_gem_object_flush_gtt_write_domain(obj);
+
 	/* If we're not in the cpu write domain, set ourself into the
 	 * gtt write domain and manually flush cachelines (as required).
 	 * This optimizes for the case when the gpu will use the data

From 3b5724d702ef24ee41ca008a1fab1cf94f3d31b5 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:49 +0100
Subject: [PATCH 102/141] drm/i915: Wait for writes through the GTT to land
 before reading back

If we quickly switch from writing through the GTT to a read of the
physical page directly with the CPU (e.g. performing relocations through
the GTT and then running the command parser), we can observe that the
writes are not visible to the CPU. It is not a coherency problem, as
extensive investigations with clflush have demonstrated, but a mere
timing issue - we have to wait for the GTT to complete it's write before
we start our read from the CPU.

The issue can be illustrated in userspace with:

	gtt = gem_mmap__gtt(fd, handle, 0, OBJECT_SIZE, PROT_READ | PROT_WRITE);
	cpu = gem_mmap__cpu(fd, handle, 0, OBJECT_SIZE, PROT_READ | PROT_WRITE);
	gem_set_domain(fd, handle, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);

	for (i = 0; i < OBJECT_SIZE / 64; i++) {
		int x = 16*i + (i%16);
		gtt[x] = i;
		clflush(&cpu[x], sizeof(cpu[x]));
		assert(cpu[x] == i);
	}

Experimenting with that shows that this behaviour is indeed limited to
recent Atom-class hardware.

Testcase: igt/gem_exec_flush/basic-batch-default-cmd #byt
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-10-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c3f1a981b544..4c9ecab765f1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3173,20 +3173,30 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 static void
 i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
 {
+	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	uint32_t old_write_domain;
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_GTT)
 		return;
 
 	/* No actual flushing is required for the GTT write domain.  Writes
-	 * to it immediately go to main memory as far as we know, so there's
+	 * to it "immediately" go to main memory as far as we know, so there's
 	 * no chipset flush.  It also doesn't land in render cache.
 	 *
 	 * However, we do have to enforce the order so that all writes through
 	 * the GTT land before any writes to the device, such as updates to
 	 * the GATT itself.
+	 *
+	 * We also have to wait a bit for the writes to land from the GTT.
+	 * An uncached read (i.e. mmio) seems to be ideal for the round-trip
+	 * timing. This issue has only been observed when switching quickly
+	 * between GTT writes and CPU reads from inside the kernel on recent hw,
+	 * and it appears to only affect discrete GTT blocks (i.e. on LLC
+	 * system agents we cannot reproduce this behaviour).
 	 */
 	wmb();
+	if (INTEL_GEN(dev_priv) >= 6 && !HAS_LLC(dev_priv))
+		POSTING_READ(RING_ACTHD(dev_priv->engine[RCS].mmio_base));
 
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;

From 9764951e7f517717bc7ecc3f1a9711816646ebf7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:50 +0100
Subject: [PATCH 103/141] drm/i915: Pin the pages first in shmem prepare
 read/write

There is an improbable, but not impossible, case that if we leave the
pages unpin as we operate on the object, then somebody via the shrinker
may steal the lock (which lock? right now, it is struct_mutex, THE lock)
and change the cache domains after we have already inspected them.

(Whilst here, avail ourselves of the opportunity to take a couple of
steps to make the two functions look more similar.)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-11-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 48 +++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4c9ecab765f1..c5d5dfe3e0ef 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -622,6 +622,12 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
+	ret = i915_gem_object_get_pages(obj);
+	if (ret)
+		return ret;
+
+	i915_gem_object_pin_pages(obj);
+
 	i915_gem_object_flush_gtt_write_domain(obj);
 
 	/* If we're not in the cpu read domain, set ourself into the gtt
@@ -633,22 +639,20 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 		*needs_clflush = !cpu_cache_is_coherent(obj->base.dev,
 							obj->cache_level);
 
-	ret = i915_gem_object_get_pages(obj);
-	if (ret)
-		return ret;
-
-	i915_gem_object_pin_pages(obj);
-
 	if (*needs_clflush && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		ret = i915_gem_object_set_to_cpu_domain(obj, false);
-		if (ret) {
-			i915_gem_object_unpin_pages(obj);
-			return ret;
-		}
+		if (ret)
+			goto err_unpin;
+
 		*needs_clflush = 0;
 	}
 
+	/* return with the pages pinned */
 	return 0;
+
+err_unpin:
+	i915_gem_object_unpin_pages(obj);
+	return ret;
 }
 
 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
@@ -664,6 +668,12 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
+	ret = i915_gem_object_get_pages(obj);
+	if (ret)
+		return ret;
+
+	i915_gem_object_pin_pages(obj);
+
 	i915_gem_object_flush_gtt_write_domain(obj);
 
 	/* If we're not in the cpu write domain, set ourself into the
@@ -681,18 +691,11 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 		*needs_clflush |= !cpu_cache_is_coherent(obj->base.dev,
 							 obj->cache_level);
 
-	ret = i915_gem_object_get_pages(obj);
-	if (ret)
-		return ret;
-
-	i915_gem_object_pin_pages(obj);
-
 	if (*needs_clflush && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		ret = i915_gem_object_set_to_cpu_domain(obj, true);
-		if (ret) {
-			i915_gem_object_unpin_pages(obj);
-			return ret;
-		}
+		if (ret)
+			goto err_unpin;
+
 		*needs_clflush = 0;
 	}
 
@@ -701,7 +704,12 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 
 	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 	obj->dirty = 1;
+	/* return with the pages pinned */
 	return 0;
+
+err_unpin:
+	i915_gem_object_unpin_pages(obj);
+	return ret;
 }
 
 /* Per-page copy function for the shmem pread fastpath.

From b0dc465f9540a8659ae06be08c6fcfc04c874777 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:51 +0100
Subject: [PATCH 104/141] drm/i915: Tidy up flush cpu/gtt write domains

Since we know the write domain, we can drop the local variable and make
the code look a tiny bit simpler.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-12-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c5d5dfe3e0ef..cfec2ff4fc7c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3182,7 +3182,6 @@ static void
 i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
 {
 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-	uint32_t old_write_domain;
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_GTT)
 		return;
@@ -3206,36 +3205,30 @@ i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
 	if (INTEL_GEN(dev_priv) >= 6 && !HAS_LLC(dev_priv))
 		POSTING_READ(RING_ACTHD(dev_priv->engine[RCS].mmio_base));
 
-	old_write_domain = obj->base.write_domain;
-	obj->base.write_domain = 0;
-
 	intel_fb_obj_flush(obj, false, write_origin(obj, I915_GEM_DOMAIN_GTT));
 
+	obj->base.write_domain = 0;
 	trace_i915_gem_object_change_domain(obj,
 					    obj->base.read_domains,
-					    old_write_domain);
+					    I915_GEM_DOMAIN_GTT);
 }
 
 /** Flushes the CPU write domain for the object if it's dirty. */
 static void
 i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 {
-	uint32_t old_write_domain;
-
 	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
 		return;
 
 	if (i915_gem_clflush_object(obj, obj->pin_display))
 		i915_gem_chipset_flush(to_i915(obj->base.dev));
 
-	old_write_domain = obj->base.write_domain;
-	obj->base.write_domain = 0;
-
 	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
 
+	obj->base.write_domain = 0;
 	trace_i915_gem_object_change_domain(obj,
 					    obj->base.read_domains,
-					    old_write_domain);
+					    I915_GEM_DOMAIN_CPU);
 }
 
 /**

From d50415cc6c8395602052b39a1a39290fba3d313e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:52 +0100
Subject: [PATCH 105/141] drm/i915: Refactor execbuffer relocation writing

With the introduction of the reloc page cache, we are just one step away
from refactoring the relocation write functions into one. Not only does
it tidy the code (slightly), but it greatly simplifies the control logic
much to gcc's satisfaction.

v2: Add selftests to document the relationship between the clflush
flags, the KMAP bit and packing into the page-aligned pointer.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-13-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 301 +++++++++++----------
 1 file changed, 157 insertions(+), 144 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 4cef9e9e05c5..8d0df7d81d8b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -39,6 +39,8 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
+#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+
 #define  __EXEC_OBJECT_HAS_PIN		(1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE	(1<<30)
 #define  __EXEC_OBJECT_NEEDS_MAP	(1<<29)
@@ -59,6 +61,7 @@ struct i915_execbuffer_params {
 };
 
 struct eb_vmas {
+	struct drm_i915_private *i915;
 	struct list_head vmas;
 	int and;
 	union {
@@ -68,7 +71,8 @@ struct eb_vmas {
 };
 
 static struct eb_vmas *
-eb_create(struct drm_i915_gem_execbuffer2 *args)
+eb_create(struct drm_i915_private *i915,
+	  struct drm_i915_gem_execbuffer2 *args)
 {
 	struct eb_vmas *eb = NULL;
 
@@ -95,6 +99,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args)
 	} else
 		eb->and = -args->buffer_count;
 
+	eb->i915 = i915;
 	INIT_LIST_HEAD(&eb->vmas);
 	return eb;
 }
@@ -278,6 +283,9 @@ static void eb_destroy(struct eb_vmas *eb)
 
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 {
+	if (DBG_USE_CPU_RELOC)
+		return DBG_USE_CPU_RELOC > 0;
+
 	return (HAS_LLC(obj->base.dev) ||
 		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
 		obj->cache_level != I915_CACHE_NONE);
@@ -302,37 +310,58 @@ static inline uint64_t gen8_noncanonical_addr(uint64_t address)
 }
 
 static inline uint64_t
-relocation_target(struct drm_i915_gem_relocation_entry *reloc,
+relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
 		  uint64_t target_offset)
 {
 	return gen8_canonical_addr((int)reloc->delta + target_offset);
 }
 
 struct reloc_cache {
-	void *vaddr;
+	struct drm_i915_private *i915;
+	struct drm_mm_node node;
+	unsigned long vaddr;
 	unsigned int page;
-	enum { KMAP, IOMAP } type;
+	bool use_64bit_reloc;
 };
 
-static void reloc_cache_init(struct reloc_cache *cache)
+static void reloc_cache_init(struct reloc_cache *cache,
+			     struct drm_i915_private *i915)
 {
 	cache->page = -1;
-	cache->vaddr = NULL;
+	cache->vaddr = 0;
+	cache->i915 = i915;
+	cache->use_64bit_reloc = INTEL_GEN(cache->i915) >= 8;
 }
 
+static inline void *unmask_page(unsigned long p)
+{
+	return (void *)(uintptr_t)(p & PAGE_MASK);
+}
+
+static inline unsigned int unmask_flags(unsigned long p)
+{
+	return p & ~PAGE_MASK;
+}
+
+#define KMAP 0x4 /* after CLFLUSH_FLAGS */
+
 static void reloc_cache_fini(struct reloc_cache *cache)
 {
+	void *vaddr;
+
 	if (!cache->vaddr)
 		return;
 
-	switch (cache->type) {
-	case KMAP:
-		kunmap_atomic(cache->vaddr);
-		break;
+	vaddr = unmask_page(cache->vaddr);
+	if (cache->vaddr & KMAP) {
+		if (cache->vaddr & CLFLUSH_AFTER)
+			mb();
 
-	case IOMAP:
-		io_mapping_unmap_atomic(cache->vaddr);
-		break;
+		kunmap_atomic(vaddr);
+		i915_gem_obj_finish_shmem_access((struct drm_i915_gem_object *)cache->node.mm);
+	} else {
+		io_mapping_unmap_atomic((void __iomem *)vaddr);
+		i915_vma_unpin((struct i915_vma *)cache->node.mm);
 	}
 }
 
@@ -340,147 +369,142 @@ static void *reloc_kmap(struct drm_i915_gem_object *obj,
 			struct reloc_cache *cache,
 			int page)
 {
-	if (cache->page == page)
-		return cache->vaddr;
+	void *vaddr;
 
-	if (cache->vaddr)
-		kunmap_atomic(cache->vaddr);
+	if (cache->vaddr) {
+		kunmap_atomic(unmask_page(cache->vaddr));
+	} else {
+		unsigned int flushes;
+		int ret;
 
+		ret = i915_gem_obj_prepare_shmem_write(obj, &flushes);
+		if (ret)
+			return ERR_PTR(ret);
+
+		BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS);
+		BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK);
+
+		cache->vaddr = flushes | KMAP;
+		cache->node.mm = (void *)obj;
+		if (flushes)
+			mb();
+	}
+
+	vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
+	cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr;
 	cache->page = page;
-	cache->vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
-	cache->type = KMAP;
 
-	return cache->vaddr;
+	return vaddr;
 }
 
-static int
-relocate_entry_cpu(struct drm_i915_gem_object *obj,
-		   struct drm_i915_gem_relocation_entry *reloc,
-		   struct reloc_cache *cache,
-		   uint64_t target_offset)
-{
-	struct drm_device *dev = obj->base.dev;
-	uint32_t page_offset = offset_in_page(reloc->offset);
-	uint64_t delta = relocation_target(reloc, target_offset);
-	char *vaddr;
-	int ret;
-
-	ret = i915_gem_object_set_to_cpu_domain(obj, true);
-	if (ret)
-		return ret;
-
-	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
-	*(uint32_t *)(vaddr + page_offset) = lower_32_bits(delta);
-
-	if (INTEL_GEN(dev) >= 8) {
-		page_offset += sizeof(uint32_t);
-		if (page_offset == PAGE_SIZE) {
-			vaddr = reloc_kmap(obj, cache, cache->page + 1);
-			page_offset = 0;
-		}
-		*(uint32_t *)(vaddr + page_offset) = upper_32_bits(delta);
-	}
-
-	return 0;
-}
-
-static void *reloc_iomap(struct drm_i915_private *i915,
+static void *reloc_iomap(struct drm_i915_gem_object *obj,
 			 struct reloc_cache *cache,
-			 uint64_t offset)
+			 int page)
 {
-	if (cache->page == offset >> PAGE_SHIFT)
-		return cache->vaddr;
+	void *vaddr;
 
-	if (cache->vaddr)
-		io_mapping_unmap_atomic(cache->vaddr);
+	if (cache->vaddr) {
+		io_mapping_unmap_atomic(unmask_page(cache->vaddr));
+	} else {
+		struct i915_vma *vma;
+		int ret;
 
-	cache->page = offset >> PAGE_SHIFT;
-	cache->vaddr =
-		io_mapping_map_atomic_wc(i915->ggtt.mappable,
-					 offset & PAGE_MASK);
-	cache->type = IOMAP;
+		if (use_cpu_reloc(obj))
+			return NULL;
 
-	return cache->vaddr;
-}
+		ret = i915_gem_object_set_to_gtt_domain(obj, true);
+		if (ret)
+			return ERR_PTR(ret);
 
-static int
-relocate_entry_gtt(struct drm_i915_gem_object *obj,
-		   struct drm_i915_gem_relocation_entry *reloc,
-		   struct reloc_cache *cache,
-		   uint64_t target_offset)
-{
-	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-	struct i915_vma *vma;
-	uint64_t delta = relocation_target(reloc, target_offset);
-	uint64_t offset;
-	void __iomem *reloc_page;
-	int ret;
+		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
+					       PIN_MAPPABLE | PIN_NONBLOCK);
+		if (IS_ERR(vma))
+			return NULL;
 
-	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
-	if (IS_ERR(vma))
-		return PTR_ERR(vma);
+		ret = i915_gem_object_put_fence(obj);
+		if (ret) {
+			i915_vma_unpin(vma);
+			return ERR_PTR(ret);
+		}
 
-	ret = i915_gem_object_set_to_gtt_domain(obj, true);
-	if (ret)
-		goto unpin;
-
-	ret = i915_gem_object_put_fence(obj);
-	if (ret)
-		goto unpin;
-
-	/* Map the page containing the relocation we're going to perform.  */
-	offset = vma->node.start + reloc->offset;
-	reloc_page = reloc_iomap(dev_priv, cache, offset);
-	iowrite32(lower_32_bits(delta), reloc_page + offset_in_page(offset));
-
-	if (INTEL_GEN(dev_priv) >= 8) {
-		offset += sizeof(uint32_t);
-		if (offset_in_page(offset) == 0)
-			reloc_page = reloc_iomap(dev_priv, cache, offset);
-		iowrite32(upper_32_bits(delta),
-			  reloc_page + offset_in_page(offset));
+		cache->node.start = vma->node.start;
+		cache->node.mm = (void *)vma;
 	}
 
-unpin:
-	__i915_vma_unpin(vma);
-	return ret;
+	vaddr = io_mapping_map_atomic_wc(cache->i915->ggtt.mappable,
+					 cache->node.start + (page << PAGE_SHIFT));
+	cache->page = page;
+	cache->vaddr = (unsigned long)vaddr;
+
+	return vaddr;
 }
 
-static void
-clflush_write32(void *addr, uint32_t value)
+static void *reloc_vaddr(struct drm_i915_gem_object *obj,
+			 struct reloc_cache *cache,
+			 int page)
 {
-	/* This is not a fast path, so KISS. */
-	drm_clflush_virt_range(addr, sizeof(uint32_t));
-	*(uint32_t *)addr = value;
-	drm_clflush_virt_range(addr, sizeof(uint32_t));
+	void *vaddr;
+
+	if (cache->page == page) {
+		vaddr = unmask_page(cache->vaddr);
+	} else {
+		vaddr = NULL;
+		if ((cache->vaddr & KMAP) == 0)
+			vaddr = reloc_iomap(obj, cache, page);
+		if (!vaddr)
+			vaddr = reloc_kmap(obj, cache, page);
+	}
+
+	return vaddr;
+}
+
+static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
+{
+	if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
+		if (flushes & CLFLUSH_BEFORE) {
+			clflushopt(addr);
+			mb();
+		}
+
+		*addr = value;
+
+		/* Writes to the same cacheline are serialised by the CPU
+		 * (including clflush). On the write path, we only require
+		 * that it hits memory in an orderly fashion and place
+		 * mb barriers at the start and end of the relocation phase
+		 * to ensure ordering of clflush wrt to the system.
+		 */
+		if (flushes & CLFLUSH_AFTER)
+			clflushopt(addr);
+	} else
+		*addr = value;
 }
 
 static int
-relocate_entry_clflush(struct drm_i915_gem_object *obj,
-		       struct drm_i915_gem_relocation_entry *reloc,
-		       struct reloc_cache *cache,
-		       uint64_t target_offset)
+relocate_entry(struct drm_i915_gem_object *obj,
+	       const struct drm_i915_gem_relocation_entry *reloc,
+	       struct reloc_cache *cache,
+	       u64 target_offset)
 {
-	struct drm_device *dev = obj->base.dev;
-	uint32_t page_offset = offset_in_page(reloc->offset);
-	uint64_t delta = relocation_target(reloc, target_offset);
-	char *vaddr;
-	int ret;
+	u64 offset = reloc->offset;
+	bool wide = cache->use_64bit_reloc;
+	void *vaddr;
 
-	ret = i915_gem_object_set_to_gtt_domain(obj, true);
-	if (ret)
-		return ret;
+	target_offset = relocation_target(reloc, target_offset);
+repeat:
+	vaddr = reloc_vaddr(obj, cache, offset >> PAGE_SHIFT);
+	if (IS_ERR(vaddr))
+		return PTR_ERR(vaddr);
 
-	vaddr = reloc_kmap(obj, cache, reloc->offset >> PAGE_SHIFT);
-	clflush_write32(vaddr + page_offset, lower_32_bits(delta));
+	clflush_write32(vaddr + offset_in_page(offset),
+			lower_32_bits(target_offset),
+			cache->vaddr);
 
-	if (INTEL_GEN(dev) >= 8) {
-		page_offset += sizeof(uint32_t);
-		if (page_offset == PAGE_SIZE) {
-			vaddr = reloc_kmap(obj, cache, cache->page + 1);
-			page_offset = 0;
-		}
-		clflush_write32(vaddr + page_offset, upper_32_bits(delta));
+	if (wide) {
+		offset += sizeof(u32);
+		target_offset >>= 32;
+		wide = false;
+		goto repeat;
 	}
 
 	return 0;
@@ -567,7 +591,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 
 	/* Check that the relocation address is valid... */
 	if (unlikely(reloc->offset >
-		obj->base.size - (INTEL_INFO(dev)->gen >= 8 ? 8 : 4))) {
+		     obj->base.size - (cache->use_64bit_reloc ? 8 : 4))) {
 		DRM_DEBUG("Relocation beyond object bounds: "
 			  "obj %p target %d offset %d size %d.\n",
 			  obj, reloc->target_handle,
@@ -587,23 +611,12 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	if (pagefault_disabled() && !object_is_idle(obj))
 		return -EFAULT;
 
-	if (use_cpu_reloc(obj))
-		ret = relocate_entry_cpu(obj, reloc, cache, target_offset);
-	else if (obj->map_and_fenceable)
-		ret = relocate_entry_gtt(obj, reloc, cache, target_offset);
-	else if (static_cpu_has(X86_FEATURE_CLFLUSH))
-		ret = relocate_entry_clflush(obj, reloc, cache, target_offset);
-	else {
-		WARN_ONCE(1, "Impossible case in relocation handling\n");
-		ret = -ENODEV;
-	}
-
+	ret = relocate_entry(obj, reloc, cache, target_offset);
 	if (ret)
 		return ret;
 
 	/* and update the user's relocation entry */
 	reloc->presumed_offset = target_offset;
-
 	return 0;
 }
 
@@ -619,7 +632,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
 	int remain, ret = 0;
 
 	user_relocs = u64_to_user_ptr(entry->relocs_ptr);
-	reloc_cache_init(&cache);
+	reloc_cache_init(&cache, eb->i915);
 
 	remain = entry->relocation_count;
 	while (remain) {
@@ -668,7 +681,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma,
 	struct reloc_cache cache;
 	int i, ret = 0;
 
-	reloc_cache_init(&cache);
+	reloc_cache_init(&cache, eb->i915);
 	for (i = 0; i < entry->relocation_count; i++) {
 		ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache);
 		if (ret)
@@ -1647,7 +1660,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	memset(&params_master, 0x00, sizeof(params_master));
 
-	eb = eb_create(args);
+	eb = eb_create(dev_priv, args);
 	if (eb == NULL) {
 		i915_gem_context_put(ctx);
 		mutex_unlock(&dev->struct_mutex);

From e8cb909ac3abbcac5184825638903a2b9a225725 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:53 +0100
Subject: [PATCH 106/141] drm/i915: Fallback to single page GTT mmappings for
 relocations

If we cannot pin the entire object into the mappable region of the GTT,
try to pin a single page instead. This is much more likely to succeed,
and prevents us falling back to the clflush slow path.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-14-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 64 ++++++++++++++++++----
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8d0df7d81d8b..c970aabfffa3 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -331,6 +331,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->vaddr = 0;
 	cache->i915 = i915;
 	cache->use_64bit_reloc = INTEL_GEN(cache->i915) >= 8;
+	cache->node.allocated = false;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -360,8 +361,19 @@ static void reloc_cache_fini(struct reloc_cache *cache)
 		kunmap_atomic(vaddr);
 		i915_gem_obj_finish_shmem_access((struct drm_i915_gem_object *)cache->node.mm);
 	} else {
+		wmb();
 		io_mapping_unmap_atomic((void __iomem *)vaddr);
-		i915_vma_unpin((struct i915_vma *)cache->node.mm);
+		if (cache->node.allocated) {
+			struct i915_ggtt *ggtt = &cache->i915->ggtt;
+
+			ggtt->base.clear_range(&ggtt->base,
+					       cache->node.start,
+					       cache->node.size,
+					       true);
+			drm_mm_remove_node(&cache->node);
+		} else {
+			i915_vma_unpin((struct i915_vma *)cache->node.mm);
+		}
 	}
 }
 
@@ -401,8 +413,19 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 			 struct reloc_cache *cache,
 			 int page)
 {
+	struct i915_ggtt *ggtt = &cache->i915->ggtt;
+	unsigned long offset;
 	void *vaddr;
 
+	if (cache->node.allocated) {
+		wmb();
+		ggtt->base.insert_page(&ggtt->base,
+				       i915_gem_object_get_dma_address(obj, page),
+				       cache->node.start, I915_CACHE_NONE, 0);
+		cache->page = page;
+		return unmask_page(cache->vaddr);
+	}
+
 	if (cache->vaddr) {
 		io_mapping_unmap_atomic(unmask_page(cache->vaddr));
 	} else {
@@ -418,21 +441,38 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 
 		vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 					       PIN_MAPPABLE | PIN_NONBLOCK);
-		if (IS_ERR(vma))
-			return NULL;
+		if (IS_ERR(vma)) {
+			memset(&cache->node, 0, sizeof(cache->node));
+			ret = drm_mm_insert_node_in_range_generic
+				(&ggtt->base.mm, &cache->node,
+				 4096, 0, 0,
+				 0, ggtt->mappable_end,
+				 DRM_MM_SEARCH_DEFAULT,
+				 DRM_MM_CREATE_DEFAULT);
+			if (ret)
+				return ERR_PTR(ret);
+		} else {
+			ret = i915_gem_object_put_fence(obj);
+			if (ret) {
+				i915_vma_unpin(vma);
+				return ERR_PTR(ret);
+			}
 
-		ret = i915_gem_object_put_fence(obj);
-		if (ret) {
-			i915_vma_unpin(vma);
-			return ERR_PTR(ret);
+			cache->node.start = vma->node.start;
+			cache->node.mm = (void *)vma;
 		}
-
-		cache->node.start = vma->node.start;
-		cache->node.mm = (void *)vma;
 	}
 
-	vaddr = io_mapping_map_atomic_wc(cache->i915->ggtt.mappable,
-					 cache->node.start + (page << PAGE_SHIFT));
+	offset = cache->node.start;
+	if (cache->node.allocated) {
+		ggtt->base.insert_page(&ggtt->base,
+				       i915_gem_object_get_dma_address(obj, page),
+				       offset, I915_CACHE_NONE, 0);
+	} else {
+		offset += page << PAGE_SHIFT;
+	}
+
+	vaddr = io_mapping_map_atomic_wc(cache->i915->ggtt.mappable, offset);
 	cache->page = page;
 	cache->vaddr = (unsigned long)vaddr;
 

From 9e53d9be0d9e09f834d3bdfccafb8330f2597e6c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:54 +0100
Subject: [PATCH 107/141] drm/i915: Disallow direct CPU access to stolen pages
 for relocations

As we cannot access the backing pages behind stolen objects, we should
not attempt to do so for relocations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-15-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c970aabfffa3..98d5aa5e0a34 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -283,6 +283,9 @@ static void eb_destroy(struct eb_vmas *eb)
 
 static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 {
+	if (!i915_gem_object_has_struct_page(obj))
+		return false;
+
 	if (DBG_USE_CPU_RELOC)
 		return DBG_USE_CPU_RELOC > 0;
 

From 05a20d098db1e3318228e7c281cd9b2d3d25f12b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:55 +0100
Subject: [PATCH 108/141] drm/i915: Move map-and-fenceable tracking to the VMA

By moving map-and-fenceable tracking from the object to the VMA, we gain
fine-grained tracking and the ability to track individual fences on the VMA
(subsequent patch).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-16-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h            |  6 ----
 drivers/gpu/drm/i915/i915_gem.c            | 37 ++++++++++------------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  4 +--
 drivers/gpu/drm/i915/i915_gem_fence.c      |  7 ++--
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.h        | 10 ++++--
 drivers/gpu/drm/i915/i915_gem_tiling.c     |  4 +--
 drivers/gpu/drm/i915/intel_display.c       |  6 ++--
 8 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 91861a08787c..b3623945b555 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2192,12 +2192,6 @@ struct drm_i915_gem_object {
 	 */
 	unsigned int fence_dirty:1;
 
-	/**
-	 * Is the object at the current location in the gtt mappable and
-	 * fenceable? Used to avoid costly recalculations.
-	 */
-	unsigned int map_and_fenceable:1;
-
 	/**
 	 * Whether the current gtt mapping needs to be mappable (and isn't just
 	 * mappable by accident). Track pin and fault separate for a more
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cfec2ff4fc7c..1f6312ca646c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2899,8 +2899,7 @@ int i915_vma_unbind(struct i915_vma *vma)
 	GEM_BUG_ON(obj->bind_count == 0);
 	GEM_BUG_ON(!obj->pages);
 
-	if (i915_vma_is_ggtt(vma) &&
-	    vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
+	if (i915_vma_is_map_and_fenceable(vma)) {
 		i915_gem_object_finish_gtt(obj);
 
 		/* release the fence reg _after_ flushing */
@@ -2909,6 +2908,7 @@ int i915_vma_unbind(struct i915_vma *vma)
 			return ret;
 
 		__i915_vma_iounmap(vma);
+		vma->flags &= ~I915_VMA_CAN_FENCE;
 	}
 
 	if (likely(!vma->vm->closed)) {
@@ -2920,13 +2920,10 @@ int i915_vma_unbind(struct i915_vma *vma)
 	drm_mm_remove_node(&vma->node);
 	list_move_tail(&vma->vm_link, &vma->vm->unbound_list);
 
-	if (i915_vma_is_ggtt(vma)) {
-		if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
-			obj->map_and_fenceable = false;
-		} else if (vma->pages) {
-			sg_free_table(vma->pages);
-			kfree(vma->pages);
-		}
+	if (vma->pages != obj->pages) {
+		GEM_BUG_ON(!vma->pages);
+		sg_free_table(vma->pages);
+		kfree(vma->pages);
 	}
 	vma->pages = NULL;
 
@@ -3703,8 +3700,6 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 static bool
 i915_vma_misplaced(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 {
-	struct drm_i915_gem_object *obj = vma->obj;
-
 	if (!drm_mm_node_allocated(&vma->node))
 		return false;
 
@@ -3714,7 +3709,7 @@ i915_vma_misplaced(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	if (alignment && vma->node.start & (alignment - 1))
 		return true;
 
-	if (flags & PIN_MAPPABLE && !obj->map_and_fenceable)
+	if (flags & PIN_MAPPABLE && !i915_vma_is_map_and_fenceable(vma))
 		return true;
 
 	if (flags & PIN_OFFSET_BIAS &&
@@ -3736,10 +3731,10 @@ void __i915_vma_set_map_and_fenceable(struct i915_vma *vma)
 	u32 fence_size, fence_alignment;
 
 	fence_size = i915_gem_get_ggtt_size(dev_priv,
-					    obj->base.size,
+					    vma->size,
 					    i915_gem_object_get_tiling(obj));
 	fence_alignment = i915_gem_get_ggtt_alignment(dev_priv,
-						      obj->base.size,
+						      vma->size,
 						      i915_gem_object_get_tiling(obj),
 						      true);
 
@@ -3749,7 +3744,10 @@ void __i915_vma_set_map_and_fenceable(struct i915_vma *vma)
 	mappable = (vma->node.start + fence_size <=
 		    dev_priv->ggtt.mappable_end);
 
-	obj->map_and_fenceable = mappable && fenceable;
+	if (mappable && fenceable)
+		vma->flags |= I915_VMA_CAN_FENCE;
+	else
+		vma->flags &= ~I915_VMA_CAN_FENCE;
 }
 
 int __i915_vma_do_pin(struct i915_vma *vma,
@@ -3809,12 +3807,11 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 
 		WARN(i915_vma_is_pinned(vma),
 		     "bo is already pinned in ggtt with incorrect alignment:"
-		     " offset=%08x, req.alignment=%llx, req.map_and_fenceable=%d,"
-		     " obj->map_and_fenceable=%d\n",
-		     i915_ggtt_offset(vma),
-		     alignment,
+		     " offset=%08x, req.alignment=%llx,"
+		     " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
+		     i915_ggtt_offset(vma), alignment,
 		     !!(flags & PIN_MAPPABLE),
-		     obj->map_and_fenceable);
+		     i915_vma_is_map_and_fenceable(vma));
 		ret = i915_vma_unbind(vma);
 		if (ret)
 			return ERR_PTR(ret);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 98d5aa5e0a34..28888d608914 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -857,7 +857,6 @@ static bool
 eb_vma_misplaced(struct i915_vma *vma)
 {
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	struct drm_i915_gem_object *obj = vma->obj;
 
 	WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
 		!i915_vma_is_ggtt(vma));
@@ -878,7 +877,8 @@ eb_vma_misplaced(struct i915_vma *vma)
 		return true;
 
 	/* avoid costly ping-pong once a batch bo ended up non-mappable */
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !obj->map_and_fenceable)
+	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
+	    !i915_vma_is_map_and_fenceable(vma))
 		return !only_mappable_for_reloc(entry->flags);
 
 	if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index b0c6c2777725..e15365be4045 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -130,7 +130,9 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 		     !is_power_of_2(vma->node.size) ||
 		     (vma->node.start & (vma->node.size - 1)),
 		     "object 0x%08llx [fenceable? %d] not 1M or pot-size (0x%08llx) aligned\n",
-		     vma->node.start, obj->map_and_fenceable, vma->node.size);
+		     vma->node.start,
+		     i915_vma_is_map_and_fenceable(vma),
+		     vma->node.size);
 
 		if (tiling == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
 			tile_width = 128;
@@ -389,9 +391,6 @@ i915_gem_object_get_fence(struct drm_i915_gem_object *obj)
 			return 0;
 		}
 	} else if (enable) {
-		if (WARN_ON(!obj->map_and_fenceable))
-			return -EINVAL;
-
 		reg = i915_find_fence_reg(dev);
 		if (IS_ERR(reg))
 			return PTR_ERR(reg);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 3631944ac2d9..e31f98df26f6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3671,7 +3671,7 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 	assert_rpm_wakelock_held(to_i915(vma->vm->dev));
 
 	lockdep_assert_held(&vma->vm->dev->struct_mutex);
-	if (WARN_ON(!vma->obj->map_and_fenceable))
+	if (WARN_ON(!i915_vma_is_map_and_fenceable(vma)))
 		return IO_ERR_PTR(-ENODEV);
 
 	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index d6e4b6529196..d7ff78b46266 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -197,8 +197,9 @@ struct i915_vma {
 #define I915_VMA_LOCAL_BIND	BIT(7)
 #define I915_VMA_BIND_MASK (I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND | I915_VMA_PIN_OVERFLOW)
 
-#define I915_VMA_GGTT	BIT(8)
-#define I915_VMA_CLOSED BIT(9)
+#define I915_VMA_GGTT		BIT(8)
+#define I915_VMA_CAN_FENCE	BIT(9)
+#define I915_VMA_CLOSED		BIT(10)
 
 	unsigned int active;
 	struct i915_gem_active last_read[I915_NUM_ENGINES];
@@ -239,6 +240,11 @@ static inline bool i915_vma_is_ggtt(const struct i915_vma *vma)
 	return vma->flags & I915_VMA_GGTT;
 }
 
+static inline bool i915_vma_is_map_and_fenceable(const struct i915_vma *vma)
+{
+	return vma->flags & I915_VMA_CAN_FENCE;
+}
+
 static inline bool i915_vma_is_closed(const struct i915_vma *vma)
 {
 	return vma->flags & I915_VMA_CLOSED;
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index bfefb63a55ef..af70d4460a9e 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -134,7 +134,7 @@ i915_gem_object_fence_prepare(struct drm_i915_gem_object *obj, int tiling_mode)
 	if (!vma)
 		return 0;
 
-	if (!obj->map_and_fenceable)
+	if (!i915_vma_is_map_and_fenceable(vma))
 		return 0;
 
 	if (IS_GEN3(dev_priv)) {
@@ -145,7 +145,7 @@ i915_gem_object_fence_prepare(struct drm_i915_gem_object *obj, int tiling_mode)
 			goto bad;
 	}
 
-	size = i915_gem_get_ggtt_size(dev_priv, obj->base.size, tiling_mode);
+	size = i915_gem_get_ggtt_size(dev_priv, vma->size, tiling_mode);
 	if (vma->node.size < size)
 		goto bad;
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 31eaeedfad30..04a8900f68c1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2224,7 +2224,7 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	 * framebuffer compression.  For simplicity, we always install
 	 * a fence as the cost is not that onerous.
 	 */
-	if (view.type == I915_GGTT_VIEW_NORMAL) {
+	if (i915_vma_is_map_and_fenceable(vma)) {
 		ret = i915_gem_object_get_fence(obj);
 		if (ret == -EDEADLK) {
 			/*
@@ -2262,11 +2262,11 @@ void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	WARN_ON(!mutex_is_locked(&obj->base.dev->struct_mutex));
 
 	intel_fill_fb_ggtt_view(&view, fb, rotation);
+	vma = i915_gem_object_to_ggtt(obj, &view);
 
-	if (view.type == I915_GGTT_VIEW_NORMAL)
+	if (i915_vma_is_map_and_fenceable(vma))
 		i915_gem_object_unpin_fence(obj);
 
-	vma = i915_gem_object_to_ggtt(obj, &view);
 	i915_gem_object_unpin_from_display_plane(vma);
 }
 

From 9d80841ea4c9df420da3f9a61a819d09a03f2161 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:56 +0100
Subject: [PATCH 109/141] drm/i915: Allow ringbuffers to be bound anywhere

Now that we have WC vmapping available, we can bind our rings anywhere
in the GGTT and do not need to restrict them to the mappable region.
Except for stolen objects, for which direct access is verbatim and we
must use the mappable aperture.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-17-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 15 ++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 12703ea27259..21bfc69e9dc6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1892,17 +1892,20 @@ int intel_ring_pin(struct intel_ring *ring)
 {
 	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
 	unsigned int flags = PIN_GLOBAL | PIN_OFFSET_BIAS | 4096;
+	enum i915_map_type map;
 	struct i915_vma *vma = ring->vma;
 	void *addr;
 	int ret;
 
 	GEM_BUG_ON(ring->vaddr);
 
-	if (ring->needs_iomap)
+	map = HAS_LLC(ring->engine->i915) ? I915_MAP_WB : I915_MAP_WC;
+
+	if (vma->obj->stolen)
 		flags |= PIN_MAPPABLE;
 
 	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
-		if (flags & PIN_MAPPABLE)
+		if (flags & PIN_MAPPABLE || map == I915_MAP_WC)
 			ret = i915_gem_object_set_to_gtt_domain(vma->obj, true);
 		else
 			ret = i915_gem_object_set_to_cpu_domain(vma->obj, true);
@@ -1914,10 +1917,10 @@ int intel_ring_pin(struct intel_ring *ring)
 	if (unlikely(ret))
 		return ret;
 
-	if (flags & PIN_MAPPABLE)
+	if (i915_vma_is_map_and_fenceable(vma))
 		addr = (void __force *)i915_vma_pin_iomap(vma);
 	else
-		addr = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
+		addr = i915_gem_object_pin_map(vma->obj, map);
 	if (IS_ERR(addr))
 		goto err;
 
@@ -1934,7 +1937,7 @@ void intel_ring_unpin(struct intel_ring *ring)
 	GEM_BUG_ON(!ring->vma);
 	GEM_BUG_ON(!ring->vaddr);
 
-	if (ring->needs_iomap)
+	if (i915_vma_is_map_and_fenceable(ring->vma))
 		i915_vma_unpin_iomap(ring->vma);
 	else
 		i915_gem_object_unpin_map(ring->vma->obj);
@@ -2005,8 +2008,6 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
 		return ERR_CAST(vma);
 	}
 	ring->vma = vma;
-	if (!HAS_LLC(engine->i915) || vma->obj->stolen)
-		ring->needs_iomap = true;
 
 	list_add(&ring->link, &engine->buffers);
 	return ring;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 86612d502c65..84aea549de5d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -96,7 +96,6 @@ struct intel_ring {
 	int space;
 	int size;
 	int effective_size;
-	bool needs_iomap;
 
 	/** We track the position of the requests in the ring buffer, and
 	 * when each is retired we increment last_retired_head as the GPU

From c58b735fc762e891481e92af7124b85cb0a51fce Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:57 +0100
Subject: [PATCH 110/141] drm/i915: Allocate rings from stolen

If we have stolen available, make use of it for ringbuffer allocation.
Previously this was restricted to !llc platforms, as writing to stolen
requires a GGTT mapping - but now that we have partial mappable support,
the mappable aperture isn't quite so precious so we can use it more
freely and ringbuffers are a good user for the otherwise wasted stolen.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-18-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_ringbuffer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 21bfc69e9dc6..cc5bcd14b6df 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1952,10 +1952,8 @@ intel_ring_create_vma(struct drm_i915_private *dev_priv, int size)
 	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
 
-	obj = ERR_PTR(-ENODEV);
-	if (!HAS_LLC(dev_priv))
-		obj = i915_gem_object_create_stolen(&dev_priv->drm, size);
-	if (IS_ERR(obj))
+	obj = i915_gem_object_create_stolen(&dev_priv->drm, size);
+	if (!obj)
 		obj = i915_gem_object_create(&dev_priv->drm, size);
 	if (IS_ERR(obj))
 		return ERR_CAST(obj);

From 364c8172edb5ff0b4650bbd4c45eaab46b84b008 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:58 +0100
Subject: [PATCH 111/141] drm/i915/userptr: Make gup errors stickier

Keep any error reported by the gup_worker until we are notified that the
arena has changed (via the mmu-notifier). This has the importance of
making two consecutive calls to i915_gem_object_get_pages() reporting
the same error, and curtailing a loop of detecting a fault and requeueing
a gup_worker.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-19-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 57218cca7e05..be54825ef3e8 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -542,8 +542,6 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 			}
 		}
 		obj->userptr.work = ERR_PTR(ret);
-		if (ret)
-			__i915_gem_userptr_set_active(obj, false);
 	}
 
 	obj->userptr.workers--;
@@ -628,15 +626,14 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 	 * to the vma (discard or cloning) which should prevent the more
 	 * egregious cases from causing harm.
 	 */
-	if (IS_ERR(obj->userptr.work)) {
-		/* active flag will have been dropped already by the worker */
-		ret = PTR_ERR(obj->userptr.work);
-		obj->userptr.work = NULL;
-		return ret;
-	}
-	if (obj->userptr.work)
+
+	if (obj->userptr.work) {
 		/* active flag should still be held for the pending work */
-		return -EAGAIN;
+		if (IS_ERR(obj->userptr.work))
+			return PTR_ERR(obj->userptr.work);
+		else
+			return -EAGAIN;
+	}
 
 	/* Let the mmu-notifier know that we have begun and need cancellation */
 	ret = __i915_gem_userptr_set_active(obj, true);

From a1e5afbe4d5b6a0b1e3ffb32ec11dd51887ca7a3 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:16:59 +0100
Subject: [PATCH 112/141] drm/i915: Rename fence.lru_list to link

Our current practice is to only name the actual list (here
dev_priv->fence_list) using "list", and elements upon that list are
referred to as "link". Further, the lru nature is of the list and not of
the node and including in the name does not disambiguate the link from
anything else.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-20-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h            | 2 +-
 drivers/gpu/drm/i915/i915_gem.c            | 2 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
 drivers/gpu/drm/i915/i915_gem_fence.c      | 9 ++++-----
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b3623945b555..67ece6db60d9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -461,7 +461,7 @@ struct intel_overlay_error_state;
 #define I915_MAX_NUM_FENCE_BITS 6
 
 struct drm_i915_fence_reg {
-	struct list_head lru_list;
+	struct list_head link;
 	struct drm_i915_gem_object *obj;
 	int pin_count;
 };
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1f6312ca646c..dd68f0c15801 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4553,7 +4553,7 @@ i915_gem_load_init(struct drm_device *dev)
 	for (i = 0; i < I915_NUM_ENGINES; i++)
 		init_engine_lists(&dev_priv->engine[i]);
 	for (i = 0; i < I915_MAX_NUM_FENCES; i++)
-		INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
+		INIT_LIST_HEAD(&dev_priv->fence_regs[i].link);
 	INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
 			  i915_gem_retire_work_handler);
 	INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 28888d608914..58cebafa8348 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1310,7 +1310,7 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 		if (flags & __EXEC_OBJECT_HAS_FENCE) {
 			struct drm_i915_private *dev_priv = req->i915;
 
-			list_move_tail(&dev_priv->fence_regs[obj->fence_reg].lru_list,
+			list_move_tail(&dev_priv->fence_regs[obj->fence_reg].link,
 				       &dev_priv->mm.fence_list);
 		}
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index e15365be4045..1b32351aee42 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -245,11 +245,11 @@ static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj,
 	if (enable) {
 		obj->fence_reg = reg;
 		fence->obj = obj;
-		list_move_tail(&fence->lru_list, &dev_priv->mm.fence_list);
+		list_move_tail(&fence->link, &dev_priv->mm.fence_list);
 	} else {
 		obj->fence_reg = I915_FENCE_REG_NONE;
 		fence->obj = NULL;
-		list_del_init(&fence->lru_list);
+		list_del_init(&fence->link);
 	}
 	obj->fence_dirty = false;
 }
@@ -331,7 +331,7 @@ i915_find_fence_reg(struct drm_device *dev)
 		goto deadlock;
 
 	/* None available, try to steal one or wait for a user to finish */
-	list_for_each_entry(reg, &dev_priv->mm.fence_list, lru_list) {
+	list_for_each_entry(reg, &dev_priv->mm.fence_list, link) {
 		if (reg->pin_count)
 			continue;
 
@@ -386,8 +386,7 @@ i915_gem_object_get_fence(struct drm_i915_gem_object *obj)
 	if (obj->fence_reg != I915_FENCE_REG_NONE) {
 		reg = &dev_priv->fence_regs[obj->fence_reg];
 		if (!obj->fence_dirty) {
-			list_move_tail(&reg->lru_list,
-				       &dev_priv->mm.fence_list);
+			list_move_tail(&reg->link, &dev_priv->mm.fence_list);
 			return 0;
 		}
 	} else if (enable) {

From 49ef5294cda256aa5496ba56bbf859d3c7a17e07 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:00 +0100
Subject: [PATCH 113/141] drm/i915: Move fence tracking from object to vma

In order to handle tiled partial GTT mmappings, we need to associate the
fence with an individual vma.

v2: A couple of silly drops replaced spotted by Joonas

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-21-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  16 +-
 drivers/gpu/drm/i915/i915_drv.h            |  82 ++--
 drivers/gpu/drm/i915/i915_gem.c            |  30 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  20 +-
 drivers/gpu/drm/i915/i915_gem_fence.c      | 440 ++++++++-------------
 drivers/gpu/drm/i915/i915_gem_gtt.c        |   2 +
 drivers/gpu/drm/i915/i915_gem_gtt.h        |   8 +
 drivers/gpu/drm/i915/i915_gem_tiling.c     |  67 ++--
 drivers/gpu/drm/i915/i915_gpu_error.c      |   2 +-
 drivers/gpu/drm/i915/intel_display.c       |  57 ++-
 drivers/gpu/drm/i915/intel_fbc.c           |  10 +-
 drivers/gpu/drm/i915/intel_overlay.c       |   2 +-
 12 files changed, 339 insertions(+), 397 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 36112282f590..d0b4c74974be 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -152,11 +152,9 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		seq_printf(m, "%x ",
 			   i915_gem_active_get_seqno(&obj->last_read[id],
 						     &obj->base.dev->struct_mutex));
-	seq_printf(m, "] %x %x%s%s%s",
+	seq_printf(m, "] %x %s%s%s",
 		   i915_gem_active_get_seqno(&obj->last_write,
 					     &obj->base.dev->struct_mutex),
-		   i915_gem_active_get_seqno(&obj->last_fence,
-					     &obj->base.dev->struct_mutex),
 		   i915_cache_level_str(to_i915(obj->base.dev), obj->cache_level),
 		   obj->dirty ? " dirty" : "",
 		   obj->madv == I915_MADV_DONTNEED ? " purgeable" : "");
@@ -169,8 +167,6 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 	seq_printf(m, " (pinned x %d)", pin_count);
 	if (obj->pin_display)
 		seq_printf(m, " (display)");
-	if (obj->fence_reg != I915_FENCE_REG_NONE)
-		seq_printf(m, " (fence: %d)", obj->fence_reg);
 	list_for_each_entry(vma, &obj->vma_list, obj_link) {
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
@@ -180,6 +176,10 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 			   vma->node.start, vma->node.size);
 		if (i915_vma_is_ggtt(vma))
 			seq_printf(m, ", type: %u", vma->ggtt_view.type);
+		if (vma->fence)
+			seq_printf(m, " , fence: %d%s",
+				   vma->fence->id,
+				   i915_gem_active_isset(&vma->last_fence) ? "*" : "");
 		seq_puts(m, ")");
 	}
 	if (obj->stolen)
@@ -938,14 +938,14 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 
 	seq_printf(m, "Total fences = %d\n", dev_priv->num_fence_regs);
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_gem_object *obj = dev_priv->fence_regs[i].obj;
+		struct i915_vma *vma = dev_priv->fence_regs[i].vma;
 
 		seq_printf(m, "Fence %d, pin count = %d, object = ",
 			   i, dev_priv->fence_regs[i].pin_count);
-		if (obj == NULL)
+		if (!vma)
 			seq_puts(m, "unused");
 		else
-			describe_obj(m, obj);
+			describe_obj(m, vma->obj);
 		seq_putc(m, '\n');
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 67ece6db60d9..56d439374fe5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -455,15 +455,21 @@ struct intel_opregion {
 struct intel_overlay;
 struct intel_overlay_error_state;
 
-#define I915_FENCE_REG_NONE -1
-#define I915_MAX_NUM_FENCES 32
-/* 32 fences + sign bit for FENCE_REG_NONE */
-#define I915_MAX_NUM_FENCE_BITS 6
-
 struct drm_i915_fence_reg {
 	struct list_head link;
-	struct drm_i915_gem_object *obj;
+	struct drm_i915_private *i915;
+	struct i915_vma *vma;
 	int pin_count;
+	int id;
+	/**
+	 * Whether the tiling parameters for the currently
+	 * associated fence register have changed. Note that
+	 * for the purposes of tracking tiling changes we also
+	 * treat the unfenced register, the register slot that
+	 * the object occupies whilst it executes a fenced
+	 * command (such as BLT on gen2/3), as a "fence".
+	 */
+	bool dirty;
 };
 
 struct sdvo_device_mapping {
@@ -2171,27 +2177,11 @@ struct drm_i915_gem_object {
 	 */
 	unsigned int dirty:1;
 
-	/**
-	 * Fence register bits (if any) for this object.  Will be set
-	 * as needed when mapped into the GTT.
-	 * Protected by dev->struct_mutex.
-	 */
-	signed int fence_reg:I915_MAX_NUM_FENCE_BITS;
-
 	/**
 	 * Advice: are the backing pages purgeable?
 	 */
 	unsigned int madv:2;
 
-	/**
-	 * Whether the tiling parameters for the currently associated fence
-	 * register have changed. Note that for the purposes of tracking
-	 * tiling changes we also treat the unfenced register, the register
-	 * slot that the object occupies whilst it executes a fenced
-	 * command (such as BLT on gen2/3), as a "fence".
-	 */
-	unsigned int fence_dirty:1;
-
 	/**
 	 * Whether the current gtt mapping needs to be mappable (and isn't just
 	 * mappable by accident). Track pin and fault separate for a more
@@ -2240,7 +2230,6 @@ struct drm_i915_gem_object {
 	 */
 	struct i915_gem_active last_read[I915_NUM_ENGINES];
 	struct i915_gem_active last_write;
-	struct i915_gem_active last_fence;
 
 	/** References from framebuffers, locks out tiling changes. */
 	unsigned long framebuffer_references;
@@ -3343,11 +3332,50 @@ i915_gem_object_ggtt_offset(struct drm_i915_gem_object *o,
 }
 
 /* i915_gem_fence.c */
-int __must_check i915_gem_object_get_fence(struct drm_i915_gem_object *obj);
-int __must_check i915_gem_object_put_fence(struct drm_i915_gem_object *obj);
+int __must_check i915_vma_get_fence(struct i915_vma *vma);
+int __must_check i915_vma_put_fence(struct i915_vma *vma);
 
-bool i915_gem_object_pin_fence(struct drm_i915_gem_object *obj);
-void i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj);
+/**
+ * i915_vma_pin_fence - pin fencing state
+ * @vma: vma to pin fencing for
+ *
+ * This pins the fencing state (whether tiled or untiled) to make sure the
+ * vma (and its object) is ready to be used as a scanout target. Fencing
+ * status must be synchronize first by calling i915_vma_get_fence():
+ *
+ * The resulting fence pin reference must be released again with
+ * i915_vma_unpin_fence().
+ *
+ * Returns:
+ *
+ * True if the vma has a fence, false otherwise.
+ */
+static inline bool
+i915_vma_pin_fence(struct i915_vma *vma)
+{
+	if (vma->fence) {
+		vma->fence->pin_count++;
+		return true;
+	} else
+		return false;
+}
+
+/**
+ * i915_vma_unpin_fence - unpin fencing state
+ * @vma: vma to unpin fencing for
+ *
+ * This releases the fence pin reference acquired through
+ * i915_vma_pin_fence. It will handle both objects with and without an
+ * attached fence correctly, callers do not need to distinguish this.
+ */
+static inline void
+i915_vma_unpin_fence(struct i915_vma *vma)
+{
+	if (vma->fence) {
+		GEM_BUG_ON(vma->fence->pin_count <= 0);
+		vma->fence->pin_count--;
+	}
+}
 
 void i915_gem_restore_fences(struct drm_device *dev);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dd68f0c15801..9276c73f1d81 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -829,7 +829,7 @@ i915_gem_gtt_pread(struct drm_device *dev,
 	if (!IS_ERR(vma)) {
 		node.start = i915_ggtt_offset(vma);
 		node.allocated = false;
-		ret = i915_gem_object_put_fence(obj);
+		ret = i915_vma_put_fence(vma);
 		if (ret) {
 			i915_vma_unpin(vma);
 			vma = ERR_PTR(ret);
@@ -1131,7 +1131,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 	if (!IS_ERR(vma)) {
 		node.start = i915_ggtt_offset(vma);
 		node.allocated = false;
-		ret = i915_gem_object_put_fence(obj);
+		ret = i915_vma_put_fence(vma);
 		if (ret) {
 			i915_vma_unpin(vma);
 			vma = ERR_PTR(ret);
@@ -1751,7 +1751,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
-	ret = i915_gem_object_get_fence(obj);
+	ret = i915_vma_get_fence(vma);
 	if (ret)
 		goto err_unpin;
 
@@ -2903,7 +2903,7 @@ int i915_vma_unbind(struct i915_vma *vma)
 		i915_gem_object_finish_gtt(obj);
 
 		/* release the fence reg _after_ flushing */
-		ret = i915_gem_object_put_fence(obj);
+		ret = i915_vma_put_fence(vma);
 		if (ret)
 			return ret;
 
@@ -3385,9 +3385,11 @@ restart:
 			 * dropped the fence as all snoopable access is
 			 * supposed to be linear.
 			 */
-			ret = i915_gem_object_put_fence(obj);
-			if (ret)
-				return ret;
+			list_for_each_entry(vma, &obj->vma_list, obj_link) {
+				ret = i915_vma_put_fence(vma);
+				if (ret)
+					return ret;
+			}
 		} else {
 			/* We either have incoherent backing store and
 			 * so no GTT access or the architecture is fully
@@ -4065,14 +4067,12 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 				    i915_gem_object_retire__read);
 	init_request_active(&obj->last_write,
 			    i915_gem_object_retire__write);
-	init_request_active(&obj->last_fence, NULL);
 	INIT_LIST_HEAD(&obj->obj_exec_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
 	obj->ops = ops;
 
-	obj->fence_reg = I915_FENCE_REG_NONE;
 	obj->madv = I915_MADV_WILLNEED;
 
 	i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
@@ -4502,6 +4502,7 @@ void
 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = &dev_priv->drm;
+	int i;
 
 	if (INTEL_INFO(dev_priv)->gen >= 7 && !IS_VALLEYVIEW(dev_priv) &&
 	    !IS_CHERRYVIEW(dev_priv))
@@ -4517,6 +4518,13 @@ i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
 				I915_READ(vgtif_reg(avail_rs.fence_num));
 
 	/* Initialize fence registers to zero */
+	for (i = 0; i < dev_priv->num_fence_regs; i++) {
+		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
+
+		fence->i915 = dev_priv;
+		fence->id = i;
+		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
+	}
 	i915_gem_restore_fences(dev);
 
 	i915_gem_detect_bit_6_swizzle(dev);
@@ -4552,8 +4560,6 @@ i915_gem_load_init(struct drm_device *dev)
 	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 	for (i = 0; i < I915_NUM_ENGINES; i++)
 		init_engine_lists(&dev_priv->engine[i]);
-	for (i = 0; i < I915_MAX_NUM_FENCES; i++)
-		INIT_LIST_HEAD(&dev_priv->fence_regs[i].link);
 	INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
 			  i915_gem_retire_work_handler);
 	INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
@@ -4563,8 +4569,6 @@ i915_gem_load_init(struct drm_device *dev)
 
 	dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL;
 
-	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
-
 	init_waitqueue_head(&dev_priv->pending_flip_queue);
 
 	dev_priv->mm.interruptible = true;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 58cebafa8348..907386630e26 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -250,7 +250,6 @@ static void
 i915_gem_execbuffer_unreserve_vma(struct i915_vma *vma)
 {
 	struct drm_i915_gem_exec_object2 *entry;
-	struct drm_i915_gem_object *obj = vma->obj;
 
 	if (!drm_mm_node_allocated(&vma->node))
 		return;
@@ -258,7 +257,7 @@ i915_gem_execbuffer_unreserve_vma(struct i915_vma *vma)
 	entry = vma->exec_entry;
 
 	if (entry->flags & __EXEC_OBJECT_HAS_FENCE)
-		i915_gem_object_unpin_fence(obj);
+		i915_vma_unpin_fence(vma);
 
 	if (entry->flags & __EXEC_OBJECT_HAS_PIN)
 		__i915_vma_unpin(vma);
@@ -455,7 +454,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 			if (ret)
 				return ERR_PTR(ret);
 		} else {
-			ret = i915_gem_object_put_fence(obj);
+			ret = i915_vma_put_fence(vma);
 			if (ret) {
 				i915_vma_unpin(vma);
 				return ERR_PTR(ret);
@@ -811,11 +810,11 @@ i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
 	entry->flags |= __EXEC_OBJECT_HAS_PIN;
 
 	if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
-		ret = i915_gem_object_get_fence(obj);
+		ret = i915_vma_get_fence(vma);
 		if (ret)
 			return ret;
 
-		if (i915_gem_object_pin_fence(obj))
+		if (i915_vma_pin_fence(vma))
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
@@ -1305,15 +1304,8 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
 	}
 
-	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
-		i915_gem_active_set(&obj->last_fence, req);
-		if (flags & __EXEC_OBJECT_HAS_FENCE) {
-			struct drm_i915_private *dev_priv = req->i915;
-
-			list_move_tail(&dev_priv->fence_regs[obj->fence_reg].link,
-				       &dev_priv->mm.fence_list);
-		}
-	}
+	if (flags & EXEC_OBJECT_NEEDS_FENCE)
+		i915_gem_active_set(&vma->last_fence, req);
 
 	i915_vma_set_active(vma, idx);
 	i915_gem_active_set(&vma->last_read[idx], req);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index 1b32351aee42..dfe0a1a5e584 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -55,74 +55,73 @@
  * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
  */
 
-static void i965_write_fence_reg(struct drm_device *dev, int reg,
-				 struct drm_i915_gem_object *obj)
+#define pipelined 0
+
+static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
+				 struct i915_vma *vma)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
 	i915_reg_t fence_reg_lo, fence_reg_hi;
 	int fence_pitch_shift;
+	u64 val;
 
-	if (INTEL_INFO(dev)->gen >= 6) {
-		fence_reg_lo = FENCE_REG_GEN6_LO(reg);
-		fence_reg_hi = FENCE_REG_GEN6_HI(reg);
+	if (INTEL_INFO(fence->i915)->gen >= 6) {
+		fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
+		fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
 		fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
+
 	} else {
-		fence_reg_lo = FENCE_REG_965_LO(reg);
-		fence_reg_hi = FENCE_REG_965_HI(reg);
+		fence_reg_lo = FENCE_REG_965_LO(fence->id);
+		fence_reg_hi = FENCE_REG_965_HI(fence->id);
 		fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
 	}
 
-	/* To w/a incoherency with non-atomic 64-bit register updates,
-	 * we split the 64-bit update into two 32-bit writes. In order
-	 * for a partial fence not to be evaluated between writes, we
-	 * precede the update with write to turn off the fence register,
-	 * and only enable the fence as the last step.
-	 *
-	 * For extra levels of paranoia, we make sure each step lands
-	 * before applying the next step.
-	 */
-	I915_WRITE(fence_reg_lo, 0);
-	POSTING_READ(fence_reg_lo);
-
-	if (obj) {
-		struct i915_vma *vma = i915_gem_object_to_ggtt(obj, NULL);
-		unsigned int tiling = i915_gem_object_get_tiling(obj);
-		unsigned int stride = i915_gem_object_get_stride(obj);
-		u32 size = vma->node.size;
-		u32 row_size = stride * (tiling == I915_TILING_Y ? 32 : 8);
-		u64 val;
-
-		/* Adjust fence size to match tiled area */
-		size = rounddown(size, row_size);
+	val = 0;
+	if (vma) {
+		unsigned int tiling = i915_gem_object_get_tiling(vma->obj);
+		bool is_y_tiled = tiling == I915_TILING_Y;
+		unsigned int stride = i915_gem_object_get_stride(vma->obj);
+		u32 row_size = stride * (is_y_tiled ? 32 : 8);
+		u32 size = rounddown((u32)vma->node.size, row_size);
 
 		val = ((vma->node.start + size - 4096) & 0xfffff000) << 32;
 		val |= vma->node.start & 0xfffff000;
 		val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
-		if (tiling == I915_TILING_Y)
-			val |= 1 << I965_FENCE_TILING_Y_SHIFT;
+		if (is_y_tiled)
+			val |= BIT(I965_FENCE_TILING_Y_SHIFT);
 		val |= I965_FENCE_REG_VALID;
+	}
 
-		I915_WRITE(fence_reg_hi, val >> 32);
-		POSTING_READ(fence_reg_hi);
+	if (!pipelined) {
+		struct drm_i915_private *dev_priv = fence->i915;
 
-		I915_WRITE(fence_reg_lo, val);
+		/* To w/a incoherency with non-atomic 64-bit register updates,
+		 * we split the 64-bit update into two 32-bit writes. In order
+		 * for a partial fence not to be evaluated between writes, we
+		 * precede the update with write to turn off the fence register,
+		 * and only enable the fence as the last step.
+		 *
+		 * For extra levels of paranoia, we make sure each step lands
+		 * before applying the next step.
+		 */
+		I915_WRITE(fence_reg_lo, 0);
+		POSTING_READ(fence_reg_lo);
+
+		I915_WRITE(fence_reg_hi, upper_32_bits(val));
+		I915_WRITE(fence_reg_lo, lower_32_bits(val));
 		POSTING_READ(fence_reg_lo);
-	} else {
-		I915_WRITE(fence_reg_hi, 0);
-		POSTING_READ(fence_reg_hi);
 	}
 }
 
-static void i915_write_fence_reg(struct drm_device *dev, int reg,
-				 struct drm_i915_gem_object *obj)
+static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
+				 struct i915_vma *vma)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
 	u32 val;
 
-	if (obj) {
-		struct i915_vma *vma = i915_gem_object_to_ggtt(obj, NULL);
-		unsigned int tiling = i915_gem_object_get_tiling(obj);
-		unsigned int stride = i915_gem_object_get_stride(obj);
+	val = 0;
+	if (vma) {
+		unsigned int tiling = i915_gem_object_get_tiling(vma->obj);
+		bool is_y_tiled = tiling == I915_TILING_Y;
+		unsigned int stride = i915_gem_object_get_stride(vma->obj);
 		int pitch_val;
 		int tile_width;
 
@@ -134,7 +133,7 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 		     i915_vma_is_map_and_fenceable(vma),
 		     vma->node.size);
 
-		if (tiling == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
+		if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence->i915))
 			tile_width = 128;
 		else
 			tile_width = 512;
@@ -144,28 +143,32 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 		pitch_val = ffs(pitch_val) - 1;
 
 		val = vma->node.start;
-		if (tiling == I915_TILING_Y)
-			val |= 1 << I830_FENCE_TILING_Y_SHIFT;
+		if (is_y_tiled)
+			val |= BIT(I830_FENCE_TILING_Y_SHIFT);
 		val |= I915_FENCE_SIZE_BITS(vma->node.size);
 		val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 		val |= I830_FENCE_REG_VALID;
-	} else
-		val = 0;
+	}
 
-	I915_WRITE(FENCE_REG(reg), val);
-	POSTING_READ(FENCE_REG(reg));
+	if (!pipelined) {
+		struct drm_i915_private *dev_priv = fence->i915;
+		i915_reg_t reg = FENCE_REG(fence->id);
+
+		I915_WRITE(reg, val);
+		POSTING_READ(reg);
+	}
 }
 
-static void i830_write_fence_reg(struct drm_device *dev, int reg,
-				struct drm_i915_gem_object *obj)
+static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
+				 struct i915_vma *vma)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
 	u32 val;
 
-	if (obj) {
-		struct i915_vma *vma = i915_gem_object_to_ggtt(obj, NULL);
-		unsigned int tiling = i915_gem_object_get_tiling(obj);
-		unsigned int stride = i915_gem_object_get_stride(obj);
+	val = 0;
+	if (vma) {
+		unsigned int tiling = i915_gem_object_get_tiling(vma->obj);
+		bool is_y_tiled = tiling == I915_TILING_Y;
+		unsigned int stride = i915_gem_object_get_stride(vma->obj);
 		u32 pitch_val;
 
 		WARN((vma->node.start & ~I830_FENCE_START_MASK) ||
@@ -178,104 +181,102 @@ static void i830_write_fence_reg(struct drm_device *dev, int reg,
 		pitch_val = ffs(pitch_val) - 1;
 
 		val = vma->node.start;
-		if (tiling == I915_TILING_Y)
-			val |= 1 << I830_FENCE_TILING_Y_SHIFT;
+		if (is_y_tiled)
+			val |= BIT(I830_FENCE_TILING_Y_SHIFT);
 		val |= I830_FENCE_SIZE_BITS(vma->node.size);
 		val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 		val |= I830_FENCE_REG_VALID;
-	} else
-		val = 0;
-
-	I915_WRITE(FENCE_REG(reg), val);
-	POSTING_READ(FENCE_REG(reg));
-}
-
-inline static bool i915_gem_object_needs_mb(struct drm_i915_gem_object *obj)
-{
-	return obj && obj->base.read_domains & I915_GEM_DOMAIN_GTT;
-}
-
-static void i915_gem_write_fence(struct drm_device *dev, int reg,
-				 struct drm_i915_gem_object *obj)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-
-	/* Ensure that all CPU reads are completed before installing a fence
-	 * and all writes before removing the fence.
-	 */
-	if (i915_gem_object_needs_mb(dev_priv->fence_regs[reg].obj))
-		mb();
-
-	WARN(obj &&
-	     (!i915_gem_object_get_stride(obj) ||
-	      !i915_gem_object_get_tiling(obj)),
-	     "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
-	     i915_gem_object_get_stride(obj),
-	     i915_gem_object_get_tiling(obj));
-
-	if (IS_GEN2(dev))
-		i830_write_fence_reg(dev, reg, obj);
-	else if (IS_GEN3(dev))
-		i915_write_fence_reg(dev, reg, obj);
-	else if (INTEL_INFO(dev)->gen >= 4)
-		i965_write_fence_reg(dev, reg, obj);
-
-	/* And similarly be paranoid that no direct access to this region
-	 * is reordered to before the fence is installed.
-	 */
-	if (i915_gem_object_needs_mb(obj))
-		mb();
-}
-
-static inline int fence_number(struct drm_i915_private *dev_priv,
-			       struct drm_i915_fence_reg *fence)
-{
-	return fence - dev_priv->fence_regs;
-}
-
-static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj,
-					 struct drm_i915_fence_reg *fence,
-					 bool enable)
-{
-	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-	int reg = fence_number(dev_priv, fence);
-
-	i915_gem_write_fence(obj->base.dev, reg, enable ? obj : NULL);
-
-	if (enable) {
-		obj->fence_reg = reg;
-		fence->obj = obj;
-		list_move_tail(&fence->link, &dev_priv->mm.fence_list);
-	} else {
-		obj->fence_reg = I915_FENCE_REG_NONE;
-		fence->obj = NULL;
-		list_del_init(&fence->link);
 	}
-	obj->fence_dirty = false;
+
+	if (!pipelined) {
+		struct drm_i915_private *dev_priv = fence->i915;
+		i915_reg_t reg = FENCE_REG(fence->id);
+
+		I915_WRITE(reg, val);
+		POSTING_READ(reg);
+	}
 }
 
-static inline void i915_gem_object_fence_lost(struct drm_i915_gem_object *obj)
+static void fence_write(struct drm_i915_fence_reg *fence,
+			struct i915_vma *vma)
 {
-	if (i915_gem_object_is_tiled(obj))
-		i915_gem_release_mmap(obj);
-
-	/* As we do not have an associated fence register, we will force
-	 * a tiling change if we ever need to acquire one.
+	/* Previous access through the fence register is marshalled by
+	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
+	 * and explicitly managed for internal users.
 	 */
-	obj->fence_dirty = false;
-	obj->fence_reg = I915_FENCE_REG_NONE;
+
+	if (IS_GEN2(fence->i915))
+		i830_write_fence_reg(fence, vma);
+	else if (IS_GEN3(fence->i915))
+		i915_write_fence_reg(fence, vma);
+	else
+		i965_write_fence_reg(fence, vma);
+
+	/* Access through the fenced region afterwards is
+	 * ordered by the posting reads whilst writing the registers.
+	 */
+
+	fence->dirty = false;
 }
 
-static int
-i915_gem_object_wait_fence(struct drm_i915_gem_object *obj)
+static int fence_update(struct drm_i915_fence_reg *fence,
+			struct i915_vma *vma)
 {
-	return i915_gem_active_retire(&obj->last_fence,
-				      &obj->base.dev->struct_mutex);
+	int ret;
+
+	if (vma) {
+		if (!i915_vma_is_map_and_fenceable(vma))
+			return -EINVAL;
+
+		if (WARN(!i915_gem_object_get_stride(vma->obj) ||
+			 !i915_gem_object_get_tiling(vma->obj),
+			 "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
+			 i915_gem_object_get_stride(vma->obj),
+			 i915_gem_object_get_tiling(vma->obj)))
+			return -EINVAL;
+
+		ret = i915_gem_active_retire(&vma->last_fence,
+					     &vma->obj->base.dev->struct_mutex);
+		if (ret)
+			return ret;
+	}
+
+	if (fence->vma) {
+		ret = i915_gem_active_retire(&fence->vma->last_fence,
+				      &fence->vma->obj->base.dev->struct_mutex);
+		if (ret)
+			return ret;
+	}
+
+	if (fence->vma && fence->vma != vma) {
+		/* Ensure that all userspace CPU access is completed before
+		 * stealing the fence.
+		 */
+		i915_gem_release_mmap(fence->vma->obj);
+
+		fence->vma->fence = NULL;
+		fence->vma = NULL;
+
+		list_move(&fence->link, &fence->i915->mm.fence_list);
+	}
+
+	fence_write(fence, vma);
+
+	if (vma) {
+		if (fence->vma != vma) {
+			vma->fence = fence;
+			fence->vma = vma;
+		}
+
+		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
+	}
+
+	return 0;
 }
 
 /**
- * i915_gem_object_put_fence - force-remove fence for an object
- * @obj: object to map through a fence reg
+ * i915_vma_put_fence - force-remove fence for a VMA
+ * @vma: vma to map linearly (not through a fence reg)
  *
  * This function force-removes any fence from the given object, which is useful
  * if the kernel wants to do untiled GTT access.
@@ -285,70 +286,40 @@ i915_gem_object_wait_fence(struct drm_i915_gem_object *obj)
  * 0 on success, negative error code on failure.
  */
 int
-i915_gem_object_put_fence(struct drm_i915_gem_object *obj)
+i915_vma_put_fence(struct i915_vma *vma)
 {
-	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-	struct drm_i915_fence_reg *fence;
-	int ret;
+	struct drm_i915_fence_reg *fence = vma->fence;
 
-	ret = i915_gem_object_wait_fence(obj);
-	if (ret)
-		return ret;
-
-	if (obj->fence_reg == I915_FENCE_REG_NONE)
+	if (!fence)
 		return 0;
 
-	fence = &dev_priv->fence_regs[obj->fence_reg];
-
 	if (fence->pin_count)
 		return -EBUSY;
 
-	i915_gem_object_fence_lost(obj);
-	i915_gem_object_update_fence(obj, fence, false);
-
-	return 0;
+	return fence_update(fence, NULL);
 }
 
-static struct drm_i915_fence_reg *
-i915_find_fence_reg(struct drm_device *dev)
+static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct drm_i915_fence_reg *reg, *avail;
-	int i;
+	struct drm_i915_fence_reg *fence;
 
-	/* First try to find a free reg */
-	avail = NULL;
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		reg = &dev_priv->fence_regs[i];
-		if (!reg->obj)
-			return reg;
-
-		if (!reg->pin_count)
-			avail = reg;
-	}
-
-	if (avail == NULL)
-		goto deadlock;
-
-	/* None available, try to steal one or wait for a user to finish */
-	list_for_each_entry(reg, &dev_priv->mm.fence_list, link) {
-		if (reg->pin_count)
+	list_for_each_entry(fence, &dev_priv->mm.fence_list, link) {
+		if (fence->pin_count)
 			continue;
 
-		return reg;
+		return fence;
 	}
 
-deadlock:
 	/* Wait for completion of pending flips which consume fences */
-	if (intel_has_pending_fb_unpin(dev))
+	if (intel_has_pending_fb_unpin(&dev_priv->drm))
 		return ERR_PTR(-EAGAIN);
 
 	return ERR_PTR(-EDEADLK);
 }
 
 /**
- * i915_gem_object_get_fence - set up fencing for an object
- * @obj: object to map through a fence reg
+ * i915_vma_get_fence - set up fencing for a vma
+ * @vma: vma to map through a fence reg
  *
  * When mapping objects through the GTT, userspace wants to be able to write
  * to them without having to worry about swizzling if the object is tiled.
@@ -365,93 +336,27 @@ deadlock:
  * 0 on success, negative error code on failure.
  */
 int
-i915_gem_object_get_fence(struct drm_i915_gem_object *obj)
+i915_vma_get_fence(struct i915_vma *vma)
 {
-	struct drm_device *dev = obj->base.dev;
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	bool enable = i915_gem_object_is_tiled(obj);
-	struct drm_i915_fence_reg *reg;
-	int ret;
-
-	/* Have we updated the tiling parameters upon the object and so
-	 * will need to serialise the write to the associated fence register?
-	 */
-	if (obj->fence_dirty) {
-		ret = i915_gem_object_wait_fence(obj);
-		if (ret)
-			return ret;
-	}
+	struct drm_i915_fence_reg *fence;
+	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
 
 	/* Just update our place in the LRU if our fence is getting reused. */
-	if (obj->fence_reg != I915_FENCE_REG_NONE) {
-		reg = &dev_priv->fence_regs[obj->fence_reg];
-		if (!obj->fence_dirty) {
-			list_move_tail(&reg->link, &dev_priv->mm.fence_list);
+	if (vma->fence) {
+		fence = vma->fence;
+		if (!fence->dirty) {
+			list_move_tail(&fence->link,
+				       &fence->i915->mm.fence_list);
 			return 0;
 		}
-	} else if (enable) {
-		reg = i915_find_fence_reg(dev);
-		if (IS_ERR(reg))
-			return PTR_ERR(reg);
-
-		if (reg->obj) {
-			struct drm_i915_gem_object *old = reg->obj;
-
-			ret = i915_gem_object_wait_fence(old);
-			if (ret)
-				return ret;
-
-			i915_gem_object_fence_lost(old);
-		}
+	} else if (set) {
+		fence = fence_find(to_i915(vma->vm->dev));
+		if (IS_ERR(fence))
+			return PTR_ERR(fence);
 	} else
 		return 0;
 
-	i915_gem_object_update_fence(obj, reg, enable);
-
-	return 0;
-}
-
-/**
- * i915_gem_object_pin_fence - pin fencing state
- * @obj: object to pin fencing for
- *
- * This pins the fencing state (whether tiled or untiled) to make sure the
- * object is ready to be used as a scanout target. Fencing status must be
- * synchronize first by calling i915_gem_object_get_fence():
- *
- * The resulting fence pin reference must be released again with
- * i915_gem_object_unpin_fence().
- *
- * Returns:
- *
- * True if the object has a fence, false otherwise.
- */
-bool
-i915_gem_object_pin_fence(struct drm_i915_gem_object *obj)
-{
-	if (obj->fence_reg != I915_FENCE_REG_NONE) {
-		to_i915(obj->base.dev)->fence_regs[obj->fence_reg].pin_count++;
-		return true;
-	} else
-		return false;
-}
-
-/**
- * i915_gem_object_unpin_fence - unpin fencing state
- * @obj: object to unpin fencing for
- *
- * This releases the fence pin reference acquired through
- * i915_gem_object_pin_fence. It will handle both objects with and without an
- * attached fence correctly, callers do not need to distinguish this.
- */
-void
-i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
-{
-	if (obj->fence_reg != I915_FENCE_REG_NONE) {
-		struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
-		WARN_ON(dev_priv->fence_regs[obj->fence_reg].pin_count <= 0);
-		dev_priv->fence_regs[obj->fence_reg].pin_count--;
-	}
+	return fence_update(fence, set);
 }
 
 /**
@@ -473,12 +378,7 @@ void i915_gem_restore_fences(struct drm_device *dev)
 		 * Commit delayed tiling changes if we have an object still
 		 * attached to the fence, otherwise just clear the fence.
 		 */
-		if (reg->obj) {
-			i915_gem_object_update_fence(reg->obj, reg,
-						     i915_gem_object_get_tiling(reg->obj));
-		} else {
-			i915_gem_write_fence(dev, i, NULL);
-		}
+		fence_write(reg, reg->vma);
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e31f98df26f6..a18363a0d8c5 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3322,6 +3322,7 @@ void i915_vma_destroy(struct i915_vma *vma)
 	GEM_BUG_ON(vma->node.allocated);
 	GEM_BUG_ON(i915_vma_is_active(vma));
 	GEM_BUG_ON(!i915_vma_is_closed(vma));
+	GEM_BUG_ON(vma->fence);
 
 	list_del(&vma->vm_link);
 	if (!i915_vma_is_ggtt(vma))
@@ -3357,6 +3358,7 @@ __i915_vma_create(struct drm_i915_gem_object *obj,
 	INIT_LIST_HEAD(&vma->exec_list);
 	for (i = 0; i < ARRAY_SIZE(vma->last_read); i++)
 		init_request_active(&vma->last_read[i], i915_vma_retire);
+	init_request_active(&vma->last_fence, NULL);
 	list_add(&vma->vm_link, &vm->unbound_list);
 	vma->vm = vm;
 	vma->obj = obj;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index d7ff78b46266..c88af2ab5538 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -38,7 +38,13 @@
 
 #include "i915_gem_request.h"
 
+#define I915_FENCE_REG_NONE -1
+#define I915_MAX_NUM_FENCES 32
+/* 32 fences + sign bit for FENCE_REG_NONE */
+#define I915_MAX_NUM_FENCE_BITS 6
+
 struct drm_i915_file_private;
+struct drm_i915_fence_reg;
 
 typedef uint32_t gen6_pte_t;
 typedef uint64_t gen8_pte_t;
@@ -174,6 +180,7 @@ struct i915_vma {
 	struct drm_mm_node node;
 	struct drm_i915_gem_object *obj;
 	struct i915_address_space *vm;
+	struct drm_i915_fence_reg *fence;
 	struct sg_table *pages;
 	void __iomem *iomap;
 	u64 size;
@@ -203,6 +210,7 @@ struct i915_vma {
 
 	unsigned int active;
 	struct i915_gem_active last_read[I915_NUM_ENGINES];
+	struct i915_gem_active last_fence;
 
 	/**
 	 * Support different GGTT views into the same object.
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index af70d4460a9e..a14b1e3d4c78 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -116,13 +116,39 @@ i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode)
 	return true;
 }
 
+static bool i915_vma_fence_prepare(struct i915_vma *vma, int tiling_mode)
+{
+	struct drm_i915_private *dev_priv = to_i915(vma->vm->dev);
+	u32 size;
+
+	if (!i915_vma_is_map_and_fenceable(vma))
+		return true;
+
+	if (INTEL_GEN(dev_priv) == 3) {
+		if (vma->node.start & ~I915_FENCE_START_MASK)
+			return false;
+	} else {
+		if (vma->node.start & ~I830_FENCE_START_MASK)
+			return false;
+	}
+
+	size = i915_gem_get_ggtt_size(dev_priv, vma->size, tiling_mode);
+	if (vma->node.size < size)
+		return false;
+
+	if (vma->node.start & (size - 1))
+		return false;
+
+	return true;
+}
+
 /* Make the current GTT allocation valid for the change in tiling. */
 static int
 i915_gem_object_fence_prepare(struct drm_i915_gem_object *obj, int tiling_mode)
 {
 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct i915_vma *vma;
-	u32 size;
+	int ret;
 
 	if (tiling_mode == I915_TILING_NONE)
 		return 0;
@@ -130,32 +156,16 @@ i915_gem_object_fence_prepare(struct drm_i915_gem_object *obj, int tiling_mode)
 	if (INTEL_GEN(dev_priv) >= 4)
 		return 0;
 
-	vma = i915_gem_object_to_ggtt(obj, NULL);
-	if (!vma)
-		return 0;
+	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+		if (i915_vma_fence_prepare(vma, tiling_mode))
+			continue;
 
-	if (!i915_vma_is_map_and_fenceable(vma))
-		return 0;
-
-	if (IS_GEN3(dev_priv)) {
-		if (vma->node.start & ~I915_FENCE_START_MASK)
-			goto bad;
-	} else {
-		if (vma->node.start & ~I830_FENCE_START_MASK)
-			goto bad;
+		ret = i915_vma_unbind(vma);
+		if (ret)
+			return ret;
 	}
 
-	size = i915_gem_get_ggtt_size(dev_priv, vma->size, tiling_mode);
-	if (vma->node.size < size)
-		goto bad;
-
-	if (vma->node.start & (size - 1))
-		goto bad;
-
 	return 0;
-
-bad:
-	return i915_vma_unbind(vma);
 }
 
 /**
@@ -248,6 +258,8 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 
 		err = i915_gem_object_fence_prepare(obj, args->tiling_mode);
 		if (!err) {
+			struct i915_vma *vma;
+
 			if (obj->pages &&
 			    obj->madv == I915_MADV_WILLNEED &&
 			    dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
@@ -257,11 +269,12 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 					i915_gem_object_pin_pages(obj);
 			}
 
-			obj->fence_dirty =
-				!i915_gem_active_is_idle(&obj->last_fence,
-							 &dev->struct_mutex) ||
-				obj->fence_reg != I915_FENCE_REG_NONE;
+			list_for_each_entry(vma, &obj->vma_list, obj_link) {
+				if (!vma->fence)
+					continue;
 
+				vma->fence->dirty = true;
+			}
 			obj->tiling_and_stride =
 				args->stride | args->tiling_mode;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0c3f30ce85c3..84dd5bc06db3 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -797,7 +797,7 @@ static void capture_bo(struct drm_i915_error_buffer *err,
 	err->gtt_offset = vma->node.start;
 	err->read_domains = obj->base.read_domains;
 	err->write_domain = obj->base.write_domain;
-	err->fence_reg = obj->fence_reg;
+	err->fence_reg = vma->fence ? vma->fence->id : -1;
 	err->tiling = i915_gem_object_get_tiling(obj);
 	err->dirty = obj->dirty;
 	err->purgeable = obj->madv != I915_MADV_WILLNEED;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 04a8900f68c1..c81c89adaff3 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2188,7 +2188,6 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	struct i915_ggtt_view view;
 	struct i915_vma *vma;
 	u32 alignment;
-	int ret;
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
@@ -2214,43 +2213,33 @@ intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	intel_runtime_pm_get(dev_priv);
 
 	vma = i915_gem_object_pin_to_display_plane(obj, alignment, &view);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
-		goto err_pm;
-	}
+	if (IS_ERR(vma))
+		goto err;
 
-	/* Install a fence for tiled scan-out. Pre-i965 always needs a
-	 * fence, whereas 965+ only requires a fence if using
-	 * framebuffer compression.  For simplicity, we always install
-	 * a fence as the cost is not that onerous.
-	 */
 	if (i915_vma_is_map_and_fenceable(vma)) {
-		ret = i915_gem_object_get_fence(obj);
-		if (ret == -EDEADLK) {
-			/*
-			 * -EDEADLK means there are no free fences
-			 * no pending flips.
-			 *
-			 * This is propagated to atomic, but it uses
-			 * -EDEADLK to force a locking recovery, so
-			 * change the returned error to -EBUSY.
-			 */
-			ret = -EBUSY;
-			goto err_unpin;
-		} else if (ret)
-			goto err_unpin;
-
-		i915_gem_object_pin_fence(obj);
+		/* Install a fence for tiled scan-out. Pre-i965 always needs a
+		 * fence, whereas 965+ only requires a fence if using
+		 * framebuffer compression.  For simplicity, we always, when
+		 * possible, install a fence as the cost is not that onerous.
+		 *
+		 * If we fail to fence the tiled scanout, then either the
+		 * modeset will reject the change (which is highly unlikely as
+		 * the affected systems, all but one, do not have unmappable
+		 * space) or we will not be able to enable full powersaving
+		 * techniques (also likely not to apply due to various limits
+		 * FBC and the like impose on the size of the buffer, which
+		 * presumably we violated anyway with this unmappable buffer).
+		 * Anyway, it is presumably better to stumble onwards with
+		 * something and try to run the system in a "less than optimal"
+		 * mode that matches the user configuration.
+		 */
+		if (i915_vma_get_fence(vma) == 0)
+			i915_vma_pin_fence(vma);
 	}
 
+err:
 	intel_runtime_pm_put(dev_priv);
 	return vma;
-
-err_unpin:
-	i915_gem_object_unpin_from_display_plane(vma);
-err_pm:
-	intel_runtime_pm_put(dev_priv);
-	return ERR_PTR(ret);
 }
 
 void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
@@ -2264,9 +2253,7 @@ void intel_unpin_fb_obj(struct drm_framebuffer *fb, unsigned int rotation)
 	intel_fill_fb_ggtt_view(&view, fb, rotation);
 	vma = i915_gem_object_to_ggtt(obj, &view);
 
-	if (i915_vma_is_map_and_fenceable(vma))
-		i915_gem_object_unpin_fence(obj);
-
+	i915_vma_unpin_fence(vma);
 	i915_gem_object_unpin_from_display_plane(vma);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index e122052c4081..40bf2e4c804d 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -709,6 +709,14 @@ static bool intel_fbc_hw_tracking_covers_screen(struct intel_crtc *crtc)
 	return effective_w <= max_w && effective_h <= max_h;
 }
 
+/* XXX replace me when we have VMA tracking for intel_plane_state */
+static int get_fence_id(struct drm_framebuffer *fb)
+{
+	struct i915_vma *vma = i915_gem_object_to_ggtt(intel_fb_obj(fb), NULL);
+
+	return vma && vma->fence ? vma->fence->id : I915_FENCE_REG_NONE;
+}
+
 static void intel_fbc_update_state_cache(struct intel_crtc *crtc,
 					 struct intel_crtc_state *crtc_state,
 					 struct intel_plane_state *plane_state)
@@ -740,7 +748,7 @@ static void intel_fbc_update_state_cache(struct intel_crtc *crtc,
 		cache->fb.ilk_ggtt_offset = i915_gem_object_ggtt_offset(obj, NULL);
 	cache->fb.pixel_format = fb->pixel_format;
 	cache->fb.stride = fb->pitches[0];
-	cache->fb.fence_reg = obj->fence_reg;
+	cache->fb.fence_reg = get_fence_id(fb);
 	cache->fb.tiling_mode = i915_gem_object_get_tiling(obj);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 72f8990a13d2..3cf8d02064a8 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -760,7 +760,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	if (IS_ERR(vma))
 		return PTR_ERR(vma);
 
-	ret = i915_gem_object_put_fence(new_bo);
+	ret = i915_vma_put_fence(vma);
 	if (ret)
 		goto out_unpin;
 

From 03af84fe7f48937941fbb4dcd6cf66c68ae0edd4 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:01 +0100
Subject: [PATCH 114/141] drm/i915: Choose partial chunksize based on tile row
 size

In order to support setting up fences for partial mappings of an object,
we have to align those mappings with the fence. The minimum chunksize we
choose is at least the size of a single tile row.

v2: Make minimum chunk size a define for later use

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-22-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9276c73f1d81..d29bbc0296fb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1669,6 +1669,16 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
+{
+	u64 size;
+
+	size = i915_gem_object_get_stride(obj);
+	size *= i915_gem_object_get_tiling(obj) == I915_TILING_Y ? 32 : 8;
+
+	return size >> PAGE_SHIFT;
+}
+
 /**
  * i915_gem_fault - fault a page into the GTT
  * @area: CPU VMA in question
@@ -1687,6 +1697,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
  */
 int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
+#define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
 	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
@@ -1728,7 +1739,11 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	/* Use a partial view if the object is bigger than the aperture. */
 	if (obj->base.size >= ggtt->mappable_end &&
 	    !i915_gem_object_is_tiled(obj)) {
-		static const unsigned int chunk_size = 256; // 1 MiB
+		unsigned int chunk_size;
+
+		chunk_size = MIN_CHUNK_PAGES;
+		if (i915_gem_object_is_tiled(obj))
+			chunk_size = max(chunk_size, tile_row_pages(obj));
 
 		memset(&view, 0, sizeof(view));
 		view.type = I915_GGTT_VIEW_PARTIAL;

From a61007a83a4671da77210790997d5c8c92ed87ea Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:02 +0100
Subject: [PATCH 115/141] drm/i915: Fix partial GGTT faulting

We want to always use the partial VMA as a fallback for a failure to
bind the object into the GGTT. This extends the support partial objects
in the GGTT to cover everything, not just objects too large.

v2: Call the partial view, view not partial.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-23-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 61 ++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d29bbc0296fb..39236b489de8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1702,7 +1702,6 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct i915_ggtt_view view = i915_ggtt_view_normal;
 	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
 	struct i915_vma *vma;
 	pgoff_t page_offset;
@@ -1736,11 +1735,14 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
-	/* Use a partial view if the object is bigger than the aperture. */
-	if (obj->base.size >= ggtt->mappable_end &&
-	    !i915_gem_object_is_tiled(obj)) {
+	/* Now pin it into the GTT as needed */
+	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
+				       PIN_MAPPABLE | PIN_NONBLOCK);
+	if (IS_ERR(vma)) {
+		struct i915_ggtt_view view;
 		unsigned int chunk_size;
 
+		/* Use a partial view if it is bigger than available space */
 		chunk_size = MIN_CHUNK_PAGES;
 		if (i915_gem_object_is_tiled(obj))
 			chunk_size = max(chunk_size, tile_row_pages(obj));
@@ -1749,14 +1751,12 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 		view.type = I915_GGTT_VIEW_PARTIAL;
 		view.params.partial.offset = rounddown(page_offset, chunk_size);
 		view.params.partial.size =
-			min_t(unsigned int,
-			      chunk_size,
+			min_t(unsigned int, chunk_size,
 			      (area->vm_end - area->vm_start) / PAGE_SIZE -
 			      view.params.partial.offset);
-	}
 
-	/* Now pin it into the GTT if needed */
-	vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
+		vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
+	}
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto err_unlock;
@@ -1774,26 +1774,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	pfn = ggtt->mappable_base + i915_ggtt_offset(vma);
 	pfn >>= PAGE_SHIFT;
 
-	if (unlikely(view.type == I915_GGTT_VIEW_PARTIAL)) {
-		/* Overriding existing pages in partial view does not cause
-		 * us any trouble as TLBs are still valid because the fault
-		 * is due to userspace losing part of the mapping or never
-		 * having accessed it before (at this partials' range).
-		 */
-		unsigned long base = area->vm_start +
-				     (view.params.partial.offset << PAGE_SHIFT);
-		unsigned int i;
-
-		for (i = 0; i < view.params.partial.size; i++) {
-			ret = vm_insert_pfn(area,
-					    base + i * PAGE_SIZE,
-					    pfn + i);
-			if (ret)
-				break;
-		}
-
-		obj->fault_mappable = true;
-	} else {
+	if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
 		if (!obj->fault_mappable) {
 			unsigned long size =
 				min_t(unsigned long,
@@ -1809,13 +1790,31 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 				if (ret)
 					break;
 			}
-
-			obj->fault_mappable = true;
 		} else
 			ret = vm_insert_pfn(area,
 					    (unsigned long)vmf->virtual_address,
 					    pfn + page_offset);
+	} else {
+		/* Overriding existing pages in partial view does not cause
+		 * us any trouble as TLBs are still valid because the fault
+		 * is due to userspace losing part of the mapping or never
+		 * having accessed it before (at this partials' range).
+		 */
+		const struct i915_ggtt_view *view = &vma->ggtt_view;
+		unsigned long base = area->vm_start +
+			(view->params.partial.offset << PAGE_SHIFT);
+		unsigned int i;
+
+		for (i = 0; i < view->params.partial.size; i++) {
+			ret = vm_insert_pfn(area,
+					    base + i * PAGE_SIZE,
+					    pfn + i);
+			if (ret)
+				break;
+		}
 	}
+
+	obj->fault_mappable = true;
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:

From aa136d9d72c2f9c4925ff86eebc86748fa069beb Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:03 +0100
Subject: [PATCH 116/141] drm/i915: Convert partial ggtt vma to full ggtt if it
 spans the entire object

If we want to create a partial vma from a chunk that is the same size as
the object, create a normal ggtt vma instead. The benefit is that it
will match future requests for the normal ggtt.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-24-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 39236b489de8..e4c18c71576f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1755,6 +1755,12 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 			      (area->vm_end - area->vm_start) / PAGE_SIZE -
 			      view.params.partial.offset);
 
+		/* If the partial covers the entire object, just create a
+		 * normal VMA.
+		 */
+		if (chunk_size >= obj->base.size >> PAGE_SHIFT)
+			view.type = I915_GGTT_VIEW_NORMAL;
+
 		vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
 	}
 	if (IS_ERR(vma)) {

From 50349247ea807ad0950bbcedb1abb576e6a785db Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:04 +0100
Subject: [PATCH 117/141] drm/i915: Drop ORIGIN_GTT for untracked GTT writes

If FBC is set on a framebuffer that is unmapped, all GTT faults will be
from a partial mapping. Writes by the user through the partial VMA are
then untracked by the FBC and so we must use the ORIGIN_CPU when flushing
the I915_GEM_DOMAIN_GTT.

v2: Keep ORIGIN_CPU for set-to-domain(.write=CPU)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: "Zanoni, Paulo R" <paulo.r.zanoni@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-25-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h |  2 +-
 drivers/gpu/drm/i915/i915_gem.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 56d439374fe5..9386523464ea 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2198,6 +2198,7 @@ struct drm_i915_gem_object {
 	unsigned int cache_dirty:1;
 
 	atomic_t frontbuffer_bits;
+	unsigned int frontbuffer_ggtt_origin; /* write once */
 
 	/** Current tiling stride for the object, if it's tiled. */
 	unsigned int tiling_and_stride;
@@ -2205,7 +2206,6 @@ struct drm_i915_gem_object {
 #define TILING_MASK (FENCE_MINIMUM_STRIDE-1)
 #define STRIDE_MASK (~TILING_MASK)
 
-	unsigned int has_wc_mmap;
 	/** Count of VMA actually bound by this object */
 	unsigned int bind_count;
 	unsigned int pin_display;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e4c18c71576f..9811980d6837 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1497,8 +1497,8 @@ err:
 static inline enum fb_op_origin
 write_origin(struct drm_i915_gem_object *obj, unsigned domain)
 {
-	return domain == I915_GEM_DOMAIN_GTT && !obj->has_wc_mmap ?
-	       ORIGIN_GTT : ORIGIN_CPU;
+	return (domain == I915_GEM_DOMAIN_GTT ?
+		obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 }
 
 /**
@@ -1658,7 +1658,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 		up_write(&mm->mmap_sem);
 
 		/* This may race, but that's ok, it only gets set */
-		WRITE_ONCE(obj->has_wc_mmap, true);
+		WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
 	}
 	i915_gem_object_put_unlocked(obj);
 	if (IS_ERR((void *)addr))
@@ -1761,6 +1761,11 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 		if (chunk_size >= obj->base.size >> PAGE_SHIFT)
 			view.type = I915_GGTT_VIEW_NORMAL;
 
+		/* Userspace is now writing through an untracked VMA, abandon
+		 * all hope that the hardware is able to track future writes.
+		 */
+		obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
+
 		vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, PIN_MAPPABLE);
 	}
 	if (IS_ERR(vma)) {
@@ -4093,6 +4098,7 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
 	obj->ops = ops;
 
+	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
 	obj->madv = I915_MADV_WILLNEED;
 
 	i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);

From 821188778b9be2050d45490c4b2b009d51f041e0 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:05 +0100
Subject: [PATCH 118/141] drm/i915: Choose not to evict faultable objects from
 the GGTT

Often times we do not want to evict mapped objects from the GGTT as
these are quite expensive to teardown and frequently reused (causing an
equally, if not more so, expensive setup). In particular, when faulting
in a new object we want to avoid evicting an active object, or else we
may trigger a page-fault-of-doom as we ping-pong between evicting two
objects.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-26-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c       | 12 ++++++++++--
 drivers/gpu/drm/i915/i915_gem_evict.c |  7 +++++--
 drivers/gpu/drm/i915/i915_gem_gtt.h   |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9811980d6837..a13290a8ba30 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1706,6 +1706,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	struct i915_vma *vma;
 	pgoff_t page_offset;
 	unsigned long pfn;
+	unsigned int flags;
 	int ret;
 
 	/* We don't use vmf->pgoff since that has the fake offset */
@@ -1735,9 +1736,16 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
+	/* If the object is smaller than a couple of partial vma, it is
+	 * not worth only creating a single partial vma - we may as well
+	 * clear enough space for the full object.
+	 */
+	flags = PIN_MAPPABLE;
+	if (obj->base.size > 2 * MIN_CHUNK_PAGES << PAGE_SHIFT)
+		flags |= PIN_NONBLOCK | PIN_NONFAULT;
+
 	/* Now pin it into the GTT as needed */
-	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
-				       PIN_MAPPABLE | PIN_NONBLOCK);
+	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
 	if (IS_ERR(vma)) {
 		struct i915_ggtt_view view;
 		unsigned int chunk_size;
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index f76c06e92677..815d5fbe07ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -47,7 +47,7 @@ gpu_is_idle(struct drm_i915_private *dev_priv)
 }
 
 static bool
-mark_free(struct i915_vma *vma, struct list_head *unwind)
+mark_free(struct i915_vma *vma, unsigned int flags, struct list_head *unwind)
 {
 	if (i915_vma_is_pinned(vma))
 		return false;
@@ -55,6 +55,9 @@ mark_free(struct i915_vma *vma, struct list_head *unwind)
 	if (WARN_ON(!list_empty(&vma->exec_list)))
 		return false;
 
+	if (flags & PIN_NONFAULT && vma->obj->fault_mappable)
+		return false;
+
 	list_add(&vma->exec_list, unwind);
 	return drm_mm_scan_add_block(&vma->node);
 }
@@ -129,7 +132,7 @@ search_again:
 	phase = phases;
 	do {
 		list_for_each_entry(vma, *phase, vm_link)
-			if (mark_free(vma, &eviction_list))
+			if (mark_free(vma, flags, &eviction_list))
 				goto found;
 	} while (*++phase);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index c88af2ab5538..9248a0e0b286 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -637,6 +637,7 @@ void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
 #define PIN_NONBLOCK		BIT(0)
 #define PIN_MAPPABLE		BIT(1)
 #define PIN_ZONE_4G		BIT(2)
+#define PIN_NONFAULT		BIT(3)
 
 #define PIN_MBZ			BIT(5) /* I915_VMA_PIN_OVERFLOW */
 #define PIN_GLOBAL		BIT(6) /* I915_VMA_GLOBAL_BIND */

From 2efb813d5388e18255c54afac77bd91acd586908 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:06 +0100
Subject: [PATCH 119/141] drm/i915: Fallback to using unmappable memory for
 scanout

The existing ABI says that scanouts are pinned into the mappable region
so that legacy clients (e.g. old Xorg or plymouthd) can write directly
into the scanout through a GTT mapping. However if the surface does not
fit into the mappable region, we are better off just trying to fit it
anywhere and hoping for the best. (Any userspace that is capable of
using ginormous scanouts is also likely not to rely on pure GTT
updates.) With the partial vma fault support, we are no longer
restricted to only using scanouts that we can pin (though it is still
preferred for performance reasons and for powersaving features like
FBC).

v2: Skip fence pinning when not mappable.
v3: Add a comment to explain the possible ramifications of not being
    able to use fences for unmappable scanouts.
v4: Rebase to skip over some local patches
v5: Rebase to defer until after we have unmappable GTT fault support

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Deepak S <deepak.s@linux.intel.com>
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-27-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c  | 14 ++++++++++----
 drivers/gpu/drm/i915/intel_fbc.c |  4 ++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a13290a8ba30..69a994bd6afd 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3581,11 +3581,17 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 
 	/* As the user may map the buffer once pinned in the display plane
 	 * (e.g. libkms for the bootup splash), we have to ensure that we
-	 * always use map_and_fenceable for all scanout buffers.
+	 * always use map_and_fenceable for all scanout buffers. However,
+	 * it may simply be too big to fit into mappable, in which case
+	 * put it anyway and hope that userspace can cope (but always first
+	 * try to preserve the existing ABI).
 	 */
-	vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
-				       view->type == I915_GGTT_VIEW_NORMAL ?
-				       PIN_MAPPABLE : 0);
+	vma = ERR_PTR(-ENOSPC);
+	if (view->type == I915_GGTT_VIEW_NORMAL)
+		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
+					       PIN_MAPPABLE | PIN_NONBLOCK);
+	if (IS_ERR(vma))
+		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, 0);
 	if (IS_ERR(vma))
 		goto err_unpin_display;
 
diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index 40bf2e4c804d..37415f96f906 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -776,6 +776,10 @@ static bool intel_fbc_can_activate(struct intel_crtc *crtc)
 
 	/* The use of a CPU fence is mandatory in order to detect writes
 	 * by the CPU to the scanout and trigger updates to the FBC.
+	 *
+	 * Note that is possible for a tiled surface to be unmappable (and
+	 * so have no fence associated with it) due to aperture constaints
+	 * at the time of pinning.
 	 */
 	if (cache->fb.tiling_mode != I915_TILING_X ||
 	    cache->fb.fence_reg == I915_FENCE_REG_NONE) {

From d8923dcfa53d59886d432a3fc430e26cb92ce86a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:07 +0100
Subject: [PATCH 120/141] drm/i915: Track display alignment on VMA

When using the aliasing ppgtt and pageflipping with the shrinker/eviction
active, we note that we often have to rebind the backbuffer before
flipping onto the scanout because it has an invalid alignment. If we
store the worst-case alignment required for a VMA, we can avoid having
to rebind at critical junctures.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-28-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c     | 21 ++++++++-------------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  1 +
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 69a994bd6afd..021f807d7afc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3050,7 +3050,6 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	struct drm_i915_private *dev_priv = to_i915(vma->vm->dev);
 	struct drm_i915_gem_object *obj = vma->obj;
 	u64 start, end;
-	u64 min_alignment;
 	int ret;
 
 	GEM_BUG_ON(vma->flags & (I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND));
@@ -3061,17 +3060,10 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 		size = i915_gem_get_ggtt_size(dev_priv, size,
 					      i915_gem_object_get_tiling(obj));
 
-	min_alignment =
-		i915_gem_get_ggtt_alignment(dev_priv, size,
-					    i915_gem_object_get_tiling(obj),
-					    flags & PIN_MAPPABLE);
-	if (alignment == 0)
-		alignment = min_alignment;
-	if (alignment & (min_alignment - 1)) {
-		DRM_DEBUG("Invalid object alignment requested %llu, minimum %llu\n",
-			  alignment, min_alignment);
-		return -EINVAL;
-	}
+	alignment = max(max(alignment, vma->display_alignment),
+			i915_gem_get_ggtt_alignment(dev_priv, size,
+						    i915_gem_object_get_tiling(obj),
+						    flags & PIN_MAPPABLE));
 
 	start = flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
 
@@ -3595,6 +3587,8 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 	if (IS_ERR(vma))
 		goto err_unpin_display;
 
+	vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
+
 	WARN_ON(obj->pin_display > i915_vma_pin_count(vma));
 
 	i915_gem_object_flush_cpu_write_domain(obj);
@@ -3625,7 +3619,8 @@ i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
 	if (WARN_ON(vma->obj->pin_display == 0))
 		return;
 
-	vma->obj->pin_display--;
+	if (--vma->obj->pin_display == 0)
+		vma->display_alignment = 0;
 
 	i915_vma_unpin(vma);
 	WARN_ON(vma->obj->pin_display > i915_vma_pin_count(vma));
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 9248a0e0b286..a15cea73f729 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -184,6 +184,7 @@ struct i915_vma {
 	struct sg_table *pages;
 	void __iomem *iomap;
 	u64 size;
+	u64 display_alignment;
 
 	unsigned int flags;
 	/**

From 383d5823e938129fe10c76e0032a22734849049c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:08 +0100
Subject: [PATCH 121/141] drm/i915: Bump the inactive tracking for all VMA
 accessed

We track the LRU access for eviction and bump the last access for the
user GGTT on set-to-gtt. When we do so we need to not only bump the
primary GGTT VMA but all partials as well. Similarly we want to
bump the last access tracking for when unpinning an object from the
scanout so that they do not get promptly evicted and hopefully remain
available for reuse on the next frame.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-29-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 021f807d7afc..14c624ee6dbf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3253,6 +3253,24 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 					    I915_GEM_DOMAIN_CPU);
 }
 
+static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
+{
+	struct i915_vma *vma;
+
+	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+		if (!i915_vma_is_ggtt(vma))
+			continue;
+
+		if (i915_vma_is_active(vma))
+			continue;
+
+		if (!drm_mm_node_allocated(&vma->node))
+			continue;
+
+		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+	}
+}
+
 /**
  * Moves a single object to the GTT read, and possibly write domain.
  * @obj: object to act on
@@ -3265,7 +3283,6 @@ int
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	uint32_t old_write_domain, old_read_domains;
-	struct i915_vma *vma;
 	int ret;
 
 	ret = i915_gem_object_wait_rendering(obj, !write);
@@ -3315,11 +3332,7 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 					    old_write_domain);
 
 	/* And bump the LRU for this access */
-	vma = i915_gem_object_to_ggtt(obj, NULL);
-	if (vma &&
-	    drm_mm_node_allocated(&vma->node) &&
-	    !i915_vma_is_active(vma))
-		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+	i915_gem_object_bump_inactive_ggtt(obj);
 
 	return 0;
 }
@@ -3622,6 +3635,10 @@ i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
 	if (--vma->obj->pin_display == 0)
 		vma->display_alignment = 0;
 
+	/* Bump the LRU to try and avoid premature eviction whilst flipping  */
+	if (!i915_vma_is_active(vma))
+		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+
 	i915_vma_unpin(vma);
 	WARN_ON(vma->obj->pin_display > i915_vma_pin_count(vma));
 }

From cd3127d684f027f7c85aec57e284dcecd033dccf Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:09 +0100
Subject: [PATCH 122/141] drm/i915: Stop discarding GTT cache-domain on unbind
 vma

Since commit 43566dedde54 ("drm/i915: Broaden application of
set-domain(GTT)") we allowed objects to be in the GTT domain, but unbound.
Therefore removing the GTT cache domain when removing the GGTT vma is no
longer semantically correct.

An unfortunate side-effect is we lose the wondrously named
i915_gem_object_finish_gtt(), not to be confused with
i915_gem_gtt_finish_object()!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Akash Goel <akash.goel@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-30-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 14c624ee6dbf..0e1f5dde2e87 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2857,27 +2857,6 @@ i915_gem_object_sync(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static void i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj)
-{
-	u32 old_write_domain, old_read_domains;
-
-	/* Force a pagefault for domain tracking on next user access */
-	i915_gem_release_mmap(obj);
-
-	if ((obj->base.read_domains & I915_GEM_DOMAIN_GTT) == 0)
-		return;
-
-	old_read_domains = obj->base.read_domains;
-	old_write_domain = obj->base.write_domain;
-
-	obj->base.read_domains &= ~I915_GEM_DOMAIN_GTT;
-	obj->base.write_domain &= ~I915_GEM_DOMAIN_GTT;
-
-	trace_i915_gem_object_change_domain(obj,
-					    old_read_domains,
-					    old_write_domain);
-}
-
 static void __i915_vma_iounmap(struct i915_vma *vma)
 {
 	GEM_BUG_ON(i915_vma_is_pinned(vma));
@@ -2933,13 +2912,14 @@ int i915_vma_unbind(struct i915_vma *vma)
 	GEM_BUG_ON(!obj->pages);
 
 	if (i915_vma_is_map_and_fenceable(vma)) {
-		i915_gem_object_finish_gtt(obj);
-
 		/* release the fence reg _after_ flushing */
 		ret = i915_vma_put_fence(vma);
 		if (ret)
 			return ret;
 
+		/* Force a pagefault for domain tracking on next user access */
+		i915_gem_release_mmap(obj);
+
 		__i915_vma_iounmap(vma);
 		vma->flags &= ~I915_VMA_CAN_FENCE;
 	}

From 7756e454077197df57c51cc2f7ae844ec6ce9fba Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:10 +0100
Subject: [PATCH 123/141] drm/i915/cmdparser: Make initialisation failure
 non-fatal

If the developer adds a register in the wrong order, we BUG during boot.
That makes development and testing very difficult. Let's be a bit more
friendly and disable the command parser with a big warning if the tables
are invalid.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-31-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 27 +++++++++++++-------------
 drivers/gpu/drm/i915/i915_drv.h        |  2 +-
 drivers/gpu/drm/i915/intel_engine_cs.c |  6 ++++--
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index e586e15e172f..808d97646e70 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -746,17 +746,15 @@ static void fini_hash_table(struct intel_engine_cs *engine)
  * Optionally initializes fields related to batch buffer command parsing in the
  * struct intel_engine_cs based on whether the platform requires software
  * command parsing.
- *
- * Return: non-zero if initialization fails
  */
-int intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
+void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 {
 	const struct drm_i915_cmd_table *cmd_tables;
 	int cmd_table_count;
 	int ret;
 
 	if (!IS_GEN7(engine->i915))
-		return 0;
+		return;
 
 	switch (engine->id) {
 	case RCS:
@@ -811,24 +809,27 @@ int intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 		break;
 	default:
 		MISSING_CASE(engine->id);
-		BUG();
+		return;
 	}
 
-	BUG_ON(!validate_cmds_sorted(engine, cmd_tables, cmd_table_count));
-	BUG_ON(!validate_regs_sorted(engine));
-
-	WARN_ON(!hash_empty(engine->cmd_hash));
+	if (!validate_cmds_sorted(engine, cmd_tables, cmd_table_count)) {
+		DRM_ERROR("%s: command descriptions are not sorted\n",
+			  engine->name);
+		return;
+	}
+	if (!validate_regs_sorted(engine)) {
+		DRM_ERROR("%s: registers are not sorted\n", engine->name);
+		return;
+	}
 
 	ret = init_hash_table(engine, cmd_tables, cmd_table_count);
 	if (ret) {
-		DRM_ERROR("CMD: cmd_parser_init failed!\n");
+		DRM_ERROR("%s: initialised failed!\n", engine->name);
 		fini_hash_table(engine);
-		return ret;
+		return;
 	}
 
 	engine->needs_cmd_parser = true;
-
-	return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9386523464ea..299949d54dca 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3541,7 +3541,7 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
 
 /* i915_cmd_parser.c */
 int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv);
-int intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
+void intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
 bool intel_engine_needs_cmd_parser(struct intel_engine_cs *engine);
 int intel_engine_cmd_parser(struct intel_engine_cs *engine,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 8a27bb9f6bc1..2e96a86105c2 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -239,6 +239,8 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
 	intel_engine_init_requests(engine);
 	intel_engine_init_hangcheck(engine);
 	i915_gem_batch_pool_init(engine, &engine->batch_pool);
+
+	intel_engine_init_cmd_parser(engine);
 }
 
 int intel_engine_create_scratch(struct intel_engine_cs *engine, int size)
@@ -301,7 +303,7 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		return ret;
 
-	return intel_engine_init_cmd_parser(engine);
+	return 0;
 }
 
 /**
@@ -315,7 +317,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 {
 	intel_engine_cleanup_scratch(engine);
 
-	intel_engine_cleanup_cmd_parser(engine);
 	intel_engine_fini_breadcrumbs(engine);
+	intel_engine_cleanup_cmd_parser(engine);
 	i915_gem_batch_pool_fini(&engine->batch_pool);
 }

From 068715b922a6f87c454cdfa15bb8049d2076eee6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:11 +0100
Subject: [PATCH 124/141] drm/i915/cmdparser: Add the TIMESTAMP register for
 the other engines

Since I have been using the BCS_TIMESTAMP to measure latency of
execution upon the blitter ring, allow regular userspace to also read
from that register. They are already allowed RCS_TIMESTAMP!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-32-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 808d97646e70..8ebc0ce44a76 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -458,6 +458,7 @@ static const struct drm_i915_reg_descriptor gen7_render_regs[] = {
 	REG32(GEN7_GPGPU_DISPATCHDIMX),
 	REG32(GEN7_GPGPU_DISPATCHDIMY),
 	REG32(GEN7_GPGPU_DISPATCHDIMZ),
+	REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
 	REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 0),
 	REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 1),
 	REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 2),
@@ -473,6 +474,7 @@ static const struct drm_i915_reg_descriptor gen7_render_regs[] = {
 	REG32(GEN7_L3SQCREG1),
 	REG32(GEN7_L3CNTLREG2),
 	REG32(GEN7_L3CNTLREG3),
+	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
 };
 
 static const struct drm_i915_reg_descriptor hsw_render_regs[] = {
@@ -502,7 +504,10 @@ static const struct drm_i915_reg_descriptor hsw_render_regs[] = {
 };
 
 static const struct drm_i915_reg_descriptor gen7_blt_regs[] = {
+	REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE),
+	REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
 	REG32(BCS_SWCTRL),
+	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
 };
 
 static const struct drm_i915_reg_descriptor ivb_master_regs[] = {

From 0b5372727be37944239100ff05a63df9771c8484 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:12 +0100
Subject: [PATCH 125/141] drm/i915/cmdparser: Use cached vmappings

The single largest factor in the overhead of parsing the commands is the
setup of the virtual mapping to provide a continuous block for the batch
buffer. If we keep those vmappings around (against the better judgement
of mm/vmalloc.c, which we offset by handwaving and looking suggestively
at the shrinker) we can dramatically improve the performance of the
parser for small batches (such as media workloads). Furthermore, we can
use the prepare shmem read/write functions to determine  how best we
need to clflush the range (rather than every page of the object).

The impact of caching both src/dst vmaps is +80% on ivb and +140% on byt
for the throughput on small batches. (Caching just the dst vmap and
iterating over the src, doing a page by page copy is roughly 5% slower
on both platforms. That may be an acceptable trade-off to eliminate one
cached vmapping, and we may be able to reduce the per-page copying overhead
further.) For *this* simple test case, the cmdparser is now within a
factor of 2 of ideal performance.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-33-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c     | 127 ++++++++-------------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   8 +-
 2 files changed, 54 insertions(+), 81 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 8ebc0ce44a76..5d9ea163d1c8 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -937,98 +937,63 @@ find_reg_in_tables(const struct drm_i915_reg_table *tables,
 	return NULL;
 }
 
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
-		       unsigned start, unsigned len)
-{
-	int i;
-	void *addr = NULL;
-	struct sg_page_iter sg_iter;
-	int first_page = start >> PAGE_SHIFT;
-	int last_page = (len + start + 4095) >> PAGE_SHIFT;
-	int npages = last_page - first_page;
-	struct page **pages;
-
-	pages = drm_malloc_ab(npages, sizeof(*pages));
-	if (pages == NULL) {
-		DRM_DEBUG_DRIVER("Failed to get space for pages\n");
-		goto finish;
-	}
-
-	i = 0;
-	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
-		pages[i++] = sg_page_iter_page(&sg_iter);
-		if (i == npages)
-			break;
-	}
-
-	addr = vmap(pages, i, 0, PAGE_KERNEL);
-	if (addr == NULL) {
-		DRM_DEBUG_DRIVER("Failed to vmap pages\n");
-		goto finish;
-	}
-
-finish:
-	if (pages)
-		drm_free_large(pages);
-	return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
+/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
+static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 		       struct drm_i915_gem_object *src_obj,
 		       u32 batch_start_offset,
-		       u32 batch_len)
+		       u32 batch_len,
+		       bool *needs_clflush_after)
 {
-	unsigned int needs_clflush;
-	void *src_base, *src;
-	void *dst = NULL;
+	unsigned int src_needs_clflush;
+	unsigned int dst_needs_clflush;
+	void *src, *dst;
 	int ret;
 
-	if (batch_len > dest_obj->base.size ||
-	    batch_len + batch_start_offset > src_obj->base.size)
-		return ERR_PTR(-E2BIG);
-
-	if (WARN_ON(dest_obj->pages_pin_count == 0))
-		return ERR_PTR(-ENODEV);
-
-	ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
+	if (ret)
 		return ERR_PTR(ret);
-	}
 
-	src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
-	if (!src_base) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
-		ret = -ENOMEM;
+	ret = i915_gem_obj_prepare_shmem_write(dst_obj, &dst_needs_clflush);
+	if (ret) {
+		dst = ERR_PTR(ret);
 		goto unpin_src;
 	}
 
-	ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
-	if (ret) {
-		DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
-		goto unmap_src;
+	src = i915_gem_object_pin_map(src_obj, I915_MAP_WB);
+	if (IS_ERR(src)) {
+		dst = src;
+		goto unpin_dst;
 	}
 
-	dst = vmap_batch(dest_obj, 0, batch_len);
-	if (!dst) {
-		DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
-		ret = -ENOMEM;
+	dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB);
+	if (IS_ERR(dst))
 		goto unmap_src;
-	}
 
-	src = src_base + offset_in_page(batch_start_offset);
-	if (needs_clflush)
+	src += batch_start_offset;
+	if (src_needs_clflush)
 		drm_clflush_virt_range(src, batch_len);
 
+	/* We can avoid clflushing partial cachelines before the write if we
+	 * only every write full cache-lines. Since we know that both the
+	 * source and destination are in multiples of PAGE_SIZE, we can simply
+	 * round up to the next cacheline. We don't care about copying too much
+	 * here as we only validate up to the end of the batch.
+	 */
+	if (dst_needs_clflush & CLFLUSH_BEFORE)
+		batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
+
 	memcpy(dst, src, batch_len);
 
+	/* dst_obj is returned with vmap pinned */
+	*needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
+
 unmap_src:
-	vunmap(src_base);
+	i915_gem_object_unpin_map(src_obj);
+unpin_dst:
+	i915_gem_obj_finish_shmem_access(dst_obj);
 unpin_src:
 	i915_gem_obj_finish_shmem_access(src_obj);
-
-	return ret ? ERR_PTR(ret) : dst;
+	return dst;
 }
 
 /**
@@ -1206,16 +1171,18 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			    u32 batch_len,
 			    bool is_master)
 {
-	u32 *cmd, *batch_base, *batch_end;
+	u32 *cmd, *batch_end;
 	struct drm_i915_cmd_descriptor default_desc = { 0 };
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
+	bool needs_clflush_after = false;
 	int ret = 0;
 
-	batch_base = copy_batch(shadow_batch_obj, batch_obj,
-				batch_start_offset, batch_len);
-	if (IS_ERR(batch_base)) {
+	cmd = copy_batch(shadow_batch_obj, batch_obj,
+			 batch_start_offset, batch_len,
+			 &needs_clflush_after);
+	if (IS_ERR(cmd)) {
 		DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
-		return PTR_ERR(batch_base);
+		return PTR_ERR(cmd);
 	}
 
 	/*
@@ -1223,9 +1190,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 	 * large or larger and copy_batch() will write MI_NOPs to the extra
 	 * space. Parsing should be faster in some cases this way.
 	 */
-	batch_end = batch_base + (batch_len / sizeof(*batch_end));
-
-	cmd = batch_base;
+	batch_end = cmd + (batch_len / sizeof(*batch_end));
 	while (cmd < batch_end) {
 		const struct drm_i915_cmd_descriptor *desc;
 		u32 length;
@@ -1284,7 +1249,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 		ret = -EINVAL;
 	}
 
-	vunmap(batch_base);
+	if (ret == 0 && needs_clflush_after)
+		drm_clflush_virt_range(shadow_batch_obj->mapping, batch_len);
+	i915_gem_object_unpin_map(shadow_batch_obj);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 907386630e26..4192066ff60e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1512,7 +1512,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
 		     params->args_batch_start_offset;
 
 	if (exec_len == 0)
-		exec_len = params->batch->size;
+		exec_len = params->batch->size - params->args_batch_start_offset;
 
 	ret = params->engine->emit_bb_start(params->request,
 					    exec_start, exec_len,
@@ -1738,6 +1738,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		ret = -EINVAL;
 		goto err;
 	}
+	if (args->batch_start_offset > params->batch->size ||
+	    args->batch_len > params->batch->size - args->batch_start_offset) {
+		DRM_DEBUG("Attempting to use out-of-bounds batch\n");
+		ret = -EINVAL;
+		goto err;
+	}
 
 	params->args_batch_start_offset = args->batch_start_offset;
 	if (intel_engine_needs_cmd_parser(engine) && args->batch_len) {

From ed13033f0287051577bc1678cde63a42fa419f3c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:13 +0100
Subject: [PATCH 126/141] drm/i915/cmdparser: Only cache the dst vmap

For simplicity, we want to continue using a contiguous mapping of the
command buffer, but we can reduce the number of vmappings we hold by
switching over to a page-by-page copy from the user batch buffer to the
shadow. The cost for saving one linear mapping is about 5% in trivial
workloads - which is more or less the overhead in calling kmap_atomic().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-34-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 33 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 5d9ea163d1c8..d1858f80d64c 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -946,7 +946,8 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 {
 	unsigned int src_needs_clflush;
 	unsigned int dst_needs_clflush;
-	void *src, *dst;
+	void *dst, *ptr;
+	int offset, n;
 	int ret;
 
 	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
@@ -959,19 +960,12 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 		goto unpin_src;
 	}
 
-	src = i915_gem_object_pin_map(src_obj, I915_MAP_WB);
-	if (IS_ERR(src)) {
-		dst = src;
-		goto unpin_dst;
-	}
-
 	dst = i915_gem_object_pin_map(dst_obj, I915_MAP_WB);
 	if (IS_ERR(dst))
-		goto unmap_src;
+		goto unpin_dst;
 
-	src += batch_start_offset;
-	if (src_needs_clflush)
-		drm_clflush_virt_range(src, batch_len);
+	ptr = dst;
+	offset = offset_in_page(batch_start_offset);
 
 	/* We can avoid clflushing partial cachelines before the write if we
 	 * only every write full cache-lines. Since we know that both the
@@ -982,13 +976,24 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 	if (dst_needs_clflush & CLFLUSH_BEFORE)
 		batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
 
-	memcpy(dst, src, batch_len);
+	for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
+		int len = min_t(int, batch_len, PAGE_SIZE - offset);
+		void *vaddr;
+
+		vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
+		if (src_needs_clflush)
+			drm_clflush_virt_range(vaddr + offset, len);
+		memcpy(ptr, vaddr + offset, len);
+		kunmap_atomic(vaddr);
+
+		ptr += len;
+		batch_len -= len;
+		offset = 0;
+	}
 
 	/* dst_obj is returned with vmap pinned */
 	*needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
 
-unmap_src:
-	i915_gem_object_unpin_map(src_obj);
 unpin_dst:
 	i915_gem_obj_finish_shmem_access(dst_obj);
 unpin_src:

From d6a4ead7a38666b9acdb098012a178c4c4950eca Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:14 +0100
Subject: [PATCH 127/141] drm/i915/cmdparser: Improve hash function

The existing code's hashfunction is very suboptimal (most 3D commands
use the same bucket degrading the hash to a long list). The code even
acknowledge that the issue was known and the fix simple:

/*
 * If we attempt to generate a perfect hash, we should be able to look at bits
 * 31:29 of a command from a batch buffer and use the full mask for that
 * client. The existing INSTR_CLIENT_MASK/SHIFT defines can be used for this.
 */

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-35-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 51 ++++++++++++++++----------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index d1858f80d64c..54aeab3f9614 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -86,24 +86,24 @@
  * general bitmasking mechanism.
  */
 
-#define STD_MI_OPCODE_MASK  0xFF800000
-#define STD_3D_OPCODE_MASK  0xFFFF0000
-#define STD_2D_OPCODE_MASK  0xFFC00000
-#define STD_MFX_OPCODE_MASK 0xFFFF0000
+#define STD_MI_OPCODE_SHIFT  (32 - 9)
+#define STD_3D_OPCODE_SHIFT  (32 - 16)
+#define STD_2D_OPCODE_SHIFT  (32 - 10)
+#define STD_MFX_OPCODE_SHIFT (32 - 16)
 
 #define CMD(op, opm, f, lm, fl, ...)				\
 	{							\
 		.flags = (fl) | ((f) ? CMD_DESC_FIXED : 0),	\
-		.cmd = { (op), (opm) },				\
+		.cmd = { (op), ~0u << (opm) },			\
 		.length = { (lm) },				\
 		__VA_ARGS__					\
 	}
 
 /* Convenience macros to compress the tables */
-#define SMI STD_MI_OPCODE_MASK
-#define S3D STD_3D_OPCODE_MASK
-#define S2D STD_2D_OPCODE_MASK
-#define SMFX STD_MFX_OPCODE_MASK
+#define SMI STD_MI_OPCODE_SHIFT
+#define S3D STD_3D_OPCODE_SHIFT
+#define S2D STD_2D_OPCODE_SHIFT
+#define SMFX STD_MFX_OPCODE_SHIFT
 #define F true
 #define S CMD_DESC_SKIP
 #define R CMD_DESC_REJECT
@@ -696,12 +696,26 @@ struct cmd_node {
  * non-opcode bits being set. But if we don't include those bits, some 3D
  * commands may hash to the same bucket due to not including opcode bits that
  * make the command unique. For now, we will risk hashing to the same bucket.
- *
- * If we attempt to generate a perfect hash, we should be able to look at bits
- * 31:29 of a command from a batch buffer and use the full mask for that
- * client. The existing INSTR_CLIENT_MASK/SHIFT defines can be used for this.
  */
-#define CMD_HASH_MASK STD_MI_OPCODE_MASK
+static inline u32 cmd_header_key(u32 x)
+{
+	u32 shift;
+
+	switch (x >> INSTR_CLIENT_SHIFT) {
+	default:
+	case INSTR_MI_CLIENT:
+		shift = STD_MI_OPCODE_SHIFT;
+		break;
+	case INSTR_RC_CLIENT:
+		shift = STD_3D_OPCODE_SHIFT;
+		break;
+	case INSTR_BC_CLIENT:
+		shift = STD_2D_OPCODE_SHIFT;
+		break;
+	}
+
+	return x >> shift;
+}
 
 static int init_hash_table(struct intel_engine_cs *engine,
 			   const struct drm_i915_cmd_table *cmd_tables,
@@ -725,7 +739,7 @@ static int init_hash_table(struct intel_engine_cs *engine,
 
 			desc_node->desc = desc;
 			hash_add(engine->cmd_hash, &desc_node->node,
-				 desc->cmd.value & CMD_HASH_MASK);
+				 cmd_header_key(desc->cmd.value));
 		}
 	}
 
@@ -859,12 +873,9 @@ find_cmd_in_table(struct intel_engine_cs *engine,
 	struct cmd_node *desc_node;
 
 	hash_for_each_possible(engine->cmd_hash, desc_node, node,
-			       cmd_header & CMD_HASH_MASK) {
+			       cmd_header_key(cmd_header)) {
 		const struct drm_i915_cmd_descriptor *desc = desc_node->desc;
-		u32 masked_cmd = desc->cmd.mask & cmd_header;
-		u32 masked_value = desc->cmd.value & desc->cmd.mask;
-
-		if (masked_cmd == masked_value)
+		if (((cmd_header ^ desc->cmd.value) & desc->cmd.mask) == 0)
 			return desc;
 	}
 

From efdfd91f9b234638705d79a2b2f0a6ac8a5f99e8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:15 +0100
Subject: [PATCH 128/141] drm/i915/cmdparser: Compare against the previous
 command descriptor

On the blitter (and in test code), we see long sequences of repeated
commands, e.g. XY_PIXEL_BLT, XY_SCANLINE_BLT, or XY_SRC_COPY. For these,
we can skip the hashtable lookup by remembering the previous command
descriptor and doing a straightforward compare of the command header.
The corollary is that we need to do one extra comparison before lookup
up new commands.

v2: Less magic mask (ok, it is still magic, but now you cannot see!)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-36-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 54aeab3f9614..558077a25052 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -90,6 +90,7 @@
 #define STD_3D_OPCODE_SHIFT  (32 - 16)
 #define STD_2D_OPCODE_SHIFT  (32 - 10)
 #define STD_MFX_OPCODE_SHIFT (32 - 16)
+#define MIN_OPCODE_SHIFT 16
 
 #define CMD(op, opm, f, lm, fl, ...)				\
 	{							\
@@ -350,6 +351,9 @@ static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
 	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   R  ),
 };
 
+static const struct drm_i915_cmd_descriptor noop_desc =
+	CMD(MI_NOOP, SMI, F, 1, S);
+
 #undef CMD
 #undef SMI
 #undef S3D
@@ -893,11 +897,14 @@ find_cmd_in_table(struct intel_engine_cs *engine,
 static const struct drm_i915_cmd_descriptor*
 find_cmd(struct intel_engine_cs *engine,
 	 u32 cmd_header,
+	 const struct drm_i915_cmd_descriptor *desc,
 	 struct drm_i915_cmd_descriptor *default_desc)
 {
-	const struct drm_i915_cmd_descriptor *desc;
 	u32 mask;
 
+	if (((cmd_header ^ desc->cmd.value) & desc->cmd.mask) == 0)
+		return desc;
+
 	desc = find_cmd_in_table(engine, cmd_header);
 	if (desc)
 		return desc;
@@ -906,10 +913,10 @@ find_cmd(struct intel_engine_cs *engine,
 	if (!mask)
 		return NULL;
 
-	BUG_ON(!default_desc);
-	default_desc->flags = CMD_DESC_SKIP;
+	default_desc->cmd.value = cmd_header;
+	default_desc->cmd.mask = ~0u << MIN_OPCODE_SHIFT;
 	default_desc->length.mask = mask;
-
+	default_desc->flags = CMD_DESC_SKIP;
 	return default_desc;
 }
 
@@ -1188,7 +1195,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			    bool is_master)
 {
 	u32 *cmd, *batch_end;
-	struct drm_i915_cmd_descriptor default_desc = { 0 };
+	struct drm_i915_cmd_descriptor default_desc = noop_desc;
+	const struct drm_i915_cmd_descriptor *desc = &default_desc;
 	bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
 	bool needs_clflush_after = false;
 	int ret = 0;
@@ -1208,13 +1216,12 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 	 */
 	batch_end = cmd + (batch_len / sizeof(*batch_end));
 	while (cmd < batch_end) {
-		const struct drm_i915_cmd_descriptor *desc;
 		u32 length;
 
 		if (*cmd == MI_BATCH_BUFFER_END)
 			break;
 
-		desc = find_cmd(engine, *cmd, &default_desc);
+		desc = find_cmd(engine, *cmd, desc, &default_desc);
 		if (!desc) {
 			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
 					 *cmd);

From ea884f09e59951af54976881337309e76b00ec20 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:16 +0100
Subject: [PATCH 129/141] drm/i915/cmdparser: Check for SKIP descriptors first

If the command descriptor says to skip it, ignore checking for anyother
other conflict.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-37-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 558077a25052..690aaf51d715 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -1046,6 +1046,9 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 		      const bool is_master,
 		      bool *oacontrol_set)
 {
+	if (desc->flags & CMD_DESC_SKIP)
+		return true;
+
 	if (desc->flags & CMD_DESC_REJECT) {
 		DRM_DEBUG_DRIVER("CMD: Rejected command: 0x%08X\n", *cmd);
 		return false;

From 76ff480ec9633d689a14e15bc0e3d10a84e6853b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:17 +0100
Subject: [PATCH 130/141] drm/i915/cmdparser: Use binary search for faster
 register lookup

A significant proportion of the cmdparsing time for some batches is the
cost to find the register in the mmiotable. We ensure that those tables
are in ascending order such that we could do a binary search if it was
ever merited. It is.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-38-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 42 ++++++++++++--------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 690aaf51d715..e128e3ab8452 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -921,36 +921,37 @@ find_cmd(struct intel_engine_cs *engine,
 }
 
 static const struct drm_i915_reg_descriptor *
-find_reg(const struct drm_i915_reg_descriptor *table,
-	 int count, u32 addr)
+__find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr)
 {
-	int i;
-
-	for (i = 0; i < count; i++) {
-		if (i915_mmio_reg_offset(table[i].addr) == addr)
-			return &table[i];
+	int start = 0, end = count;
+	while (start < end) {
+		int mid = start + (end - start) / 2;
+		int ret = addr - i915_mmio_reg_offset(table[mid].addr);
+		if (ret < 0)
+			end = mid;
+		else if (ret > 0)
+			start = mid + 1;
+		else
+			return &table[mid];
 	}
-
 	return NULL;
 }
 
 static const struct drm_i915_reg_descriptor *
-find_reg_in_tables(const struct drm_i915_reg_table *tables,
-		   int count, bool is_master, u32 addr)
+find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr)
 {
-	int i;
-	const struct drm_i915_reg_table *table;
-	const struct drm_i915_reg_descriptor *reg;
+	const struct drm_i915_reg_table *table = engine->reg_tables;
+	int count = engine->reg_table_count;
 
-	for (i = 0; i < count; i++) {
-		table = &tables[i];
+	do {
 		if (!table->master || is_master) {
-			reg = find_reg(table->regs, table->num_regs,
-				       addr);
+			const struct drm_i915_reg_descriptor *reg;
+
+			reg = __find_reg(table->regs, table->num_regs, addr);
 			if (reg != NULL)
 				return reg;
 		}
-	}
+	} while (table++, --count);
 
 	return NULL;
 }
@@ -1073,10 +1074,7 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 		     offset += step) {
 			const u32 reg_addr = cmd[offset] & desc->reg.mask;
 			const struct drm_i915_reg_descriptor *reg =
-				find_reg_in_tables(engine->reg_tables,
-						   engine->reg_table_count,
-						   is_master,
-						   reg_addr);
+				find_reg(engine, is_master, reg_addr);
 
 			if (!reg) {
 				DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (exec_id=%d)\n",

From 52a42cec4b7088599a9f51187c454d45c460167a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Aug 2016 17:17:18 +0100
Subject: [PATCH 131/141] drm/i915/cmdparser: Accelerate copies from WC memory

If we need to use clflush to prepare our batch for reads from memory, we
can bypass the cache instead by using non-temporal copies.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.william.auld@gmail.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-39-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 62 ++++++++++++++++----------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index e128e3ab8452..3c72b3b103e7 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -965,8 +965,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 {
 	unsigned int src_needs_clflush;
 	unsigned int dst_needs_clflush;
-	void *dst, *ptr;
-	int offset, n;
+	void *dst, *src;
 	int ret;
 
 	ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush);
@@ -983,31 +982,48 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 	if (IS_ERR(dst))
 		goto unpin_dst;
 
-	ptr = dst;
-	offset = offset_in_page(batch_start_offset);
+	src = ERR_PTR(-ENODEV);
+	if (src_needs_clflush &&
+	    i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) {
+		src = i915_gem_object_pin_map(src_obj, I915_MAP_WC);
+		if (!IS_ERR(src)) {
+			i915_memcpy_from_wc(dst,
+					    src + batch_start_offset,
+					    ALIGN(batch_len, 16));
+			i915_gem_object_unpin_map(src_obj);
+		}
+	}
+	if (IS_ERR(src)) {
+		void *ptr;
+		int offset, n;
 
-	/* We can avoid clflushing partial cachelines before the write if we
-	 * only every write full cache-lines. Since we know that both the
-	 * source and destination are in multiples of PAGE_SIZE, we can simply
-	 * round up to the next cacheline. We don't care about copying too much
-	 * here as we only validate up to the end of the batch.
-	 */
-	if (dst_needs_clflush & CLFLUSH_BEFORE)
-		batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size);
+		offset = offset_in_page(batch_start_offset);
 
-	for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
-		int len = min_t(int, batch_len, PAGE_SIZE - offset);
-		void *vaddr;
+		/* We can avoid clflushing partial cachelines before the write
+		 * if we only every write full cache-lines. Since we know that
+		 * both the source and destination are in multiples of
+		 * PAGE_SIZE, we can simply round up to the next cacheline.
+		 * We don't care about copying too much here as we only
+		 * validate up to the end of the batch.
+		 */
+		if (dst_needs_clflush & CLFLUSH_BEFORE)
+			batch_len = roundup(batch_len,
+					    boot_cpu_data.x86_clflush_size);
 
-		vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n));
-		if (src_needs_clflush)
-			drm_clflush_virt_range(vaddr + offset, len);
-		memcpy(ptr, vaddr + offset, len);
-		kunmap_atomic(vaddr);
+		ptr = dst;
+		for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) {
+			int len = min_t(int, batch_len, PAGE_SIZE - offset);
 
-		ptr += len;
-		batch_len -= len;
-		offset = 0;
+			src = kmap_atomic(i915_gem_object_get_page(src_obj, n));
+			if (src_needs_clflush)
+				drm_clflush_virt_range(src + offset, len);
+			memcpy(ptr, src + offset, len);
+			kunmap_atomic(src);
+
+			ptr += len;
+			batch_len -= len;
+			offset = 0;
+		}
 	}
 
 	/* dst_obj is returned with vmap pinned */

From 0be81156b3fb4d4e8e2c94177e5222dc21c3ff10 Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Fri, 19 Aug 2016 15:23:42 +0100
Subject: [PATCH 132/141] drm/i915: Reattach comment, complete type
 specification

In the recent patch
bc3d674 drm/i915: Allow userspace to request no-error-capture upon ...
the final version moved the flags and the associated #defines around
so they were adjacent; unfortunately, they ended up between a comment
and the thing (hw_id) to which the comment applies :(

So this patch reshuffles the comment and subject back together.

Also, as we're touching 'hw_id', let's change it from just 'unsigned'
to a fully-specified 'unsigned int', because some code checking tools
(including checkpatch) object to plain 'unsigned'.

Fixes: bc3d674462e5 ("drm/i915: Allow userspace to request no-error-capture...")
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1471616622-6919-1-git-send-email-david.s.gordon@intel.com
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 299949d54dca..016425c0b475 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -893,11 +893,12 @@ struct i915_gem_context {
 
 	struct i915_ctx_hang_stats hang_stats;
 
-	/* Unique identifier for this context, used by the hw for tracking */
 	unsigned long flags;
 #define CONTEXT_NO_ZEROMAP		BIT(0)
 #define CONTEXT_NO_ERROR_CAPTURE	BIT(1)
-	unsigned hw_id;
+
+	/* Unique identifier for this context, used by the hw for tracking */
+	unsigned int hw_id;
 	u32 user_handle;
 
 	u32 ggtt_alignment;

From 4fc788f5ee3dba89c3269dcfafcc36136a1ffd0e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 19 Aug 2016 16:54:23 +0100
Subject: [PATCH 133/141] drm/i915: Flush delayed fence releases after reset

What I never hit in testing, but Mika immediately did, was a GPU hang
with a pending fence release (where a tiled object has been changed by
the user to be untiled, and the update has not yet been committed to the
fence register). As the stride/tiling is 0, this causes a divide-by-zero
error when trying to write the new fence parameters:

[   28.784518] drm/i915: Resetting chip after gpu hang
[   28.784551] divide error: 0000 [#1] PREEMPT SMP
[   28.784565] Modules linked in: nls_iso8859_1 nls_cp437 vfat fat mxm_wmi x86_pkg_temp_thermal snd_hda_codec_hdmi kvm irqbypass snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec serio_raw snd_hwdep snd_hda_core snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq snd_timer snd_seq_device snd soundcore mac_hid wmi efivarfs autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx raid0 multipath linear psmouse e1000e ptp pps_core nvme nvme_core i915 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm video
[   28.784738] CPU: 0 PID: 1692 Comm: kworker/0:2 Not tainted 4.8.0-rc2+ #895
[   28.784752] Hardware name: System manufacturer System Product Name/Z170M-PLUS, BIOS 1803 05/09/2016
[   28.784786] Workqueue: events_long i915_hangcheck_elapsed [i915]
[   28.784814] task: ffff923c18f59d40 task.stack: ffff923c1b7e4000
[   28.784827] RIP: 0010:[<ffffffffc0475b5f>]  [<ffffffffc0475b5f>] fence_write+0x9f/0x3b0 [i915]
[   28.784854] RSP: 0018:ffff923c1b7e7b30  EFLAGS: 00010246
[   28.784866] RAX: 00000000008ca000 RBX: ffff923c18540000 RCX: 0000000000000020
[   28.784880] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000596d000
[   28.784894] RBP: ffff923c1b7e7b68 R08: 0000000000000000 R09: 0000000000000000
[   28.784908] R10: 0000000000000000 R11: 00000000008ca000 R12: ffff923c1ef9d600
[   28.784921] R13: 0000000000100040 R14: 0000000000100044 R15: ffff923c18549908
[   28.784935] FS:  0000000000000000(0000) GS:ffff923c36c00000(0000) knlGS:0000000000000000
[   28.784951] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   28.784962] CR2: 00007f193373c893 CR3: 0000000419c78000 CR4: 00000000003406f0
[   28.784976] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   28.784990] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   28.785004] Stack:
[   28.785009]  000000000596c03b ffff923c1b7e7b68 ffff923c18549938 0000000000000009
[   28.785026]  ffff923c18540000 ffff923c18549280 ffff923c18547ce8 ffff923c1b7e7b90
[   28.785044]  ffffffffc04761f9 ffff923c18540000 ffff923c18547d00 ffff923c18548ff8
[   28.785062] Call Trace:
[   28.785078]  [<ffffffffc04761f9>] i915_gem_restore_fences+0x39/0x50 [i915]
[   28.785102]  [<ffffffffc047fe89>] i915_gem_reset+0x179/0x300 [i915]

Reported-by: Mika Kuoppala <mika.kuoppala@intel.com>
Fixes: 49ef5294cda2 ("drm/i915: Move fence tracking from object to vma")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160819155428.1670-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem_fence.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index dfe0a1a5e584..8df1fa7234e8 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -373,12 +373,16 @@ void i915_gem_restore_fences(struct drm_device *dev)
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
+		struct i915_vma *vma = reg->vma;
 
 		/*
 		 * Commit delayed tiling changes if we have an object still
 		 * attached to the fence, otherwise just clear the fence.
 		 */
-		fence_write(reg, reg->vma);
+		if (vma && !i915_gem_object_is_tiled(vma->obj))
+			vma = NULL;
+
+		fence_update(reg, vma);
 	}
 }
 

From 12ecf4b979ec79b72e7aff35b172cada85e128a5 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 19 Aug 2016 16:54:24 +0100
Subject: [PATCH 134/141] drm/i915/fbc: Don't set an illegal fence if unfenced

If the frontbuffer doesn't have an associated fence, it will have a
fence reg of -1. If we attempt to OR in this register into the FBC
control register we end up setting all control bits, oops!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: "Zanoni, Paulo R" <paulo.r.zanoni@intel.com>
Reviwed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160819155428.1670-2-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_fbc.c | 48 +++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index 37415f96f906..faa67624e1ed 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -190,9 +190,13 @@ static void g4x_fbc_activate(struct drm_i915_private *dev_priv)
 		dpfc_ctl |= DPFC_CTL_LIMIT_2X;
 	else
 		dpfc_ctl |= DPFC_CTL_LIMIT_1X;
-	dpfc_ctl |= DPFC_CTL_FENCE_EN | params->fb.fence_reg;
 
-	I915_WRITE(DPFC_FENCE_YOFF, params->crtc.fence_y_offset);
+	if (params->fb.fence_reg != I915_FENCE_REG_NONE) {
+		dpfc_ctl |= DPFC_CTL_FENCE_EN | params->fb.fence_reg;
+		I915_WRITE(DPFC_FENCE_YOFF, params->crtc.fence_y_offset);
+	} else {
+		I915_WRITE(DPFC_FENCE_YOFF, 0);
+	}
 
 	/* enable it... */
 	I915_WRITE(DPFC_CONTROL, dpfc_ctl | DPFC_CTL_EN);
@@ -244,21 +248,29 @@ static void ilk_fbc_activate(struct drm_i915_private *dev_priv)
 		dpfc_ctl |= DPFC_CTL_LIMIT_1X;
 		break;
 	}
-	dpfc_ctl |= DPFC_CTL_FENCE_EN;
-	if (IS_GEN5(dev_priv))
-		dpfc_ctl |= params->fb.fence_reg;
+
+	if (params->fb.fence_reg != I915_FENCE_REG_NONE) {
+		dpfc_ctl |= DPFC_CTL_FENCE_EN;
+		if (IS_GEN5(dev_priv))
+			dpfc_ctl |= params->fb.fence_reg;
+		if (IS_GEN6(dev_priv)) {
+			I915_WRITE(SNB_DPFC_CTL_SA,
+				   SNB_CPU_FENCE_ENABLE | params->fb.fence_reg);
+			I915_WRITE(DPFC_CPU_FENCE_OFFSET,
+				   params->crtc.fence_y_offset);
+		}
+	} else {
+		if (IS_GEN6(dev_priv)) {
+			I915_WRITE(SNB_DPFC_CTL_SA, 0);
+			I915_WRITE(DPFC_CPU_FENCE_OFFSET, 0);
+		}
+	}
 
 	I915_WRITE(ILK_DPFC_FENCE_YOFF, params->crtc.fence_y_offset);
 	I915_WRITE(ILK_FBC_RT_BASE, params->fb.ggtt_offset | ILK_FBC_RT_VALID);
 	/* enable it... */
 	I915_WRITE(ILK_DPFC_CONTROL, dpfc_ctl | DPFC_CTL_EN);
 
-	if (IS_GEN6(dev_priv)) {
-		I915_WRITE(SNB_DPFC_CTL_SA,
-			   SNB_CPU_FENCE_ENABLE | params->fb.fence_reg);
-		I915_WRITE(DPFC_CPU_FENCE_OFFSET, params->crtc.fence_y_offset);
-	}
-
 	intel_fbc_recompress(dev_priv);
 }
 
@@ -305,7 +317,15 @@ static void gen7_fbc_activate(struct drm_i915_private *dev_priv)
 		break;
 	}
 
-	dpfc_ctl |= IVB_DPFC_CTL_FENCE_EN;
+	if (params->fb.fence_reg != I915_FENCE_REG_NONE) {
+		dpfc_ctl |= IVB_DPFC_CTL_FENCE_EN;
+		I915_WRITE(SNB_DPFC_CTL_SA,
+			   SNB_CPU_FENCE_ENABLE | params->fb.fence_reg);
+		I915_WRITE(DPFC_CPU_FENCE_OFFSET, params->crtc.fence_y_offset);
+	} else {
+		I915_WRITE(SNB_DPFC_CTL_SA,0);
+		I915_WRITE(DPFC_CPU_FENCE_OFFSET, 0);
+	}
 
 	if (dev_priv->fbc.false_color)
 		dpfc_ctl |= FBC_CTL_FALSE_COLOR;
@@ -324,10 +344,6 @@ static void gen7_fbc_activate(struct drm_i915_private *dev_priv)
 
 	I915_WRITE(ILK_DPFC_CONTROL, dpfc_ctl | DPFC_CTL_EN);
 
-	I915_WRITE(SNB_DPFC_CTL_SA,
-		   SNB_CPU_FENCE_ENABLE | params->fb.fence_reg);
-	I915_WRITE(DPFC_CPU_FENCE_OFFSET, params->crtc.fence_y_offset);
-
 	intel_fbc_recompress(dev_priv);
 }
 

From 8678fdaf396c3aa3732b3d98ce2241633dbc26ba Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 19 Aug 2016 16:54:25 +0100
Subject: [PATCH 135/141] drm/i915/fbc: Allow on unfenced surfaces, for recent
 gen

Only fbc1 is tied to using a fence. Later iterations of fbc are more
flexible and allow operation on unfenced frontbuffers.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: "Zanoni, Paulo R" <paulo.r.zanoni@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160819155428.1670-3-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_fbc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index faa67624e1ed..bf8b22ad9aed 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -799,8 +799,10 @@ static bool intel_fbc_can_activate(struct intel_crtc *crtc)
 	 */
 	if (cache->fb.tiling_mode != I915_TILING_X ||
 	    cache->fb.fence_reg == I915_FENCE_REG_NONE) {
-		fbc->no_fbc_reason = "framebuffer not tiled or fenced";
-		return false;
+		if (INTEL_GEN(dev_priv) < 5) {
+			fbc->no_fbc_reason = "framebuffer not tiled or fenced";
+			return false;
+		}
 	}
 	if (INTEL_INFO(dev_priv)->gen <= 4 && !IS_G4X(dev_priv) &&
 	    cache->plane.rotation != DRM_ROTATE_0) {

From cafaf14a5d8f152ed3c984ecd48dee6e824446bc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 19 Aug 2016 16:54:26 +0100
Subject: [PATCH 136/141] io-mapping: Always create a struct to hold metadata
 about the io-mapping

Currently, we only allocate a structure to hold metadata if we need to
allocate an ioremap for every access, such as on x86-32. However, it
would be useful to store basic information about the io-mapping, such as
its page protection, on all platforms.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: linux-mm@kvack.org
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160819155428.1670-4-chris@chris-wilson.co.uk
---
 include/linux/io-mapping.h | 92 +++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 35 deletions(-)

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 645ad06b5d52..b4c4b5c4216d 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -31,16 +31,16 @@
  * See Documentation/io-mapping.txt
  */
 
-#ifdef CONFIG_HAVE_ATOMIC_IOMAP
-
-#include <asm/iomap.h>
-
 struct io_mapping {
 	resource_size_t base;
 	unsigned long size;
 	pgprot_t prot;
+	void __iomem *iomem;
 };
 
+#ifdef CONFIG_HAVE_ATOMIC_IOMAP
+
+#include <asm/iomap.h>
 /*
  * For small address space machines, mapping large objects
  * into the kernel virtual space isn't practical. Where
@@ -49,34 +49,25 @@ struct io_mapping {
  */
 
 static inline struct io_mapping *
-io_mapping_create_wc(resource_size_t base, unsigned long size)
+io_mapping_init_wc(struct io_mapping *iomap,
+		   resource_size_t base,
+		   unsigned long size)
 {
-	struct io_mapping *iomap;
 	pgprot_t prot;
 
-	iomap = kmalloc(sizeof(*iomap), GFP_KERNEL);
-	if (!iomap)
-		goto out_err;
-
 	if (iomap_create_wc(base, size, &prot))
-		goto out_free;
+		return NULL;
 
 	iomap->base = base;
 	iomap->size = size;
 	iomap->prot = prot;
 	return iomap;
-
-out_free:
-	kfree(iomap);
-out_err:
-	return NULL;
 }
 
 static inline void
-io_mapping_free(struct io_mapping *mapping)
+io_mapping_fini(struct io_mapping *mapping)
 {
 	iomap_free(mapping->base, mapping->size);
-	kfree(mapping);
 }
 
 /* Atomic map/unmap */
@@ -121,21 +112,40 @@ io_mapping_unmap(void __iomem *vaddr)
 #else
 
 #include <linux/uaccess.h>
-
-/* this struct isn't actually defined anywhere */
-struct io_mapping;
+#include <asm/pgtable_types.h>
 
 /* Create the io_mapping object*/
 static inline struct io_mapping *
-io_mapping_create_wc(resource_size_t base, unsigned long size)
+io_mapping_init_wc(struct io_mapping *iomap,
+		   resource_size_t base,
+		   unsigned long size)
 {
-	return (struct io_mapping __force *) ioremap_wc(base, size);
+	iomap->base = base;
+	iomap->size = size;
+	iomap->iomem = ioremap_wc(base, size);
+	iomap->prot = pgprot_writecombine(PAGE_KERNEL_IO);
+
+	return iomap;
 }
 
 static inline void
-io_mapping_free(struct io_mapping *mapping)
+io_mapping_fini(struct io_mapping *mapping)
+{
+	iounmap(mapping->iomem);
+}
+
+/* Non-atomic map/unmap */
+static inline void __iomem *
+io_mapping_map_wc(struct io_mapping *mapping,
+		  unsigned long offset,
+		  unsigned long size)
+{
+	return mapping->iomem + offset;
+}
+
+static inline void
+io_mapping_unmap(void __iomem *vaddr)
 {
-	iounmap((void __force __iomem *) mapping);
 }
 
 /* Atomic map/unmap */
@@ -145,30 +155,42 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
 {
 	preempt_disable();
 	pagefault_disable();
-	return ((char __force __iomem *) mapping) + offset;
+	return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
 }
 
 static inline void
 io_mapping_unmap_atomic(void __iomem *vaddr)
 {
+	io_mapping_unmap(vaddr);
 	pagefault_enable();
 	preempt_enable();
 }
 
-/* Non-atomic map/unmap */
-static inline void __iomem *
-io_mapping_map_wc(struct io_mapping *mapping,
-		  unsigned long offset,
-		  unsigned long size)
+#endif /* HAVE_ATOMIC_IOMAP */
+
+static inline struct io_mapping *
+io_mapping_create_wc(resource_size_t base,
+		     unsigned long size)
 {
-	return ((char __force __iomem *) mapping) + offset;
+	struct io_mapping *iomap;
+
+	iomap = kmalloc(sizeof(*iomap), GFP_KERNEL);
+	if (!iomap)
+		return NULL;
+
+	if (!io_mapping_init_wc(iomap, base, size)) {
+		kfree(iomap);
+		return NULL;
+	}
+
+	return iomap;
 }
 
 static inline void
-io_mapping_unmap(void __iomem *vaddr)
+io_mapping_free(struct io_mapping *iomap)
 {
+	io_mapping_fini(iomap);
+	kfree(iomap);
 }
 
-#endif /* HAVE_ATOMIC_IOMAP */
-
 #endif /* _LINUX_IO_MAPPING_H */

From f7bbe7883c3f119714fd09a8ceaac8075ba04dfe Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 19 Aug 2016 16:54:27 +0100
Subject: [PATCH 137/141] drm/i915: Embed the io-mapping struct inside
 drm_i915_private

As io_mapping.h now always allocates the struct, we can avoid that
allocation and extra pointer dance by embedding the struct inside
drm_i915_private

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160819155428.1670-5-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c            |  6 +++---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c        | 11 +++++------
 drivers/gpu/drm/i915/i915_gem_gtt.h        |  2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c      |  2 +-
 drivers/gpu/drm/i915/intel_overlay.c       |  4 ++--
 6 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0e1f5dde2e87..5398af7f7580 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -891,7 +891,7 @@ i915_gem_gtt_pread(struct drm_device *dev,
 		 * and write to user memory which may result into page
 		 * faults, and so we cannot perform this under struct_mutex.
 		 */
-		if (slow_user_access(ggtt->mappable, page_base,
+		if (slow_user_access(&ggtt->mappable, page_base,
 				     page_offset, user_data,
 				     page_length, false)) {
 			ret = -EFAULT;
@@ -1187,11 +1187,11 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_private *i915,
 		 * If the object is non-shmem backed, we retry again with the
 		 * path that handles page fault.
 		 */
-		if (fast_user_write(ggtt->mappable, page_base,
+		if (fast_user_write(&ggtt->mappable, page_base,
 				    page_offset, user_data, page_length)) {
 			hit_slow_path = true;
 			mutex_unlock(&dev->struct_mutex);
-			if (slow_user_access(ggtt->mappable,
+			if (slow_user_access(&ggtt->mappable,
 					     page_base,
 					     page_offset, user_data,
 					     page_length, true)) {
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 4192066ff60e..601156c353cc 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -474,7 +474,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj,
 		offset += page << PAGE_SHIFT;
 	}
 
-	vaddr = io_mapping_map_atomic_wc(cache->i915->ggtt.mappable, offset);
+	vaddr = io_mapping_map_atomic_wc(&cache->i915->ggtt.mappable, offset);
 	cache->page = page;
 	cache->vaddr = (unsigned long)vaddr;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index a18363a0d8c5..b90fdcee992a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2794,7 +2794,6 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
 
 	if (dev_priv->mm.aliasing_ppgtt) {
 		struct i915_hw_ppgtt *ppgtt = dev_priv->mm.aliasing_ppgtt;
-
 		ppgtt->base.cleanup(&ppgtt->base);
 		kfree(ppgtt);
 	}
@@ -2811,7 +2810,7 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
 	ggtt->base.cleanup(&ggtt->base);
 
 	arch_phys_wc_del(ggtt->mtrr);
-	io_mapping_free(ggtt->mappable);
+	io_mapping_fini(&ggtt->mappable);
 }
 
 static unsigned int gen6_get_total_gtt_size(u16 snb_gmch_ctl)
@@ -3209,9 +3208,9 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
 	if (!HAS_LLC(dev_priv))
 		ggtt->base.mm.color_adjust = i915_gtt_color_adjust;
 
-	ggtt->mappable =
-		io_mapping_create_wc(ggtt->mappable_base, ggtt->mappable_end);
-	if (!ggtt->mappable) {
+	if (!io_mapping_init_wc(&dev_priv->ggtt.mappable,
+				dev_priv->ggtt.mappable_base,
+				dev_priv->ggtt.mappable_end)) {
 		ret = -EIO;
 		goto out_gtt_cleanup;
 	}
@@ -3681,7 +3680,7 @@ void __iomem *i915_vma_pin_iomap(struct i915_vma *vma)
 
 	ptr = vma->iomap;
 	if (ptr == NULL) {
-		ptr = io_mapping_map_wc(i915_vm_to_ggtt(vma->vm)->mappable,
+		ptr = io_mapping_map_wc(&i915_vm_to_ggtt(vma->vm)->mappable,
 					vma->node.start,
 					vma->node.size);
 		if (ptr == NULL)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index a15cea73f729..a9aec25535ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -439,13 +439,13 @@ struct i915_address_space {
  */
 struct i915_ggtt {
 	struct i915_address_space base;
+	struct io_mapping mappable;	/* Mapping to our CPU mappable region */
 
 	size_t stolen_size;		/* Total size of stolen memory */
 	size_t stolen_usable_size;	/* Total size minus BIOS reserved */
 	size_t stolen_reserved_base;
 	size_t stolen_reserved_size;
 	u64 mappable_end;		/* End offset that we can CPU map */
-	struct io_mapping *mappable;	/* Mapping to our CPU mappable region */
 	phys_addr_t mappable_base;	/* PA of our GMADR */
 
 	/** "Graphics Stolen Memory" holds the global PTEs */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 84dd5bc06db3..41ec7a183c73 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -729,7 +729,7 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
 			 * captures what the GPU read.
 			 */
 
-			s = io_mapping_map_atomic_wc(ggtt->mappable,
+			s = io_mapping_map_atomic_wc(&ggtt->mappable,
 						     reloc_offset);
 			memcpy_fromio(d, s, PAGE_SIZE);
 			io_mapping_unmap_atomic(s);
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index 3cf8d02064a8..a24bc8c7889f 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -196,7 +196,7 @@ intel_overlay_map_regs(struct intel_overlay *overlay)
 	if (OVERLAY_NEEDS_PHYSICAL(dev_priv))
 		regs = (struct overlay_registers __iomem *)overlay->reg_bo->phys_handle->vaddr;
 	else
-		regs = io_mapping_map_wc(dev_priv->ggtt.mappable,
+		regs = io_mapping_map_wc(&dev_priv->ggtt.mappable,
 					 overlay->flip_addr,
 					 PAGE_SIZE);
 
@@ -1489,7 +1489,7 @@ intel_overlay_map_regs_atomic(struct intel_overlay *overlay)
 		regs = (struct overlay_registers __iomem *)
 			overlay->reg_bo->phys_handle->vaddr;
 	else
-		regs = io_mapping_map_atomic_wc(dev_priv->ggtt.mappable,
+		regs = io_mapping_map_atomic_wc(&dev_priv->ggtt.mappable,
 						overlay->flip_addr);
 
 	return regs;

From c58305af1835095ddc25ee6f548ac05915e66ac5 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 19 Aug 2016 16:54:28 +0100
Subject: [PATCH 138/141] drm/i915: Use remap_io_mapping() to prefault all PTE
 in a single pass

Very old numbers indicate this is a 66% improvement when remapping the
entire object for fence contention - due to the elimination of
track_pfn_insert and its strcmp.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Testcase: igt/gem_fence_upload/performance
Testcase: igt/gem_mmap_gtt
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160819155428.1670-6-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/Makefile        |  2 +-
 drivers/gpu/drm/i915/Makefile   |  3 +-
 drivers/gpu/drm/i915/i915_drv.h |  5 ++
 drivers/gpu/drm/i915/i915_gem.c | 50 +++-----------------
 drivers/gpu/drm/i915/i915_mm.c  | 84 +++++++++++++++++++++++++++++++++
 5 files changed, 99 insertions(+), 45 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_mm.c

diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 0238bf8bc8c3..3ff094171ee5 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -46,7 +46,7 @@ obj-$(CONFIG_DRM_RADEON)+= radeon/
 obj-$(CONFIG_DRM_AMDGPU)+= amd/amdgpu/
 obj-$(CONFIG_DRM_MGA)	+= mga/
 obj-$(CONFIG_DRM_I810)	+= i810/
-obj-$(CONFIG_DRM_I915)  += i915/
+obj-$(CONFIG_DRM_I915)	+= i915/
 obj-$(CONFIG_DRM_MGAG200) += mgag200/
 obj-$(CONFIG_DRM_VC4)  += vc4/
 obj-$(CONFIG_DRM_CIRRUS_QEMU) += cirrus/
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 3412413408c0..a7da24640e88 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -12,6 +12,7 @@ subdir-ccflags-y += \
 i915-y := i915_drv.o \
 	  i915_irq.o \
 	  i915_memcpy.o \
+	  i915_mm.o \
 	  i915_params.o \
 	  i915_pci.o \
           i915_suspend.o \
@@ -113,6 +114,6 @@ i915-y += intel_gvt.o
 include $(src)/gvt/Makefile
 endif
 
-obj-$(CONFIG_DRM_I915)  += i915.o
+obj-$(CONFIG_DRM_I915) += i915.o
 
 CFLAGS_i915_trace_points.o := -I$(src)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 016425c0b475..9cd102cd931e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3931,6 +3931,11 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 void i915_memcpy_init_early(struct drm_i915_private *dev_priv);
 bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
 
+/* i915_mm.c */
+int remap_io_mapping(struct vm_area_struct *vma,
+		     unsigned long addr, unsigned long pfn, unsigned long size,
+		     struct io_mapping *iomap);
+
 #define ptr_mask_bits(ptr) ({						\
 	unsigned long __v = (unsigned long)(ptr);			\
 	(typeof(ptr))(__v & PAGE_MASK);					\
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5398af7f7580..04607d4115d6 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1705,7 +1705,6 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
 	struct i915_vma *vma;
 	pgoff_t page_offset;
-	unsigned long pfn;
 	unsigned int flags;
 	int ret;
 
@@ -1790,48 +1789,13 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 		goto err_unpin;
 
 	/* Finally, remap it using the new GTT offset */
-	pfn = ggtt->mappable_base + i915_ggtt_offset(vma);
-	pfn >>= PAGE_SHIFT;
-
-	if (vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
-		if (!obj->fault_mappable) {
-			unsigned long size =
-				min_t(unsigned long,
-				      area->vm_end - area->vm_start,
-				      obj->base.size) >> PAGE_SHIFT;
-			unsigned long base = area->vm_start;
-			int i;
-
-			for (i = 0; i < size; i++) {
-				ret = vm_insert_pfn(area,
-						    base + i * PAGE_SIZE,
-						    pfn + i);
-				if (ret)
-					break;
-			}
-		} else
-			ret = vm_insert_pfn(area,
-					    (unsigned long)vmf->virtual_address,
-					    pfn + page_offset);
-	} else {
-		/* Overriding existing pages in partial view does not cause
-		 * us any trouble as TLBs are still valid because the fault
-		 * is due to userspace losing part of the mapping or never
-		 * having accessed it before (at this partials' range).
-		 */
-		const struct i915_ggtt_view *view = &vma->ggtt_view;
-		unsigned long base = area->vm_start +
-			(view->params.partial.offset << PAGE_SHIFT);
-		unsigned int i;
-
-		for (i = 0; i < view->params.partial.size; i++) {
-			ret = vm_insert_pfn(area,
-					    base + i * PAGE_SIZE,
-					    pfn + i);
-			if (ret)
-				break;
-		}
-	}
+	ret = remap_io_mapping(area,
+			       area->vm_start + (vma->ggtt_view.params.partial.offset << PAGE_SHIFT),
+			       (ggtt->mappable_base + vma->node.start) >> PAGE_SHIFT,
+			       min_t(u64, vma->size, area->vm_end - area->vm_start),
+			       &ggtt->mappable);
+	if (ret)
+		goto err_unpin;
 
 	obj->fault_mappable = true;
 err_unpin:
diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c
new file mode 100644
index 000000000000..e4935dd1fd37
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_mm.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/io-mapping.h>
+
+#include <asm/pgtable.h>
+
+#include "i915_drv.h"
+
+struct remap_pfn {
+	struct mm_struct *mm;
+	unsigned long pfn;
+	pgprot_t prot;
+};
+
+static int remap_pfn(pte_t *pte, pgtable_t token,
+		     unsigned long addr, void *data)
+{
+	struct remap_pfn *r = data;
+
+	/* Special PTE are not associated with any struct page */
+	set_pte_at(r->mm, addr, pte, pte_mkspecial(pfn_pte(r->pfn, r->prot)));
+	r->pfn++;
+
+	return 0;
+}
+
+/**
+ * remap_io_mapping - remap an IO mapping to userspace
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ * @iomap: the source io_mapping
+ *
+ *  Note: this is only safe if the mm semaphore is held when called.
+ */
+int remap_io_mapping(struct vm_area_struct *vma,
+		     unsigned long addr, unsigned long pfn, unsigned long size,
+		     struct io_mapping *iomap)
+{
+	struct remap_pfn r;
+	int err;
+
+	GEM_BUG_ON((vma->vm_flags &
+		    (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP)) !=
+		   (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP));
+
+	/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+	r.mm = vma->vm_mm;
+	r.pfn = pfn;
+	r.prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+			  (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
+
+	err = apply_to_page_range(r.mm, addr, size, remap_pfn, &r);
+	if (unlikely(err)) {
+		zap_vma_ptes(vma, addr, (r.pfn - pfn) << PAGE_SHIFT);
+		return err;
+	}
+
+	return 0;
+}

From d5d0804f8f6d0c89913f6a2de5348adef8ec33e4 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 22 Aug 2016 08:35:48 +0200
Subject: [PATCH 139/141] drm/i915: Update DRIVER_DATE to 20160822

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9cd102cd931e..e6069057eb98 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -70,7 +70,7 @@
 
 #define DRIVER_NAME		"i915"
 #define DRIVER_DESC		"Intel Graphics"
-#define DRIVER_DATE		"20160808"
+#define DRIVER_DATE		"20160822"
 
 #undef WARN_ON
 /* Many gcc seem to no see through this and fall over :( */

From bcaaa0c4310ebe1ab04274594916bbb2d82a49c8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 23 Aug 2016 16:50:24 +0100
Subject: [PATCH 140/141] io-mapping.h: s/PAGE_KERNEL_IO/PAGE_KERNEL/

PAGE_KERNEL_IO is an x86-ism. Though it is used to define the pgprot_t
used for the iomapped region, it itself is just PAGE_KERNEL. On all
other arches, PAGE_KERNEL_IO is undefined so in a general header we must
refrain from using it.

v2: include pgtable for pgprot_combine()

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: cafaf14a5d8f ("io-mapping: Always create a struct to hold metadata about the io-mapping")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: linux-mm@kvack.org
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20160823155024.22379-1-chris@chris-wilson.co.uk
(cherry picked from commit ac96b5566926af83463ddcf4655856033c092f26)
---
 include/linux/io-mapping.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index b4c4b5c4216d..a87dd7fffc0a 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -112,7 +112,7 @@ io_mapping_unmap(void __iomem *vaddr)
 #else
 
 #include <linux/uaccess.h>
-#include <asm/pgtable_types.h>
+#include <asm/pgtable.h>
 
 /* Create the io_mapping object*/
 static inline struct io_mapping *
@@ -123,7 +123,7 @@ io_mapping_init_wc(struct io_mapping *iomap,
 	iomap->base = base;
 	iomap->size = size;
 	iomap->iomem = ioremap_wc(base, size);
-	iomap->prot = pgprot_writecombine(PAGE_KERNEL_IO);
+	iomap->prot = pgprot_writecombine(PAGE_KERNEL);
 
 	return iomap;
 }

From 351243897b15aba02ad15317724d616aeaf00c7d Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 23 Aug 2016 22:15:02 +0200
Subject: [PATCH 141/141] io-mapping: Fixup for different names of writecombine

Somehow architectures can't agree on this. And for good measure make
sure we have a fallback which should work everywhere (fingers
crossed).

v2: Make it compile properly, needs a defined() for the #elif.

Fixes: ac96b5566926 ("io-mapping.h: s/PAGE_KERNEL_IO/PAGE_KERNEL/")
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: linux-mm@kvack.org
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20160823202233.4681-1-daniel.vetter@ffwll.ch
(cherry picked from commit 80c33624e4723c4e22d9917cd676067ebf652dc2)
---
 include/linux/io-mapping.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index a87dd7fffc0a..58df02bd93c9 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -123,7 +123,13 @@ io_mapping_init_wc(struct io_mapping *iomap,
 	iomap->base = base;
 	iomap->size = size;
 	iomap->iomem = ioremap_wc(base, size);
+#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */
+	iomap->prot = pgprot_noncached_wc(PAGE_KERNEL);
+#elif defined(pgprot_writecombine)
 	iomap->prot = pgprot_writecombine(PAGE_KERNEL);
+#else
+	iomap->prot = pgprot_noncached(PAGE_KERNEL);
+#endif
 
 	return iomap;
 }