linux/drivers/gpu/drm/i915/i915_drv.h

3467 lines
104 KiB
C
Raw Normal View History

/* i915_drv.h -- Private header for the I915 driver -*- linux-c -*-
*/
/*
*
* Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _I915_DRV_H_
#define _I915_DRV_H_
#include <uapi/drm/i915_drm.h>
#include <uapi/drm/drm_fourcc.h>
#include "i915_reg.h"
#include "intel_bios.h"
#include "intel_ringbuffer.h"
#include "intel_lrc.h"
#include "i915_gem_gtt.h"
#include "i915_gem_render_state.h"
#include <linux/io-mapping.h>
#include <linux/i2c.h>
#include <linux/i2c-algo-bit.h>
#include <drm/intel-gtt.h>
#include <drm/drm_legacy.h> /* for struct drm_dma_handle */
#include <drm/drm_gem.h>
#include <linux/backlight.h>
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
#include <linux/hashtable.h>
#include <linux/intel-iommu.h>
#include <linux/kref.h>
drm/i915: irq-drive the dp aux communication At least on the platforms that have a dp aux irq and also have it enabled - vlvhsw should have one, too. But I don't have a machine to test this on. Judging from docs there's no dp aux interrupt for gm45. Also, I only have an ivb cpu edp machine, so the dp aux A code for snb/ilk is untested. For dpcd probing when nothing is connected it slashes about 5ms of cpu time (cpu time is now negligible), which agrees with 3 * 5 400 usec timeouts. A previous version of this patch increases the time required to go through the dp_detect cycle (which includes reading the edid) from around 33 ms to around 40 ms. Experiments indicated that this is purely due to the irq latency - the hw doesn't allow us to queue up dp aux transactions and hence irq latency directly affects throughput. gmbus is much better, there we have a 8 byte buffer, and we get the irq once another 4 bytes can be queued up. But by using the pm_qos interface to request the lowest possible cpu wake-up latency this slowdown completely disappeared. Since all our output detection logic is single-threaded with the mode_config mutex right now anyway, I've decide not ot play fancy and to just reuse the gmbus wait queue. But this would definitely prep the way to run dp detection on different ports in parallel v2: Add a timeout for dp aux transfers when using interrupts - the hw _does_ prevent this with the hw-based 400 usec timeout, but if the irq somehow doesn't arrive we're screwed. Lesson learned while developing this ;-) v3: While at it also convert the busy-loop to wait_for_atomic, so that we don't run the risk of an infinite loop any more. v4: Ensure we have the smallest possible irq latency by using the pm_qos interface. v5: Add a comment to the code to explain why we frob pm_qos. Suggested by Chris Wilson. v6: Disable dp irq for vlv, that's easier than trying to get at docs and hw. v7: Squash in a fix for Haswell that Paulo Zanoni tracked down - the dp aux registers aren't at a fixed offset any more, but can be on the PCH while the DP port is on the cpu die. Reviewed-by: Imre Deak <imre.deak@intel.com> (v6) Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-12-01 20:53:48 +08:00
#include <linux/pm_qos.h>
/* General customization:
*/
#define DRIVER_NAME "i915"
#define DRIVER_DESC "Intel Graphics"
#define DRIVER_DATE "20150717"
#undef WARN_ON
drm/i915: Use BUILD_BUG if possible in the i915 WARN_ON Faster feedback to errors is always better. This is inspired by the addition to WARN_ONs to mask/enable helpers for registers to make sure callers have the arguments ordered correctly: Pretty much always the arguments are static. We use WARN_ON(1) a lot in default switch statements though where we should always handle all cases. So add a new macro specifically for that. The idea to use __builtin_constant_p is from Chris Wilson. v2: Use the ({}) gcc-ism to avoid the static inline, suggested by Dave. My first attempt used __cond as the temp var, which is the same used by BUILD_BUG_ON, but with inverted sense. Hilarity ensued, so sprinkle i915 into the name. Also use a temporary variable to only evaluate the condition once, suggested by Damien. v3: It's crazy but apparently 32bit gcc can't compile out the BUILD_BUG_ON in a lot of cases and just falls over. I have no idea why, but until clue grows just disable this nifty idea on 32bit builds. Reported by 0-day builder. v4: Got it all wrong, apparently its the gcc version. We need 4.9+. Now reported by Imre. v5: Chris suggested to add the case to MISSING_CASE for speedier debug. v6: Even some gcc 4.9 versions don't see through the maze, so give up for now. Keep the skeleton and MISSING_CASE stuff though. Cc: Imre Deak <imre.deak@intel.com> Cc: Damien Lespiau <damien.lespiau@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Dave Gordon <david.s.gordon@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
2014-12-08 23:40:10 +08:00
/* Many gcc seem to no see through this and fall over :( */
#if 0
#define WARN_ON(x) ({ \
bool __i915_warn_cond = (x); \
if (__builtin_constant_p(__i915_warn_cond)) \
BUILD_BUG_ON(__i915_warn_cond); \
WARN(__i915_warn_cond, "WARN_ON(" #x ")"); })
#else
#define WARN_ON(x) WARN((x), "WARN_ON(" #x ")")
#endif
#undef WARN_ON_ONCE
#define WARN_ON_ONCE(x) WARN_ONCE((x), "WARN_ON_ONCE(" #x ")")
drm/i915: Use BUILD_BUG if possible in the i915 WARN_ON Faster feedback to errors is always better. This is inspired by the addition to WARN_ONs to mask/enable helpers for registers to make sure callers have the arguments ordered correctly: Pretty much always the arguments are static. We use WARN_ON(1) a lot in default switch statements though where we should always handle all cases. So add a new macro specifically for that. The idea to use __builtin_constant_p is from Chris Wilson. v2: Use the ({}) gcc-ism to avoid the static inline, suggested by Dave. My first attempt used __cond as the temp var, which is the same used by BUILD_BUG_ON, but with inverted sense. Hilarity ensued, so sprinkle i915 into the name. Also use a temporary variable to only evaluate the condition once, suggested by Damien. v3: It's crazy but apparently 32bit gcc can't compile out the BUILD_BUG_ON in a lot of cases and just falls over. I have no idea why, but until clue grows just disable this nifty idea on 32bit builds. Reported by 0-day builder. v4: Got it all wrong, apparently its the gcc version. We need 4.9+. Now reported by Imre. v5: Chris suggested to add the case to MISSING_CASE for speedier debug. v6: Even some gcc 4.9 versions don't see through the maze, so give up for now. Keep the skeleton and MISSING_CASE stuff though. Cc: Imre Deak <imre.deak@intel.com> Cc: Damien Lespiau <damien.lespiau@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Dave Gordon <david.s.gordon@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
2014-12-08 23:40:10 +08:00
#define MISSING_CASE(x) WARN(1, "Missing switch case (%lu) in %s\n", \
(long) (x), __func__);
/* Use I915_STATE_WARN(x) and I915_STATE_WARN_ON() (rather than WARN() and
* WARN_ON()) for hw state sanity checks to check for unexpected conditions
* which may not necessarily be a user visible problem. This will either
* WARN() or DRM_ERROR() depending on the verbose_checks moduleparam, to
* enable distros and users to tailor their preferred amount of i915 abrt
* spam.
*/
#define I915_STATE_WARN(condition, format...) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) { \
if (i915.verbose_state_checks) \
WARN(1, format); \
else \
DRM_ERROR(format); \
} \
unlikely(__ret_warn_on); \
})
#define I915_STATE_WARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) { \
if (i915.verbose_state_checks) \
WARN(1, "WARN_ON(" #condition ")\n"); \
else \
DRM_ERROR("WARN_ON(" #condition ")\n"); \
} \
unlikely(__ret_warn_on); \
})
enum pipe {
INVALID_PIPE = -1,
PIPE_A = 0,
PIPE_B,
PIPE_C,
_PIPE_EDP,
I915_MAX_PIPES = _PIPE_EDP
};
#define pipe_name(p) ((p) + 'A')
drm/i915: add TRANSCODER_EDP Before Haswell we used to have the CPU pipes and the PCH transcoders. We had the same amount of pipes and transcoders, and there was a 1:1 mapping between them. After Haswell what we used to call CPU pipe was split into CPU pipe and CPU transcoder. So now we have 3 CPU pipes (A, B and C), 4 CPU transcoders (A, B, C and EDP) and 1 PCH transcoder (only used for VGA). For all the outputs except for EDP we have an 1:1 mapping on the CPU pipes and CPU transcoders, so if you're using CPU pipe A you have to use CPU transcoder A. When have an eDP output you have to use transcoder EDP and you can attach this CPU transcoder to any of the 3 CPU pipes. When using VGA you need to select a pair of matching CPU pipes/transcoders (A/A, B/B, C/C) and you also need to enable/use the PCH transcoder. For now we're just creating the cpu_transcoder definitions and setting cpu_transcoder to TRANSCODER_EDP on DDI eDP code, but none of the registers was ported to use transcoder instead of pipe. The goal is to keep the code backwards-compatible since on all cases except when using eDP we must have pipe == cpu_transcoder. V2: Comment the haswell_crtc_off chunk, suggested by Damien Lespiau and Daniel Vetter. We currently need the haswell_crtc_off chunk because TRANSCODER_EDP can be used by any CRTC, so when you stop using it you have to stop saying you're using it, otherwise you may have at some point 2 CRTCs claiming they're using TRANSCODER_EDP (a disabled CRTC and an enabled one), then the HW state readout code will get completely confused. In other words: Imagine the following case: xrandr --output eDP1 --auto --crtc 0 xrandr --output eDP1 --off xrandr --output eDP1 --auto --crtc 2 After the last command you could get a "pipe A assertion failure (expected off, current on)" because CRTC 0 still claims it's using TRANSCODER_EDP, so the HW state readout function will read it (through PIPECONF) and expect it to be off, when it's actually on because it's being used by CRTC 2. So when we make "intel_crtc->cpu_transcoder = intel_crtc->pipe" we make sure we're pointing to our own original CRTC which is certainly not used by any other CRTC. Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-10-25 01:59:34 +08:00
enum transcoder {
TRANSCODER_A = 0,
TRANSCODER_B,
TRANSCODER_C,
TRANSCODER_EDP,
I915_MAX_TRANSCODERS
drm/i915: add TRANSCODER_EDP Before Haswell we used to have the CPU pipes and the PCH transcoders. We had the same amount of pipes and transcoders, and there was a 1:1 mapping between them. After Haswell what we used to call CPU pipe was split into CPU pipe and CPU transcoder. So now we have 3 CPU pipes (A, B and C), 4 CPU transcoders (A, B, C and EDP) and 1 PCH transcoder (only used for VGA). For all the outputs except for EDP we have an 1:1 mapping on the CPU pipes and CPU transcoders, so if you're using CPU pipe A you have to use CPU transcoder A. When have an eDP output you have to use transcoder EDP and you can attach this CPU transcoder to any of the 3 CPU pipes. When using VGA you need to select a pair of matching CPU pipes/transcoders (A/A, B/B, C/C) and you also need to enable/use the PCH transcoder. For now we're just creating the cpu_transcoder definitions and setting cpu_transcoder to TRANSCODER_EDP on DDI eDP code, but none of the registers was ported to use transcoder instead of pipe. The goal is to keep the code backwards-compatible since on all cases except when using eDP we must have pipe == cpu_transcoder. V2: Comment the haswell_crtc_off chunk, suggested by Damien Lespiau and Daniel Vetter. We currently need the haswell_crtc_off chunk because TRANSCODER_EDP can be used by any CRTC, so when you stop using it you have to stop saying you're using it, otherwise you may have at some point 2 CRTCs claiming they're using TRANSCODER_EDP (a disabled CRTC and an enabled one), then the HW state readout code will get completely confused. In other words: Imagine the following case: xrandr --output eDP1 --auto --crtc 0 xrandr --output eDP1 --off xrandr --output eDP1 --auto --crtc 2 After the last command you could get a "pipe A assertion failure (expected off, current on)" because CRTC 0 still claims it's using TRANSCODER_EDP, so the HW state readout function will read it (through PIPECONF) and expect it to be off, when it's actually on because it's being used by CRTC 2. So when we make "intel_crtc->cpu_transcoder = intel_crtc->pipe" we make sure we're pointing to our own original CRTC which is certainly not used by any other CRTC. Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-10-25 01:59:34 +08:00
};
#define transcoder_name(t) ((t) + 'A')
/*
* This is the maximum (across all platforms) number of planes (primary +
* sprites) that can be active at the same time on one pipe.
*
* This value doesn't count the cursor plane.
*/
#define I915_MAX_PLANES 4
enum plane {
PLANE_A = 0,
PLANE_B,
PLANE_C,
};
#define plane_name(p) ((p) + 'A')
#define sprite_name(p, s) ((p) * INTEL_INFO(dev)->num_sprites[(p)] + (s) + 'A')
enum port {
PORT_A = 0,
PORT_B,
PORT_C,
PORT_D,
PORT_E,
I915_MAX_PORTS
};
#define port_name(p) ((p) + 'A')
#define I915_NUM_PHYS_VLV 2
enum dpio_channel {
DPIO_CH0,
DPIO_CH1
};
enum dpio_phy {
DPIO_PHY0,
DPIO_PHY1
};
enum intel_display_power_domain {
POWER_DOMAIN_PIPE_A,
POWER_DOMAIN_PIPE_B,
POWER_DOMAIN_PIPE_C,
POWER_DOMAIN_PIPE_A_PANEL_FITTER,
POWER_DOMAIN_PIPE_B_PANEL_FITTER,
POWER_DOMAIN_PIPE_C_PANEL_FITTER,
POWER_DOMAIN_TRANSCODER_A,
POWER_DOMAIN_TRANSCODER_B,
POWER_DOMAIN_TRANSCODER_C,
POWER_DOMAIN_TRANSCODER_EDP,
POWER_DOMAIN_PORT_DDI_A_2_LANES,
POWER_DOMAIN_PORT_DDI_A_4_LANES,
POWER_DOMAIN_PORT_DDI_B_2_LANES,
POWER_DOMAIN_PORT_DDI_B_4_LANES,
POWER_DOMAIN_PORT_DDI_C_2_LANES,
POWER_DOMAIN_PORT_DDI_C_4_LANES,
POWER_DOMAIN_PORT_DDI_D_2_LANES,
POWER_DOMAIN_PORT_DDI_D_4_LANES,
POWER_DOMAIN_PORT_DSI,
POWER_DOMAIN_PORT_CRT,
POWER_DOMAIN_PORT_OTHER,
POWER_DOMAIN_VGA,
POWER_DOMAIN_AUDIO,
POWER_DOMAIN_PLLS,
POWER_DOMAIN_AUX_A,
POWER_DOMAIN_AUX_B,
POWER_DOMAIN_AUX_C,
POWER_DOMAIN_AUX_D,
drm/i915: use power get/put instead of set for power on after init Currently we make sure that all power domains are enabled during driver init and turn off unneded ones only after the first modeset. Similarly during suspend we enable all power domains, which will remain on through the following resume until the first modeset. This logic is supported by intel_set_power_well() in the power domain framework. It would be nice to simplify the API, so that we only have get/put functions and make it more explicit on the higher level how this "power well on during init" logic works. This will make it also easier if in the future we want to shorten the time the power wells are on. For this add a new device private flag tracking whether we have the power wells on because of init/suspend and use only intel_display_power_get()/put(). As nothing else uses intel_set_power_well() we can remove it. This also fixes commit 6efdf354ddb186c6604d1692075421e8d2c740e9 Author: Imre Deak <imre.deak@intel.com> Date: Wed Oct 16 17:25:52 2013 +0300 drm/i915: enable only the needed power domains during modeset where removing intel_set_power_well() resulted in not releasing the reference on the power well that was taken during init and thus leaving the power well on all the time. Regression reported by Paulo. v2: - move the init_power_on flag to the power_domains struct (Daniel) v3: - add note about this being a regression fix too (Paulo) Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-10-25 22:36:48 +08:00
POWER_DOMAIN_INIT,
POWER_DOMAIN_NUM,
};
#define POWER_DOMAIN_PIPE(pipe) ((pipe) + POWER_DOMAIN_PIPE_A)
#define POWER_DOMAIN_PIPE_PANEL_FITTER(pipe) \
((pipe) + POWER_DOMAIN_PIPE_A_PANEL_FITTER)
#define POWER_DOMAIN_TRANSCODER(tran) \
((tran) == TRANSCODER_EDP ? POWER_DOMAIN_TRANSCODER_EDP : \
(tran) + POWER_DOMAIN_TRANSCODER_A)
enum hpd_pin {
HPD_NONE = 0,
HPD_TV = HPD_NONE, /* TV is known to be unreliable */
HPD_CRT,
HPD_SDVO_B,
HPD_SDVO_C,
HPD_PORT_A,
HPD_PORT_B,
HPD_PORT_C,
HPD_PORT_D,
HPD_NUM_PINS
};
#define for_each_hpd_pin(__pin) \
for ((__pin) = (HPD_NONE + 1); (__pin) < HPD_NUM_PINS; (__pin)++)
struct i915_hotplug {
struct work_struct hotplug_work;
struct {
unsigned long last_jiffies;
int count;
enum {
HPD_ENABLED = 0,
HPD_DISABLED = 1,
HPD_MARK_DISABLED = 2
} state;
} stats[HPD_NUM_PINS];
u32 event_bits;
struct delayed_work reenable_work;
struct intel_digital_port *irq_port[I915_MAX_PORTS];
u32 long_port_mask;
u32 short_port_mask;
struct work_struct dig_port_work;
/*
* if we get a HPD irq from DP and a HPD irq from non-DP
* the non-DP HPD could block the workqueue on a mode config
* mutex getting, that userspace may have taken. However
* userspace is waiting on the DP workqueue to run which is
* blocked behind the non-DP one.
*/
struct workqueue_struct *dp_wq;
};
#define I915_GEM_GPU_DOMAINS \
(I915_GEM_DOMAIN_RENDER | \
I915_GEM_DOMAIN_SAMPLER | \
I915_GEM_DOMAIN_COMMAND | \
I915_GEM_DOMAIN_INSTRUCTION | \
I915_GEM_DOMAIN_VERTEX)
#define for_each_pipe(__dev_priv, __p) \
for ((__p) = 0; (__p) < INTEL_INFO(__dev_priv)->num_pipes; (__p)++)
#define for_each_plane(__dev_priv, __pipe, __p) \
for ((__p) = 0; \
(__p) < INTEL_INFO(__dev_priv)->num_sprites[(__pipe)] + 1; \
(__p)++)
#define for_each_sprite(__dev_priv, __p, __s) \
for ((__s) = 0; \
(__s) < INTEL_INFO(__dev_priv)->num_sprites[(__p)]; \
(__s)++)
#define for_each_crtc(dev, crtc) \
list_for_each_entry(crtc, &dev->mode_config.crtc_list, head)
#define for_each_intel_plane(dev, intel_plane) \
list_for_each_entry(intel_plane, \
&dev->mode_config.plane_list, \
base.head)
#define for_each_intel_plane_on_crtc(dev, intel_crtc, intel_plane) \
list_for_each_entry(intel_plane, \
&(dev)->mode_config.plane_list, \
base.head) \
if ((intel_plane)->pipe == (intel_crtc)->pipe)
#define for_each_intel_crtc(dev, intel_crtc) \
list_for_each_entry(intel_crtc, &dev->mode_config.crtc_list, base.head)
#define for_each_intel_encoder(dev, intel_encoder) \
list_for_each_entry(intel_encoder, \
&(dev)->mode_config.encoder_list, \
base.head)
#define for_each_intel_connector(dev, intel_connector) \
list_for_each_entry(intel_connector, \
&dev->mode_config.connector_list, \
base.head)
#define for_each_encoder_on_crtc(dev, __crtc, intel_encoder) \
list_for_each_entry((intel_encoder), &(dev)->mode_config.encoder_list, base.head) \
if ((intel_encoder)->base.crtc == (__crtc))
#define for_each_connector_on_encoder(dev, __encoder, intel_connector) \
list_for_each_entry((intel_connector), &(dev)->mode_config.connector_list, base.head) \
if ((intel_connector)->base.encoder == (__encoder))
#define for_each_power_domain(domain, mask) \
for ((domain) = 0; (domain) < POWER_DOMAIN_NUM; (domain)++) \
if ((1 << (domain)) & (mask))
struct drm_i915_private;
drm/i915: Prevent recursive deadlock on releasing a busy userptr During release of the GEM object we hold the struct_mutex. As the object may be holding onto the last reference for the task->mm, calling mmput() may trigger exit_mmap() which close the vma which will call drm_gem_vm_close() and attempt to reacquire the struct_mutex. In order to avoid that recursion, we have to defer the mmput() until after we drop the struct_mutex, i.e. we need to schedule a worker to do the clean up. A further issue spotted by Tvrtko was caused when we took a GTT mmapping of a userptr buffer object. In that case, we would never call mmput as the object would be cyclically referenced by the GTT mmapping and not freed upon process exit - keeping the entire process mm alive after the process task was reaped. The fix employed is to replace the mm_users/mmput() reference handling to mm_count/mmdrop() for the shared i915_mm_struct. INFO: task test_surfaces:1632 blocked for more than 120 seconds.       Tainted: GF          O 3.14.5+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. test_surfaces   D 0000000000000000     0  1632   1590 0x00000082  ffff88014914baa8 0000000000000046 0000000000000000 ffff88014914a010  0000000000012c40 0000000000012c40 ffff8800a0058210 ffff88014784b010  ffff88014914a010 ffff880037b1c820 ffff8800a0058210 ffff880037b1c824 Call Trace:  [<ffffffff81582499>] schedule+0x29/0x70  [<ffffffff815825fe>] schedule_preempt_disabled+0xe/0x10  [<ffffffff81583b93>] __mutex_lock_slowpath+0x183/0x220  [<ffffffff81583c53>] mutex_lock+0x23/0x40  [<ffffffffa005c2a3>] drm_gem_vm_close+0x33/0x70 [drm]  [<ffffffff8115a483>] remove_vma+0x33/0x70  [<ffffffff8115a5dc>] exit_mmap+0x11c/0x170  [<ffffffff8104d6eb>] mmput+0x6b/0x100  [<ffffffffa00f44b9>] i915_gem_userptr_release+0x89/0xc0 [i915]  [<ffffffffa00e6706>] i915_gem_free_object+0x126/0x250 [i915]  [<ffffffffa005c06a>] drm_gem_object_free+0x2a/0x40 [drm]  [<ffffffffa005cc32>] drm_gem_object_handle_unreference_unlocked+0xe2/0x120 [drm]  [<ffffffffa005ccd4>] drm_gem_object_release_handle+0x64/0x90 [drm]  [<ffffffff8127ffeb>] idr_for_each+0xab/0x100  [<ffffffffa005cc70>] ? drm_gem_object_handle_unreference_unlocked+0x120/0x120 [drm]  [<ffffffff81583c46>] ? mutex_lock+0x16/0x40  [<ffffffffa005c354>] drm_gem_release+0x24/0x40 [drm]  [<ffffffffa005b82b>] drm_release+0x3fb/0x480 [drm]  [<ffffffff8118d482>] __fput+0xb2/0x260  [<ffffffff8118d6de>] ____fput+0xe/0x10  [<ffffffff8106f27f>] task_work_run+0x8f/0xf0  [<ffffffff81052228>] do_exit+0x1a8/0x480  [<ffffffff81052551>] do_group_exit+0x51/0xc0  [<ffffffff810525d7>] SyS_exit_group+0x17/0x20  [<ffffffff8158e092>] system_call_fastpath+0x16/0x1b v2: Incorporate feedback from Tvrtko and remove the unnessary mm referencing when creating the i915_mm_struct and improve some of the function names and comments. Reported-by: Jacek Danecki <jacek.danecki@intel.com> Test-case: igt/gem_userptr_blits/process-exit* Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Jacek Danecki <jacek.danecki@intel.com> Cc: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com> Reviewed-by: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: stable@vger.kernel.org # hold off until 3.17 ships for additional testing Signed-off-by: Jani Nikula <jani.nikula@intel.com>
2014-08-07 21:20:40 +08:00
struct i915_mm_struct;
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
struct i915_mmu_object;
struct drm_i915_file_private {
struct drm_i915_private *dev_priv;
struct drm_file *file;
struct {
spinlock_t lock;
struct list_head request_list;
/* 20ms is a fairly arbitrary limit (greater than the average frame time)
* chosen to prevent the CPU getting more than a frame ahead of the GPU
* (when using lax throttling for the frontbuffer). We also use it to
* offer free GPU waitboosts for severely congested workloads.
*/
#define DRM_I915_THROTTLE_JIFFIES msecs_to_jiffies(20)
} mm;
struct idr context_idr;
struct intel_rps_client {
struct list_head link;
unsigned boosts;
} rps;
struct intel_engine_cs *bsd_ring;
};
enum intel_dpll_id {
DPLL_ID_PRIVATE = -1, /* non-shared dpll in use */
/* real shared dpll ids must be >= 0 */
DPLL_ID_PCH_PLL_A = 0,
DPLL_ID_PCH_PLL_B = 1,
/* hsw/bdw */
DPLL_ID_WRPLL1 = 0,
DPLL_ID_WRPLL2 = 1,
/* skl */
DPLL_ID_SKL_DPLL1 = 0,
DPLL_ID_SKL_DPLL2 = 1,
DPLL_ID_SKL_DPLL3 = 2,
};
#define I915_NUM_PLLS 3
struct intel_dpll_hw_state {
/* i9xx, pch plls */
uint32_t dpll;
uint32_t dpll_md;
uint32_t fp0;
uint32_t fp1;
/* hsw, bdw */
uint32_t wrpll;
/* skl */
/*
* DPLL_CTRL1 has 6 bits for each each this DPLL. We store those in
* lower part of ctrl1 and they get shifted into position when writing
* the register. This allows us to easily compare the state to share
* the DPLL.
*/
uint32_t ctrl1;
/* HDMI only, 0 when used for DP */
uint32_t cfgcr1, cfgcr2;
/* bxt */
uint32_t ebb0, ebb4, pll0, pll1, pll2, pll3, pll6, pll8, pll9, pll10,
pcsdw12;
};
struct intel_shared_dpll_config {
unsigned crtc_mask; /* mask of CRTCs sharing this PLL */
struct intel_dpll_hw_state hw_state;
};
struct intel_shared_dpll {
struct intel_shared_dpll_config config;
int active; /* count of number of active CRTCs (i.e. DPMS on) */
bool on; /* is the PLL actually active? Disabled during modeset */
const char *name;
/* should match the index in the dev_priv->shared_dplls array */
enum intel_dpll_id id;
/* The mode_set hook is optional and should be used together with the
* intel_prepare_shared_dpll function. */
void (*mode_set)(struct drm_i915_private *dev_priv,
struct intel_shared_dpll *pll);
void (*enable)(struct drm_i915_private *dev_priv,
struct intel_shared_dpll *pll);
void (*disable)(struct drm_i915_private *dev_priv,
struct intel_shared_dpll *pll);
bool (*get_hw_state)(struct drm_i915_private *dev_priv,
struct intel_shared_dpll *pll,
struct intel_dpll_hw_state *hw_state);
};
#define SKL_DPLL0 0
#define SKL_DPLL1 1
#define SKL_DPLL2 2
#define SKL_DPLL3 3
/* Used by dp and fdi links */
struct intel_link_m_n {
uint32_t tu;
uint32_t gmch_m;
uint32_t gmch_n;
uint32_t link_m;
uint32_t link_n;
};
void intel_link_compute_m_n(int bpp, int nlanes,
int pixel_clock, int link_clock,
struct intel_link_m_n *m_n);
/* Interface history:
*
* 1.1: Original.
* 1.2: Add Power Management
* 1.3: Add vblank support
* 1.4: Fix cmdbuffer path, add heap destroy
* 1.5: Add vblank pipe configuration
* 1.6: - New ioctl for scheduling buffer swaps on vertical blank
* - Support vertical blank on secondary display pipe
*/
#define DRIVER_MAJOR 1
#define DRIVER_MINOR 6
#define DRIVER_PATCHLEVEL 0
#define WATCH_LISTS 0
struct opregion_header;
struct opregion_acpi;
struct opregion_swsci;
struct opregion_asle;
struct intel_opregion {
struct opregion_header __iomem *header;
struct opregion_acpi __iomem *acpi;
struct opregion_swsci __iomem *swsci;
u32 swsci_gbda_sub_functions;
u32 swsci_sbcb_sub_functions;
struct opregion_asle __iomem *asle;
void __iomem *vbt;
u32 __iomem *lid_state;
struct work_struct asle_work;
};
#define OPREGION_SIZE (8*1024)
struct intel_overlay;
struct intel_overlay_error_state;
#define I915_FENCE_REG_NONE -1
#define I915_MAX_NUM_FENCES 32
/* 32 fences + sign bit for FENCE_REG_NONE */
#define I915_MAX_NUM_FENCE_BITS 6
struct drm_i915_fence_reg {
struct list_head lru_list;
struct drm_i915_gem_object *obj;
int pin_count;
};
struct sdvo_device_mapping {
u8 initialized;
u8 dvo_port;
u8 slave_addr;
u8 dvo_wiring;
u8 i2c_pin;
u8 ddc_pin;
};
struct intel_display_error_state;
struct drm_i915_error_state {
struct kref ref;
struct timeval time;
char error_msg[128];
u32 reset_count;
u32 suspend_count;
/* Generic register state */
u32 eir;
u32 pgtbl_er;
u32 ier;
u32 gtier[4];
u32 ccid;
u32 derrmr;
u32 forcewake;
u32 error; /* gen6+ */
u32 err_int; /* gen7 */
u32 fault_data0; /* gen8, gen9 */
u32 fault_data1; /* gen8, gen9 */
u32 done_reg;
u32 gac_eco;
u32 gam_ecochk;
u32 gab_ctl;
u32 gfx_mode;
u32 extra_instdone[I915_NUM_INSTDONE_REG];
u64 fence[I915_MAX_NUM_FENCES];
struct intel_overlay_error_state *overlay;
struct intel_display_error_state *display;
struct drm_i915_error_object *semaphore_obj;
struct drm_i915_error_ring {
bool valid;
/* Software tracked state */
bool waiting;
int hangcheck_score;
enum intel_ring_hangcheck_action hangcheck_action;
int num_requests;
/* our own tracking of ring head and tail */
u32 cpu_ring_head;
u32 cpu_ring_tail;
u32 semaphore_seqno[I915_NUM_RINGS - 1];
/* Register state */
u32 start;
u32 tail;
u32 head;
u32 ctl;
u32 hws;
u32 ipeir;
u32 ipehr;
u32 instdone;
u32 bbstate;
u32 instpm;
u32 instps;
u32 seqno;
u64 bbaddr;
u64 acthd;
u32 fault_reg;
u64 faddr;
u32 rc_psmi; /* sleep state */
u32 semaphore_mboxes[I915_NUM_RINGS - 1];
struct drm_i915_error_object {
int page_count;
u32 gtt_offset;
u32 *pages[0];
} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
struct drm_i915_error_request {
long jiffies;
u32 seqno;
u32 tail;
} *requests;
struct {
u32 gfx_mode;
union {
u64 pdp[4];
u32 pp_dir_base;
};
} vm_info;
pid_t pid;
char comm[TASK_COMM_LEN];
} ring[I915_NUM_RINGS];
struct drm_i915_error_buffer {
u32 size;
u32 name;
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 20:41:17 +08:00
u32 rseqno[I915_NUM_RINGS], wseqno;
u32 gtt_offset;
u32 read_domains;
u32 write_domain;
s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
s32 pinned:2;
u32 tiling:2;
u32 dirty:1;
u32 purgeable:1;
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
u32 userptr:1;
s32 ring:4;
u32 cache_level:3;
} **active_bo, **pinned_bo;
u32 *active_bo_count, *pinned_bo_count;
u32 vm_count;
};
struct intel_connector;
struct intel_encoder;
struct intel_crtc_state;
struct intel_initial_plane_config;
struct intel_crtc;
drm/i915: move find_pll callback to dev_priv->display Now that the DP madness is cleared out, this is all only per-platform. So move it out from the intel clock limits structure. While at it drop the intel prefix on the static functions, call the vtable entry find_dpll (since it's for the display pll) and rip out the now unnecessary forward declarations. Note that the parameters of ->find_dpll are still unchanged, but they eventually need to be moved over to just take in a pipe configuration. But currently a lot of things are still missing from the pipe configuration (reflock, output-specific dpll limits and preferences, downclocked dotclock). So this will happen in a later step. Note that intel_g4x_limit has a peculiar case where it selects intel_limits_i9xx_sdvo as the limit. This is pretty bogus and also not used since the only output types left are DP and native TV-out which both use special pre-tuned dpll values. v2: Re-add comment for the find_pll callback (requested by Paulo) and elaborate on why the transformation is correct for g4x platforms (to clarify a review question from Paulo). Double up on that by adding a WARN as suggested by Paulo Zanoni on irc. v3: Initialize limits to NULL since gcc is now unhappy. v4: v2/3 will blow up with a NULL dereference in ->find_dpll for dp and TV-out ports, spotted by Paulo on irc. So just give up on this madness for now, and leave this to be fixed in a later patch. v5: Since the ever-so-slight change for g4x might result in some dpll parameter computation failing spuriously where before it didn't for ports with preset dpll settings (DP & TV-out) override this. For paranoia also do it in the ilk+ code. Cc: Paulo Zanoni <przanoni@gmail.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-06-04 04:40:22 +08:00
struct intel_limit;
struct dpll;
struct drm_i915_display_funcs {
int (*get_display_clock_speed)(struct drm_device *dev);
int (*get_fifo_size)(struct drm_device *dev, int plane);
drm/i915: move find_pll callback to dev_priv->display Now that the DP madness is cleared out, this is all only per-platform. So move it out from the intel clock limits structure. While at it drop the intel prefix on the static functions, call the vtable entry find_dpll (since it's for the display pll) and rip out the now unnecessary forward declarations. Note that the parameters of ->find_dpll are still unchanged, but they eventually need to be moved over to just take in a pipe configuration. But currently a lot of things are still missing from the pipe configuration (reflock, output-specific dpll limits and preferences, downclocked dotclock). So this will happen in a later step. Note that intel_g4x_limit has a peculiar case where it selects intel_limits_i9xx_sdvo as the limit. This is pretty bogus and also not used since the only output types left are DP and native TV-out which both use special pre-tuned dpll values. v2: Re-add comment for the find_pll callback (requested by Paulo) and elaborate on why the transformation is correct for g4x platforms (to clarify a review question from Paulo). Double up on that by adding a WARN as suggested by Paulo Zanoni on irc. v3: Initialize limits to NULL since gcc is now unhappy. v4: v2/3 will blow up with a NULL dereference in ->find_dpll for dp and TV-out ports, spotted by Paulo on irc. So just give up on this madness for now, and leave this to be fixed in a later patch. v5: Since the ever-so-slight change for g4x might result in some dpll parameter computation failing spuriously where before it didn't for ports with preset dpll settings (DP & TV-out) override this. For paranoia also do it in the ilk+ code. Cc: Paulo Zanoni <przanoni@gmail.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-06-04 04:40:22 +08:00
/**
* find_dpll() - Find the best values for the PLL
* @limit: limits for the PLL
* @crtc: current CRTC
* @target: target frequency in kHz
* @refclk: reference clock frequency in kHz
* @match_clock: if provided, @best_clock P divider must
* match the P divider from @match_clock
* used for LVDS downclocking
* @best_clock: best PLL values found
*
* Returns true on success, false on failure.
*/
bool (*find_dpll)(const struct intel_limit *limit,
struct intel_crtc_state *crtc_state,
drm/i915: move find_pll callback to dev_priv->display Now that the DP madness is cleared out, this is all only per-platform. So move it out from the intel clock limits structure. While at it drop the intel prefix on the static functions, call the vtable entry find_dpll (since it's for the display pll) and rip out the now unnecessary forward declarations. Note that the parameters of ->find_dpll are still unchanged, but they eventually need to be moved over to just take in a pipe configuration. But currently a lot of things are still missing from the pipe configuration (reflock, output-specific dpll limits and preferences, downclocked dotclock). So this will happen in a later step. Note that intel_g4x_limit has a peculiar case where it selects intel_limits_i9xx_sdvo as the limit. This is pretty bogus and also not used since the only output types left are DP and native TV-out which both use special pre-tuned dpll values. v2: Re-add comment for the find_pll callback (requested by Paulo) and elaborate on why the transformation is correct for g4x platforms (to clarify a review question from Paulo). Double up on that by adding a WARN as suggested by Paulo Zanoni on irc. v3: Initialize limits to NULL since gcc is now unhappy. v4: v2/3 will blow up with a NULL dereference in ->find_dpll for dp and TV-out ports, spotted by Paulo on irc. So just give up on this madness for now, and leave this to be fixed in a later patch. v5: Since the ever-so-slight change for g4x might result in some dpll parameter computation failing spuriously where before it didn't for ports with preset dpll settings (DP & TV-out) override this. For paranoia also do it in the ilk+ code. Cc: Paulo Zanoni <przanoni@gmail.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-06-04 04:40:22 +08:00
int target, int refclk,
struct dpll *match_clock,
struct dpll *best_clock);
void (*update_wm)(struct drm_crtc *crtc);
void (*update_sprite_wm)(struct drm_plane *plane,
struct drm_crtc *crtc,
uint32_t sprite_width, uint32_t sprite_height,
int pixel_size, bool enable, bool scaled);
int (*modeset_calc_cdclk)(struct drm_atomic_state *state);
void (*modeset_commit_cdclk)(struct drm_atomic_state *state);
/* Returns the active state of the crtc, and if the crtc is active,
* fills out the pipe-config with the hw state. */
bool (*get_pipe_config)(struct intel_crtc *,
struct intel_crtc_state *);
void (*get_initial_plane_config)(struct intel_crtc *,
struct intel_initial_plane_config *);
int (*crtc_compute_clock)(struct intel_crtc *crtc,
struct intel_crtc_state *crtc_state);
void (*crtc_enable)(struct drm_crtc *crtc);
void (*crtc_disable)(struct drm_crtc *crtc);
void (*audio_codec_enable)(struct drm_connector *connector,
struct intel_encoder *encoder,
struct drm_display_mode *mode);
void (*audio_codec_disable)(struct intel_encoder *encoder);
void (*fdi_link_train)(struct drm_crtc *crtc);
void (*init_clock_gating)(struct drm_device *dev);
int (*queue_flip)(struct drm_device *dev, struct drm_crtc *crtc,
struct drm_framebuffer *fb,
struct drm_i915_gem_object *obj,
struct drm_i915_gem_request *req,
uint32_t flags);
void (*update_primary_plane)(struct drm_crtc *crtc,
struct drm_framebuffer *fb,
int x, int y);
drm/i915: Fixup hpd irq register setup ordering For GMCH platforms we set up the hpd irq registers in the irq postinstall hook. But since we only enable the irq sources we actually need in PORT_HOTPLUG_EN/STATUS, taking dev_priv->hotplug_supported_mask into account, no hpd interrupt sources is enabled since commit 52d7ecedac3f96fb562cb482c139015372728638 Author: Daniel Vetter <daniel.vetter@ffwll.ch> Date: Sat Dec 1 21:03:22 2012 +0100 drm/i915: reorder setup sequence to have irqs for output setup Wrongly set-up interrupts also lead to broken hw-based load-detection on at least GM45, resulting in ghost VGA/TV-out outputs. To fix this, delay the hotplug register setup until after all outputs are set up, by moving it into a new dev_priv->display.hpd_irq_callback. We might also move the PCH_SPLIT platforms to such a setup eventually. Another funny part is that we need to delay the fbdev initial config probing until after the hpd regs are setup, for otherwise it'll detect ghost outputs. But we can only enable the hpd interrupt handling itself (and the output polling) _after_ that initial scan, due to massive locking brain-damage in the fbdev setup code. Add a big comment to explain this cute little dragon lair. v2: Encapsulate all the fbdev handling by wrapping the move call into intel_fbdev_initial_config in intel_fb.c. Requested by Chris Wilson. v3: Applied bikeshed from Jesse Barnes. v4: Imre Deak noticed that we also need to call intel_hpd_init after the drm_irqinstall calls in the gpu reset and resume paths - otherwise hotplug will be broken. Also improve the comment a bit about why hpd_init needs to be called before we set up the initial fbdev config. Bugzilla: Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=54943 Reported-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> (v3) Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-12-11 21:05:07 +08:00
void (*hpd_irq_setup)(struct drm_device *dev);
/* clock updates for mode set */
/* cursor updates */
/* render clock increase/decrease */
/* display clock increase/decrease */
/* pll clock increase/decrease */
int (*setup_backlight)(struct intel_connector *connector, enum pipe pipe);
uint32_t (*get_backlight)(struct intel_connector *connector);
void (*set_backlight)(struct intel_connector *connector,
uint32_t level);
void (*disable_backlight)(struct intel_connector *connector);
void (*enable_backlight)(struct intel_connector *connector);
};
enum forcewake_domain_id {
FW_DOMAIN_ID_RENDER = 0,
FW_DOMAIN_ID_BLITTER,
FW_DOMAIN_ID_MEDIA,
FW_DOMAIN_ID_COUNT
};
enum forcewake_domains {
FORCEWAKE_RENDER = (1 << FW_DOMAIN_ID_RENDER),
FORCEWAKE_BLITTER = (1 << FW_DOMAIN_ID_BLITTER),
FORCEWAKE_MEDIA = (1 << FW_DOMAIN_ID_MEDIA),
FORCEWAKE_ALL = (FORCEWAKE_RENDER |
FORCEWAKE_BLITTER |
FORCEWAKE_MEDIA)
};
struct intel_uncore_funcs {
void (*force_wake_get)(struct drm_i915_private *dev_priv,
enum forcewake_domains domains);
void (*force_wake_put)(struct drm_i915_private *dev_priv,
enum forcewake_domains domains);
uint8_t (*mmio_readb)(struct drm_i915_private *dev_priv, off_t offset, bool trace);
uint16_t (*mmio_readw)(struct drm_i915_private *dev_priv, off_t offset, bool trace);
uint32_t (*mmio_readl)(struct drm_i915_private *dev_priv, off_t offset, bool trace);
uint64_t (*mmio_readq)(struct drm_i915_private *dev_priv, off_t offset, bool trace);
void (*mmio_writeb)(struct drm_i915_private *dev_priv, off_t offset,
uint8_t val, bool trace);
void (*mmio_writew)(struct drm_i915_private *dev_priv, off_t offset,
uint16_t val, bool trace);
void (*mmio_writel)(struct drm_i915_private *dev_priv, off_t offset,
uint32_t val, bool trace);
void (*mmio_writeq)(struct drm_i915_private *dev_priv, off_t offset,
uint64_t val, bool trace);
};
struct intel_uncore {
spinlock_t lock; /** lock is also taken in irq contexts. */
struct intel_uncore_funcs funcs;
unsigned fifo_count;
enum forcewake_domains fw_domains;
struct intel_uncore_forcewake_domain {
struct drm_i915_private *i915;
enum forcewake_domain_id id;
unsigned wake_count;
struct timer_list timer;
u32 reg_set;
u32 val_set;
u32 val_clear;
u32 reg_ack;
u32 reg_post;
u32 val_reset;
} fw_domain[FW_DOMAIN_ID_COUNT];
};
/* Iterate over initialised fw domains */
#define for_each_fw_domain_mask(domain__, mask__, dev_priv__, i__) \
for ((i__) = 0, (domain__) = &(dev_priv__)->uncore.fw_domain[0]; \
(i__) < FW_DOMAIN_ID_COUNT; \
(i__)++, (domain__) = &(dev_priv__)->uncore.fw_domain[i__]) \
if (((mask__) & (dev_priv__)->uncore.fw_domains) & (1 << (i__)))
#define for_each_fw_domain(domain__, dev_priv__, i__) \
for_each_fw_domain_mask(domain__, FORCEWAKE_ALL, dev_priv__, i__)
drm/i915/skl: Add DC5 Trigger Sequence Add triggers as per expectations mentioned in gen9_enable_dc5 and gen9_disable_dc5 patch. Also call POSTING_READ for every write to a register to ensure that its written immediately. v1: Remove POSTING_READ calls as they've already been added in previous patches. v2: Rebase to move all runtime pm specific changes to intel_runtime_pm.c file. Modified as per review comments from Imre: 1] Change variable name 'dc5_allowed' to 'dc5_enabled' to correspond to relevant functions. 2] Move the check dc5_enabled in skl_set_power_well() to disable DC5 into gen9_disable_DC5 which is a more appropriate place. 3] Convert checks for 'pm.dc5_enabled' and 'pm.suspended' in skl_set_power_well() to warnings. However, removing them for now as they'll be included in a future patch asserting DC-state entry/exit criteria. 4] Enable DC5, only when CSR firmware is verified to be loaded. Create new structure to track 'enabled' and 'deferred' status of DC5. 5] Ensure runtime PM reference is obtained, if CSR is not loaded, to avoid entering runtime-suspend and release it when it's loaded. 6] Protect necessary CSR-related code with locks. 7] Move CSR-loading call to runtime PM initialization, as power domains needed to be accessed during deferred DC5-enabling, are not initialized earlier. v3: Rebase to latest. Modified as per review comments from Imre: 1] Use blocking wait for CSR-loading to finish to enable DC5 for simplicity, instead of deferring enabling DC5 until CSR is loaded. 2] Obtain runtime PM reference during CSR-loading initialization itself as deferred DC5- enabling is removed and release it at the end of CSR-loading functionality. 3] Revert calling CSR-loading functionality to the beginning of i915 driver-load functionality to avoid any delay in loading. 4] Define another variable to track whether CSR-loading failed and use it to avoid enabling DC5 if it's true. 5] Define CSR-load-status accessor functions for use later. v4: 1] Disable DC5 before enabling PG2 instead of after it. 2] DC5 was being mistaken enabled even when CSR-loading timed-out. Fix that. 3] Enable DC5-related functionality using a macro. 4] Remove dc5_enabled tracking variable and its use as it's not needed now. v5: 1] Mark CSR failed to load where necessary in finish_csr_load function. 2] Use mutex-protected accessor function to check if CSR loaded instead of directly accessing the variable. 3] Prefix csr_load_status_get/set function names with intel_. v6: rebase to latest. v7: Rebase on top of nightly (Damien) v8: Squashed the patch from Imre - added csr helper pointers to simplify the code. (Imre) v9: After adding dmc ver 1.0 support rebased on top of nightly. (Animesh) v10: Added a enum for different csr states, suggested by Imre. (Animesh) v11: Based on review comments from Imre, Damien and Daniel following changes done - enum name chnaged to csr_state (singular form). - FW_UNINITIALIZED used as zeroth element in enum csr_state. - Prototype changed for helper function(set/get csr status), using enum csr_state instead of bool. v12: Based on review comment from Imre, introduced bool fw_loaded local to finish_csr_load() which helps calling once to set the csr status. The same flag used to fail RPM if find any issue during firmware loading. Issue: VIZ-2819 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Suketu Shah <suketu.j.shah@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-17 22:16:16 +08:00
enum csr_state {
FW_UNINITIALIZED = 0,
FW_LOADED,
FW_FAILED
};
drm/i915/skl: Add support to load SKL CSR firmware. Display Context Save and Restore support is needed for various SKL Display C states like DC5, DC6. This implementation is added based on first version of DMC CSR program that we received from h/w team. Here we are using request_firmware based design. Finally this firmware should end up in linux-firmware tree. For SKL platform its mandatory to ensure that we load this csr program before enabling DC states like DC5/DC6. As CSR program gets reset on various conditions, we should ensure to load it during boot and in future change to be added to load this system resume sequence too. v1: Initial relese as RFC patch v2: Design change as per Daniel, Damien and Shobit's review comments request firmware method followed. v3: Some optimization and functional changes. Pulled register defines into drivers/gpu/drm/i915/i915_reg.h Used kmemdup to allocate and duplicate firmware content. Ensured to free allocated buffer. v4: Modified as per review comments from Satheesh and Daniel Removed temporary buffer. Optimized number of writes by replacing I915_WRITE with I915_WRITE64. v5: Modified as per review comemnts from Damien. - Changed name for functions and firmware. - Introduced HAS_CSR. - Reverted back previous change and used csr_buf with u8 size. - Using cpu_to_be64 for endianness change. Modified as per review comments from Imre. - Modified registers and macro names to be a bit closer to bspec terminology and the existing register naming in the driver. - Early return for non SKL platforms in intel_load_csr_program function. - Added locking around CSR program load function as it may be called concurrently during system/runtime resume. - Releasing the fw before loading the program for consistency - Handled error path during f/w load. v6: Modified as per review comments from Imre. - Corrected out_freecsr sequence. v7: Modified as per review comments from Imre. Fail loading fw if fw->size%8!=0. v8: Rebase to latest. v9: Rebase on top of -nightly (Damien) v10: Enabled support for dmc firmware ver 1.0. According to ver 1.0 in a single binary package all the firmware's that are required for different stepping's of the product will be stored. The package contains the css header, followed by the package header and the actual dmc firmwares. Package header contains the firmware/stepping mapping table and the corresponding firmware offsets to the individual binaries, within the package. Each individual program binary contains the header and the payload sections whose size is specified in the header section. This changes are done to extract the specific firmaware from the package. (Animesh) v11: Modified as per review comemnts from Imre. - Added code comment from bpec for header structure elements. - Added __packed to avoid structure padding. - Added helper functions for stepping and substepping info. - Added code comment for CSR_MAX_FW_SIZE. - Disabled BXT firmware loading, will be enabled with dmc 1.0 support. - Changed skl_stepping_info based on bspec, earlier used from config DB. - Removed duplicate call of cpu_to_be* from intel_csr_load_program function. - Used cpu_to_be32 instead of cpu_to_be64 as firmware binary in dword aligned. - Added sanity check for header length. - Added sanity check for mmio address got from firmware binary. - kmalloc done separately for dmc header and dmc firmware. (Animesh) v12: Modified as per review comemnts from Imre. - Corrected the typo error in skl stepping info structure. - Added out-of-bound access for skl_stepping_info. - Sanity check for mmio address modified. - Sanity check added for stepping and substeppig. - Modified the intel_dmc_info structure, cache only the required header info. (Animesh) v13: clarify firmware load error message. The reason for a firmware loading failure can be obscure if the driver is built-in. Provide an explanation to the user about the likely reason for the failure and how to resolve it. (Imre) v14: Suggested by Jani. - fix s/I915/CONFIG_DRM_I915/ typo - add fw_path to the firmware object instead of using a static ptr (Jani) v15: 1) Changed the firmware name as dmc_gen9.bin, everytime for a new firmware version a symbolic link with same name will help not to build kernel again. 2) Changes done as per review comments from Imre. - Error check removed for intel_csr_ucode_init. - Moved csr-specific data structure to intel_csr.h and optimization done on structure definition. - fw->data used directly for parsing the header info & memory allocation only done separately for payload. (Animesh) v16: - No need for out_regs label in i915_driver_load(), so removed it. - Changed the firmware name as skl_dmc_ver1.bin, followed naming convention <platform>_dmc_<api-version>.bin (Animesh) Issue: VIZ-2569 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-04 20:58:44 +08:00
struct intel_csr {
const char *fw_path;
__be32 *dmc_payload;
uint32_t dmc_fw_size;
uint32_t mmio_count;
uint32_t mmioaddr[8];
uint32_t mmiodata[8];
drm/i915/skl: Add DC5 Trigger Sequence Add triggers as per expectations mentioned in gen9_enable_dc5 and gen9_disable_dc5 patch. Also call POSTING_READ for every write to a register to ensure that its written immediately. v1: Remove POSTING_READ calls as they've already been added in previous patches. v2: Rebase to move all runtime pm specific changes to intel_runtime_pm.c file. Modified as per review comments from Imre: 1] Change variable name 'dc5_allowed' to 'dc5_enabled' to correspond to relevant functions. 2] Move the check dc5_enabled in skl_set_power_well() to disable DC5 into gen9_disable_DC5 which is a more appropriate place. 3] Convert checks for 'pm.dc5_enabled' and 'pm.suspended' in skl_set_power_well() to warnings. However, removing them for now as they'll be included in a future patch asserting DC-state entry/exit criteria. 4] Enable DC5, only when CSR firmware is verified to be loaded. Create new structure to track 'enabled' and 'deferred' status of DC5. 5] Ensure runtime PM reference is obtained, if CSR is not loaded, to avoid entering runtime-suspend and release it when it's loaded. 6] Protect necessary CSR-related code with locks. 7] Move CSR-loading call to runtime PM initialization, as power domains needed to be accessed during deferred DC5-enabling, are not initialized earlier. v3: Rebase to latest. Modified as per review comments from Imre: 1] Use blocking wait for CSR-loading to finish to enable DC5 for simplicity, instead of deferring enabling DC5 until CSR is loaded. 2] Obtain runtime PM reference during CSR-loading initialization itself as deferred DC5- enabling is removed and release it at the end of CSR-loading functionality. 3] Revert calling CSR-loading functionality to the beginning of i915 driver-load functionality to avoid any delay in loading. 4] Define another variable to track whether CSR-loading failed and use it to avoid enabling DC5 if it's true. 5] Define CSR-load-status accessor functions for use later. v4: 1] Disable DC5 before enabling PG2 instead of after it. 2] DC5 was being mistaken enabled even when CSR-loading timed-out. Fix that. 3] Enable DC5-related functionality using a macro. 4] Remove dc5_enabled tracking variable and its use as it's not needed now. v5: 1] Mark CSR failed to load where necessary in finish_csr_load function. 2] Use mutex-protected accessor function to check if CSR loaded instead of directly accessing the variable. 3] Prefix csr_load_status_get/set function names with intel_. v6: rebase to latest. v7: Rebase on top of nightly (Damien) v8: Squashed the patch from Imre - added csr helper pointers to simplify the code. (Imre) v9: After adding dmc ver 1.0 support rebased on top of nightly. (Animesh) v10: Added a enum for different csr states, suggested by Imre. (Animesh) v11: Based on review comments from Imre, Damien and Daniel following changes done - enum name chnaged to csr_state (singular form). - FW_UNINITIALIZED used as zeroth element in enum csr_state. - Prototype changed for helper function(set/get csr status), using enum csr_state instead of bool. v12: Based on review comment from Imre, introduced bool fw_loaded local to finish_csr_load() which helps calling once to set the csr status. The same flag used to fail RPM if find any issue during firmware loading. Issue: VIZ-2819 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Suketu Shah <suketu.j.shah@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-17 22:16:16 +08:00
enum csr_state state;
drm/i915/skl: Add support to load SKL CSR firmware. Display Context Save and Restore support is needed for various SKL Display C states like DC5, DC6. This implementation is added based on first version of DMC CSR program that we received from h/w team. Here we are using request_firmware based design. Finally this firmware should end up in linux-firmware tree. For SKL platform its mandatory to ensure that we load this csr program before enabling DC states like DC5/DC6. As CSR program gets reset on various conditions, we should ensure to load it during boot and in future change to be added to load this system resume sequence too. v1: Initial relese as RFC patch v2: Design change as per Daniel, Damien and Shobit's review comments request firmware method followed. v3: Some optimization and functional changes. Pulled register defines into drivers/gpu/drm/i915/i915_reg.h Used kmemdup to allocate and duplicate firmware content. Ensured to free allocated buffer. v4: Modified as per review comments from Satheesh and Daniel Removed temporary buffer. Optimized number of writes by replacing I915_WRITE with I915_WRITE64. v5: Modified as per review comemnts from Damien. - Changed name for functions and firmware. - Introduced HAS_CSR. - Reverted back previous change and used csr_buf with u8 size. - Using cpu_to_be64 for endianness change. Modified as per review comments from Imre. - Modified registers and macro names to be a bit closer to bspec terminology and the existing register naming in the driver. - Early return for non SKL platforms in intel_load_csr_program function. - Added locking around CSR program load function as it may be called concurrently during system/runtime resume. - Releasing the fw before loading the program for consistency - Handled error path during f/w load. v6: Modified as per review comments from Imre. - Corrected out_freecsr sequence. v7: Modified as per review comments from Imre. Fail loading fw if fw->size%8!=0. v8: Rebase to latest. v9: Rebase on top of -nightly (Damien) v10: Enabled support for dmc firmware ver 1.0. According to ver 1.0 in a single binary package all the firmware's that are required for different stepping's of the product will be stored. The package contains the css header, followed by the package header and the actual dmc firmwares. Package header contains the firmware/stepping mapping table and the corresponding firmware offsets to the individual binaries, within the package. Each individual program binary contains the header and the payload sections whose size is specified in the header section. This changes are done to extract the specific firmaware from the package. (Animesh) v11: Modified as per review comemnts from Imre. - Added code comment from bpec for header structure elements. - Added __packed to avoid structure padding. - Added helper functions for stepping and substepping info. - Added code comment for CSR_MAX_FW_SIZE. - Disabled BXT firmware loading, will be enabled with dmc 1.0 support. - Changed skl_stepping_info based on bspec, earlier used from config DB. - Removed duplicate call of cpu_to_be* from intel_csr_load_program function. - Used cpu_to_be32 instead of cpu_to_be64 as firmware binary in dword aligned. - Added sanity check for header length. - Added sanity check for mmio address got from firmware binary. - kmalloc done separately for dmc header and dmc firmware. (Animesh) v12: Modified as per review comemnts from Imre. - Corrected the typo error in skl stepping info structure. - Added out-of-bound access for skl_stepping_info. - Sanity check for mmio address modified. - Sanity check added for stepping and substeppig. - Modified the intel_dmc_info structure, cache only the required header info. (Animesh) v13: clarify firmware load error message. The reason for a firmware loading failure can be obscure if the driver is built-in. Provide an explanation to the user about the likely reason for the failure and how to resolve it. (Imre) v14: Suggested by Jani. - fix s/I915/CONFIG_DRM_I915/ typo - add fw_path to the firmware object instead of using a static ptr (Jani) v15: 1) Changed the firmware name as dmc_gen9.bin, everytime for a new firmware version a symbolic link with same name will help not to build kernel again. 2) Changes done as per review comments from Imre. - Error check removed for intel_csr_ucode_init. - Moved csr-specific data structure to intel_csr.h and optimization done on structure definition. - fw->data used directly for parsing the header info & memory allocation only done separately for payload. (Animesh) v16: - No need for out_regs label in i915_driver_load(), so removed it. - Changed the firmware name as skl_dmc_ver1.bin, followed naming convention <platform>_dmc_<api-version>.bin (Animesh) Issue: VIZ-2569 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-04 20:58:44 +08:00
};
#define DEV_INFO_FOR_EACH_FLAG(func, sep) \
func(is_mobile) sep \
func(is_i85x) sep \
func(is_i915g) sep \
func(is_i945gm) sep \
func(is_g33) sep \
func(need_gfx_hws) sep \
func(is_g4x) sep \
func(is_pineview) sep \
func(is_broadwater) sep \
func(is_crestline) sep \
func(is_ivybridge) sep \
func(is_valleyview) sep \
func(is_haswell) sep \
func(is_skylake) sep \
func(is_preliminary) sep \
func(has_fbc) sep \
func(has_pipe_cxsr) sep \
func(has_hotplug) sep \
func(cursor_needs_physical) sep \
func(has_overlay) sep \
func(overlay_needs_physical) sep \
func(supports_tv) sep \
func(has_llc) sep \
func(has_ddi) sep \
func(has_fpga_dbg)
#define DEFINE_FLAG(name) u8 name:1
#define SEP_SEMICOLON ;
struct intel_device_info {
u32 display_mmio_offset;
u16 device_id;
u8 num_pipes:3;
u8 num_sprites[I915_MAX_PIPES];
u8 gen;
u8 ring_mask; /* Rings supported by the HW */
DEV_INFO_FOR_EACH_FLAG(DEFINE_FLAG, SEP_SEMICOLON);
/* Register offsets for the various display pipes and transcoders */
int pipe_offsets[I915_MAX_TRANSCODERS];
int trans_offsets[I915_MAX_TRANSCODERS];
int palette_offsets[I915_MAX_PIPES];
int cursor_offsets[I915_MAX_PIPES];
/* Slice/subslice/EU info */
u8 slice_total;
u8 subslice_total;
u8 subslice_per_slice;
u8 eu_total;
u8 eu_per_subslice;
/* For each slice, which subslice(s) has(have) 7 EUs (bitfield)? */
u8 subslice_7eu[3];
u8 has_slice_pg:1;
u8 has_subslice_pg:1;
u8 has_eu_pg:1;
};
#undef DEFINE_FLAG
#undef SEP_SEMICOLON
enum i915_cache_level {
I915_CACHE_NONE = 0,
I915_CACHE_LLC, /* also used for snoopable memory on non-LLC */
I915_CACHE_L3_LLC, /* gen7+, L3 sits between the domain specifc
caches, eg sampler/render caches, and the
large Last-Level-Cache. LLC is coherent with
the CPU, but L3 is only visible to the GPU. */
I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */
};
struct i915_ctx_hang_stats {
/* This context had batch pending when hang was declared */
unsigned batch_pending;
/* This context had batch active when hang was declared */
unsigned batch_active;
/* Time when this context was last blamed for a GPU reset */
unsigned long guilty_ts;
/* If the contexts causes a second GPU hang within this time,
* it is permanently banned from submitting any more work.
*/
unsigned long ban_period_seconds;
/* This context is banned to submit more work */
bool banned;
};
/* This must match up with the value previously used for execbuf2.rsvd1. */
drm/i915: Emphasize that ctx->id is merely a user handle This is an Execlists preparatory patch, since they make context ID become an overloaded term: - In the software, it was used to distinguish which context userspace was trying to use. - In the BSpec, the term is used to describe the 20-bits long field the hardware uses to it to discriminate the contexts that are submitted to the ELSP and inform the driver about their current status (via Context Switch Interrupts and Context Status Buffers). Initially, I tried to make the different meanings converge, but it proved impossible: - The software ctx->id is per-filp, while the hardware one needs to be globally unique. - Also, we multiplex several backing states objects per intel_context, and all of them need unique HW IDs. - I tried adding a per-filp ID and then composing the HW context ID as: ctx->id + file_priv->id + ring->id, but the fact that the hardware only uses 20-bits means we have to artificially limit the number of filps or contexts the userspace can create. The ctx->user_handle renaming bits are done with this Cocci patch (plus manual frobbing of the struct declaration): @@ struct intel_context c; @@ - (c).id + c.user_handle @@ struct intel_context *c; @@ - (c)->id + c->user_handle Also, while we are at it, s/DEFAULT_CONTEXT_ID/DEFAULT_CONTEXT_HANDLE and change the type to unsigned 32 bits. v2: s/handle/user_handle and change the type to uint32_t as suggested by Chris Wilson. Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> (v1) Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-03 23:28:00 +08:00
#define DEFAULT_CONTEXT_HANDLE 0
#define CONTEXT_NO_ZEROMAP (1<<0)
/**
* struct intel_context - as the name implies, represents a context.
* @ref: reference count.
* @user_handle: userspace tracking identity for this context.
* @remap_slice: l3 row remapping information.
* @flags: context specific flags:
* CONTEXT_NO_ZEROMAP: do not allow mapping things to page 0.
* @file_priv: filp associated with this context (NULL for global default
* context).
* @hang_stats: information about the role of this context in possible GPU
* hangs.
* @ppgtt: virtual memory space used by this context.
* @legacy_hw_ctx: render context backing object and whether it is correctly
* initialized (legacy ring submission mechanism only).
* @link: link in the global list of contexts.
*
* Contexts are memory images used by the hardware to store copies of their
* internal state.
*/
struct intel_context {
struct kref ref;
drm/i915: Emphasize that ctx->id is merely a user handle This is an Execlists preparatory patch, since they make context ID become an overloaded term: - In the software, it was used to distinguish which context userspace was trying to use. - In the BSpec, the term is used to describe the 20-bits long field the hardware uses to it to discriminate the contexts that are submitted to the ELSP and inform the driver about their current status (via Context Switch Interrupts and Context Status Buffers). Initially, I tried to make the different meanings converge, but it proved impossible: - The software ctx->id is per-filp, while the hardware one needs to be globally unique. - Also, we multiplex several backing states objects per intel_context, and all of them need unique HW IDs. - I tried adding a per-filp ID and then composing the HW context ID as: ctx->id + file_priv->id + ring->id, but the fact that the hardware only uses 20-bits means we have to artificially limit the number of filps or contexts the userspace can create. The ctx->user_handle renaming bits are done with this Cocci patch (plus manual frobbing of the struct declaration): @@ struct intel_context c; @@ - (c).id + c.user_handle @@ struct intel_context *c; @@ - (c)->id + c->user_handle Also, while we are at it, s/DEFAULT_CONTEXT_ID/DEFAULT_CONTEXT_HANDLE and change the type to unsigned 32 bits. v2: s/handle/user_handle and change the type to uint32_t as suggested by Chris Wilson. Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> (v1) Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-03 23:28:00 +08:00
int user_handle;
drm/i915: Do remaps for all contexts On both Ivybridge and Haswell, row remapping information is saved and restored with context. This means, we never actually properly supported the l3 remapping because our sysfs interface is asynchronous (and not tied to any context), and the known faulty HW would be reused by the next context to run. Not that due to the asynchronous nature of the sysfs entry, there is no point modifying the registers for the existing context. Instead we set a flag for all contexts to load the correct remapping information on the next run. Interested clients can use debugfs to determine whether or not the row has been remapped. One could propose at this point that we just do the remapping in the kernel. I guess since we have to maintain the sysfs interface anyway, I'm not sure how useful it is, and I do like keeping the policy in userspace; (it wasn't my original decision to make the interface the way it is, so I'm not attached). v2: Force a context switch when we have a remap on the next switch. (Ville) Don't let userspace use the interface with disabled contexts. v3: Don't force a context switch, just let it nop Improper context slice remap initialization, 1<<1 instead of 1<<i, but I rewrote it to avoid a second round of confusion. Error print moved to error path (All Ville) Added a comment on why the slice remap initialization happens. CC: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-09-19 10:03:18 +08:00
uint8_t remap_slice;
drm/i915: Store device pointer in contexts for late tracepoint usafe [ 1572.417121] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1572.421010] IP: [<ffffffffa00b2514>] ftrace_raw_event_i915_context+0x5d/0x70 [i915] [ 1572.424970] PGD 1766a3067 PUD 1767a2067 PMD 0 [ 1572.428892] Oops: 0000 [#1] SMP [ 1572.432787] Modules linked in: ipv6 dm_mod iTCO_wdt iTCO_vendor_support snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_controller snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd soundcore serio_raw pcspkr lpc_ich i2c_i801 mfd_core battery ac acpi_cpufreq i915 button video drm_kms_helper drm [ 1572.441720] CPU: 2 PID: 18853 Comm: kworker/u8:0 Not tainted 4.0.0_kcloud_3f0360_20150429+ #588 [ 1572.446298] Workqueue: i915 i915_gem_retire_work_handler [i915] [ 1572.450876] task: ffff880002f428f0 ti: ffff880035724000 task.ti: ffff880035724000 [ 1572.455557] RIP: 0010:[<ffffffffa00b2514>] [<ffffffffa00b2514>] ftrace_raw_event_i915_context+0x5d/0x70 [i915] [ 1572.460423] RSP: 0018:ffff880035727ce8 EFLAGS: 00010286 [ 1572.465262] RAX: ffff880073f1643c RBX: ffff880002da9058 RCX: ffff880073e5db40 [ 1572.470179] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880035727ce8 [ 1572.475107] RBP: ffff88007bb11a00 R08: 0000000000000000 R09: 0000000000000000 [ 1572.480034] R10: 0000000000362200 R11: 0000000000000008 R12: 0000000000000000 [ 1572.484952] R13: ffff880035727d78 R14: ffff880002dc1c98 R15: ffff880002dc1dc8 [ 1572.489886] FS: 0000000000000000(0000) GS:ffff88017fd00000(0000) knlGS:0000000000000000 [ 1572.494883] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1572.499859] CR2: 0000000000000000 CR3: 000000017572a000 CR4: 00000000001006e0 [ 1572.504842] Stack: [ 1572.509834] ffff88017b0090c0 ffff880073f16438 ffff880002da9058 ffff880073f1643c [ 1572.514904] 0000000000000246 ffff880100000000 ffff88007bb11a00 ffff880002ddeb10 [ 1572.519985] ffff8801759f79c0 ffffffffa0092ff0 0000000000000000 ffff88007bb11a00 [ 1572.525049] Call Trace: [ 1572.530093] [<ffffffffa0092ff0>] ? i915_gem_context_free+0xa8/0xc1 [i915] [ 1572.535227] [<ffffffffa009b969>] ? i915_gem_request_free+0x4e/0x50 [i915] [ 1572.540347] [<ffffffffa00b5533>] ? intel_execlists_retire_requests+0x14c/0x159 [i915] [ 1572.545500] [<ffffffffa009d9ea>] ? i915_gem_retire_requests+0x9d/0xeb [i915] [ 1572.550664] [<ffffffffa009dd8c>] ? i915_gem_retire_work_handler+0x4c/0x61 [i915] [ 1572.555825] [<ffffffff8104ca7f>] ? process_one_work+0x1b2/0x31d [ 1572.560951] [<ffffffff8104d278>] ? worker_thread+0x24d/0x339 [ 1572.566033] [<ffffffff8104d02b>] ? cancel_delayed_work_sync+0xa/0xa [ 1572.571140] [<ffffffff81050b25>] ? kthread+0xce/0xd6 [ 1572.576191] [<ffffffff81050a57>] ? kthread_create_on_node+0x162/0x162 [ 1572.581228] [<ffffffff8179b3c8>] ? ret_from_fork+0x58/0x90 [ 1572.586259] [<ffffffff81050a57>] ? kthread_create_on_node+0x162/0x162 [ 1572.591318] Code: de 48 89 e7 e8 09 4d 00 e1 48 85 c0 74 27 48 89 68 10 48 8b 55 38 48 89 e7 48 89 50 18 48 8b 55 10 48 8b 12 48 8b 12 48 8b 52 38 <8b> 12 89 50 08 e8 95 4d 00 e1 48 83 c4 30 5b 5d 41 5c c3 41 55 [ 1572.596981] RIP [<ffffffffa00b2514>] ftrace_raw_event_i915_context+0x5d/0x70 [i915] [ 1572.602464] RSP <ffff880035727ce8> [ 1572.607911] CR2: 0000000000000000 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=90112#c23 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-05 16:17:29 +08:00
struct drm_i915_private *i915;
int flags;
struct drm_i915_file_private *file_priv;
struct i915_ctx_hang_stats hang_stats;
struct i915_hw_ppgtt *ppgtt;
/* Legacy ring buffer submission */
drm/i915: Emphasize that ctx->obj & ctx->is_initialized refer to the legacy rcs ctx We have already advanced that Logical Ring Contexts have their own kind of backing objects, but everything will be better explained in the Execlists series. For now, suffice it to say that the current backing object is only ever used with the render ring, so we're making this fact more explicit (which is a good reason on its own). As for the is_initialized flag, we only use to signify that the render state has been initialized (a.k.a. golden context, a.k.a. null context). It doesn't mean anything for the other engines, so make that distinction obvious. Done with the following Coccinelle patch (plus manual frobbing of the struct): @@ struct intel_context c; @@ - (c).obj + c.legacy_hw_ctx.rcs_state @@ struct intel_context *c; @@ - (c)->obj + c->legacy_hw_ctx.rcs_state @@ struct intel_context c; @@ - (c).is_initialized + c.legacy_hw_ctx.initialized @@ struct intel_context *c; @@ - (c)->is_initialized + c->legacy_hw_ctx.initialized This Execlists prep-work patch has been suggested by Chris Wilson and Daniel Vetter separately. Initially, it was two separate patches: drm/i915: Rename ctx->obj to ctx->rcs_state drm/i915: Make it obvious that ctx->id is merely a user handle Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: s/id/is_initialized/ to fix the subject and resolve a conflict in i915_gem_context_reset. Also introduce a new lctx local variable to avoid overtly long lines.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-03 23:27:59 +08:00
struct {
struct drm_i915_gem_object *rcs_state;
bool initialized;
} legacy_hw_ctx;
/* Execlists */
bool rcs_initialized;
struct {
struct drm_i915_gem_object *state;
struct intel_ringbuffer *ringbuf;
int pin_count;
} engine[I915_NUM_RINGS];
struct list_head link;
};
enum fb_op_origin {
ORIGIN_GTT,
ORIGIN_CPU,
ORIGIN_CS,
ORIGIN_FLIP,
};
struct i915_fbc {
/* This is always the inner lock when overlapping with struct_mutex and
* it's the outer lock when overlapping with stolen_lock. */
struct mutex lock;
unsigned long uncompressed_size;
unsigned threshold;
unsigned int fb_id;
unsigned int possible_framebuffer_bits;
unsigned int busy_bits;
struct intel_crtc *crtc;
int y;
struct drm_mm_node compressed_fb;
struct drm_mm_node *compressed_llb;
bool false_color;
/* Tracks whether the HW is actually enabled, not whether the feature is
* possible. */
bool enabled;
struct intel_fbc_work {
struct delayed_work work;
struct intel_crtc *crtc;
struct drm_framebuffer *fb;
} *fbc_work;
enum no_fbc_reason {
FBC_OK, /* FBC is enabled */
FBC_UNSUPPORTED, /* FBC is not supported by this chipset */
FBC_NO_OUTPUT, /* no outputs enabled to compress */
FBC_STOLEN_TOO_SMALL, /* not enough space for buffers */
FBC_UNSUPPORTED_MODE, /* interlace or doublescanned mode */
FBC_MODE_TOO_LARGE, /* mode too large for compression */
FBC_BAD_PLANE, /* fbc not supported on plane */
FBC_NOT_TILED, /* buffer not tiled */
FBC_MULTIPLE_PIPES, /* more than one pipe active */
FBC_MODULE_PARAM,
FBC_CHIP_DEFAULT, /* disabled by default on this chip */
FBC_ROTATION, /* rotation is not supported */
FBC_IN_DBG_MASTER, /* kernel debugger is active */
} no_fbc_reason;
bool (*fbc_enabled)(struct drm_i915_private *dev_priv);
void (*enable_fbc)(struct intel_crtc *crtc);
void (*disable_fbc)(struct drm_i915_private *dev_priv);
};
/**
* HIGH_RR is the highest eDP panel refresh rate read from EDID
* LOW_RR is the lowest eDP panel refresh rate found from EDID
* parsing for same resolution.
*/
enum drrs_refresh_rate_type {
DRRS_HIGH_RR,
DRRS_LOW_RR,
DRRS_MAX_RR, /* RR count */
};
enum drrs_support_type {
DRRS_NOT_SUPPORTED = 0,
STATIC_DRRS_SUPPORT = 1,
SEAMLESS_DRRS_SUPPORT = 2
drm/i915: Add support for DRRS to switch RR This patch computes and stored 2nd M/N/TU for switching to different refresh rate dynamically. PIPECONF_EDP_RR_MODE_SWITCH bit helps toggle between alternate refresh rates programmed in 2nd M/N/TU registers. v2: Daniel's review comments Computing M2/N2 in compute_config and storing it in crtc_config v3: Modified reference to edp_downclock and edp_downclock_avail based on the changes made to move them from dev_private to intel_panel. v4: Modified references to is_drrs_supported based on the changes made to rename it to drrs_support. v5: Jani's review comments Removed superfluous return statements. Changed support for Gen 7 and above. Corrected indentation. Re-structured the code which finds crtc and connector from encoder. Changed some logs to be less verbose. v6: Modifying i915_drrs to include only intel connector as intel_dp can be derived from intel connector when required. v7: As per internal review comments, acquiring mutex just before accessing drrs RR. As per Chris's review comments, added documentation about the use of locking in the function. v8: Incorporated Jani's review comments. Removed reference to edp_downclock. v9: Jani's review comments. Modified comment in set_drrs. Changed index to type edp_drrs_refresh_rate_type. Check if PSR is enabled before setting registers fo DRRS. Signed-off-by: Pradeep Bhat <pradeep.bhat@intel.com> Signed-off-by: Vandana Kannan <vandana.kannan@intel.com> Cc: Jani Nikula <jani.nikula@linux.intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-04-05 14:43:28 +08:00
};
struct intel_dp;
struct i915_drrs {
struct mutex mutex;
struct delayed_work work;
struct intel_dp *dp;
unsigned busy_frontbuffer_bits;
enum drrs_refresh_rate_type refresh_rate_type;
enum drrs_support_type type;
};
struct i915_psr {
struct mutex lock;
bool sink_support;
bool source_ok;
struct intel_dp *enabled;
bool active;
struct delayed_work work;
drm/i915: Fix up PSR frontbuffer tracking I've tried to split this up, but all the changes are so tightly related that I didn't find a good way to do this without breaking bisecting. Essentially this completely changes how psr is glued into the overall driver, and there's not much you can do to soften such a paradigm change. - Use frontbuffer tracking bits stuff to separate disable and re-enable. - Don't re-check everything in the psr work. We have now accurate tracking for everything, so no need to check for sprites or tiling really. Allows us to ditch tons of locks. - That in turn allows us to properly cancel the work in the disable function - no more deadlocks. - Add a check for HSW sprites and force a flush. Apparently the hardware doesn't forward the flushing when updating the sprite base address. We can do the same trick everywhere else we have such issues, e.g. on baytrail with ... everything. - Don't re-enable psr with a delay in psr_exit. It really must be turned off forever if we detect a gtt write. At least with the current frontbuffer render tracking. Userspace can do a busy ioctl call or no-op pageflip to re-enable psr. - Drop redundant checks for crtc and crtc->active - now that they're only called from enable this is guaranteed. - Fix up the hsw port check. eDP can also happen on port D, but the issue is exactly that it doesn't work there. So an || check is wrong. - We still schedule the psr work with a delay. The frontbuffer flushing interface mandates that we upload the next full frame, so need to wait a bit. Once we have single-shot frame uploads we can do better here. v2: Don't enable psr initially, rely upon the fb flush of the initial plane setup for that. Gives us more unified code flow and makes the crtc enable sequence less a special case. v3: s/psr_exit/psr_invalidate/ for consistency v4: Fixup whitespace. v5: Correctly bail out of psr_invalidate/flush when dev_priv->psr.enabled is NULL. Spotted by Rodrigo. v6: - Only schedule work when there's work to do. Fixes WARNINGs reported by Rodrigo. - Comments Chris requested to clarify the code. v7: Fix conflict on rebase (Rodrigo) Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v6) Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-12 01:30:16 +08:00
unsigned busy_frontbuffer_bits;
bool psr2_support;
bool aux_frame_sync;
};
enum intel_pch {
drm/i915: add PCH_NONE to enum intel_pch And rely on the fact that it's 0 to assume that machines without a PCH will have PCH_NONE as dev_priv->pch_type. Just today I finally realized that HAS_PCH_IBX is true for machines without a PCH. IMHO this is totally counter-intuitive and I don't think it's a good idea to assume that we're going to check for HAS_PCH_IBX only after we check for HAS_PCH_SPLIT. I believe that in the future we'll have more PCH types and checks like: if (HAS_PCH_IBX(dev) || HAS_PCH_CPT(dev)) will become more and more common. There's a good chance that we may break non-PCH machines by adding these checks in code that runs on all machines. I also believe that the HAS_PCH_SPLIT check will become less common as we add more and more different PCH types. We'll probably start replacing checks like: if (HAS_PCH_SPLIT(dev)) foo(); else bar(); with: if (HAS_PCH_NEW(dev)) baz(); else if (HAS_PCH_OLD(dev) || HAS_PCH_IBX(dev)) foo(); else bar(); and this may break gen 2/3/4. As far as we have investigated, this patch will affect the behavior of intel_hdmi_dpms and intel_dp_link_down on gen 4. In both functions the code inside the HAS_PCH_IBX check is for IBX-specific workarounds, so we should be safe. If we start bisecting gen 2/3/4 bugs to this commit we should consider replacing the HAS_PCH_IBX checks with something else. V2: Improve commit message, list possible side effects and solution. Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-07-04 05:48:16 +08:00
PCH_NONE = 0, /* No PCH present */
PCH_IBX, /* Ibexpeak PCH */
PCH_CPT, /* Cougarpoint PCH */
PCH_LPT, /* Lynxpoint PCH */
PCH_SPT, /* Sunrisepoint PCH */
PCH_NOP,
};
enum intel_sbi_destination {
SBI_ICLK,
SBI_MPHY,
};
#define QUIRK_PIPEA_FORCE (1<<0)
#define QUIRK_LVDS_SSC_DISABLE (1<<1)
#define QUIRK_INVERT_BRIGHTNESS (1<<2)
#define QUIRK_BACKLIGHT_PRESENT (1<<3)
#define QUIRK_PIPEB_FORCE (1<<4)
#define QUIRK_PIN_SWIZZLED_PAGES (1<<5)
struct intel_fbdev;
struct intel_fbc_work;
struct intel_gmbus {
struct i2c_adapter adapter;
u32 force_bit;
u32 reg0;
u32 gpio_reg;
struct i2c_algo_bit_data bit_algo;
struct drm_i915_private *dev_priv;
};
struct i915_suspend_saved_registers {
u32 saveDSPARB;
u32 saveLVDS;
u32 savePP_ON_DELAYS;
u32 savePP_OFF_DELAYS;
u32 savePP_ON;
u32 savePP_OFF;
u32 savePP_CONTROL;
u32 savePP_DIVISOR;
u32 saveFBC_CONTROL;
u32 saveCACHE_MODE_0;
u32 saveMI_ARB_STATE;
u32 saveSWF0[16];
u32 saveSWF1[16];
u32 saveSWF2[3];
uint64_t saveFENCE[I915_MAX_NUM_FENCES];
u32 savePCH_PORT_HOTPLUG;
u16 saveGCDGMBUS;
};
struct vlv_s0ix_state {
/* GAM */
u32 wr_watermark;
u32 gfx_prio_ctrl;
u32 arb_mode;
u32 gfx_pend_tlb0;
u32 gfx_pend_tlb1;
u32 lra_limits[GEN7_LRA_LIMITS_REG_NUM];
u32 media_max_req_count;
u32 gfx_max_req_count;
u32 render_hwsp;
u32 ecochk;
u32 bsd_hwsp;
u32 blt_hwsp;
u32 tlb_rd_addr;
/* MBC */
u32 g3dctl;
u32 gsckgctl;
u32 mbctl;
/* GCP */
u32 ucgctl1;
u32 ucgctl3;
u32 rcgctl1;
u32 rcgctl2;
u32 rstctl;
u32 misccpctl;
/* GPM */
u32 gfxpause;
u32 rpdeuhwtc;
u32 rpdeuc;
u32 ecobus;
u32 pwrdwnupctl;
u32 rp_down_timeout;
u32 rp_deucsw;
u32 rcubmabdtmr;
u32 rcedata;
u32 spare2gh;
/* Display 1 CZ domain */
u32 gt_imr;
u32 gt_ier;
u32 pm_imr;
u32 pm_ier;
u32 gt_scratch[GEN7_GT_SCRATCH_REG_NUM];
/* GT SA CZ domain */
u32 tilectl;
u32 gt_fifoctl;
u32 gtlc_wake_ctrl;
u32 gtlc_survive;
u32 pmwgicz;
/* Display 2 CZ domain */
u32 gu_ctl0;
u32 gu_ctl1;
u32 pcbr;
u32 clock_gate_dis2;
};
struct intel_rps_ei {
u32 cz_clock;
u32 render_c0;
u32 media_c0;
};
struct intel_gen6_power_mgmt {
drm/i915: sanitize rps irq disabling When disabling the RPS interrupts there is a tricky dependency between the thread disabling the interrupts, the RPS interrupt handler and the corresponding RPS work. The RPS work can reenable the interrupts, so there is no straightforward order in the disabling thread to (1) make sure that any RPS work is flushed and to (2) disable all RPS interrupts. Currently this is solved by masking the interrupts using two separate mask registers (first level display IMR and PM IMR) and doing the disabling when all first level interrupts are disabled. This works, but the requirement to run with all first level interrupts disabled is unnecessary making the suspend / unload time ordering of RPS disabling wrt. other unitialization steps difficult and error prone. Removing this restriction allows us to disable RPS early during suspend / unload and forget about it for the rest of the sequence. By adding a more explicit method for avoiding the above race, it also becomes easier to prove its correctness. Finally currently we can hit the WARN in snb_update_pm_irq(), when a final RPS work runs with the first level interrupts already disabled. This won't lead to any problem (due to the separate interrupt masks), but with the change in this and the next patch we can get rid of the WARN, while leaving it in place for other scenarios. To address the above points, add a new RPS interrupts_enabled flag and use this during RPS disabling to avoid requeuing the RPS work and reenabling of the RPS interrupts. Since the interrupt disabling happens now in intel_suspend_gt_powersave(), we will disable RPS interrupts explicitly during suspend (and not just through the first level mask), but there is no problem doing so, it's also more consistent and allows us to unify more of the RPS disabling during suspend and unload time in the next patch. v2/v3: - rebase on patch "drm/i915: move rps irq disable one level up" in the patchset Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-11-19 21:30:04 +08:00
/*
* work, interrupts_enabled and pm_iir are protected by
* dev_priv->irq_lock
*/
struct work_struct work;
drm/i915: sanitize rps irq disabling When disabling the RPS interrupts there is a tricky dependency between the thread disabling the interrupts, the RPS interrupt handler and the corresponding RPS work. The RPS work can reenable the interrupts, so there is no straightforward order in the disabling thread to (1) make sure that any RPS work is flushed and to (2) disable all RPS interrupts. Currently this is solved by masking the interrupts using two separate mask registers (first level display IMR and PM IMR) and doing the disabling when all first level interrupts are disabled. This works, but the requirement to run with all first level interrupts disabled is unnecessary making the suspend / unload time ordering of RPS disabling wrt. other unitialization steps difficult and error prone. Removing this restriction allows us to disable RPS early during suspend / unload and forget about it for the rest of the sequence. By adding a more explicit method for avoiding the above race, it also becomes easier to prove its correctness. Finally currently we can hit the WARN in snb_update_pm_irq(), when a final RPS work runs with the first level interrupts already disabled. This won't lead to any problem (due to the separate interrupt masks), but with the change in this and the next patch we can get rid of the WARN, while leaving it in place for other scenarios. To address the above points, add a new RPS interrupts_enabled flag and use this during RPS disabling to avoid requeuing the RPS work and reenabling of the RPS interrupts. Since the interrupt disabling happens now in intel_suspend_gt_powersave(), we will disable RPS interrupts explicitly during suspend (and not just through the first level mask), but there is no problem doing so, it's also more consistent and allows us to unify more of the RPS disabling during suspend and unload time in the next patch. v2/v3: - rebase on patch "drm/i915: move rps irq disable one level up" in the patchset Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-11-19 21:30:04 +08:00
bool interrupts_enabled;
u32 pm_iir;
/* Frequencies are stored in potentially platform dependent multiples.
* In other words, *_freq needs to be multiplied by X to be interesting.
* Soft limits are those which are used for the dynamic reclocking done
* by the driver (raise frequencies under heavy loads, and lower for
* lighter loads). Hard limits are those imposed by the hardware.
*
* A distinction is made for overclocking, which is never enabled by
* default, and is considered to be above the hard limit if it's
* possible at all.
*/
u8 cur_freq; /* Current frequency (cached, may not == HW) */
u8 min_freq_softlimit; /* Minimum frequency permitted by the driver */
u8 max_freq_softlimit; /* Max frequency permitted by the driver */
u8 max_freq; /* Maximum frequency, RP0 if not overclocking */
u8 min_freq; /* AKA RPn. Minimum frequency */
u8 idle_freq; /* Frequency to request when we are idle */
u8 efficient_freq; /* AKA RPe. Pre-determined balanced frequency */
u8 rp1_freq; /* "less than" RP0 power/freqency */
u8 rp0_freq; /* Non-overclocked max frequency. */
u32 cz_freq;
u8 up_threshold; /* Current %busy required to uplock */
u8 down_threshold; /* Current %busy required to downclock */
drm/i915: Tweak RPS thresholds to more aggressively downclock After applying wait-boost we often find ourselves stuck at higher clocks than required. The current threshold value requires the GPU to be continuously and completely idle for 313ms before it is dropped by one bin. Conversely, we require the GPU to be busy for an average of 90% over a 84ms period before we upclock. So the current thresholds almost never downclock the GPU, and respond very slowly to sudden demands for more power. It is easy to observe that we currently lock into the wrong bin and both underperform in benchmarks and consume more power than optimal (just by repeating the task and measuring the different results). An alternative approach, as discussed in the bspec, is to use a continuous threshold for upclocking, and an average value for downclocking. This is good for quickly detecting and reacting to state changes within a frame, however it fails with the common throttling method of waiting upon the outstanding frame - at least it is difficult to choose a threshold that works well at 15,000fps and at 60fps. So continue to use average busy/idle loads to determine frequency change. v2: Use 3 power zones to keep frequencies low in steady-state mostly idle (e.g. scrolling, interactive 2D drawing), and frequencies high for demanding games. In between those end-states, we use a fast-reclocking algorithm to converge more quickly on the desired bin. v3: Bug fixes - make sure we reset adj after switching power zones. v4: Tune - drop the continuous busy thresholds as it prevents us from choosing the right frequency for glxgears style swap benchmarks. Instead the goal is to be able to find the right clocks irrespective of the wait-boost. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Kenneth Graunke <kenneth@whitecape.org> Cc: Stéphane Marchesin <stephane.marchesin@gmail.com> Cc: Owen Taylor <otaylor@redhat.com> Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com> Cc: "Zhuang, Lena" <lena.zhuang@intel.com> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-09-26 00:34:57 +08:00
int last_adj;
enum { LOW_POWER, BETWEEN, HIGH_POWER } power;
spinlock_t client_lock;
struct list_head clients;
bool client_boost;
bool enabled;
struct delayed_work delayed_resume_work;
unsigned boosts;
struct intel_rps_client semaphores, mmioflips;
/* manual wa residency calculations */
struct intel_rps_ei up_ei, down_ei;
/*
* Protects RPS/RC6 register access and PCU communication.
* Must be taken after struct_mutex if nested. Note that
* this lock may be held for long periods of time when
* talking to hw - so only take it when talking to hw!
*/
struct mutex hw_lock;
};
/* defined intel_pm.c */
extern spinlock_t mchdev_lock;
struct intel_ilk_power_mgmt {
u8 cur_delay;
u8 min_delay;
u8 max_delay;
u8 fmax;
u8 fstart;
u64 last_count1;
unsigned long last_time1;
unsigned long chipset_power;
u64 last_count2;
u64 last_time2;
unsigned long gfx_power;
u8 corr;
int c_m;
int r_t;
};
struct drm_i915_private;
struct i915_power_well;
struct i915_power_well_ops {
/*
* Synchronize the well's hw state to match the current sw state, for
* example enable/disable it based on the current refcount. Called
* during driver init and resume time, possibly after first calling
* the enable/disable handlers.
*/
void (*sync_hw)(struct drm_i915_private *dev_priv,
struct i915_power_well *power_well);
/*
* Enable the well and resources that depend on it (for example
* interrupts located on the well). Called after the 0->1 refcount
* transition.
*/
void (*enable)(struct drm_i915_private *dev_priv,
struct i915_power_well *power_well);
/*
* Disable the well and resources that depend on it. Called after
* the 1->0 refcount transition.
*/
void (*disable)(struct drm_i915_private *dev_priv,
struct i915_power_well *power_well);
/* Returns the hw enabled state. */
bool (*is_enabled)(struct drm_i915_private *dev_priv,
struct i915_power_well *power_well);
};
/* Power well structure for haswell */
struct i915_power_well {
const char *name;
bool always_on;
/* power well enable/disable usage count */
int count;
/* cached hw enabled state */
bool hw_enabled;
unsigned long domains;
unsigned long data;
const struct i915_power_well_ops *ops;
};
struct i915_power_domains {
drm/i915: use power get/put instead of set for power on after init Currently we make sure that all power domains are enabled during driver init and turn off unneded ones only after the first modeset. Similarly during suspend we enable all power domains, which will remain on through the following resume until the first modeset. This logic is supported by intel_set_power_well() in the power domain framework. It would be nice to simplify the API, so that we only have get/put functions and make it more explicit on the higher level how this "power well on during init" logic works. This will make it also easier if in the future we want to shorten the time the power wells are on. For this add a new device private flag tracking whether we have the power wells on because of init/suspend and use only intel_display_power_get()/put(). As nothing else uses intel_set_power_well() we can remove it. This also fixes commit 6efdf354ddb186c6604d1692075421e8d2c740e9 Author: Imre Deak <imre.deak@intel.com> Date: Wed Oct 16 17:25:52 2013 +0300 drm/i915: enable only the needed power domains during modeset where removing intel_set_power_well() resulted in not releasing the reference on the power well that was taken during init and thus leaving the power well on all the time. Regression reported by Paulo. v2: - move the init_power_on flag to the power_domains struct (Daniel) v3: - add note about this being a regression fix too (Paulo) Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-10-25 22:36:48 +08:00
/*
* Power wells needed for initialization at driver init and suspend
* time are on. They are kept on until after the first modeset.
*/
bool init_power_on;
bool initializing;
int power_well_count;
drm/i915: use power get/put instead of set for power on after init Currently we make sure that all power domains are enabled during driver init and turn off unneded ones only after the first modeset. Similarly during suspend we enable all power domains, which will remain on through the following resume until the first modeset. This logic is supported by intel_set_power_well() in the power domain framework. It would be nice to simplify the API, so that we only have get/put functions and make it more explicit on the higher level how this "power well on during init" logic works. This will make it also easier if in the future we want to shorten the time the power wells are on. For this add a new device private flag tracking whether we have the power wells on because of init/suspend and use only intel_display_power_get()/put(). As nothing else uses intel_set_power_well() we can remove it. This also fixes commit 6efdf354ddb186c6604d1692075421e8d2c740e9 Author: Imre Deak <imre.deak@intel.com> Date: Wed Oct 16 17:25:52 2013 +0300 drm/i915: enable only the needed power domains during modeset where removing intel_set_power_well() resulted in not releasing the reference on the power well that was taken during init and thus leaving the power well on all the time. Regression reported by Paulo. v2: - move the init_power_on flag to the power_domains struct (Daniel) v3: - add note about this being a regression fix too (Paulo) Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-10-25 22:36:48 +08:00
struct mutex lock;
int domain_use_count[POWER_DOMAIN_NUM];
struct i915_power_well *power_wells;
};
#define MAX_L3_SLICES 2
struct intel_l3_parity {
u32 *remap_info[MAX_L3_SLICES];
struct work_struct error_work;
int which_slice;
};
struct i915_gem_mm {
/** Memory allocator for GTT stolen memory */
struct drm_mm stolen;
/** Protects the usage of the GTT stolen memory allocator. This is
* always the inner lock when overlapping with struct_mutex. */
struct mutex stolen_lock;
/** List of all objects in gtt_space. Used to restore gtt
* mappings on resume */
struct list_head bound_list;
/**
* List of objects which are not bound to the GTT (thus
* are idle and not used by the GPU) but still have
* (presumably uncached) pages still attached.
*/
struct list_head unbound_list;
/** Usable portion of the GTT for GEM */
unsigned long stolen_base; /* limited to low memory (32-bit) */
/** PPGTT used for aliasing the PPGTT with the GTT */
struct i915_hw_ppgtt *aliasing_ppgtt;
struct notifier_block oom_notifier;
struct shrinker shrinker;
bool shrinker_no_lock_stealing;
/** LRU list of objects with fence regs on them. */
struct list_head fence_list;
/**
* We leave the user IRQ off as much as possible,
* but this means that requests will finish and never
* be retired once the system goes idle. Set a timer to
* fire periodically while the ring is running. When it
* fires, go retire requests.
*/
struct delayed_work retire_work;
drm/i915: Boost RPS frequency for CPU stalls If we encounter a situation where the CPU blocks waiting for results from the GPU, give the GPU a kick to boost its the frequency. This should work to reduce user interface stalls and to quickly promote mesa to high frequencies - but the cost is that our requested frequency stalls high (as we do not idle for long enough before rc6 to start reducing frequencies, nor are we aggressive at down clocking an underused GPU). However, this should be mitigated by rc6 itself powering off the GPU when idle, and that energy use is dependent upon the workload of the GPU in addition to its frequency (e.g. the math or sampler functions only consume power when used). Still, this is likely to adversely affect light workloads. In particular, this nearly eliminates the highly noticeable wake-up lag in animations from idle. For example, expose or workspace transitions. (However, given the situation where we fail to downclock, our requested frequency is almost always the maximum, except for Baytrail where we manually downclock upon idling. This often masks the latency of upclocking after being idle, so animations are typically smooth - at the cost of increased power consumption.) Stéphane raised the concern that this will punish good applications and reward bad applications - but due to the nature of how mesa performs its client throttling, I believe all mesa applications will be roughly equally affected. To address this concern, and to prevent applications like compositors from permanently boosting the RPS state, we ratelimit the frequency of the wait-boosts each client recieves. Unfortunately, this techinique is ineffective with Ironlake - which also has dynamic render power states and suffers just as dramatically. For Ironlake, the thermal/power headroom is shared with the CPU through Intelligent Power Sharing and the intel-ips module. This leaves us with no GPU boost frequencies available when coming out of idle, and due to hardware limitations we cannot change the arbitration between the CPU and GPU quickly enough to be effective. v2: Limit each client to receiving a single boost for each active period. Tested by QA to only marginally increase power, and to demonstrably increase throughput in games. No latency measurements yet. v3: Cater for front-buffer rendering with manual throttling. v4: Tidy up. v5: Sadly the compositor needs frequent boosts as it may never idle, but due to its picking mechanism (using ReadPixels) may require frequent waits. Those waits, along with the waits for the vrefresh swap, conspire to keep the GPU at low frequencies despite the interactive latency. To overcome this we ditch the one-boost-per-active-period and just ratelimit the number of wait-boosts each client can receive. Reported-and-tested-by: Paul Neumann <paul104x@yahoo.de> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=68716 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Kenneth Graunke <kenneth@whitecape.org> Cc: Stéphane Marchesin <stephane.marchesin@gmail.com> Cc: Owen Taylor <otaylor@redhat.com> Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com> Cc: "Zhuang, Lena" <lena.zhuang@intel.com> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> [danvet: No extern for function prototypes in headers.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-09-26 00:34:56 +08:00
/**
* When we detect an idle GPU, we want to turn on
* powersaving features. So once we see that there
* are no more requests outstanding and no more
* arrive within a small period of time, we fire
* off the idle_work.
*/
struct delayed_work idle_work;
/**
* Are we in a non-interruptible section of code like
* modesetting?
*/
bool interruptible;
/**
* Is the GPU currently considered idle, or busy executing userspace
* requests? Whilst idle, we attempt to power down the hardware and
* display clocks. In order to reduce the effect on performance, there
* is a slight delay before we do so.
*/
bool busy;
/* the indicator for dispatch video commands on two BSD rings */
int bsd_ring_dispatch_index;
/** Bit 6 swizzling required for X tiling */
uint32_t bit_6_swizzle_x;
/** Bit 6 swizzling required for Y tiling */
uint32_t bit_6_swizzle_y;
/* accounting, useful for userland debugging */
spinlock_t object_stat_lock;
size_t object_memory;
u32 object_count;
};
struct drm_i915_error_state_buf {
struct drm_i915_private *i915;
unsigned bytes;
unsigned size;
int err;
u8 *buf;
loff_t start;
loff_t pos;
};
struct i915_error_state_file_priv {
struct drm_device *dev;
struct drm_i915_error_state *error;
};
struct i915_gpu_error {
/* For hangcheck timer */
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
/* Hang gpu twice in this window and your context gets banned */
#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
struct workqueue_struct *hangcheck_wq;
struct delayed_work hangcheck_work;
/* For reset and error_state handling. */
spinlock_t lock;
/* Protected by the above dev->gpu_error.lock. */
struct drm_i915_error_state *first_error;
unsigned long missed_irq_rings;
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
/**
* State variable controlling the reset flow and count
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
*
* This is a counter which gets incremented when reset is triggered,
* and again when reset has been handled. So odd values (lowest bit set)
* means that reset is in progress and even values that
* (reset_counter >> 1):th reset was successfully completed.
*
* If reset is not completed succesfully, the I915_WEDGE bit is
* set meaning that hardware is terminally sour and there is no
* recovery. All waiters on the reset_queue will be woken when
* that happens.
*
* This counter is used by the wait_seqno code to notice that reset
* event happened and it needs to restart the entire ioctl (since most
* likely the seqno it waited for won't ever signal anytime soon).
drm/i915: create a race-free reset detection With the previous patch the state transition handling of the reset code itself is now (hopefully) race free and solid. But that still leaves out everyone else - with the various lock-free wait paths we have there's the possibility that the reset happens between the point where we read the seqno we should wait on and the actual wait. And if __wait_seqno then never sees the RESET_IN_PROGRESS state, we'll happily wait for a seqno which will in all likelyhood never signal. In practice this is not a big problem since the X server gets constantly interrupted, and can then submit more work (hopefully) to unblock everyone else: As soon as a new seqno write lands, all waiters will unblock. But running the i-g-t reset testcase ZZ_hangman can expose this race, especially on slower hw with fewer cpu cores. Now looking forward to ARB_robustness and friends that's not the best possible behaviour, hence this patch adds a reset_counter to be able to detect any reset, even if a given thread never observed the in-progress state. The important part is to correctly order things: - The write side needs to increment the counter after any seqno gets reset. Hence we need to do that at the end of the reset work, and again wake everyone up. We also need to place a barrier in between any possible seqno changes and the counter increment, since any unlock operations only guarantee that nothing leaks out, but not that at later load operation gets moved ahead. - On the read side we need to ensure that no reset can sneak in and invalidate the seqno. In all cases we can use the one-sided barrier that unlock operations guarantee (of the lock protecting the respective seqno/ring pair) to ensure correct ordering. Hence it is sufficient to place the atomic read before the mutex/spin_unlock and no additional barriers are required. The end-result of all this is that we need to wake up everyone twice in a reset operation: - First, before the reset starts, to get any lockholders of the locks, so that the reset can proceed. - Second, after the reset is completed, to allow waiters to properly and reliably detect the reset condition and bail out. I admit that this entire reset_counter thing smells a bit like overkill, but I think it's justified since it makes it really explicit what the bail-out condition is. And we need a reset counter anyway to implement ARB_robustness, and imo with finer-grained locking on the horizont this is the most resilient scheme I could think of. v2: Drop spurious change in the wait_for_error EXIT_COND - we only need to wait until we leave the reset-in-progress wedged state. v3: Don't play tricks with barriers in the throttle ioctl, the spin_unlock is barrier enough. I've also considered using a little helper to grab the current reset_counter, but then decided that hiding the atomic_read isn't a great idea, since having it explicitly show up in the code is a nice remainder to reviews to check the memory barriers. v4: Add a comment to explain why we need to fall through in __wait_seqno in the end variable assignments. v5: Review from Damien: - s/smb/smp/ in a comment - don't increment the reset counter after we've set it to WEDGED. Now we (again) properly wedge the gpu when the reset fails. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-12-06 16:01:42 +08:00
*
* This is important for lock-free wait paths, where no contended lock
* naturally enforces the correct ordering between the bail-out of the
* waiter and the gpu reset work code.
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
*/
atomic_t reset_counter;
#define I915_RESET_IN_PROGRESS_FLAG 1
#define I915_WEDGED (1 << 31)
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
/**
* Waitqueue to signal when the reset has completed. Used by clients
* that wait for dev_priv->mm.wedged to settle.
*/
wait_queue_head_t reset_queue;
/* Userspace knobs for gpu hang simulation;
* combines both a ring mask, and extra flags
*/
u32 stop_rings;
#define I915_STOP_RING_ALLOW_BAN (1 << 31)
#define I915_STOP_RING_ALLOW_WARN (1 << 30)
/* For missed irq/seqno simulation. */
unsigned int test_irq_rings;
/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
bool reload_in_reset;
};
i915: ignore lid open event when resuming i915 driver needs to do modeset when 1. system resumes from sleep 2. lid is opened In PM_SUSPEND_MEM state, all the GPEs are cleared when system resumes, thus it is the i915_resume code does the modeset rather than intel_lid_notify(). But in PM_SUSPEND_FREEZE state, this will be broken because system is still responsive to the lid events. 1. When we close the lid in Freeze state, intel_lid_notify() sets modeset_on_lid. 2. When we reopen the lid, intel_lid_notify() will do a modeset, before the system is resumed. here is the error log, [92146.548074] WARNING: at drivers/gpu/drm/i915/intel_display.c:1028 intel_wait_for_pipe_off+0x184/0x190 [i915]() [92146.548076] Hardware name: VGN-Z540N [92146.548078] pipe_off wait timed out [92146.548167] Modules linked in: hid_generic usbhid hid snd_hda_codec_realtek snd_hda_intel snd_hda_codec parport_pc snd_hwdep ppdev snd_pcm_oss i915 snd_mixer_oss snd_pcm arc4 iwldvm snd_seq_dummy mac80211 snd_seq_oss snd_seq_midi fbcon tileblit font bitblit softcursor drm_kms_helper snd_rawmidi snd_seq_midi_event coretemp drm snd_seq kvm btusb bluetooth snd_timer iwlwifi pcmcia tpm_infineon i2c_algo_bit joydev snd_seq_device intel_agp cfg80211 snd intel_gtt yenta_socket pcmcia_rsrc sony_laptop agpgart microcode psmouse tpm_tis serio_raw mxm_wmi soundcore snd_page_alloc tpm acpi_cpufreq lpc_ich pcmcia_core tpm_bios mperf processor lp parport firewire_ohci firewire_core crc_itu_t sdhci_pci sdhci thermal e1000e [92146.548173] Pid: 4304, comm: kworker/0:0 Tainted: G W 3.8.0-rc3-s0i3-v3-test+ #9 [92146.548175] Call Trace: [92146.548189] [<c10378e2>] warn_slowpath_common+0x72/0xa0 [92146.548227] [<f86398b4>] ? intel_wait_for_pipe_off+0x184/0x190 [i915] [92146.548263] [<f86398b4>] ? intel_wait_for_pipe_off+0x184/0x190 [i915] [92146.548270] [<c10379b3>] warn_slowpath_fmt+0x33/0x40 [92146.548307] [<f86398b4>] intel_wait_for_pipe_off+0x184/0x190 [i915] [92146.548344] [<f86399c2>] intel_disable_pipe+0x102/0x190 [i915] [92146.548380] [<f8639ea4>] ? intel_disable_plane+0x64/0x80 [i915] [92146.548417] [<f8639f7c>] i9xx_crtc_disable+0xbc/0x150 [i915] [92146.548456] [<f863ebee>] intel_crtc_update_dpms+0x5e/0x90 [i915] [92146.548493] [<f86437cf>] intel_modeset_setup_hw_state+0x42f/0x8f0 [i915] [92146.548535] [<f8645b0b>] intel_lid_notify+0x9b/0xc0 [i915] [92146.548543] [<c15610d3>] notifier_call_chain+0x43/0x60 [92146.548550] [<c105d1e1>] __blocking_notifier_call_chain+0x41/0x80 [92146.548556] [<c105d23f>] blocking_notifier_call_chain+0x1f/0x30 [92146.548563] [<c131a684>] acpi_lid_send_state+0x78/0xa4 [92146.548569] [<c131aa9e>] acpi_button_notify+0x3b/0xf1 [92146.548577] [<c12df56a>] ? acpi_os_execute+0x17/0x19 [92146.548582] [<c12e591a>] ? acpi_ec_sync_query+0xa5/0xbc [92146.548589] [<c12e2b82>] acpi_device_notify+0x16/0x18 [92146.548595] [<c12f4904>] acpi_ev_notify_dispatch+0x38/0x4f [92146.548600] [<c12df0e8>] acpi_os_execute_deferred+0x20/0x2b [92146.548607] [<c1051208>] process_one_work+0x128/0x3f0 [92146.548613] [<c1564f73>] ? common_interrupt+0x33/0x38 [92146.548618] [<c104f8c0>] ? wake_up_worker+0x30/0x30 [92146.548624] [<c12df0c8>] ? acpi_os_wait_events_complete+0x1e/0x1e [92146.548629] [<c10524f9>] worker_thread+0x119/0x3b0 [92146.548634] [<c10523e0>] ? manage_workers+0x240/0x240 [92146.548640] [<c1056e84>] kthread+0x94/0xa0 [92146.548647] [<c1060000>] ? ftrace_raw_output_sched_stat_runtime+0x70/0xf0 [92146.548652] [<c15649b7>] ret_from_kernel_thread+0x1b/0x28 [92146.548658] [<c1056df0>] ? kthread_create_on_node+0xc0/0xc0 three different modeset flags are introduced in this patch MODESET_ON_LID_OPEN: do modeset on next lid open event MODESET_DONE: modeset already done MODESET_SUSPENDED: suspended, only do modeset when system is resumed In this way, 1. when lid is closed, MODESET_ON_LID_OPEN is set so that we'll do modeset on next lid open event. 2. when lid is opened, MODESET_DONE is set so that duplicate lid open events will be ignored. 3. when system suspends, MODESET_SUSPENDED is set. In this case, we will not do modeset on any lid events. Plus, locking mechanism is also introduced to avoid racing. Signed-off-by: Zhang Rui <rui.zhang@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-02-05 15:41:53 +08:00
enum modeset_restore {
MODESET_ON_LID_OPEN,
MODESET_DONE,
MODESET_SUSPENDED,
};
struct ddi_vbt_port_info {
/*
* This is an index in the HDMI/DVI DDI buffer translation table.
* The special value HDMI_LEVEL_SHIFT_UNKNOWN means the VBT didn't
* populate this field.
*/
#define HDMI_LEVEL_SHIFT_UNKNOWN 0xff
uint8_t hdmi_level_shift;
uint8_t supports_dvi:1;
uint8_t supports_hdmi:1;
uint8_t supports_dp:1;
};
enum psr_lines_to_wait {
PSR_0_LINES_TO_WAIT = 0,
PSR_1_LINE_TO_WAIT,
PSR_4_LINES_TO_WAIT,
PSR_8_LINES_TO_WAIT
};
struct intel_vbt_data {
struct drm_display_mode *lfp_lvds_vbt_mode; /* if any */
struct drm_display_mode *sdvo_lvds_vbt_mode; /* if any */
/* Feature bits */
unsigned int int_tv_support:1;
unsigned int lvds_dither:1;
unsigned int lvds_vbt:1;
unsigned int int_crt_support:1;
unsigned int lvds_use_ssc:1;
unsigned int display_clock_mode:1;
unsigned int fdi_rx_polarity_inverted:1;
unsigned int has_mipi:1;
int lvds_ssc_freq;
unsigned int bios_lvds_val; /* initial [PCH_]LVDS reg val in VBIOS */
enum drrs_support_type drrs_type;
/* eDP */
int edp_rate;
int edp_lanes;
int edp_preemphasis;
int edp_vswing;
bool edp_initialized;
bool edp_support;
int edp_bpp;
struct edp_power_seq edp_pps;
struct {
bool full_link;
bool require_aux_wakeup;
int idle_frames;
enum psr_lines_to_wait lines_to_wait;
int tp1_wakeup_time;
int tp2_tp3_wakeup_time;
} psr;
struct {
u16 pwm_freq_hz;
bool present;
bool active_low_pwm;
u8 min_brightness; /* min_brightness/255 of max */
} backlight;
/* MIPI DSI */
struct {
u16 port;
u16 panel_id;
struct mipi_config *config;
struct mipi_pps_data *pps;
u8 seq_version;
u32 size;
u8 *data;
u8 *sequence[MIPI_SEQ_MAX];
} dsi;
int crt_ddc_pin;
int child_dev_num;
union child_device_config *child_dev;
struct ddi_vbt_port_info ddi_port_info[I915_MAX_PORTS];
};
enum intel_ddb_partitioning {
INTEL_DDB_PART_1_2,
INTEL_DDB_PART_5_6, /* IVB+ */
};
struct intel_wm_level {
bool enable;
uint32_t pri_val;
uint32_t spr_val;
uint32_t cur_val;
uint32_t fbc_val;
};
struct ilk_wm_values {
uint32_t wm_pipe[3];
uint32_t wm_lp[3];
uint32_t wm_lp_spr[3];
uint32_t wm_linetime[3];
bool enable_fbc_wm;
enum intel_ddb_partitioning partitioning;
};
struct vlv_pipe_wm {
uint16_t primary;
uint16_t sprite[2];
uint8_t cursor;
};
drm/i915: Rewrite VLV/CHV watermark code Assuming the PND deadline mechanism works reasonably we should do memory requests as early as possible so that PND has schedule the requests more intelligently. Currently we're still calculating the watermarks as if VLV/CHV are identical to g4x, which isn't the case. The current code also seems to calculate insufficient watermarks and hence we're seeing some underruns, especially on high resolution displays. To fix it just rip out the current code and replace is with something that tries to utilize PND as efficiently as possible. We now calculate the WM watermark to trigger when the FIFO still has 256us worth of data. 256us is the maximum deadline value supoorted by PND, so issuing memory requests earlier would mean we probably couldn't utilize the full FIFO as PND would attempt to return the data at least in at least 256us. We also clamp the watermark to at least 8 cachelines as that's the magic watermark that enabling trickle feed would also impose. I'm assuming it matches some burst size. In theory we could just enable trickle feed and ignore the WM values, except trickle feed doesn't work with max fifo mode anyway, so we'd still need to calculate the SR watermarks. It seems cleaner to just disable trickle feed and calculate all watermarks the same way. Also trickle feed wouldn't account for the 256us max deadline value, thoguh that may be a moot point in non-max fifo mode sicne the FIFOs are fairly small. On VLV max fifo mode can be used with either primary or sprite planes. So the code now also checks all the planes (apart from the cursor) when calculating the SR plane watermark. We don't have to worry about the WM1 watermarks since we're using the PND deadline scheme which means the hardware ignores WM1 values. v2: Use plane->state->fb instead of plane->fb Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-03-06 03:19:49 +08:00
struct vlv_sr_wm {
uint16_t plane;
uint8_t cursor;
};
drm/i915: Rewrite VLV/CHV watermark code Assuming the PND deadline mechanism works reasonably we should do memory requests as early as possible so that PND has schedule the requests more intelligently. Currently we're still calculating the watermarks as if VLV/CHV are identical to g4x, which isn't the case. The current code also seems to calculate insufficient watermarks and hence we're seeing some underruns, especially on high resolution displays. To fix it just rip out the current code and replace is with something that tries to utilize PND as efficiently as possible. We now calculate the WM watermark to trigger when the FIFO still has 256us worth of data. 256us is the maximum deadline value supoorted by PND, so issuing memory requests earlier would mean we probably couldn't utilize the full FIFO as PND would attempt to return the data at least in at least 256us. We also clamp the watermark to at least 8 cachelines as that's the magic watermark that enabling trickle feed would also impose. I'm assuming it matches some burst size. In theory we could just enable trickle feed and ignore the WM values, except trickle feed doesn't work with max fifo mode anyway, so we'd still need to calculate the SR watermarks. It seems cleaner to just disable trickle feed and calculate all watermarks the same way. Also trickle feed wouldn't account for the 256us max deadline value, thoguh that may be a moot point in non-max fifo mode sicne the FIFOs are fairly small. On VLV max fifo mode can be used with either primary or sprite planes. So the code now also checks all the planes (apart from the cursor) when calculating the SR plane watermark. We don't have to worry about the WM1 watermarks since we're using the PND deadline scheme which means the hardware ignores WM1 values. v2: Use plane->state->fb instead of plane->fb Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-03-06 03:19:49 +08:00
struct vlv_wm_values {
struct vlv_pipe_wm pipe[3];
struct vlv_sr_wm sr;
struct {
uint8_t cursor;
uint8_t sprite[2];
uint8_t primary;
} ddl[3];
uint8_t level;
bool cxsr;
};
struct skl_ddb_entry {
uint16_t start, end; /* in number of blocks, 'end' is exclusive */
};
static inline uint16_t skl_ddb_entry_size(const struct skl_ddb_entry *entry)
{
return entry->end - entry->start;
}
static inline bool skl_ddb_entry_equal(const struct skl_ddb_entry *e1,
const struct skl_ddb_entry *e2)
{
if (e1->start == e2->start && e1->end == e2->end)
return true;
return false;
}
struct skl_ddb_allocation {
struct skl_ddb_entry pipe[I915_MAX_PIPES];
struct skl_ddb_entry plane[I915_MAX_PIPES][I915_MAX_PLANES]; /* packed/uv */
struct skl_ddb_entry y_plane[I915_MAX_PIPES][I915_MAX_PLANES]; /* y-plane */
struct skl_ddb_entry cursor[I915_MAX_PIPES];
};
struct skl_wm_values {
bool dirty[I915_MAX_PIPES];
struct skl_ddb_allocation ddb;
uint32_t wm_linetime[I915_MAX_PIPES];
uint32_t plane[I915_MAX_PIPES][I915_MAX_PLANES][8];
uint32_t cursor[I915_MAX_PIPES][8];
uint32_t plane_trans[I915_MAX_PIPES][I915_MAX_PLANES];
uint32_t cursor_trans[I915_MAX_PIPES];
};
struct skl_wm_level {
bool plane_en[I915_MAX_PLANES];
bool cursor_en;
uint16_t plane_res_b[I915_MAX_PLANES];
uint8_t plane_res_l[I915_MAX_PLANES];
uint16_t cursor_res_b;
uint8_t cursor_res_l;
};
drm/i915: allow package C8+ states on Haswell (disabled) This patch allows PC8+ states on Haswell. These states can only be reached when all the display outputs are disabled, and they allow some more power savings. The fact that the graphics device is allowing PC8+ doesn't mean that the machine will actually enter PC8+: all the other devices also need to allow PC8+. For now this option is disabled by default. You need i915.allow_pc8=1 if you want it. This patch adds a big comment inside i915_drv.h explaining how it works and how it tracks things. Read it. v2: (this is not really v2, many previous versions were already sent, but they had different names) - Use the new functions to enable/disable GTIMR and GEN6_PMIMR - Rename almost all variables and functions to names suggested by Chris - More WARNs on the IRQ handling code - Also disable PC8 when there's GPU work to do (thanks to Ben for the help on this), so apps can run caster - Enable PC8 on a delayed work function that is delayed for 5 seconds. This makes sure we only enable PC8+ if we're really idle - Make sure we're not in PC8+ when suspending v3: - WARN if IRQs are disabled on __wait_seqno - Replace some DRM_ERRORs with WARNs - Fix calls to restore GT and PM interrupts - Use intel_mark_busy instead of intel_ring_advance to disable PC8 v4: - Use the force_wake, Luke! v5: - Remove the "IIR is not zero" WARNs - Move the force_wake chunk to its own patch - Only restore what's missing from RC6, not everything Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-20 00:18:09 +08:00
/*
* This struct helps tracking the state needed for runtime PM, which puts the
* device in PCI D3 state. Notice that when this happens, nothing on the
* graphics device works, even register access, so we don't get interrupts nor
* anything else.
drm/i915: allow package C8+ states on Haswell (disabled) This patch allows PC8+ states on Haswell. These states can only be reached when all the display outputs are disabled, and they allow some more power savings. The fact that the graphics device is allowing PC8+ doesn't mean that the machine will actually enter PC8+: all the other devices also need to allow PC8+. For now this option is disabled by default. You need i915.allow_pc8=1 if you want it. This patch adds a big comment inside i915_drv.h explaining how it works and how it tracks things. Read it. v2: (this is not really v2, many previous versions were already sent, but they had different names) - Use the new functions to enable/disable GTIMR and GEN6_PMIMR - Rename almost all variables and functions to names suggested by Chris - More WARNs on the IRQ handling code - Also disable PC8 when there's GPU work to do (thanks to Ben for the help on this), so apps can run caster - Enable PC8 on a delayed work function that is delayed for 5 seconds. This makes sure we only enable PC8+ if we're really idle - Make sure we're not in PC8+ when suspending v3: - WARN if IRQs are disabled on __wait_seqno - Replace some DRM_ERRORs with WARNs - Fix calls to restore GT and PM interrupts - Use intel_mark_busy instead of intel_ring_advance to disable PC8 v4: - Use the force_wake, Luke! v5: - Remove the "IIR is not zero" WARNs - Move the force_wake chunk to its own patch - Only restore what's missing from RC6, not everything Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-20 00:18:09 +08:00
*
* Every piece of our code that needs to actually touch the hardware needs to
* either call intel_runtime_pm_get or call intel_display_power_get with the
* appropriate power domain.
drm/i915: make PC8 be part of runtime PM suspend/resume Currently, when our driver becomes idle for i915.pc8_timeout (default: 5s) we enable PC8, so we save some power, but not everything we can. Then, while PC8 is enabled, if we stay idle for more autosuspend_delay_ms (default: 10s) we'll enter runtime PM and put the graphics device in D3 state, saving even more power. The two features are separate things with increasing levels of power savings, but if we disable PC8 we'll never get into D3. While from the modularity point of view it would be nice to keep these features as separate, we have reasons to merge them: - We are not aware of anybody wanting a "PC8 without D3" environment. - If we keep both features as separate, we'll have to to test both PC8 and PC8+D3 code paths. We're already having a major pain to make QA do automated testing of just one thing, testing both paths will cost even more. - Only Haswell+ supports PC8, so if we want to add runtime PM support to, for example, IVB, we'll have to copy some code from the PC8 feature to runtime PM, so merging both features as a single thing will make it easier for enabling runtime PM on other platforms. This patch only does the very basic steps required to have PC8 and runtime PM merged on a single feature: the next patches will take care of cleaning up everything. v2: - Rebase. v3: - Rebase. - Fully remove the deprecated i915 params since Daniel doesn't consider them as part of the ABI. v4: - Rebase. - Fix typo in the commit message. v5: - Rebase, again. - Add a huge comment explaining the different forcewake usage (Chris, Daniel). - Use open-coded forcewake functions (Daniel). Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-03-08 07:08:05 +08:00
*
* Our driver uses the autosuspend delay feature, which means we'll only really
* suspend if we stay with zero refcount for a certain amount of time. The
* default value is currently very conservative (see intel_runtime_pm_enable), but
* it can be changed with the standard runtime PM files from sysfs.
drm/i915: allow package C8+ states on Haswell (disabled) This patch allows PC8+ states on Haswell. These states can only be reached when all the display outputs are disabled, and they allow some more power savings. The fact that the graphics device is allowing PC8+ doesn't mean that the machine will actually enter PC8+: all the other devices also need to allow PC8+. For now this option is disabled by default. You need i915.allow_pc8=1 if you want it. This patch adds a big comment inside i915_drv.h explaining how it works and how it tracks things. Read it. v2: (this is not really v2, many previous versions were already sent, but they had different names) - Use the new functions to enable/disable GTIMR and GEN6_PMIMR - Rename almost all variables and functions to names suggested by Chris - More WARNs on the IRQ handling code - Also disable PC8 when there's GPU work to do (thanks to Ben for the help on this), so apps can run caster - Enable PC8 on a delayed work function that is delayed for 5 seconds. This makes sure we only enable PC8+ if we're really idle - Make sure we're not in PC8+ when suspending v3: - WARN if IRQs are disabled on __wait_seqno - Replace some DRM_ERRORs with WARNs - Fix calls to restore GT and PM interrupts - Use intel_mark_busy instead of intel_ring_advance to disable PC8 v4: - Use the force_wake, Luke! v5: - Remove the "IIR is not zero" WARNs - Move the force_wake chunk to its own patch - Only restore what's missing from RC6, not everything Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-20 00:18:09 +08:00
*
* The irqs_disabled variable becomes true exactly after we disable the IRQs and
* goes back to false exactly before we reenable the IRQs. We use this variable
* to check if someone is trying to enable/disable IRQs while they're supposed
* to be disabled. This shouldn't happen and we'll print some error messages in
* case it happens.
drm/i915: allow package C8+ states on Haswell (disabled) This patch allows PC8+ states on Haswell. These states can only be reached when all the display outputs are disabled, and they allow some more power savings. The fact that the graphics device is allowing PC8+ doesn't mean that the machine will actually enter PC8+: all the other devices also need to allow PC8+. For now this option is disabled by default. You need i915.allow_pc8=1 if you want it. This patch adds a big comment inside i915_drv.h explaining how it works and how it tracks things. Read it. v2: (this is not really v2, many previous versions were already sent, but they had different names) - Use the new functions to enable/disable GTIMR and GEN6_PMIMR - Rename almost all variables and functions to names suggested by Chris - More WARNs on the IRQ handling code - Also disable PC8 when there's GPU work to do (thanks to Ben for the help on this), so apps can run caster - Enable PC8 on a delayed work function that is delayed for 5 seconds. This makes sure we only enable PC8+ if we're really idle - Make sure we're not in PC8+ when suspending v3: - WARN if IRQs are disabled on __wait_seqno - Replace some DRM_ERRORs with WARNs - Fix calls to restore GT and PM interrupts - Use intel_mark_busy instead of intel_ring_advance to disable PC8 v4: - Use the force_wake, Luke! v5: - Remove the "IIR is not zero" WARNs - Move the force_wake chunk to its own patch - Only restore what's missing from RC6, not everything Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-20 00:18:09 +08:00
*
* For more, read the Documentation/power/runtime_pm.txt.
drm/i915: allow package C8+ states on Haswell (disabled) This patch allows PC8+ states on Haswell. These states can only be reached when all the display outputs are disabled, and they allow some more power savings. The fact that the graphics device is allowing PC8+ doesn't mean that the machine will actually enter PC8+: all the other devices also need to allow PC8+. For now this option is disabled by default. You need i915.allow_pc8=1 if you want it. This patch adds a big comment inside i915_drv.h explaining how it works and how it tracks things. Read it. v2: (this is not really v2, many previous versions were already sent, but they had different names) - Use the new functions to enable/disable GTIMR and GEN6_PMIMR - Rename almost all variables and functions to names suggested by Chris - More WARNs on the IRQ handling code - Also disable PC8 when there's GPU work to do (thanks to Ben for the help on this), so apps can run caster - Enable PC8 on a delayed work function that is delayed for 5 seconds. This makes sure we only enable PC8+ if we're really idle - Make sure we're not in PC8+ when suspending v3: - WARN if IRQs are disabled on __wait_seqno - Replace some DRM_ERRORs with WARNs - Fix calls to restore GT and PM interrupts - Use intel_mark_busy instead of intel_ring_advance to disable PC8 v4: - Use the force_wake, Luke! v5: - Remove the "IIR is not zero" WARNs - Move the force_wake chunk to its own patch - Only restore what's missing from RC6, not everything Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-20 00:18:09 +08:00
*/
struct i915_runtime_pm {
bool suspended;
bool irqs_enabled;
drm/i915: allow package C8+ states on Haswell (disabled) This patch allows PC8+ states on Haswell. These states can only be reached when all the display outputs are disabled, and they allow some more power savings. The fact that the graphics device is allowing PC8+ doesn't mean that the machine will actually enter PC8+: all the other devices also need to allow PC8+. For now this option is disabled by default. You need i915.allow_pc8=1 if you want it. This patch adds a big comment inside i915_drv.h explaining how it works and how it tracks things. Read it. v2: (this is not really v2, many previous versions were already sent, but they had different names) - Use the new functions to enable/disable GTIMR and GEN6_PMIMR - Rename almost all variables and functions to names suggested by Chris - More WARNs on the IRQ handling code - Also disable PC8 when there's GPU work to do (thanks to Ben for the help on this), so apps can run caster - Enable PC8 on a delayed work function that is delayed for 5 seconds. This makes sure we only enable PC8+ if we're really idle - Make sure we're not in PC8+ when suspending v3: - WARN if IRQs are disabled on __wait_seqno - Replace some DRM_ERRORs with WARNs - Fix calls to restore GT and PM interrupts - Use intel_mark_busy instead of intel_ring_advance to disable PC8 v4: - Use the force_wake, Luke! v5: - Remove the "IIR is not zero" WARNs - Move the force_wake chunk to its own patch - Only restore what's missing from RC6, not everything Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-20 00:18:09 +08:00
};
enum intel_pipe_crc_source {
INTEL_PIPE_CRC_SOURCE_NONE,
INTEL_PIPE_CRC_SOURCE_PLANE1,
INTEL_PIPE_CRC_SOURCE_PLANE2,
INTEL_PIPE_CRC_SOURCE_PF,
INTEL_PIPE_CRC_SOURCE_PIPE,
/* TV/DP on pre-gen5/vlv can't use the pipe source. */
INTEL_PIPE_CRC_SOURCE_TV,
INTEL_PIPE_CRC_SOURCE_DP_B,
INTEL_PIPE_CRC_SOURCE_DP_C,
INTEL_PIPE_CRC_SOURCE_DP_D,
INTEL_PIPE_CRC_SOURCE_AUTO,
INTEL_PIPE_CRC_SOURCE_MAX,
};
struct intel_pipe_crc_entry {
uint32_t frame;
uint32_t crc[5];
};
#define INTEL_PIPE_CRC_ENTRIES_NR 128
struct intel_pipe_crc {
spinlock_t lock;
bool opened; /* exclusive access to the result file */
struct intel_pipe_crc_entry *entries;
enum intel_pipe_crc_source source;
int head, tail;
wait_queue_head_t wq;
};
drm/i915: Track frontbuffer invalidation/flushing So these are the guts of the new beast. This tracks when a frontbuffer gets invalidated (due to frontbuffer rendering) and hence should be constantly scaned out, and when it's flushed again and can be compressed/one-shot-upload. Rules for flushing are simple: The frontbuffer needs one more full upload starting from the next vblank. Which means that the flushing can _only_ be called once the frontbuffer update has been latched. But this poses a problem for pageflips: We can't just delay the flushing until the pageflip is latched, since that would pose the risk that we override frontbuffer rendering that has been scheduled in-between the pageflip ioctl and the actual latching. To handle this track asynchronous invalidations (and also pageflip) state per-ring and delay any in-between flushing until the rendering has completed. And also cancel any delayed flushing if we get a new invalidation request (whether delayed or not). Also call intel_mark_fb_busy in both cases in all cases to make sure that we keep the screen at the highest refresh rate both on flips, synchronous plane updates and for frontbuffer rendering. v2: Lots of improvements Suggestions from Chris: - Move invalidate/flush in flush_*_domain and set_to_*_domain. - Drop the flush in busy_ioctl since it's redundant. Was a leftover from an earlier concept to track flips/delayed flushes. - Don't forget about the initial modeset enable/final disable. Suggested by Chris. Track flips accurately, too. Since flips complete independently of rendering we need to track pending flips in a separate mask. Again if an invalidate happens we need to cancel the evenutal flush to avoid races. v3: Provide correct header declarations for flip functions. Currently not needed outside of intel_display.c, but part of the proper interface. v4: Add proper domain management to fbcon so that the fbcon buffer is also tracked correctly. v5: Fixup locking around the fbcon set_to_gtt_domain call. v6: More comments from Chris: - Split out fbcon changes. - Drop superflous checks for potential scanout before calling intel_fb functions - we can micro-optimize this later. - s/intel_fb_/intel_fb_obj_/ to make it clear that this deals in gem object. We already have precedence for fb_obj in the pin_and_fence functions. v7: Clarify the semantics of the flip flush handling by renaming things a bit: - Don't go through a gem object but take the relevant frontbuffer bits directly. These functions center on the plane, the actual object is irrelevant - even a flip to the same object as already active should cause a flush. - Add a new intel_frontbuffer_flip for synchronous plane updates. It currently just calls intel_frontbuffer_flush since the implemenation differs. This way we achieve a clear split between one-shot update events on one side and frontbuffer rendering with potentially a very long delay between the invalidate and flush. Chris and I also had some discussions about mark_busy and whether it is appropriate to call from flush. But mark busy is a state which should be derived from the 3 events (invalidate, flush, flip) we now have by the users, like psr does by tracking relevant information in psr.busy_frontbuffer_bits. DRRS (the only real use of mark_busy for frontbuffer) needs to have similar logic. With that the overall mark_busy in the core could be removed. v8: Only when retiring gpu buffers only flush frontbuffer bits we actually invalidated in a batch. Just for safety since before any additional usage/invalidate we should always retire current rendering. Suggested by Chris Wilson. v9: Actually use intel_frontbuffer_flip in all appropriate places. Spotted by Chris. v10: Address more comments from Chris: - Don't call _flip in set_base when the crtc is inactive, avoids redunancy in the modeset case with the initial enabling of all planes. - Add comments explaining that the initial/final plane enable/disable still has work left to do before it's fully generic. v11: Only invalidate for gtt/cpu access when writing. Spotted by Chris. v12: s/_flush/_flip/ in intel_overlay.c per Chris' comment. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 22:01:59 +08:00
struct i915_frontbuffer_tracking {
struct mutex lock;
/*
* Tracking bits for delayed frontbuffer flushing du to gpu activity or
* scheduled flips.
*/
unsigned busy_bits;
unsigned flip_bits;
};
struct i915_wa_reg {
u32 addr;
u32 value;
/* bitmask representing WA bits */
u32 mask;
};
#define I915_MAX_WA_REGS 16
struct i915_workarounds {
struct i915_wa_reg reg[I915_MAX_WA_REGS];
u32 count;
};
drm/i915: Introduce a PV INFO page structure for Intel GVT-g. Introduce a PV INFO structure, to facilitate the Intel GVT-g technology, which is a GPU virtualization solution with mediated pass-through. This page contains the shared information between i915 driver and the host emulator. For now, this structure utilizes an area of 4K bytes on HSW GPU's unused MMIO space. Future hardware will have the reserved window architecturally defined, and layout of the page will be added in future BSpec. The i915 driver load routine detects if it is running in a VM by reading the contents of this PV INFO page. Thereafter a flag, vgpu.active is set, and intel_vgpu_active() is used by checking this flag to conclude if GPU is virtualized with Intel GVT-g. By now, intel_vgpu_active() will return true, only when the driver is running as a guest in the Intel GVT-g enhanced environment on HSW platform. v2: take Chris' comments: - call the i915_check_vgpu() in intel_uncore_init() - sanitize i915_check_vgpu() by adding BUILD_BUG_ON() and debug info take Daniel's comments: - put the definition of PV INFO into a new header - i915_vgt_if.h other changes: - access mmio regs by readq/readw in i915_check_vgpu() v3: take Daniel's comments: - move the i915/vgt interfaces into a new i915_vgpu.c - update makefile - add kerneldoc to functions which are non-static - add a DOC: section describing some of the high-level design - update drm docbook other changes: - rename i915_vgt_if.h to i915_vgpu.h v4: take Tvrtko's comments: - fix a typo in commit message - add debug message when vgt version mismatches - rename low_gmadr/high_gmadr to mappable/non-mappable in PV INFO structure Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> Signed-off-by: Jike Song <jike.song@intel.com> Signed-off-by: Eddie Dong <eddie.dong@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-02-10 19:05:47 +08:00
struct i915_virtual_gpu {
bool active;
};
drm/i915: Merged the many do_execbuf() parameters into a structure The do_execbuf() function takes quite a few parameters. The actual set of parameters is going to change with the conversion to passing requests around. Further, it is due to grow massively with the arrival of the GPU scheduler. This patch simplifies the prototype by passing a parameter structure instead. Changing the parameter set in the future is then simply a matter of adding/removing items to the structure. Note that the structure does not contain absolutely everything that is passed in. This is because the intention is to use this structure more extensively later in this patch series and more especially in the GPU scheduler that is coming soon. The latter requires hanging on to the structure as the final hardware submission can be delayed until long after the execbuf IOCTL has returned to user land. Thus it is unsafe to put anything in the structure that is local to the IOCTL call itself - such as the 'args' parameter. All entries must be copies of data or pointers to structures that are reference counted in some way and guaranteed to exist for the duration of the batch buffer's life. v2: Rebased to newer tree and updated for changes to the command parser. Specifically, a code shuffle has required saving the batch start address in the params structure. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:43:27 +08:00
struct i915_execbuffer_params {
struct drm_device *dev;
struct drm_file *file;
uint32_t dispatch_flags;
uint32_t args_batch_start_offset;
uint32_t batch_obj_vm_offset;
struct intel_engine_cs *ring;
struct drm_i915_gem_object *batch_obj;
struct intel_context *ctx;
struct drm_i915_gem_request *request;
drm/i915: Merged the many do_execbuf() parameters into a structure The do_execbuf() function takes quite a few parameters. The actual set of parameters is going to change with the conversion to passing requests around. Further, it is due to grow massively with the arrival of the GPU scheduler. This patch simplifies the prototype by passing a parameter structure instead. Changing the parameter set in the future is then simply a matter of adding/removing items to the structure. Note that the structure does not contain absolutely everything that is passed in. This is because the intention is to use this structure more extensively later in this patch series and more especially in the GPU scheduler that is coming soon. The latter requires hanging on to the structure as the final hardware submission can be delayed until long after the execbuf IOCTL has returned to user land. Thus it is unsafe to put anything in the structure that is local to the IOCTL call itself - such as the 'args' parameter. All entries must be copies of data or pointers to structures that are reference counted in some way and guaranteed to exist for the duration of the batch buffer's life. v2: Rebased to newer tree and updated for changes to the command parser. Specifically, a code shuffle has required saving the batch start address in the params structure. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:43:27 +08:00
};
struct drm_i915_private {
struct drm_device *dev;
struct kmem_cache *objects;
struct kmem_cache *vmas;
struct kmem_cache *requests;
const struct intel_device_info info;
int relative_constants_mode;
void __iomem *regs;
struct intel_uncore uncore;
drm/i915: Introduce a PV INFO page structure for Intel GVT-g. Introduce a PV INFO structure, to facilitate the Intel GVT-g technology, which is a GPU virtualization solution with mediated pass-through. This page contains the shared information between i915 driver and the host emulator. For now, this structure utilizes an area of 4K bytes on HSW GPU's unused MMIO space. Future hardware will have the reserved window architecturally defined, and layout of the page will be added in future BSpec. The i915 driver load routine detects if it is running in a VM by reading the contents of this PV INFO page. Thereafter a flag, vgpu.active is set, and intel_vgpu_active() is used by checking this flag to conclude if GPU is virtualized with Intel GVT-g. By now, intel_vgpu_active() will return true, only when the driver is running as a guest in the Intel GVT-g enhanced environment on HSW platform. v2: take Chris' comments: - call the i915_check_vgpu() in intel_uncore_init() - sanitize i915_check_vgpu() by adding BUILD_BUG_ON() and debug info take Daniel's comments: - put the definition of PV INFO into a new header - i915_vgt_if.h other changes: - access mmio regs by readq/readw in i915_check_vgpu() v3: take Daniel's comments: - move the i915/vgt interfaces into a new i915_vgpu.c - update makefile - add kerneldoc to functions which are non-static - add a DOC: section describing some of the high-level design - update drm docbook other changes: - rename i915_vgt_if.h to i915_vgpu.h v4: take Tvrtko's comments: - fix a typo in commit message - add debug message when vgt version mismatches - rename low_gmadr/high_gmadr to mappable/non-mappable in PV INFO structure Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> Signed-off-by: Jike Song <jike.song@intel.com> Signed-off-by: Eddie Dong <eddie.dong@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-02-10 19:05:47 +08:00
struct i915_virtual_gpu vgpu;
drm/i915/skl: Add support to load SKL CSR firmware. Display Context Save and Restore support is needed for various SKL Display C states like DC5, DC6. This implementation is added based on first version of DMC CSR program that we received from h/w team. Here we are using request_firmware based design. Finally this firmware should end up in linux-firmware tree. For SKL platform its mandatory to ensure that we load this csr program before enabling DC states like DC5/DC6. As CSR program gets reset on various conditions, we should ensure to load it during boot and in future change to be added to load this system resume sequence too. v1: Initial relese as RFC patch v2: Design change as per Daniel, Damien and Shobit's review comments request firmware method followed. v3: Some optimization and functional changes. Pulled register defines into drivers/gpu/drm/i915/i915_reg.h Used kmemdup to allocate and duplicate firmware content. Ensured to free allocated buffer. v4: Modified as per review comments from Satheesh and Daniel Removed temporary buffer. Optimized number of writes by replacing I915_WRITE with I915_WRITE64. v5: Modified as per review comemnts from Damien. - Changed name for functions and firmware. - Introduced HAS_CSR. - Reverted back previous change and used csr_buf with u8 size. - Using cpu_to_be64 for endianness change. Modified as per review comments from Imre. - Modified registers and macro names to be a bit closer to bspec terminology and the existing register naming in the driver. - Early return for non SKL platforms in intel_load_csr_program function. - Added locking around CSR program load function as it may be called concurrently during system/runtime resume. - Releasing the fw before loading the program for consistency - Handled error path during f/w load. v6: Modified as per review comments from Imre. - Corrected out_freecsr sequence. v7: Modified as per review comments from Imre. Fail loading fw if fw->size%8!=0. v8: Rebase to latest. v9: Rebase on top of -nightly (Damien) v10: Enabled support for dmc firmware ver 1.0. According to ver 1.0 in a single binary package all the firmware's that are required for different stepping's of the product will be stored. The package contains the css header, followed by the package header and the actual dmc firmwares. Package header contains the firmware/stepping mapping table and the corresponding firmware offsets to the individual binaries, within the package. Each individual program binary contains the header and the payload sections whose size is specified in the header section. This changes are done to extract the specific firmaware from the package. (Animesh) v11: Modified as per review comemnts from Imre. - Added code comment from bpec for header structure elements. - Added __packed to avoid structure padding. - Added helper functions for stepping and substepping info. - Added code comment for CSR_MAX_FW_SIZE. - Disabled BXT firmware loading, will be enabled with dmc 1.0 support. - Changed skl_stepping_info based on bspec, earlier used from config DB. - Removed duplicate call of cpu_to_be* from intel_csr_load_program function. - Used cpu_to_be32 instead of cpu_to_be64 as firmware binary in dword aligned. - Added sanity check for header length. - Added sanity check for mmio address got from firmware binary. - kmalloc done separately for dmc header and dmc firmware. (Animesh) v12: Modified as per review comemnts from Imre. - Corrected the typo error in skl stepping info structure. - Added out-of-bound access for skl_stepping_info. - Sanity check for mmio address modified. - Sanity check added for stepping and substeppig. - Modified the intel_dmc_info structure, cache only the required header info. (Animesh) v13: clarify firmware load error message. The reason for a firmware loading failure can be obscure if the driver is built-in. Provide an explanation to the user about the likely reason for the failure and how to resolve it. (Imre) v14: Suggested by Jani. - fix s/I915/CONFIG_DRM_I915/ typo - add fw_path to the firmware object instead of using a static ptr (Jani) v15: 1) Changed the firmware name as dmc_gen9.bin, everytime for a new firmware version a symbolic link with same name will help not to build kernel again. 2) Changes done as per review comments from Imre. - Error check removed for intel_csr_ucode_init. - Moved csr-specific data structure to intel_csr.h and optimization done on structure definition. - fw->data used directly for parsing the header info & memory allocation only done separately for payload. (Animesh) v16: - No need for out_regs label in i915_driver_load(), so removed it. - Changed the firmware name as skl_dmc_ver1.bin, followed naming convention <platform>_dmc_<api-version>.bin (Animesh) Issue: VIZ-2569 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-04 20:58:44 +08:00
struct intel_csr csr;
/* Display CSR-related protection */
struct mutex csr_lock;
struct intel_gmbus gmbus[GMBUS_NUM_PINS];
drm/i915: use the gmbus irq for waits We need two special things to properly wire this up: - Add another argument to gmbus_wait_hw_status to pass in the correct interrupt bit in gmbus4. - Since we can only get an irq for one of the two events we want, hand-roll the wait_event_timeout code so that we wake up every jiffie and can check for NAKs. This way we also subsume gmbus support for platforms without interrupts (or where those are not yet enabled). The important bit really is to only enable one gmbus interrupt source at the same time - with that piece of lore figured out, this seems to work flawlessly. Ben Widawsky rightfully complained the lack of measurements for the claimed benefits (especially since the first version was actually broken and fell back to bit-banging). Previously reading the 256 byte hdmi EDID takes about 72 ms here. With this patch it's down to 33 ms. Given that transfering the 256 bytes over i2c at wire speed takes 20.5ms alone, the reduction in additional overhead is rather nice. v2: Chris Wilson wondered whether GMBUS4 might contain some set bits when booting up an hence result in some spurious interrupts. Since we clear GMBUS4 after every wait and we do gmbus transfer really early in the setup sequence to detect displays the window is small, but still be paranoid and clear it properly. v3: Clarify the comment that gmbus irq generation can only support one kind of event, why it bothers us and how we work around that limit. Cc: Daniel Kurtz <djkurtz@chromium.org> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-12-01 20:53:45 +08:00
/** gmbus_mutex protects against concurrent usage of the single hw gmbus
* controller on different i2c buses. */
struct mutex gmbus_mutex;
/**
* Base address of the gmbus and gpio block.
*/
uint32_t gpio_mmio_base;
/* MMIO base address for MIPI regs */
uint32_t mipi_mmio_base;
drm/i915: use the gmbus irq for waits We need two special things to properly wire this up: - Add another argument to gmbus_wait_hw_status to pass in the correct interrupt bit in gmbus4. - Since we can only get an irq for one of the two events we want, hand-roll the wait_event_timeout code so that we wake up every jiffie and can check for NAKs. This way we also subsume gmbus support for platforms without interrupts (or where those are not yet enabled). The important bit really is to only enable one gmbus interrupt source at the same time - with that piece of lore figured out, this seems to work flawlessly. Ben Widawsky rightfully complained the lack of measurements for the claimed benefits (especially since the first version was actually broken and fell back to bit-banging). Previously reading the 256 byte hdmi EDID takes about 72 ms here. With this patch it's down to 33 ms. Given that transfering the 256 bytes over i2c at wire speed takes 20.5ms alone, the reduction in additional overhead is rather nice. v2: Chris Wilson wondered whether GMBUS4 might contain some set bits when booting up an hence result in some spurious interrupts. Since we clear GMBUS4 after every wait and we do gmbus transfer really early in the setup sequence to detect displays the window is small, but still be paranoid and clear it properly. v3: Clarify the comment that gmbus irq generation can only support one kind of event, why it bothers us and how we work around that limit. Cc: Daniel Kurtz <djkurtz@chromium.org> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-12-01 20:53:45 +08:00
wait_queue_head_t gmbus_wait_queue;
struct pci_dev *bridge_dev;
struct intel_engine_cs ring[I915_NUM_RINGS];
struct drm_i915_gem_object *semaphore_obj;
uint32_t last_seqno, next_seqno;
struct drm_dma_handle *status_page_dmah;
struct resource mch_res;
/* protects the irq masks */
spinlock_t irq_lock;
drm/i915: Replaced Blitter ring based flips with MMIO flips This patch enables the framework for using MMIO based flip calls, in contrast with the CS based flip calls which are being used currently. MMIO based flip calls can be enabled on architectures where Render and Blitter engines reside in different power wells. The decision to use MMIO flips can be made based on workloads to give 100% residency for Media power well. v2: The MMIO flips now use the interrupt driven mechanism for issuing the flips when target seqno is reached. (Incorporating Ville's idea) v3: Rebasing on latest code. Code restructuring after incorporating Damien's comments v4: Addressing Ville's review comments -general cleanup -updating only base addr instead of calling update_primary_plane -extending patch for gen5+ platforms v5: Addressed Ville's review comments -Making mmio flip vs cs flip selection based on module parameter -Adding check for DRIVER_MODESET feature in notify_ring before calling notify mmio flip. -Other changes mostly in function arguments v6: -Having a seperate function to check condition for using mmio flips (Ville) -propogating error code from i915_gem_check_olr (Ville) v7: -Adding __must_check with i915_gem_check_olr (Chris) -Renaming mmio_flip_data to mmio_flip (Chris) -Rebasing on latest nightly v8: -Rebasing on latest code -squash 3rd patch in series(mmio setbase vs page flip race) with this patch -Added new tiling mode update in intel_do_mmio_flip (Chris) v9: -check for obj->last_write_seqno being 0 instead of obj->ring being NULL in intel_postpone_flip, as this is a more restrictive condition (Chris) v10: -Applied Chris's suggestions for squashing patches 2,3 into this patch. These patches make the selection of CS vs MMIO flip at the page flip time, and make the module parameter for using mmio flips as tristate, the states being 'force CS flips', 'force mmio flips', 'driver discretion'. Changed the logic for driver discretion (Chris) v11: Minor code cleanup(better readability, fixing whitespace errors, using lockdep to check mutex locked status in postpone_flip, removal of __must_check in function definition) (Chris) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com> Signed-off-by: Akash Goel <akash.goel@intel.com> Tested-by: Chris Wilson <chris@chris-wilson.co.uk> # snb, ivb [danvet: Fix up parameter alignement checkpatch spotted.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-02 19:17:17 +08:00
/* protects the mmio flip data */
spinlock_t mmio_flip_lock;
bool display_irqs_enabled;
drm/i915: irq-drive the dp aux communication At least on the platforms that have a dp aux irq and also have it enabled - vlvhsw should have one, too. But I don't have a machine to test this on. Judging from docs there's no dp aux interrupt for gm45. Also, I only have an ivb cpu edp machine, so the dp aux A code for snb/ilk is untested. For dpcd probing when nothing is connected it slashes about 5ms of cpu time (cpu time is now negligible), which agrees with 3 * 5 400 usec timeouts. A previous version of this patch increases the time required to go through the dp_detect cycle (which includes reading the edid) from around 33 ms to around 40 ms. Experiments indicated that this is purely due to the irq latency - the hw doesn't allow us to queue up dp aux transactions and hence irq latency directly affects throughput. gmbus is much better, there we have a 8 byte buffer, and we get the irq once another 4 bytes can be queued up. But by using the pm_qos interface to request the lowest possible cpu wake-up latency this slowdown completely disappeared. Since all our output detection logic is single-threaded with the mode_config mutex right now anyway, I've decide not ot play fancy and to just reuse the gmbus wait queue. But this would definitely prep the way to run dp detection on different ports in parallel v2: Add a timeout for dp aux transfers when using interrupts - the hw _does_ prevent this with the hw-based 400 usec timeout, but if the irq somehow doesn't arrive we're screwed. Lesson learned while developing this ;-) v3: While at it also convert the busy-loop to wait_for_atomic, so that we don't run the risk of an infinite loop any more. v4: Ensure we have the smallest possible irq latency by using the pm_qos interface. v5: Add a comment to the code to explain why we frob pm_qos. Suggested by Chris Wilson. v6: Disable dp irq for vlv, that's easier than trying to get at docs and hw. v7: Squash in a fix for Haswell that Paulo Zanoni tracked down - the dp aux registers aren't at a fixed offset any more, but can be on the PCH while the DP port is on the cpu die. Reviewed-by: Imre Deak <imre.deak@intel.com> (v6) Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-12-01 20:53:48 +08:00
/* To control wakeup latency, e.g. for irq-driven dp aux transfers. */
struct pm_qos_request pm_qos;
/* Sideband mailbox protection */
struct mutex sb_lock;
/** Cached value of IMR to avoid reads in updating the bitfield */
drm/i915/bdw: Implement interrupt changes The interrupt handling implementation remains the same as previous generations with the 4 types of registers, status, identity, mask, and enable. However the layout of where the bits go have changed entirely. To address these changes, all of the interrupt vfuncs needed special gen8 code. The way it works is there is a top level status register now which informs the interrupt service routine which unit caused the interrupt, and therefore which interrupt registers to read to process the interrupt. For display the division is quite logical, a set of interrupt registers for each pipe, and in addition to those, a set each for "misc" and port. For GT the things get a bit hairy, as seen by the code. Each of the GT units has it's own bits defined. They all look *very similar* and resides in 16 bits of a GT register. As an example, RCS and BCS share register 0. To compact the code a bit, at a slight expense to complexity, this is exactly how the code works as well. 2 structures are added to the ring buffer so that our ring buffer interrupt handling code knows which ring shares the interrupt registers, and a shift value (ie. the top or bottom 16 bits of the register). The above allows us to kept the interrupt register caching scheme, the per interrupt enables, and the code to mask and unmask interrupts relatively clean (again at the cost of some more complexity). Most of the GT units mentioned above are command streamers, and so the symmetry should work quite well for even the yet to be implemented rings which Broadwell adds. v2: Fixes up a couple of bugs, and is more verbose about errors in the Broadwell interrupt handler. v3: fix DE_MISC IER offset v4: Simplify interrupts: I totally misread the docs the first time I implemented interrupts, and so this should greatly simplify the mess. Unlike GEN6, we never touch the regular mask registers in irq_get/put. v5: Rebased on to of recent pch hotplug setup changes. v6: Fixup on top of moving num_pipes to intel_info. v7: Rebased on top of Egbert Eich's hpd irq handling rework. Also wired up ibx_hpd_irq_setup for gen8. v8: Rebase on top of Jani's asle handling rework. v9: Rebase on top of Ben's VECS enabling for Haswell, where he unfortunately went OCD on the gt irq #defines. Not that they're still not yet fully consistent: - Used the GT_RENDER_ #defines + bdw shifts. - Dropped the shift from the L3_PARITY stuff, seemed clearer. - s/irq_refcount/irq_refcount.gt/ v10: Squash in VECS enabling patches and the gen8_gt_irq_handler refactoring from Zhao Yakui <yakui.zhao@intel.com> v11: Rebase on top of the interrupt cleanups in upstream. v12: Rebase on top of Ben's DPF changes in upstream. v13: Drop bdw from the HAS_L3_DPF feature flag for now, it's unclear what exactly needs to be done. Requested by Ben. v14: Fix the patch. - Drop the mask of reserved bits and assorted logic, it doesn't match the spec. - Do the posting read inconditionally instead of commenting it out. - Add a GEN8_MASTER_IRQ_CONTROL definition and use it. - Fix up the GEN8_PIPE interrupt defines and give the GEN8_ prefixes - we actually will need to use them. - Enclose macros in do {} while (0) (checkpatch). - Clear DE_MISC interrupt bits only after having processed them. - Fix whitespace fail (checkpatch). - Fix overtly long lines where appropriate (checkpatch). - Don't use typedef'ed private_t (maintainer-scripts). - Align the function parameter list correctly. Signed-off-by: Ben Widawsky <ben@bwidawsk.net> (v4) Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> bikeshed
2013-11-03 12:07:09 +08:00
union {
u32 irq_mask;
u32 de_irq_mask[I915_MAX_PIPES];
};
u32 gt_irq_mask;
u32 pm_irq_mask;
u32 pm_rps_events;
u32 pipestat_irq_mask[I915_MAX_PIPES];
struct i915_hotplug hotplug;
struct i915_fbc fbc;
drm/i915: Add support for DRRS to switch RR This patch computes and stored 2nd M/N/TU for switching to different refresh rate dynamically. PIPECONF_EDP_RR_MODE_SWITCH bit helps toggle between alternate refresh rates programmed in 2nd M/N/TU registers. v2: Daniel's review comments Computing M2/N2 in compute_config and storing it in crtc_config v3: Modified reference to edp_downclock and edp_downclock_avail based on the changes made to move them from dev_private to intel_panel. v4: Modified references to is_drrs_supported based on the changes made to rename it to drrs_support. v5: Jani's review comments Removed superfluous return statements. Changed support for Gen 7 and above. Corrected indentation. Re-structured the code which finds crtc and connector from encoder. Changed some logs to be less verbose. v6: Modifying i915_drrs to include only intel connector as intel_dp can be derived from intel connector when required. v7: As per internal review comments, acquiring mutex just before accessing drrs RR. As per Chris's review comments, added documentation about the use of locking in the function. v8: Incorporated Jani's review comments. Removed reference to edp_downclock. v9: Jani's review comments. Modified comment in set_drrs. Changed index to type edp_drrs_refresh_rate_type. Check if PSR is enabled before setting registers fo DRRS. Signed-off-by: Pradeep Bhat <pradeep.bhat@intel.com> Signed-off-by: Vandana Kannan <vandana.kannan@intel.com> Cc: Jani Nikula <jani.nikula@linux.intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-04-05 14:43:28 +08:00
struct i915_drrs drrs;
struct intel_opregion opregion;
struct intel_vbt_data vbt;
bool preserve_bios_swizzle;
/* overlay */
struct intel_overlay *overlay;
/* backlight registers and fields in struct intel_panel */
struct mutex backlight_lock;
/* LVDS info */
bool no_aux_handshake;
/* protects panel power sequencer state */
struct mutex pps_mutex;
struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES]; /* assume 965 */
int fence_reg_start; /* 4 if userland hasn't ioctl'd us yet */
int num_fence_regs; /* 8 on pre-965, 16 otherwise */
unsigned int fsb_freq, mem_freq, is_ddr3;
drm/i915/skl: Deinit/init the display at suspend/resume We need to re-init the display hardware when going out of suspend. This includes: - Hooking the PCH to the reset logic - Restoring CDCDLK - Enabling the DDB power Among those, only the CDCDLK one is a bit tricky. There's some complexity in that: - DPLL0 (which is the source for CDCLK) has two VCOs, each with a set of supported frequencies. As eDP also uses DPLL0 for its link rate, once DPLL0 is on, we restrict the possible eDP link rates the chosen VCO. - CDCLK also limits the bandwidth available to push pixels. So, as a first step, this commit restore what the BIOS set, until I can do more testing. In case that's of interest for the reviewer, I've unit tested the function that derives the decimal frequency field: #include <stdio.h> #include <stdint.h> #include <assert.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) static const struct dpll_freq { unsigned int freq; unsigned int decimal; } freqs[] = { { .freq = 308570, .decimal = 0b01001100111}, { .freq = 337500, .decimal = 0b01010100001}, { .freq = 432000, .decimal = 0b01101011110}, { .freq = 450000, .decimal = 0b01110000010}, { .freq = 540000, .decimal = 0b10000110110}, { .freq = 617140, .decimal = 0b10011010000}, { .freq = 675000, .decimal = 0b10101000100}, }; static void intbits(unsigned int v) { int i; for(i = 10; i >= 0; i--) putchar('0' + ((v >> i) & 1)); } static unsigned int freq_decimal(unsigned int freq /* in kHz */) { return (freq - 1000) / 500; } static void test_freq(const struct dpll_freq *entry) { unsigned int decimal = freq_decimal(entry->freq); printf("freq: %d, expected: ", entry->freq); intbits(entry->decimal); printf(", got: "); intbits(decimal); putchar('\n'); assert(decimal == entry->decimal); } int main(int argc, char **argv) { int i; for (i = 0; i < ARRAY_SIZE(freqs); i++) test_freq(&freqs[i]); return 0; } v2: - Rebase on top of -nightly - Use (freq - 1000) / 500 for the decimal frequency (Ville) - Fix setting the enable bit of HSW_NDE_RSTWRN_OPT (Ville) - Rename skl_display_{resume,suspend} to skl_{init,uninit}_cdclk to be consistent with the BXT code (Ville) - Store boot CDCLK in ddi_pll_init (Ville) - Merge dev_priv's skl_boot_cdclk into cdclk_freq - Use LCPLL_PLL_LOCK instead of (1 << 30) (Ville) - Replace various '0' by SKL_DPLL0 to be a bit more explicit that we're programming DPLL0 - Busy poll the PCU before doing the frequency change. It takes about 3/4 cycles, each separated by 10us, to get the ACK from the CPU (Ville) v3: - Restore dev_priv->skl_boot_cdclk, leaving unification with dev_priv->cdclk_freq for a later patch (Daniel, Ville) Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-21 23:37:48 +08:00
unsigned int skl_boot_cdclk;
unsigned int cdclk_freq, max_cdclk_freq;
unsigned int hpll_freq;
drm/i915: fix hpd work vs. flush_work in the pageflip code deadlock Historically we've run our own driver hotplug handling in our own work-queue, which then launched the drm core hotplug handling in the system workqueue. This is important since we flush our own driver workqueue in the pageflip code while hodling modeset locks, and only the drm hotplug code grabbed these locks. But with commit 69787f7da6b2adc4054357a661aaa1701a9ca76f Author: Daniel Vetter <daniel.vetter@ffwll.ch> Date: Tue Oct 23 18:23:34 2012 +0000 drm: run the hpd irq event code directly this was changed and now we could deadlock in our flip handler if there's a hotplug work blocking the progress of the crucial unpin works. So this broke the careful deadlock avoidance implemented in commit b4a98e57fc27854b5938fc8b08b68e5e68b91e1f Author: Chris Wilson <chris@chris-wilson.co.uk> Date: Thu Nov 1 09:26:26 2012 +0000 drm/i915: Flush outstanding unpin tasks before pageflipping Since the rule thus far has been that work items on our own workqueue may never grab modeset locks simply restore that rule again. v2: Add a comment to the declaration of dev_priv->wq to warn readers about the tricky implications of using it. Suggested by Chris Wilson. Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Stuart Abercrombie <sabercrombie@chromium.org> Reported-by: Stuart Abercrombie <sabercrombie@chromium.org> References: http://permalink.gmane.org/gmane.comp.freedesktop.xorg.drivers.intel/26239 Cc: stable@vger.kernel.org Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: Squash in a comment at the place where we schedule the work. Requested after-the-fact by Chris on irc since the hpd work isn't the only place we botch this.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-09-02 22:22:25 +08:00
/**
* wq - Driver workqueue for GEM.
*
* NOTE: Work items scheduled here are not allowed to grab any modeset
* locks, for otherwise the flushing done in the pageflip code will
* result in deadlocks.
*/
struct workqueue_struct *wq;
/* Display functions */
struct drm_i915_display_funcs display;
/* PCH chipset type */
enum intel_pch pch_type;
unsigned short pch_id;
unsigned long quirks;
i915: ignore lid open event when resuming i915 driver needs to do modeset when 1. system resumes from sleep 2. lid is opened In PM_SUSPEND_MEM state, all the GPEs are cleared when system resumes, thus it is the i915_resume code does the modeset rather than intel_lid_notify(). But in PM_SUSPEND_FREEZE state, this will be broken because system is still responsive to the lid events. 1. When we close the lid in Freeze state, intel_lid_notify() sets modeset_on_lid. 2. When we reopen the lid, intel_lid_notify() will do a modeset, before the system is resumed. here is the error log, [92146.548074] WARNING: at drivers/gpu/drm/i915/intel_display.c:1028 intel_wait_for_pipe_off+0x184/0x190 [i915]() [92146.548076] Hardware name: VGN-Z540N [92146.548078] pipe_off wait timed out [92146.548167] Modules linked in: hid_generic usbhid hid snd_hda_codec_realtek snd_hda_intel snd_hda_codec parport_pc snd_hwdep ppdev snd_pcm_oss i915 snd_mixer_oss snd_pcm arc4 iwldvm snd_seq_dummy mac80211 snd_seq_oss snd_seq_midi fbcon tileblit font bitblit softcursor drm_kms_helper snd_rawmidi snd_seq_midi_event coretemp drm snd_seq kvm btusb bluetooth snd_timer iwlwifi pcmcia tpm_infineon i2c_algo_bit joydev snd_seq_device intel_agp cfg80211 snd intel_gtt yenta_socket pcmcia_rsrc sony_laptop agpgart microcode psmouse tpm_tis serio_raw mxm_wmi soundcore snd_page_alloc tpm acpi_cpufreq lpc_ich pcmcia_core tpm_bios mperf processor lp parport firewire_ohci firewire_core crc_itu_t sdhci_pci sdhci thermal e1000e [92146.548173] Pid: 4304, comm: kworker/0:0 Tainted: G W 3.8.0-rc3-s0i3-v3-test+ #9 [92146.548175] Call Trace: [92146.548189] [<c10378e2>] warn_slowpath_common+0x72/0xa0 [92146.548227] [<f86398b4>] ? intel_wait_for_pipe_off+0x184/0x190 [i915] [92146.548263] [<f86398b4>] ? intel_wait_for_pipe_off+0x184/0x190 [i915] [92146.548270] [<c10379b3>] warn_slowpath_fmt+0x33/0x40 [92146.548307] [<f86398b4>] intel_wait_for_pipe_off+0x184/0x190 [i915] [92146.548344] [<f86399c2>] intel_disable_pipe+0x102/0x190 [i915] [92146.548380] [<f8639ea4>] ? intel_disable_plane+0x64/0x80 [i915] [92146.548417] [<f8639f7c>] i9xx_crtc_disable+0xbc/0x150 [i915] [92146.548456] [<f863ebee>] intel_crtc_update_dpms+0x5e/0x90 [i915] [92146.548493] [<f86437cf>] intel_modeset_setup_hw_state+0x42f/0x8f0 [i915] [92146.548535] [<f8645b0b>] intel_lid_notify+0x9b/0xc0 [i915] [92146.548543] [<c15610d3>] notifier_call_chain+0x43/0x60 [92146.548550] [<c105d1e1>] __blocking_notifier_call_chain+0x41/0x80 [92146.548556] [<c105d23f>] blocking_notifier_call_chain+0x1f/0x30 [92146.548563] [<c131a684>] acpi_lid_send_state+0x78/0xa4 [92146.548569] [<c131aa9e>] acpi_button_notify+0x3b/0xf1 [92146.548577] [<c12df56a>] ? acpi_os_execute+0x17/0x19 [92146.548582] [<c12e591a>] ? acpi_ec_sync_query+0xa5/0xbc [92146.548589] [<c12e2b82>] acpi_device_notify+0x16/0x18 [92146.548595] [<c12f4904>] acpi_ev_notify_dispatch+0x38/0x4f [92146.548600] [<c12df0e8>] acpi_os_execute_deferred+0x20/0x2b [92146.548607] [<c1051208>] process_one_work+0x128/0x3f0 [92146.548613] [<c1564f73>] ? common_interrupt+0x33/0x38 [92146.548618] [<c104f8c0>] ? wake_up_worker+0x30/0x30 [92146.548624] [<c12df0c8>] ? acpi_os_wait_events_complete+0x1e/0x1e [92146.548629] [<c10524f9>] worker_thread+0x119/0x3b0 [92146.548634] [<c10523e0>] ? manage_workers+0x240/0x240 [92146.548640] [<c1056e84>] kthread+0x94/0xa0 [92146.548647] [<c1060000>] ? ftrace_raw_output_sched_stat_runtime+0x70/0xf0 [92146.548652] [<c15649b7>] ret_from_kernel_thread+0x1b/0x28 [92146.548658] [<c1056df0>] ? kthread_create_on_node+0xc0/0xc0 three different modeset flags are introduced in this patch MODESET_ON_LID_OPEN: do modeset on next lid open event MODESET_DONE: modeset already done MODESET_SUSPENDED: suspended, only do modeset when system is resumed In this way, 1. when lid is closed, MODESET_ON_LID_OPEN is set so that we'll do modeset on next lid open event. 2. when lid is opened, MODESET_DONE is set so that duplicate lid open events will be ignored. 3. when system suspends, MODESET_SUSPENDED is set. In this case, we will not do modeset on any lid events. Plus, locking mechanism is also introduced to avoid racing. Signed-off-by: Zhang Rui <rui.zhang@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-02-05 15:41:53 +08:00
enum modeset_restore modeset_restore;
struct mutex modeset_restore_lock;
struct list_head vm_list; /* Global list of all address spaces */
struct i915_gtt gtt; /* VM representing the global address space */
struct i915_gem_mm mm;
drm/i915: Prevent recursive deadlock on releasing a busy userptr During release of the GEM object we hold the struct_mutex. As the object may be holding onto the last reference for the task->mm, calling mmput() may trigger exit_mmap() which close the vma which will call drm_gem_vm_close() and attempt to reacquire the struct_mutex. In order to avoid that recursion, we have to defer the mmput() until after we drop the struct_mutex, i.e. we need to schedule a worker to do the clean up. A further issue spotted by Tvrtko was caused when we took a GTT mmapping of a userptr buffer object. In that case, we would never call mmput as the object would be cyclically referenced by the GTT mmapping and not freed upon process exit - keeping the entire process mm alive after the process task was reaped. The fix employed is to replace the mm_users/mmput() reference handling to mm_count/mmdrop() for the shared i915_mm_struct. INFO: task test_surfaces:1632 blocked for more than 120 seconds.       Tainted: GF          O 3.14.5+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. test_surfaces   D 0000000000000000     0  1632   1590 0x00000082  ffff88014914baa8 0000000000000046 0000000000000000 ffff88014914a010  0000000000012c40 0000000000012c40 ffff8800a0058210 ffff88014784b010  ffff88014914a010 ffff880037b1c820 ffff8800a0058210 ffff880037b1c824 Call Trace:  [<ffffffff81582499>] schedule+0x29/0x70  [<ffffffff815825fe>] schedule_preempt_disabled+0xe/0x10  [<ffffffff81583b93>] __mutex_lock_slowpath+0x183/0x220  [<ffffffff81583c53>] mutex_lock+0x23/0x40  [<ffffffffa005c2a3>] drm_gem_vm_close+0x33/0x70 [drm]  [<ffffffff8115a483>] remove_vma+0x33/0x70  [<ffffffff8115a5dc>] exit_mmap+0x11c/0x170  [<ffffffff8104d6eb>] mmput+0x6b/0x100  [<ffffffffa00f44b9>] i915_gem_userptr_release+0x89/0xc0 [i915]  [<ffffffffa00e6706>] i915_gem_free_object+0x126/0x250 [i915]  [<ffffffffa005c06a>] drm_gem_object_free+0x2a/0x40 [drm]  [<ffffffffa005cc32>] drm_gem_object_handle_unreference_unlocked+0xe2/0x120 [drm]  [<ffffffffa005ccd4>] drm_gem_object_release_handle+0x64/0x90 [drm]  [<ffffffff8127ffeb>] idr_for_each+0xab/0x100  [<ffffffffa005cc70>] ? drm_gem_object_handle_unreference_unlocked+0x120/0x120 [drm]  [<ffffffff81583c46>] ? mutex_lock+0x16/0x40  [<ffffffffa005c354>] drm_gem_release+0x24/0x40 [drm]  [<ffffffffa005b82b>] drm_release+0x3fb/0x480 [drm]  [<ffffffff8118d482>] __fput+0xb2/0x260  [<ffffffff8118d6de>] ____fput+0xe/0x10  [<ffffffff8106f27f>] task_work_run+0x8f/0xf0  [<ffffffff81052228>] do_exit+0x1a8/0x480  [<ffffffff81052551>] do_group_exit+0x51/0xc0  [<ffffffff810525d7>] SyS_exit_group+0x17/0x20  [<ffffffff8158e092>] system_call_fastpath+0x16/0x1b v2: Incorporate feedback from Tvrtko and remove the unnessary mm referencing when creating the i915_mm_struct and improve some of the function names and comments. Reported-by: Jacek Danecki <jacek.danecki@intel.com> Test-case: igt/gem_userptr_blits/process-exit* Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Jacek Danecki <jacek.danecki@intel.com> Cc: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com> Reviewed-by: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: stable@vger.kernel.org # hold off until 3.17 ships for additional testing Signed-off-by: Jani Nikula <jani.nikula@intel.com>
2014-08-07 21:20:40 +08:00
DECLARE_HASHTABLE(mm_structs, 7);
struct mutex mm_lock;
/* Kernel Modesetting */
struct sdvo_device_mapping sdvo_mappings[2];
struct drm_crtc *plane_to_crtc_mapping[I915_MAX_PIPES];
struct drm_crtc *pipe_to_crtc_mapping[I915_MAX_PIPES];
wait_queue_head_t pending_flip_queue;
#ifdef CONFIG_DEBUG_FS
struct intel_pipe_crc pipe_crc[I915_MAX_PIPES];
#endif
int num_shared_dpll;
struct intel_shared_dpll shared_dplls[I915_NUM_PLLS];
int dpio_phy_iosf_port[I915_NUM_PHYS_VLV];
struct i915_workarounds workarounds;
/* Reclocking support */
bool render_reclock_avail;
drm/i915: Track frontbuffer invalidation/flushing So these are the guts of the new beast. This tracks when a frontbuffer gets invalidated (due to frontbuffer rendering) and hence should be constantly scaned out, and when it's flushed again and can be compressed/one-shot-upload. Rules for flushing are simple: The frontbuffer needs one more full upload starting from the next vblank. Which means that the flushing can _only_ be called once the frontbuffer update has been latched. But this poses a problem for pageflips: We can't just delay the flushing until the pageflip is latched, since that would pose the risk that we override frontbuffer rendering that has been scheduled in-between the pageflip ioctl and the actual latching. To handle this track asynchronous invalidations (and also pageflip) state per-ring and delay any in-between flushing until the rendering has completed. And also cancel any delayed flushing if we get a new invalidation request (whether delayed or not). Also call intel_mark_fb_busy in both cases in all cases to make sure that we keep the screen at the highest refresh rate both on flips, synchronous plane updates and for frontbuffer rendering. v2: Lots of improvements Suggestions from Chris: - Move invalidate/flush in flush_*_domain and set_to_*_domain. - Drop the flush in busy_ioctl since it's redundant. Was a leftover from an earlier concept to track flips/delayed flushes. - Don't forget about the initial modeset enable/final disable. Suggested by Chris. Track flips accurately, too. Since flips complete independently of rendering we need to track pending flips in a separate mask. Again if an invalidate happens we need to cancel the evenutal flush to avoid races. v3: Provide correct header declarations for flip functions. Currently not needed outside of intel_display.c, but part of the proper interface. v4: Add proper domain management to fbcon so that the fbcon buffer is also tracked correctly. v5: Fixup locking around the fbcon set_to_gtt_domain call. v6: More comments from Chris: - Split out fbcon changes. - Drop superflous checks for potential scanout before calling intel_fb functions - we can micro-optimize this later. - s/intel_fb_/intel_fb_obj_/ to make it clear that this deals in gem object. We already have precedence for fb_obj in the pin_and_fence functions. v7: Clarify the semantics of the flip flush handling by renaming things a bit: - Don't go through a gem object but take the relevant frontbuffer bits directly. These functions center on the plane, the actual object is irrelevant - even a flip to the same object as already active should cause a flush. - Add a new intel_frontbuffer_flip for synchronous plane updates. It currently just calls intel_frontbuffer_flush since the implemenation differs. This way we achieve a clear split between one-shot update events on one side and frontbuffer rendering with potentially a very long delay between the invalidate and flush. Chris and I also had some discussions about mark_busy and whether it is appropriate to call from flush. But mark busy is a state which should be derived from the 3 events (invalidate, flush, flip) we now have by the users, like psr does by tracking relevant information in psr.busy_frontbuffer_bits. DRRS (the only real use of mark_busy for frontbuffer) needs to have similar logic. With that the overall mark_busy in the core could be removed. v8: Only when retiring gpu buffers only flush frontbuffer bits we actually invalidated in a batch. Just for safety since before any additional usage/invalidate we should always retire current rendering. Suggested by Chris Wilson. v9: Actually use intel_frontbuffer_flip in all appropriate places. Spotted by Chris. v10: Address more comments from Chris: - Don't call _flip in set_base when the crtc is inactive, avoids redunancy in the modeset case with the initial enabling of all planes. - Add comments explaining that the initial/final plane enable/disable still has work left to do before it's fully generic. v11: Only invalidate for gtt/cpu access when writing. Spotted by Chris. v12: s/_flush/_flip/ in intel_overlay.c per Chris' comment. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 22:01:59 +08:00
struct i915_frontbuffer_tracking fb_tracking;
u16 orig_clock;
bool mchbar_need_disable;
struct intel_l3_parity l3_parity;
/* Cannot be determined by PCIID. You must always read a register. */
size_t ellc_size;
/* gen6+ rps state */
struct intel_gen6_power_mgmt rps;
/* ilk-only ips/rps state. Everything in here is protected by the global
* mchdev_lock in intel_pm.c */
struct intel_ilk_power_mgmt ips;
struct i915_power_domains power_domains;
struct i915_psr psr;
struct i915_gpu_error gpu_error;
struct drm_i915_gem_object *vlv_pctx;
#ifdef CONFIG_DRM_I915_FBDEV
/* list of fbdev register on this device */
struct intel_fbdev *fbdev;
struct work_struct fbdev_suspend_work;
#endif
struct drm_property *broadcast_rgb_property;
struct drm_property *force_audio_property;
/* hda/i915 audio component */
bool audio_component_registered;
drm/i915: preliminary context support Very basic code for context setup/destruction in the driver. Adds the file i915_gem_context.c This file implements HW context support. On gen5+ a HW context consists of an opaque GPU object which is referenced at times of context saves and restores. With RC6 enabled, the context is also referenced as the GPU enters and exists from RC6 (GPU has it's own internal power context, except on gen5). Though something like a context does exist for the media ring, the code only supports contexts for the render ring. In software, there is a distinction between contexts created by the user, and the default HW context. The default HW context is used by GPU clients that do not request setup of their own hardware context. The default context's state is never restored to help prevent programming errors. This would happen if a client ran and piggy-backed off another clients GPU state. The default context only exists to give the GPU some offset to load as the current to invoke a save of the context we actually care about. In fact, the code could likely be constructed, albeit in a more complicated fashion, to never use the default context, though that limits the driver's ability to swap out, and/or destroy other contexts. All other contexts are created as a request by the GPU client. These contexts store GPU state, and thus allow GPU clients to not re-emit state (and potentially query certain state) at any time. The kernel driver makes certain that the appropriate commands are inserted. There are 4 entry points into the contexts, init, fini, open, close. The names are self-explanatory except that init can be called during reset, and also during pm thaw/resume. As we expect our context to be preserved across these events, we do not reinitialize in this case. As Adam Jackson pointed out, The cutoff of 1MB where a HW context is considered too big is arbitrary. The reason for this is even though context sizes are increasing with every generation, they have yet to eclipse even 32k. If we somehow read back way more than that, it probably means BIOS has done something strange, or we're running on a platform that wasn't designed for this. v2: rename load/unload to init/fini (daniel) remove ILK support for get_size() (indirectly daniel) add HAS_HW_CONTEXTS macro to clarify supported platforms (daniel) added comments (Ben) Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
2012-06-05 05:42:42 +08:00
uint32_t hw_context_size;
struct list_head context_list;
u32 fdi_rx_config;
drm/i915: Work around DISPLAY_PHY_CONTROL register corruption on CHV Sometimes (exactly when is a bit unclear) DISPLAY_PHY_CONTROL appears to get corrupted. The values I've managed to read from it seem to have some pattern but vary quite a lot. The corruption doesn't seem to just happen when the register is accessed, but can also happen spontaneosly during modeset. When this happens during a modeset things go south and the display doesn't light up. I've managed to hit the problemn when toggling HDMI on port D on and off. When things get corrupted the display doesn't light up, but as soon as I manually write the correct value to the register the display comes up. First I was suspicious that we ourselves accidentally overwrite it with garbage, but didn't catch anything with the reg_rw tracepoint. Also I sprinkled check all over the modeset path to see exactly when the corruption happens, and eg. the read back value was fine just before intel_dp_set_m(), and corrupted immediately after it. I also made my check function repair the register value whenever it was wrong, and with this approach the corruption repeated several times during the modeset operation, always seeming to trigger in the same exact calls to the check function, while other calls to the function never caught anything. So far I've not seen this problem occurring when carefully avoiding all read accesses to DISPLAY_PHY_CONTROL. Not sure if that's just pure luck or an actual workaround, but we can hope it works. So let's avoid reading the register and instead track the desired value of the register in dev_priv. v2: Read out the power well state to determine initial register value v3: Use DPIO_CHx names instead of raw numbers Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Deepak S <deepak.s@linux.intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-10 23:21:28 +08:00
u32 chv_phy_control;
u32 suspend_count;
struct i915_suspend_saved_registers regfile;
struct vlv_s0ix_state vlv_s0ix_state;
struct {
/*
* Raw watermark latency values:
* in 0.1us units for WM0,
* in 0.5us units for WM1+.
*/
/* primary */
uint16_t pri_latency[5];
/* sprite */
uint16_t spr_latency[5];
/* cursor */
uint16_t cur_latency[5];
/*
* Raw watermark memory latency values
* for SKL for all 8 levels
* in 1us units.
*/
uint16_t skl_latency[8];
drm/i915/skl: SKL Watermark Computation This patch implements the watermark algorithm and its necessary functions. Two function pointers skl_update_wm and skl_update_sprite_wm are provided. The skl_update_wm will update the watermarks for the crtc provided as an argument and then checks for change in DDB allocation for other active pipes and recomputes the watermarks for those Pipes and planes as well. Finally it does the register programming for all dirty pipes. The trigger of the Watermark double buffer registers will have to be once the plane configurations are done by the caller. v2: fixed the divide-by-0 error in the results computation func. Also reworked the PLANE_WM register values computation func to make it more compact. Incorporated all other review comments from Damien. v3: Changed the skl_compute_plane_wm function to now return success or failure. Also the result blocks and lines are computed here instead of in skl_compute_wm_results function. v4: Adjust skl_ddb_alloc_changed() to the new planes/cursor split (Damien) v5: Reworked the affected functions to implement new plane/cursor split. v6: Rework the logic that triggers the DDB allocation and WM computation of skl_update_other_pipe_wm() to not depend on non-computed DDB values. Always give a valid cursor_width (at boot it's 0) to keep the invariant that we consider the cursor plane always enabled. Otherwise we end up dividing by 0 in skl_compute_plane_wm() (Damien Lespiau) v7: Spell out allocation skl_ddb_ functions should have the ddb as first argument Make the skl_ddb_alloc_changed() parameters const (Damien) v8: Rebase on top of the crtc->primary changes v9: Split the staging results structure to not exceed the 1Kb stack allocation in skl_update_wm() v10: Make skl_pipe_pixel_rate() take a pointer to the pipe config Add a comment about overflow considerations for skl_wm_method1() Various additions of const Various use of sizeof(variable) instead of sizeof(type) Various move of variable definitons to a narrower scope Zero initialize some stack allocated structures to make sure we don't have garbage in case we don't write all the values (Ville) v11: Remove non-necessary default number of blocks/lines when the plane is disabled (Ville) Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Pradeep Bhat <pradeep.bhat@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-11-05 01:06:42 +08:00
/*
* The skl_wm_values structure is a bit too big for stack
* allocation, so we keep the staging struct where we store
* intermediate results here instead.
*/
struct skl_wm_values skl_results;
/* current hardware state */
drm/i915/skl: SKL Watermark Computation This patch implements the watermark algorithm and its necessary functions. Two function pointers skl_update_wm and skl_update_sprite_wm are provided. The skl_update_wm will update the watermarks for the crtc provided as an argument and then checks for change in DDB allocation for other active pipes and recomputes the watermarks for those Pipes and planes as well. Finally it does the register programming for all dirty pipes. The trigger of the Watermark double buffer registers will have to be once the plane configurations are done by the caller. v2: fixed the divide-by-0 error in the results computation func. Also reworked the PLANE_WM register values computation func to make it more compact. Incorporated all other review comments from Damien. v3: Changed the skl_compute_plane_wm function to now return success or failure. Also the result blocks and lines are computed here instead of in skl_compute_wm_results function. v4: Adjust skl_ddb_alloc_changed() to the new planes/cursor split (Damien) v5: Reworked the affected functions to implement new plane/cursor split. v6: Rework the logic that triggers the DDB allocation and WM computation of skl_update_other_pipe_wm() to not depend on non-computed DDB values. Always give a valid cursor_width (at boot it's 0) to keep the invariant that we consider the cursor plane always enabled. Otherwise we end up dividing by 0 in skl_compute_plane_wm() (Damien Lespiau) v7: Spell out allocation skl_ddb_ functions should have the ddb as first argument Make the skl_ddb_alloc_changed() parameters const (Damien) v8: Rebase on top of the crtc->primary changes v9: Split the staging results structure to not exceed the 1Kb stack allocation in skl_update_wm() v10: Make skl_pipe_pixel_rate() take a pointer to the pipe config Add a comment about overflow considerations for skl_wm_method1() Various additions of const Various use of sizeof(variable) instead of sizeof(type) Various move of variable definitons to a narrower scope Zero initialize some stack allocated structures to make sure we don't have garbage in case we don't write all the values (Ville) v11: Remove non-necessary default number of blocks/lines when the plane is disabled (Ville) Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Pradeep Bhat <pradeep.bhat@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-11-05 01:06:42 +08:00
union {
struct ilk_wm_values hw;
struct skl_wm_values skl_hw;
struct vlv_wm_values vlv;
drm/i915/skl: SKL Watermark Computation This patch implements the watermark algorithm and its necessary functions. Two function pointers skl_update_wm and skl_update_sprite_wm are provided. The skl_update_wm will update the watermarks for the crtc provided as an argument and then checks for change in DDB allocation for other active pipes and recomputes the watermarks for those Pipes and planes as well. Finally it does the register programming for all dirty pipes. The trigger of the Watermark double buffer registers will have to be once the plane configurations are done by the caller. v2: fixed the divide-by-0 error in the results computation func. Also reworked the PLANE_WM register values computation func to make it more compact. Incorporated all other review comments from Damien. v3: Changed the skl_compute_plane_wm function to now return success or failure. Also the result blocks and lines are computed here instead of in skl_compute_wm_results function. v4: Adjust skl_ddb_alloc_changed() to the new planes/cursor split (Damien) v5: Reworked the affected functions to implement new plane/cursor split. v6: Rework the logic that triggers the DDB allocation and WM computation of skl_update_other_pipe_wm() to not depend on non-computed DDB values. Always give a valid cursor_width (at boot it's 0) to keep the invariant that we consider the cursor plane always enabled. Otherwise we end up dividing by 0 in skl_compute_plane_wm() (Damien Lespiau) v7: Spell out allocation skl_ddb_ functions should have the ddb as first argument Make the skl_ddb_alloc_changed() parameters const (Damien) v8: Rebase on top of the crtc->primary changes v9: Split the staging results structure to not exceed the 1Kb stack allocation in skl_update_wm() v10: Make skl_pipe_pixel_rate() take a pointer to the pipe config Add a comment about overflow considerations for skl_wm_method1() Various additions of const Various use of sizeof(variable) instead of sizeof(type) Various move of variable definitons to a narrower scope Zero initialize some stack allocated structures to make sure we don't have garbage in case we don't write all the values (Ville) v11: Remove non-necessary default number of blocks/lines when the plane is disabled (Ville) Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Signed-off-by: Pradeep Bhat <pradeep.bhat@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-11-05 01:06:42 +08:00
};
} wm;
struct i915_runtime_pm pm;
/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
struct {
drm/i915: Merged the many do_execbuf() parameters into a structure The do_execbuf() function takes quite a few parameters. The actual set of parameters is going to change with the conversion to passing requests around. Further, it is due to grow massively with the arrival of the GPU scheduler. This patch simplifies the prototype by passing a parameter structure instead. Changing the parameter set in the future is then simply a matter of adding/removing items to the structure. Note that the structure does not contain absolutely everything that is passed in. This is because the intention is to use this structure more extensively later in this patch series and more especially in the GPU scheduler that is coming soon. The latter requires hanging on to the structure as the final hardware submission can be delayed until long after the execbuf IOCTL has returned to user land. Thus it is unsafe to put anything in the structure that is local to the IOCTL call itself - such as the 'args' parameter. All entries must be copies of data or pointers to structures that are reference counted in some way and guaranteed to exist for the duration of the batch buffer's life. v2: Rebased to newer tree and updated for changes to the command parser. Specifically, a code shuffle has required saving the batch start address in the params structure. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:43:27 +08:00
int (*execbuf_submit)(struct i915_execbuffer_params *params,
struct drm_i915_gem_execbuffer2 *args,
drm/i915: Merged the many do_execbuf() parameters into a structure The do_execbuf() function takes quite a few parameters. The actual set of parameters is going to change with the conversion to passing requests around. Further, it is due to grow massively with the arrival of the GPU scheduler. This patch simplifies the prototype by passing a parameter structure instead. Changing the parameter set in the future is then simply a matter of adding/removing items to the structure. Note that the structure does not contain absolutely everything that is passed in. This is because the intention is to use this structure more extensively later in this patch series and more especially in the GPU scheduler that is coming soon. The latter requires hanging on to the structure as the final hardware submission can be delayed until long after the execbuf IOCTL has returned to user land. Thus it is unsafe to put anything in the structure that is local to the IOCTL call itself - such as the 'args' parameter. All entries must be copies of data or pointers to structures that are reference counted in some way and guaranteed to exist for the duration of the batch buffer's life. v2: Rebased to newer tree and updated for changes to the command parser. Specifically, a code shuffle has required saving the batch start address in the params structure. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:43:27 +08:00
struct list_head *vmas);
int (*init_rings)(struct drm_device *dev);
void (*cleanup_ring)(struct intel_engine_cs *ring);
void (*stop_ring)(struct intel_engine_cs *ring);
} gt;
bool edp_low_vswing;
/*
* NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
* will be rejected. Instead look for a better place.
*/
};
static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
{
return dev->dev_private;
}
static inline struct drm_i915_private *dev_to_i915(struct device *dev)
{
return to_i915(dev_get_drvdata(dev));
}
/* Iterate over initialised rings */
#define for_each_ring(ring__, dev_priv__, i__) \
for ((i__) = 0; (i__) < I915_NUM_RINGS; (i__)++) \
if (((ring__) = &(dev_priv__)->ring[(i__)]), intel_ring_initialized((ring__)))
enum hdmi_force_audio {
HDMI_AUDIO_OFF_DVI = -2, /* no aux data for HDMI-DVI converter */
HDMI_AUDIO_OFF, /* force turn off HDMI audio */
HDMI_AUDIO_AUTO, /* trust EDID */
HDMI_AUDIO_ON, /* force turn on HDMI audio */
};
#define I915_GTT_OFFSET_NONE ((u32)-1)
struct drm_i915_gem_object_ops {
/* Interface between the GEM object and its backing storage.
* get_pages() is called once prior to the use of the associated set
* of pages before to binding them into the GTT, and put_pages() is
* called after we no longer need them. As we expect there to be
* associated cost with migrating pages between the backing storage
* and making them available for the GPU (e.g. clflush), we may hold
* onto the pages after they are no longer referenced by the GPU
* in case they may be used again shortly (for example migrating the
* pages to a different memory domain within the GTT). put_pages()
* will therefore most likely be called when the object itself is
* being released or under memory pressure (where we attempt to
* reap pages for the shrinker).
*/
int (*get_pages)(struct drm_i915_gem_object *);
void (*put_pages)(struct drm_i915_gem_object *);
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
int (*dmabuf_export)(struct drm_i915_gem_object *);
void (*release)(struct drm_i915_gem_object *);
};
drm/i915: Introduce accurate frontbuffer tracking So from just a quick look we seem to have enough information to accurately figure out whether a given gem bo is used as a frontbuffer and where exactly: We have obj->pin_count as a first check with no false negatives and only negligible false positives. And then we can just walk the modeset objects and figure out where exactly a buffer is used as scanout. Except that we can't due to locking order: If we already hold dev->struct_mutex we can't acquire any modeset locks, so could potential chase freed pointers and other evil stuff. So we need something else. For that introduce a new set of bits obj->frontbuffer_bits to track where a buffer object is used. That we can then chase without grabbing any modeset locks. Of course the consumers of this (DRRS, PSR, FBC, ...) still need to be able to do their magic both when called from modeset and from gem code. But that can be easily achieved by adding locks for these specific subsystems which always nest within either kms or gem locking. This patch just adds the relevant update code to all places. Note that if we ever support multi-planar scanout targets then we need one frontbuffer tracking bit per attachment point that we expose to userspace. v2: - Fix more oopsen. Oops. - WARN if we leak obj->frontbuffer_bits when freeing a gem buffer. Fix the bugs this brought to light. - s/update_frontbuffer_bits/update_fb_bits/. More consistent with the fb tracking functions (fb for gem object, frontbuffer for raw bits). And the function name was way too long. v3: Size obj->frontbuffer_bits correctly so that all pipes fit in. v4: Don't update fb bits in set_base on failure. Noticed by Chris. v5: s/i915_gem_update_fb_bits/i915_gem_track_fb/ Also remove a few local enum pipe variables which are now no longer needed to make the function arguments no drop over the 80 char limit. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 05:28:09 +08:00
/*
* Frontbuffer tracking bits. Set in obj->frontbuffer_bits while a gem bo is
* considered to be the frontbuffer for the given plane interface-vise. This
* doesn't mean that the hw necessarily already scans it out, but that any
* rendering (by the cpu or gpu) will land in the frontbuffer eventually.
*
* We have one bit per pipe and per scanout plane type.
*/
#define INTEL_FRONTBUFFER_BITS_PER_PIPE 4
#define INTEL_FRONTBUFFER_BITS \
(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES)
#define INTEL_FRONTBUFFER_PRIMARY(pipe) \
(1 << (INTEL_FRONTBUFFER_BITS_PER_PIPE * (pipe)))
#define INTEL_FRONTBUFFER_CURSOR(pipe) \
(1 << (1 +(INTEL_FRONTBUFFER_BITS_PER_PIPE * (pipe))))
#define INTEL_FRONTBUFFER_SPRITE(pipe) \
(1 << (2 +(INTEL_FRONTBUFFER_BITS_PER_PIPE * (pipe))))
#define INTEL_FRONTBUFFER_OVERLAY(pipe) \
(1 << (3 +(INTEL_FRONTBUFFER_BITS_PER_PIPE * (pipe))))
#define INTEL_FRONTBUFFER_ALL_MASK(pipe) \
(0xf << (INTEL_FRONTBUFFER_BITS_PER_PIPE * (pipe)))
drm/i915: Introduce accurate frontbuffer tracking So from just a quick look we seem to have enough information to accurately figure out whether a given gem bo is used as a frontbuffer and where exactly: We have obj->pin_count as a first check with no false negatives and only negligible false positives. And then we can just walk the modeset objects and figure out where exactly a buffer is used as scanout. Except that we can't due to locking order: If we already hold dev->struct_mutex we can't acquire any modeset locks, so could potential chase freed pointers and other evil stuff. So we need something else. For that introduce a new set of bits obj->frontbuffer_bits to track where a buffer object is used. That we can then chase without grabbing any modeset locks. Of course the consumers of this (DRRS, PSR, FBC, ...) still need to be able to do their magic both when called from modeset and from gem code. But that can be easily achieved by adding locks for these specific subsystems which always nest within either kms or gem locking. This patch just adds the relevant update code to all places. Note that if we ever support multi-planar scanout targets then we need one frontbuffer tracking bit per attachment point that we expose to userspace. v2: - Fix more oopsen. Oops. - WARN if we leak obj->frontbuffer_bits when freeing a gem buffer. Fix the bugs this brought to light. - s/update_frontbuffer_bits/update_fb_bits/. More consistent with the fb tracking functions (fb for gem object, frontbuffer for raw bits). And the function name was way too long. v3: Size obj->frontbuffer_bits correctly so that all pipes fit in. v4: Don't update fb bits in set_base on failure. Noticed by Chris. v5: s/i915_gem_update_fb_bits/i915_gem_track_fb/ Also remove a few local enum pipe variables which are now no longer needed to make the function arguments no drop over the 80 char limit. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 05:28:09 +08:00
struct drm_i915_gem_object {
struct drm_gem_object base;
const struct drm_i915_gem_object_ops *ops;
/** List of VMAs backed by this object */
struct list_head vma_list;
/** Stolen memory for this object, instead of being backed by shmem. */
struct drm_mm_node *stolen;
struct list_head global_list;
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 20:41:17 +08:00
struct list_head ring_list[I915_NUM_RINGS];
/** Used in execbuf to temporarily hold a ref */
struct list_head obj_exec_link;
struct list_head batch_pool_link;
drm/i915: Implement a framework for batch buffer pools This adds a small module for managing a pool of batch buffers. The only current use case is for the command parser, as described in the kerneldoc in the patch. The code is simple, but separating it out makes it easier to change the underlying algorithms and to extend to future use cases should they arise. The interface is simple: init to create an empty pool, fini to clean it up, get to obtain a new buffer. Note that all buffers are expected to be inactive before cleaning up the pool. Locking is currently based on the caller holding the struct_mutex. We already do that in the places where we will use the batch pool for the command parser. v2: - s/BUG_ON/WARN_ON/ for locking assertions - Remove the cap on pool size - Switch from alloc/free to init/fini v3: - Idiomatic looping structure in _fini - Correct handling of purged objects - Don't return a buffer that's too much larger than needed v4: - Rebased to latest -nightly v5: - Remove _put() function and clean up comments to match v6: - Move purged check inside the loop (danvet, from v4 1/7 feedback) v7: - Use single list instead of two. (Chris W) - s/active_list/cache_list - Squashed in debug patches (Chris W) drm/i915: Add a batch pool debugfs file It provides some useful information about the buffers in the global command parser batch pool. v2: rebase on global pool instead of per-ring pools v3: rebase drm/i915: Add batch pool details to i915_gem_objects debugfs To better account for the potentially large memory consumption of the batch pool. v8: - Keep cache in LRU order (danvet, from v6 1/5 feedback) Issue: VIZ-4719 Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> Reviewed-By: Jon Bloomfield <jon.bloomfield@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-12 04:13:08 +08:00
/**
* This is set if the object is on the active lists (has pending
* rendering and so a non-zero seqno), and is not set if it i s on
* inactive (ready to be unbound) list.
*/
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 20:41:17 +08:00
unsigned int active:I915_NUM_RINGS;
/**
* This is set if the object has been written to since last bound
* to the GTT
*/
unsigned int dirty:1;
/**
* Fence register bits (if any) for this object. Will be set
* as needed when mapped into the GTT.
* Protected by dev->struct_mutex.
*/
signed int fence_reg:I915_MAX_NUM_FENCE_BITS;
/**
* Advice: are the backing pages purgeable?
*/
unsigned int madv:2;
/**
* Current tiling mode for the object.
*/
unsigned int tiling_mode:2;
/**
* Whether the tiling parameters for the currently associated fence
* register have changed. Note that for the purposes of tracking
* tiling changes we also treat the unfenced register, the register
* slot that the object occupies whilst it executes a fenced
* command (such as BLT on gen2/3), as a "fence".
*/
unsigned int fence_dirty:1;
/**
* Is the object at the current location in the gtt mappable and
* fenceable? Used to avoid costly recalculations.
*/
unsigned int map_and_fenceable:1;
/**
* Whether the current gtt mapping needs to be mappable (and isn't just
* mappable by accident). Track pin and fault separate for a more
* accurate mappable working set.
*/
unsigned int fault_mappable:1;
/*
* Is the object to be mapped as read-only to the GPU
* Only honoured if hardware has relevant pte bit
*/
unsigned long gt_ro:1;
unsigned int cache_level:3;
unsigned int cache_dirty:1;
drm/i915: Introduce accurate frontbuffer tracking So from just a quick look we seem to have enough information to accurately figure out whether a given gem bo is used as a frontbuffer and where exactly: We have obj->pin_count as a first check with no false negatives and only negligible false positives. And then we can just walk the modeset objects and figure out where exactly a buffer is used as scanout. Except that we can't due to locking order: If we already hold dev->struct_mutex we can't acquire any modeset locks, so could potential chase freed pointers and other evil stuff. So we need something else. For that introduce a new set of bits obj->frontbuffer_bits to track where a buffer object is used. That we can then chase without grabbing any modeset locks. Of course the consumers of this (DRRS, PSR, FBC, ...) still need to be able to do their magic both when called from modeset and from gem code. But that can be easily achieved by adding locks for these specific subsystems which always nest within either kms or gem locking. This patch just adds the relevant update code to all places. Note that if we ever support multi-planar scanout targets then we need one frontbuffer tracking bit per attachment point that we expose to userspace. v2: - Fix more oopsen. Oops. - WARN if we leak obj->frontbuffer_bits when freeing a gem buffer. Fix the bugs this brought to light. - s/update_frontbuffer_bits/update_fb_bits/. More consistent with the fb tracking functions (fb for gem object, frontbuffer for raw bits). And the function name was way too long. v3: Size obj->frontbuffer_bits correctly so that all pipes fit in. v4: Don't update fb bits in set_base on failure. Noticed by Chris. v5: s/i915_gem_update_fb_bits/i915_gem_track_fb/ Also remove a few local enum pipe variables which are now no longer needed to make the function arguments no drop over the 80 char limit. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 05:28:09 +08:00
unsigned int frontbuffer_bits:INTEL_FRONTBUFFER_BITS;
unsigned int pin_display;
struct sg_table *pages;
int pages_pin_count;
struct get_page {
struct scatterlist *sg;
int last;
} get_page;
i915: add dmabuf/prime buffer sharing support. This adds handle->fd and fd->handle support to i915, this is to allow for offloading of rendering in one direction and outputs in the other. v2 from Daniel Vetter: - fixup conflicts with the prepare/finish gtt prep work. - implement ppgtt binding support. Note that we have squat i-g-t testcoverage for any of the lifetime and access rules dma_buf/prime support brings along. And there are quite a few intricate situations here. Also note that the integration with the existing code is a bit hackish, especially around get_gtt_pages and put_gtt_pages. It imo would be easier with the prep code from Chris Wilson's unbound series, but that is for 3.6. Also note that I didn't bother to put the new prepare/finish gtt hooks to good use by moving the dma_buf_map/unmap_attachment calls in there (like we've originally planned for). Last but not least this patch is only compile-tested, but I've changed very little compared to Dave Airlie's version. So there's a decent chance v2 on drm-next works as well as v1 on 3.4-rc. v3: Right when I've hit sent I've noticed that I've screwed up one obj->sg_list (for dmar support) and obj->sg_table (for prime support) disdinction. We should be able to merge these 2 paths, but that's material for another patch. v4: fix the error reporting bugs pointed out by ickle. v5: fix another error, and stop non-gtt mmaps on shared objects stop pread/pwrite on imported objects, add fake kmap Signed-off-by: Dave Airlie <airlied@redhat.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-05-10 21:25:09 +08:00
/* prime dma-buf support */
void *dma_buf_vmapping;
int vmapping_count;
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 20:41:17 +08:00
/** Breadcrumb of last rendering to the buffer.
* There can only be one writer, but we allow for multiple readers.
* If there is a writer that necessarily implies that all other
* read requests are complete - but we may only be lazily clearing
* the read requests. A read request is naturally the most recent
* request on a ring, so we may have two different write and read
* requests on one ring where the write request is older than the
* read request. This allows for the CPU to read from an active
* buffer by only waiting for the write to complete.
* */
struct drm_i915_gem_request *last_read_req[I915_NUM_RINGS];
struct drm_i915_gem_request *last_write_req;
/** Breadcrumb of last fenced GPU access to the buffer. */
struct drm_i915_gem_request *last_fenced_req;
/** Current tiling stride for the object, if it's tiled. */
uint32_t stride;
/** References from framebuffers, locks out tiling changes. */
unsigned long framebuffer_references;
/** Record of address bit 17 of each page at last unbind. */
unsigned long *bit_17;
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
union {
/** for phy allocated objects */
struct drm_dma_handle *phys_handle;
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
struct i915_gem_userptr {
uintptr_t ptr;
unsigned read_only :1;
unsigned workers :4;
#define I915_GEM_USERPTR_MAX_WORKERS 15
drm/i915: Prevent recursive deadlock on releasing a busy userptr During release of the GEM object we hold the struct_mutex. As the object may be holding onto the last reference for the task->mm, calling mmput() may trigger exit_mmap() which close the vma which will call drm_gem_vm_close() and attempt to reacquire the struct_mutex. In order to avoid that recursion, we have to defer the mmput() until after we drop the struct_mutex, i.e. we need to schedule a worker to do the clean up. A further issue spotted by Tvrtko was caused when we took a GTT mmapping of a userptr buffer object. In that case, we would never call mmput as the object would be cyclically referenced by the GTT mmapping and not freed upon process exit - keeping the entire process mm alive after the process task was reaped. The fix employed is to replace the mm_users/mmput() reference handling to mm_count/mmdrop() for the shared i915_mm_struct. INFO: task test_surfaces:1632 blocked for more than 120 seconds.       Tainted: GF          O 3.14.5+ #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. test_surfaces   D 0000000000000000     0  1632   1590 0x00000082  ffff88014914baa8 0000000000000046 0000000000000000 ffff88014914a010  0000000000012c40 0000000000012c40 ffff8800a0058210 ffff88014784b010  ffff88014914a010 ffff880037b1c820 ffff8800a0058210 ffff880037b1c824 Call Trace:  [<ffffffff81582499>] schedule+0x29/0x70  [<ffffffff815825fe>] schedule_preempt_disabled+0xe/0x10  [<ffffffff81583b93>] __mutex_lock_slowpath+0x183/0x220  [<ffffffff81583c53>] mutex_lock+0x23/0x40  [<ffffffffa005c2a3>] drm_gem_vm_close+0x33/0x70 [drm]  [<ffffffff8115a483>] remove_vma+0x33/0x70  [<ffffffff8115a5dc>] exit_mmap+0x11c/0x170  [<ffffffff8104d6eb>] mmput+0x6b/0x100  [<ffffffffa00f44b9>] i915_gem_userptr_release+0x89/0xc0 [i915]  [<ffffffffa00e6706>] i915_gem_free_object+0x126/0x250 [i915]  [<ffffffffa005c06a>] drm_gem_object_free+0x2a/0x40 [drm]  [<ffffffffa005cc32>] drm_gem_object_handle_unreference_unlocked+0xe2/0x120 [drm]  [<ffffffffa005ccd4>] drm_gem_object_release_handle+0x64/0x90 [drm]  [<ffffffff8127ffeb>] idr_for_each+0xab/0x100  [<ffffffffa005cc70>] ? drm_gem_object_handle_unreference_unlocked+0x120/0x120 [drm]  [<ffffffff81583c46>] ? mutex_lock+0x16/0x40  [<ffffffffa005c354>] drm_gem_release+0x24/0x40 [drm]  [<ffffffffa005b82b>] drm_release+0x3fb/0x480 [drm]  [<ffffffff8118d482>] __fput+0xb2/0x260  [<ffffffff8118d6de>] ____fput+0xe/0x10  [<ffffffff8106f27f>] task_work_run+0x8f/0xf0  [<ffffffff81052228>] do_exit+0x1a8/0x480  [<ffffffff81052551>] do_group_exit+0x51/0xc0  [<ffffffff810525d7>] SyS_exit_group+0x17/0x20  [<ffffffff8158e092>] system_call_fastpath+0x16/0x1b v2: Incorporate feedback from Tvrtko and remove the unnessary mm referencing when creating the i915_mm_struct and improve some of the function names and comments. Reported-by: Jacek Danecki <jacek.danecki@intel.com> Test-case: igt/gem_userptr_blits/process-exit* Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Jacek Danecki <jacek.danecki@intel.com> Cc: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com> Reviewed-by: "Ursulin, Tvrtko" <tvrtko.ursulin@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: stable@vger.kernel.org # hold off until 3.17 ships for additional testing Signed-off-by: Jani Nikula <jani.nikula@intel.com>
2014-08-07 21:20:40 +08:00
struct i915_mm_struct *mm;
struct i915_mmu_object *mmu_object;
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
struct work_struct *work;
} userptr;
};
};
#define to_intel_bo(x) container_of(x, struct drm_i915_gem_object, base)
drm/i915: Introduce accurate frontbuffer tracking So from just a quick look we seem to have enough information to accurately figure out whether a given gem bo is used as a frontbuffer and where exactly: We have obj->pin_count as a first check with no false negatives and only negligible false positives. And then we can just walk the modeset objects and figure out where exactly a buffer is used as scanout. Except that we can't due to locking order: If we already hold dev->struct_mutex we can't acquire any modeset locks, so could potential chase freed pointers and other evil stuff. So we need something else. For that introduce a new set of bits obj->frontbuffer_bits to track where a buffer object is used. That we can then chase without grabbing any modeset locks. Of course the consumers of this (DRRS, PSR, FBC, ...) still need to be able to do their magic both when called from modeset and from gem code. But that can be easily achieved by adding locks for these specific subsystems which always nest within either kms or gem locking. This patch just adds the relevant update code to all places. Note that if we ever support multi-planar scanout targets then we need one frontbuffer tracking bit per attachment point that we expose to userspace. v2: - Fix more oopsen. Oops. - WARN if we leak obj->frontbuffer_bits when freeing a gem buffer. Fix the bugs this brought to light. - s/update_frontbuffer_bits/update_fb_bits/. More consistent with the fb tracking functions (fb for gem object, frontbuffer for raw bits). And the function name was way too long. v3: Size obj->frontbuffer_bits correctly so that all pipes fit in. v4: Don't update fb bits in set_base on failure. Noticed by Chris. v5: s/i915_gem_update_fb_bits/i915_gem_track_fb/ Also remove a few local enum pipe variables which are now no longer needed to make the function arguments no drop over the 80 char limit. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 05:28:09 +08:00
void i915_gem_track_fb(struct drm_i915_gem_object *old,
struct drm_i915_gem_object *new,
unsigned frontbuffer_bits);
/**
* Request queue structure.
*
* The request queue allows us to note sequence numbers that have been emitted
* and may be associated with active buffers to be retired.
*
* By keeping this list, we can avoid having to do questionable sequence
* number comparisons on buffer last_read|write_seqno. It also allows an
* emission time to be associated with the request for tracking how far ahead
* of the GPU the submission is.
*
* The requests are reference counted, so upon creation they should have an
* initial reference taken using kref_init
*/
struct drm_i915_gem_request {
struct kref ref;
/** On Which ring this request was generated */
struct drm_i915_private *i915;
struct intel_engine_cs *ring;
/** GEM sequence number associated with this request. */
uint32_t seqno;
/** Position in the ringbuffer of the start of the request */
u32 head;
/**
* Position in the ringbuffer of the start of the postfix.
* This is required to calculate the maximum available ringbuffer
* space without overwriting the postfix.
*/
u32 postfix;
/** Position in the ringbuffer of the end of the whole request */
drm/i915: Record the tail at each request and use it to estimate the head By recording the location of every request in the ringbuffer, we know that in order to retire the request the GPU must have finished reading it and so the GPU head is now beyond the tail of the request. We can therefore provide a conservative estimate of where the GPU is reading from in order to avoid having to read back the ring buffer registers when polling for space upon starting a new write into the ringbuffer. A secondary effect is that this allows us to convert intel_ring_buffer_wait() to use i915_wait_request() and so consolidate upon the single function to handle the complicated task of waiting upon the GPU. A necessary precaution is that we need to make that wait uninterruptible to match the existing conditions as all the callers of intel_ring_begin() have not been audited to handle ERESTARTSYS correctly. By using a conservative estimate for the head, and always processing all outstanding requests first, we prevent a race condition between using the estimate and direct reads of I915_RING_HEAD which could result in the value of the head going backwards, and the tail overflowing once again. We are also careful to mark any request that we skip over in order to free space in ring as consumed which provides a self-consistency check. Given sufficient abuse, such as a set of unthrottled GPU bound cairo-traces, avoiding the use of I915_RING_HEAD gives a 10-20% boost on Sandy Bridge (i5-2520m): firefox-paintball 18927ms -> 15646ms: 1.21x speedup firefox-fishtank 12563ms -> 11278ms: 1.11x speedup which is a mild consolation for the performance those traces achieved from exploiting the buggy autoreported head. v2: Add a few more comments and make request->tail a conservative estimate as suggested by Daniel Vetter. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: resolve conflicts with retirement defering and the lack of the autoreport head removal (that will go in through -fixes).] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-02-15 19:25:36 +08:00
u32 tail;
/**
* Context and ring buffer related to this request
* Contexts are refcounted, so when this request is associated with a
* context, we must increment the context's refcount, to guarantee that
* it persists while any request is linked to it. Requests themselves
* are also refcounted, so the request will only be freed when the last
* reference to it is dismissed, and the code in
* i915_gem_request_free() will then decrement the refcount on the
* context.
*/
struct intel_context *ctx;
struct intel_ringbuffer *ringbuf;
/** Batch buffer related to this request if any (used for
error state dump only) */
struct drm_i915_gem_object *batch_obj;
/** Time at which this request was emitted, in jiffies. */
unsigned long emitted_jiffies;
/** global list entry for this request */
struct list_head list;
struct drm_i915_file_private *file_priv;
/** file_priv list entry for this request */
struct list_head client_list;
/** process identifier submitting this request */
struct pid *pid;
/**
* The ELSP only accepts two elements at a time, so we queue
* context/tail pairs on a given queue (ring->execlist_queue) until the
* hardware is available. The queue serves a double purpose: we also use
* it to keep track of the up to 2 contexts currently in the hardware
* (usually one in execution and the other queued up by the GPU): We
* only remove elements from the head of the queue when the hardware
* informs us that an element has been completed.
*
* All accesses to the queue are mediated by a spinlock
* (ring->execlist_lock).
*/
/** Execlist link in the submission queue.*/
struct list_head execlist_link;
/** Execlists no. of times this request has been sent to the ELSP */
int elsp_submitted;
};
int i915_gem_request_alloc(struct intel_engine_cs *ring,
struct intel_context *ctx,
struct drm_i915_gem_request **req_out);
drm/i915: Reserve ring buffer space for i915_add_request() commands It is a bad idea for i915_add_request() to fail. The work will already have been send to the ring and will be processed, but there will not be any tracking or management of that work. The only way the add request call can fail is if it can't write its epilogue commands to the ring (cache flushing, seqno updates, interrupt signalling). The reasons for that are mostly down to running out of ring buffer space and the problems associated with trying to get some more. This patch prevents that situation from happening in the first place. When a request is created, it marks sufficient space as reserved for the epilogue commands. Thus guaranteeing that by the time the epilogue is written, there will be plenty of space for it. Note that a ring_begin() call is required to actually reserve the space (and do any potential waiting). However, that is not currently done at request creation time. This is because the ring_begin() code can allocate a request. Hence calling begin() from the request allocation code would lead to infinite recursion! Later patches in this series remove the need for begin() to do the allocate. At that point, it becomes safe for the allocate to call begin() and really reserve the space. Until then, there is a potential for insufficient space to be available at the point of calling i915_add_request(). However, that would only be in the case where the request was created and immediately submitted without ever calling ring_begin() and adding any work to that request. Which should never happen. And even if it does, and if that request happens to fall down the tiny window of opportunity for failing due to being out of ring space then does it really matter because the request wasn't doing anything in the first place? v2: Updated the 'reserved space too small' warning to include the offending sizes. Added a 'cancel' operation to clean up when a request is abandoned. Added re-initialisation of tracking state after a buffer wrap to keep the sanity checks accurate. v3: Incremented the reserved size to accommodate Ironlake (after finally managing to run on an ILK system). Also fixed missing wrap code in LRC mode. v4: Added extra comment and removed duplicate WARN (feedback from Tomas). For: VIZ-5115 CC: Tomas Elf <tomas.elf@intel.com> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-06-18 20:10:09 +08:00
void i915_gem_request_cancel(struct drm_i915_gem_request *req);
void i915_gem_request_free(struct kref *req_ref);
drm/i915: Move the request/file and request/pid association to creation time In _i915_add_request(), the request is associated with a userland client. Specifically it is linked to the 'file' structure and the current user process is recorded. One problem here is that the current user process is not necessarily the same as when the request was submitted to the driver. This is especially true when the GPU scheduler arrives and decouples driver submission from hardware submission. Note also that it is only in the case where the add request comes from an execbuff call that there is a client to associate. Any other add request call is kernel only so does not need to do it. This patch moves the client association into a separate function. This is then called from the execbuffer code path itself at a sensible time. It also removes the now redundant 'file' pointer from the add request parameter list. An extra cleanup of the client association is also added to the request clean up code for the eventuality where the request is killed after association but before being submitted (e.g. due to out of memory error somewhere). Once the submission has happened, the request is on the request list and the regular request list removal will clear the association. Note that this still needs to happen at this point in time because the request might be kept floating around much longer (due to someone holding a reference count) and the client should not be worrying about this request after it has been retired. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:44:12 +08:00
int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
struct drm_file *file);
static inline uint32_t
i915_gem_request_get_seqno(struct drm_i915_gem_request *req)
{
return req ? req->seqno : 0;
}
static inline struct intel_engine_cs *
i915_gem_request_get_ring(struct drm_i915_gem_request *req)
{
return req ? req->ring : NULL;
}
static inline struct drm_i915_gem_request *
i915_gem_request_reference(struct drm_i915_gem_request *req)
{
if (req)
kref_get(&req->ref);
return req;
}
static inline void
i915_gem_request_unreference(struct drm_i915_gem_request *req)
{
WARN_ON(!mutex_is_locked(&req->ring->dev->struct_mutex));
kref_put(&req->ref, i915_gem_request_free);
}
static inline void
i915_gem_request_unreference__unlocked(struct drm_i915_gem_request *req)
{
struct drm_device *dev;
if (!req)
return;
dev = req->ring->dev;
if (kref_put_mutex(&req->ref, i915_gem_request_free, &dev->struct_mutex))
mutex_unlock(&dev->struct_mutex);
}
static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
struct drm_i915_gem_request *src)
{
if (src)
i915_gem_request_reference(src);
if (*pdst)
i915_gem_request_unreference(*pdst);
*pdst = src;
}
/*
* XXX: i915_gem_request_completed should be here but currently needs the
* definition of i915_seqno_passed() which is below. It will be moved in
* a later patch when the call to i915_seqno_passed() is obsoleted...
*/
/*
* A command that requires special handling by the command parser.
*/
struct drm_i915_cmd_descriptor {
/*
* Flags describing how the command parser processes the command.
*
* CMD_DESC_FIXED: The command has a fixed length if this is set,
* a length mask if not set
* CMD_DESC_SKIP: The command is allowed but does not follow the
* standard length encoding for the opcode range in
* which it falls
* CMD_DESC_REJECT: The command is never allowed
* CMD_DESC_REGISTER: The command should be checked against the
* register whitelist for the appropriate ring
* CMD_DESC_MASTER: The command is allowed if the submitting process
* is the DRM master
*/
u32 flags;
#define CMD_DESC_FIXED (1<<0)
#define CMD_DESC_SKIP (1<<1)
#define CMD_DESC_REJECT (1<<2)
#define CMD_DESC_REGISTER (1<<3)
#define CMD_DESC_BITMASK (1<<4)
#define CMD_DESC_MASTER (1<<5)
/*
* The command's unique identification bits and the bitmask to get them.
* This isn't strictly the opcode field as defined in the spec and may
* also include type, subtype, and/or subop fields.
*/
struct {
u32 value;
u32 mask;
} cmd;
/*
* The command's length. The command is either fixed length (i.e. does
* not include a length field) or has a length field mask. The flag
* CMD_DESC_FIXED indicates a fixed length. Otherwise, the command has
* a length mask. All command entries in a command table must include
* length information.
*/
union {
u32 fixed;
u32 mask;
} length;
/*
* Describes where to find a register address in the command to check
* against the ring's register whitelist. Only valid if flags has the
* CMD_DESC_REGISTER bit set.
*
* A non-zero step value implies that the command may access multiple
* registers in sequence (e.g. LRI), in that case step gives the
* distance in dwords between individual offset fields.
*/
struct {
u32 offset;
u32 mask;
u32 step;
} reg;
#define MAX_CMD_DESC_BITMASKS 3
/*
* Describes command checks where a particular dword is masked and
* compared against an expected value. If the command does not match
* the expected value, the parser rejects it. Only valid if flags has
* the CMD_DESC_BITMASK bit set. Only entries where mask is non-zero
* are valid.
*
* If the check specifies a non-zero condition_mask then the parser
* only performs the check when the bits specified by condition_mask
* are non-zero.
*/
struct {
u32 offset;
u32 mask;
u32 expected;
u32 condition_offset;
u32 condition_mask;
} bits[MAX_CMD_DESC_BITMASKS];
};
/*
* A table of commands requiring special handling by the command parser.
*
* Each ring has an array of tables. Each table consists of an array of command
* descriptors, which must be sorted with command opcodes in ascending order.
*/
struct drm_i915_cmd_table {
const struct drm_i915_cmd_descriptor *table;
int count;
};
/* Note that the (struct drm_i915_private *) cast is just to shut up gcc. */
#define __I915__(p) ({ \
struct drm_i915_private *__p; \
if (__builtin_types_compatible_p(typeof(*p), struct drm_i915_private)) \
__p = (struct drm_i915_private *)p; \
else if (__builtin_types_compatible_p(typeof(*p), struct drm_device)) \
__p = to_i915((struct drm_device *)p); \
else \
BUILD_BUG(); \
__p; \
})
#define INTEL_INFO(p) (&__I915__(p)->info)
#define INTEL_DEVID(p) (INTEL_INFO(p)->device_id)
#define INTEL_REVID(p) (__I915__(p)->dev->pdev->revision)
#define IS_I830(dev) (INTEL_DEVID(dev) == 0x3577)
#define IS_845G(dev) (INTEL_DEVID(dev) == 0x2562)
#define IS_I85X(dev) (INTEL_INFO(dev)->is_i85x)
#define IS_I865G(dev) (INTEL_DEVID(dev) == 0x2572)
#define IS_I915G(dev) (INTEL_INFO(dev)->is_i915g)
#define IS_I915GM(dev) (INTEL_DEVID(dev) == 0x2592)
#define IS_I945G(dev) (INTEL_DEVID(dev) == 0x2772)
#define IS_I945GM(dev) (INTEL_INFO(dev)->is_i945gm)
#define IS_BROADWATER(dev) (INTEL_INFO(dev)->is_broadwater)
#define IS_CRESTLINE(dev) (INTEL_INFO(dev)->is_crestline)
#define IS_GM45(dev) (INTEL_DEVID(dev) == 0x2A42)
#define IS_G4X(dev) (INTEL_INFO(dev)->is_g4x)
#define IS_PINEVIEW_G(dev) (INTEL_DEVID(dev) == 0xa001)
#define IS_PINEVIEW_M(dev) (INTEL_DEVID(dev) == 0xa011)
#define IS_PINEVIEW(dev) (INTEL_INFO(dev)->is_pineview)
#define IS_G33(dev) (INTEL_INFO(dev)->is_g33)
#define IS_IRONLAKE_M(dev) (INTEL_DEVID(dev) == 0x0046)
#define IS_IVYBRIDGE(dev) (INTEL_INFO(dev)->is_ivybridge)
#define IS_IVB_GT1(dev) (INTEL_DEVID(dev) == 0x0156 || \
INTEL_DEVID(dev) == 0x0152 || \
INTEL_DEVID(dev) == 0x015a)
#define IS_VALLEYVIEW(dev) (INTEL_INFO(dev)->is_valleyview)
#define IS_CHERRYVIEW(dev) (INTEL_INFO(dev)->is_valleyview && IS_GEN8(dev))
#define IS_HASWELL(dev) (INTEL_INFO(dev)->is_haswell)
#define IS_BROADWELL(dev) (!INTEL_INFO(dev)->is_valleyview && IS_GEN8(dev))
#define IS_SKYLAKE(dev) (INTEL_INFO(dev)->is_skylake)
#define IS_BROXTON(dev) (!INTEL_INFO(dev)->is_skylake && IS_GEN9(dev))
#define IS_MOBILE(dev) (INTEL_INFO(dev)->is_mobile)
#define IS_HSW_EARLY_SDV(dev) (IS_HASWELL(dev) && \
(INTEL_DEVID(dev) & 0xFF00) == 0x0C00)
#define IS_BDW_ULT(dev) (IS_BROADWELL(dev) && \
((INTEL_DEVID(dev) & 0xf) == 0x6 || \
(INTEL_DEVID(dev) & 0xf) == 0xb || \
(INTEL_DEVID(dev) & 0xf) == 0xe))
/* ULX machines are also considered ULT. */
#define IS_BDW_ULX(dev) (IS_BROADWELL(dev) && \
(INTEL_DEVID(dev) & 0xf) == 0xe)
#define IS_BDW_GT3(dev) (IS_BROADWELL(dev) && \
(INTEL_DEVID(dev) & 0x00F0) == 0x0020)
#define IS_HSW_ULT(dev) (IS_HASWELL(dev) && \
(INTEL_DEVID(dev) & 0xFF00) == 0x0A00)
#define IS_HSW_GT3(dev) (IS_HASWELL(dev) && \
(INTEL_DEVID(dev) & 0x00F0) == 0x0020)
/* ULX machines are also considered ULT. */
#define IS_HSW_ULX(dev) (INTEL_DEVID(dev) == 0x0A0E || \
INTEL_DEVID(dev) == 0x0A1E)
#define IS_SKL_ULT(dev) (INTEL_DEVID(dev) == 0x1906 || \
INTEL_DEVID(dev) == 0x1913 || \
INTEL_DEVID(dev) == 0x1916 || \
INTEL_DEVID(dev) == 0x1921 || \
INTEL_DEVID(dev) == 0x1926)
#define IS_SKL_ULX(dev) (INTEL_DEVID(dev) == 0x190E || \
INTEL_DEVID(dev) == 0x1915 || \
INTEL_DEVID(dev) == 0x191E)
#define IS_PRELIMINARY_HW(intel_info) ((intel_info)->is_preliminary)
#define SKL_REVID_A0 (0x0)
#define SKL_REVID_B0 (0x1)
#define SKL_REVID_C0 (0x2)
#define SKL_REVID_D0 (0x3)
#define SKL_REVID_E0 (0x4)
#define SKL_REVID_F0 (0x5)
#define BXT_REVID_A0 (0x0)
#define BXT_REVID_B0 (0x3)
#define BXT_REVID_C0 (0x6)
/*
* The genX designation typically refers to the render engine, so render
* capability related checks should use IS_GEN, while display and other checks
* have their own (e.g. HAS_PCH_SPLIT for ILK+ display, IS_foo for particular
* chips, etc.).
*/
#define IS_GEN2(dev) (INTEL_INFO(dev)->gen == 2)
#define IS_GEN3(dev) (INTEL_INFO(dev)->gen == 3)
#define IS_GEN4(dev) (INTEL_INFO(dev)->gen == 4)
#define IS_GEN5(dev) (INTEL_INFO(dev)->gen == 5)
#define IS_GEN6(dev) (INTEL_INFO(dev)->gen == 6)
#define IS_GEN7(dev) (INTEL_INFO(dev)->gen == 7)
#define IS_GEN8(dev) (INTEL_INFO(dev)->gen == 8)
#define IS_GEN9(dev) (INTEL_INFO(dev)->gen == 9)
#define RENDER_RING (1<<RCS)
#define BSD_RING (1<<VCS)
#define BLT_RING (1<<BCS)
#define VEBOX_RING (1<<VECS)
#define BSD2_RING (1<<VCS2)
#define HAS_BSD(dev) (INTEL_INFO(dev)->ring_mask & BSD_RING)
#define HAS_BSD2(dev) (INTEL_INFO(dev)->ring_mask & BSD2_RING)
#define HAS_BLT(dev) (INTEL_INFO(dev)->ring_mask & BLT_RING)
#define HAS_VEBOX(dev) (INTEL_INFO(dev)->ring_mask & VEBOX_RING)
#define HAS_LLC(dev) (INTEL_INFO(dev)->has_llc)
#define HAS_WT(dev) ((IS_HASWELL(dev) || IS_BROADWELL(dev)) && \
__I915__(dev)->ellc_size)
#define I915_NEED_GFX_HWS(dev) (INTEL_INFO(dev)->need_gfx_hws)
drm/i915: preliminary context support Very basic code for context setup/destruction in the driver. Adds the file i915_gem_context.c This file implements HW context support. On gen5+ a HW context consists of an opaque GPU object which is referenced at times of context saves and restores. With RC6 enabled, the context is also referenced as the GPU enters and exists from RC6 (GPU has it's own internal power context, except on gen5). Though something like a context does exist for the media ring, the code only supports contexts for the render ring. In software, there is a distinction between contexts created by the user, and the default HW context. The default HW context is used by GPU clients that do not request setup of their own hardware context. The default context's state is never restored to help prevent programming errors. This would happen if a client ran and piggy-backed off another clients GPU state. The default context only exists to give the GPU some offset to load as the current to invoke a save of the context we actually care about. In fact, the code could likely be constructed, albeit in a more complicated fashion, to never use the default context, though that limits the driver's ability to swap out, and/or destroy other contexts. All other contexts are created as a request by the GPU client. These contexts store GPU state, and thus allow GPU clients to not re-emit state (and potentially query certain state) at any time. The kernel driver makes certain that the appropriate commands are inserted. There are 4 entry points into the contexts, init, fini, open, close. The names are self-explanatory except that init can be called during reset, and also during pm thaw/resume. As we expect our context to be preserved across these events, we do not reinitialize in this case. As Adam Jackson pointed out, The cutoff of 1MB where a HW context is considered too big is arbitrary. The reason for this is even though context sizes are increasing with every generation, they have yet to eclipse even 32k. If we somehow read back way more than that, it probably means BIOS has done something strange, or we're running on a platform that wasn't designed for this. v2: rename load/unload to init/fini (daniel) remove ILK support for get_size() (indirectly daniel) add HAS_HW_CONTEXTS macro to clarify supported platforms (daniel) added comments (Ben) Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
2012-06-05 05:42:42 +08:00
#define HAS_HW_CONTEXTS(dev) (INTEL_INFO(dev)->gen >= 6)
#define HAS_LOGICAL_RING_CONTEXTS(dev) (INTEL_INFO(dev)->gen >= 8)
#define USES_PPGTT(dev) (i915.enable_ppgtt)
#define USES_FULL_PPGTT(dev) (i915.enable_ppgtt == 2)
#define HAS_OVERLAY(dev) (INTEL_INFO(dev)->has_overlay)
#define OVERLAY_NEEDS_PHYSICAL(dev) (INTEL_INFO(dev)->overlay_needs_physical)
/* Early gen2 have a totally busted CS tlb and require pinned batches. */
#define HAS_BROKEN_CS_TLB(dev) (IS_I830(dev) || IS_845G(dev))
/*
* dp aux and gmbus irq on gen4 seems to be able to generate legacy interrupts
* even when in MSI mode. This results in spurious interrupt warnings if the
* legacy irq no. is shared with another device. The kernel then disables that
* interrupt source and so prevents the other device from working properly.
*/
#define HAS_AUX_IRQ(dev) (INTEL_INFO(dev)->gen >= 5)
#define HAS_GMBUS_IRQ(dev) (INTEL_INFO(dev)->gen >= 5)
/* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte
* rows, which changed the alignment requirements and fence programming.
*/
#define HAS_128_BYTE_Y_TILING(dev) (!IS_GEN2(dev) && !(IS_I915G(dev) || \
IS_I915GM(dev)))
#define SUPPORTS_TV(dev) (INTEL_INFO(dev)->supports_tv)
#define I915_HAS_HOTPLUG(dev) (INTEL_INFO(dev)->has_hotplug)
#define HAS_FW_BLC(dev) (INTEL_INFO(dev)->gen > 2)
#define HAS_PIPE_CXSR(dev) (INTEL_INFO(dev)->has_pipe_cxsr)
#define HAS_FBC(dev) (INTEL_INFO(dev)->has_fbc)
#define HAS_IPS(dev) (IS_HSW_ULT(dev) || IS_BROADWELL(dev))
#define HAS_DP_MST(dev) (IS_HASWELL(dev) || IS_BROADWELL(dev) || \
INTEL_INFO(dev)->gen >= 9)
#define HAS_DDI(dev) (INTEL_INFO(dev)->has_ddi)
#define HAS_FPGA_DBG_UNCLAIMED(dev) (INTEL_INFO(dev)->has_fpga_dbg)
#define HAS_PSR(dev) (IS_HASWELL(dev) || IS_BROADWELL(dev) || \
IS_VALLEYVIEW(dev) || IS_CHERRYVIEW(dev) || \
IS_SKYLAKE(dev))
#define HAS_RUNTIME_PM(dev) (IS_GEN6(dev) || IS_HASWELL(dev) || \
IS_BROADWELL(dev) || IS_VALLEYVIEW(dev) || \
IS_SKYLAKE(dev))
#define HAS_RC6(dev) (INTEL_INFO(dev)->gen >= 6)
#define HAS_RC6p(dev) (INTEL_INFO(dev)->gen == 6 || IS_IVYBRIDGE(dev))
drm/i915/skl: Add support to load SKL CSR firmware. Display Context Save and Restore support is needed for various SKL Display C states like DC5, DC6. This implementation is added based on first version of DMC CSR program that we received from h/w team. Here we are using request_firmware based design. Finally this firmware should end up in linux-firmware tree. For SKL platform its mandatory to ensure that we load this csr program before enabling DC states like DC5/DC6. As CSR program gets reset on various conditions, we should ensure to load it during boot and in future change to be added to load this system resume sequence too. v1: Initial relese as RFC patch v2: Design change as per Daniel, Damien and Shobit's review comments request firmware method followed. v3: Some optimization and functional changes. Pulled register defines into drivers/gpu/drm/i915/i915_reg.h Used kmemdup to allocate and duplicate firmware content. Ensured to free allocated buffer. v4: Modified as per review comments from Satheesh and Daniel Removed temporary buffer. Optimized number of writes by replacing I915_WRITE with I915_WRITE64. v5: Modified as per review comemnts from Damien. - Changed name for functions and firmware. - Introduced HAS_CSR. - Reverted back previous change and used csr_buf with u8 size. - Using cpu_to_be64 for endianness change. Modified as per review comments from Imre. - Modified registers and macro names to be a bit closer to bspec terminology and the existing register naming in the driver. - Early return for non SKL platforms in intel_load_csr_program function. - Added locking around CSR program load function as it may be called concurrently during system/runtime resume. - Releasing the fw before loading the program for consistency - Handled error path during f/w load. v6: Modified as per review comments from Imre. - Corrected out_freecsr sequence. v7: Modified as per review comments from Imre. Fail loading fw if fw->size%8!=0. v8: Rebase to latest. v9: Rebase on top of -nightly (Damien) v10: Enabled support for dmc firmware ver 1.0. According to ver 1.0 in a single binary package all the firmware's that are required for different stepping's of the product will be stored. The package contains the css header, followed by the package header and the actual dmc firmwares. Package header contains the firmware/stepping mapping table and the corresponding firmware offsets to the individual binaries, within the package. Each individual program binary contains the header and the payload sections whose size is specified in the header section. This changes are done to extract the specific firmaware from the package. (Animesh) v11: Modified as per review comemnts from Imre. - Added code comment from bpec for header structure elements. - Added __packed to avoid structure padding. - Added helper functions for stepping and substepping info. - Added code comment for CSR_MAX_FW_SIZE. - Disabled BXT firmware loading, will be enabled with dmc 1.0 support. - Changed skl_stepping_info based on bspec, earlier used from config DB. - Removed duplicate call of cpu_to_be* from intel_csr_load_program function. - Used cpu_to_be32 instead of cpu_to_be64 as firmware binary in dword aligned. - Added sanity check for header length. - Added sanity check for mmio address got from firmware binary. - kmalloc done separately for dmc header and dmc firmware. (Animesh) v12: Modified as per review comemnts from Imre. - Corrected the typo error in skl stepping info structure. - Added out-of-bound access for skl_stepping_info. - Sanity check for mmio address modified. - Sanity check added for stepping and substeppig. - Modified the intel_dmc_info structure, cache only the required header info. (Animesh) v13: clarify firmware load error message. The reason for a firmware loading failure can be obscure if the driver is built-in. Provide an explanation to the user about the likely reason for the failure and how to resolve it. (Imre) v14: Suggested by Jani. - fix s/I915/CONFIG_DRM_I915/ typo - add fw_path to the firmware object instead of using a static ptr (Jani) v15: 1) Changed the firmware name as dmc_gen9.bin, everytime for a new firmware version a symbolic link with same name will help not to build kernel again. 2) Changes done as per review comments from Imre. - Error check removed for intel_csr_ucode_init. - Moved csr-specific data structure to intel_csr.h and optimization done on structure definition. - fw->data used directly for parsing the header info & memory allocation only done separately for payload. (Animesh) v16: - No need for out_regs label in i915_driver_load(), so removed it. - Changed the firmware name as skl_dmc_ver1.bin, followed naming convention <platform>_dmc_<api-version>.bin (Animesh) Issue: VIZ-2569 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-04 20:58:44 +08:00
#define HAS_CSR(dev) (IS_SKYLAKE(dev))
#define HAS_RESOURCE_STREAMER(dev) (IS_HASWELL(dev) || \
INTEL_INFO(dev)->gen >= 8)
#define HAS_CORE_RING_FREQ(dev) (INTEL_INFO(dev)->gen >= 6 && \
!IS_VALLEYVIEW(dev) && !IS_BROXTON(dev))
#define INTEL_PCH_DEVICE_ID_MASK 0xff00
#define INTEL_PCH_IBX_DEVICE_ID_TYPE 0x3b00
#define INTEL_PCH_CPT_DEVICE_ID_TYPE 0x1c00
#define INTEL_PCH_PPT_DEVICE_ID_TYPE 0x1e00
#define INTEL_PCH_LPT_DEVICE_ID_TYPE 0x8c00
#define INTEL_PCH_LPT_LP_DEVICE_ID_TYPE 0x9c00
#define INTEL_PCH_SPT_DEVICE_ID_TYPE 0xA100
#define INTEL_PCH_SPT_LP_DEVICE_ID_TYPE 0x9D00
#define INTEL_PCH_TYPE(dev) (__I915__(dev)->pch_type)
#define HAS_PCH_SPT(dev) (INTEL_PCH_TYPE(dev) == PCH_SPT)
#define HAS_PCH_LPT(dev) (INTEL_PCH_TYPE(dev) == PCH_LPT)
#define HAS_PCH_CPT(dev) (INTEL_PCH_TYPE(dev) == PCH_CPT)
#define HAS_PCH_IBX(dev) (INTEL_PCH_TYPE(dev) == PCH_IBX)
#define HAS_PCH_NOP(dev) (INTEL_PCH_TYPE(dev) == PCH_NOP)
#define HAS_PCH_SPLIT(dev) (INTEL_PCH_TYPE(dev) != PCH_NONE)
#define HAS_GMCH_DISPLAY(dev) (INTEL_INFO(dev)->gen < 5 || IS_VALLEYVIEW(dev))
/* DPF == dynamic parity feature */
#define HAS_L3_DPF(dev) (IS_IVYBRIDGE(dev) || IS_HASWELL(dev))
#define NUM_L3_SLICES(dev) (IS_HSW_GT3(dev) ? 2 : HAS_L3_DPF(dev))
#define GT_FREQUENCY_MULTIPLIER 50
#define GEN9_FREQ_SCALER 3
#include "i915_trace.h"
extern const struct drm_ioctl_desc i915_ioctls[];
extern int i915_max_ioctl;
extern int i915_suspend_legacy(struct drm_device *dev, pm_message_t state);
extern int i915_resume_legacy(struct drm_device *dev);
/* i915_params.c */
struct i915_params {
int modeset;
int panel_ignore_lid;
int semaphores;
int lvds_channel_mode;
int panel_use_ssc;
int vbt_sdvo_panel_type;
int enable_rc6;
int enable_fbc;
int enable_ppgtt;
int enable_execlists;
int enable_psr;
unsigned int preliminary_hw_support;
int disable_power_well;
int enable_ips;
int invert_brightness;
int enable_cmd_parser;
/* leave bools at the end to not create holes */
bool enable_hangcheck;
bool fastboot;
bool prefault_disable;
bool load_detect_test;
bool reset;
bool disable_display;
bool disable_vtd_wa;
bool enable_guc_submission;
int guc_log_level;
drm/i915: Replaced Blitter ring based flips with MMIO flips This patch enables the framework for using MMIO based flip calls, in contrast with the CS based flip calls which are being used currently. MMIO based flip calls can be enabled on architectures where Render and Blitter engines reside in different power wells. The decision to use MMIO flips can be made based on workloads to give 100% residency for Media power well. v2: The MMIO flips now use the interrupt driven mechanism for issuing the flips when target seqno is reached. (Incorporating Ville's idea) v3: Rebasing on latest code. Code restructuring after incorporating Damien's comments v4: Addressing Ville's review comments -general cleanup -updating only base addr instead of calling update_primary_plane -extending patch for gen5+ platforms v5: Addressed Ville's review comments -Making mmio flip vs cs flip selection based on module parameter -Adding check for DRIVER_MODESET feature in notify_ring before calling notify mmio flip. -Other changes mostly in function arguments v6: -Having a seperate function to check condition for using mmio flips (Ville) -propogating error code from i915_gem_check_olr (Ville) v7: -Adding __must_check with i915_gem_check_olr (Chris) -Renaming mmio_flip_data to mmio_flip (Chris) -Rebasing on latest nightly v8: -Rebasing on latest code -squash 3rd patch in series(mmio setbase vs page flip race) with this patch -Added new tiling mode update in intel_do_mmio_flip (Chris) v9: -check for obj->last_write_seqno being 0 instead of obj->ring being NULL in intel_postpone_flip, as this is a more restrictive condition (Chris) v10: -Applied Chris's suggestions for squashing patches 2,3 into this patch. These patches make the selection of CS vs MMIO flip at the page flip time, and make the module parameter for using mmio flips as tristate, the states being 'force CS flips', 'force mmio flips', 'driver discretion'. Changed the logic for driver discretion (Chris) v11: Minor code cleanup(better readability, fixing whitespace errors, using lockdep to check mutex locked status in postpone_flip, removal of __must_check in function definition) (Chris) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com> Signed-off-by: Akash Goel <akash.goel@intel.com> Tested-by: Chris Wilson <chris@chris-wilson.co.uk> # snb, ivb [danvet: Fix up parameter alignement checkpatch spotted.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-02 19:17:17 +08:00
int use_mmio_flip;
int mmio_debug;
bool verbose_state_checks;
int edp_vswing;
};
extern struct i915_params i915 __read_mostly;
/* i915_dma.c */
extern int i915_driver_load(struct drm_device *, unsigned long flags);
extern int i915_driver_unload(struct drm_device *);
extern int i915_driver_open(struct drm_device *dev, struct drm_file *file);
extern void i915_driver_lastclose(struct drm_device * dev);
extern void i915_driver_preclose(struct drm_device *dev,
struct drm_file *file);
extern void i915_driver_postclose(struct drm_device *dev,
struct drm_file *file);
extern int i915_driver_device_is_agp(struct drm_device * dev);
#ifdef CONFIG_COMPAT
extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
#endif
extern int intel_gpu_reset(struct drm_device *dev);
extern bool intel_has_gpu_reset(struct drm_device *dev);
extern int i915_reset(struct drm_device *dev);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
extern void i915_update_gfx_val(struct drm_i915_private *dev_priv);
int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on);
drm/i915/skl: Add support to load SKL CSR firmware. Display Context Save and Restore support is needed for various SKL Display C states like DC5, DC6. This implementation is added based on first version of DMC CSR program that we received from h/w team. Here we are using request_firmware based design. Finally this firmware should end up in linux-firmware tree. For SKL platform its mandatory to ensure that we load this csr program before enabling DC states like DC5/DC6. As CSR program gets reset on various conditions, we should ensure to load it during boot and in future change to be added to load this system resume sequence too. v1: Initial relese as RFC patch v2: Design change as per Daniel, Damien and Shobit's review comments request firmware method followed. v3: Some optimization and functional changes. Pulled register defines into drivers/gpu/drm/i915/i915_reg.h Used kmemdup to allocate and duplicate firmware content. Ensured to free allocated buffer. v4: Modified as per review comments from Satheesh and Daniel Removed temporary buffer. Optimized number of writes by replacing I915_WRITE with I915_WRITE64. v5: Modified as per review comemnts from Damien. - Changed name for functions and firmware. - Introduced HAS_CSR. - Reverted back previous change and used csr_buf with u8 size. - Using cpu_to_be64 for endianness change. Modified as per review comments from Imre. - Modified registers and macro names to be a bit closer to bspec terminology and the existing register naming in the driver. - Early return for non SKL platforms in intel_load_csr_program function. - Added locking around CSR program load function as it may be called concurrently during system/runtime resume. - Releasing the fw before loading the program for consistency - Handled error path during f/w load. v6: Modified as per review comments from Imre. - Corrected out_freecsr sequence. v7: Modified as per review comments from Imre. Fail loading fw if fw->size%8!=0. v8: Rebase to latest. v9: Rebase on top of -nightly (Damien) v10: Enabled support for dmc firmware ver 1.0. According to ver 1.0 in a single binary package all the firmware's that are required for different stepping's of the product will be stored. The package contains the css header, followed by the package header and the actual dmc firmwares. Package header contains the firmware/stepping mapping table and the corresponding firmware offsets to the individual binaries, within the package. Each individual program binary contains the header and the payload sections whose size is specified in the header section. This changes are done to extract the specific firmaware from the package. (Animesh) v11: Modified as per review comemnts from Imre. - Added code comment from bpec for header structure elements. - Added __packed to avoid structure padding. - Added helper functions for stepping and substepping info. - Added code comment for CSR_MAX_FW_SIZE. - Disabled BXT firmware loading, will be enabled with dmc 1.0 support. - Changed skl_stepping_info based on bspec, earlier used from config DB. - Removed duplicate call of cpu_to_be* from intel_csr_load_program function. - Used cpu_to_be32 instead of cpu_to_be64 as firmware binary in dword aligned. - Added sanity check for header length. - Added sanity check for mmio address got from firmware binary. - kmalloc done separately for dmc header and dmc firmware. (Animesh) v12: Modified as per review comemnts from Imre. - Corrected the typo error in skl stepping info structure. - Added out-of-bound access for skl_stepping_info. - Sanity check for mmio address modified. - Sanity check added for stepping and substeppig. - Modified the intel_dmc_info structure, cache only the required header info. (Animesh) v13: clarify firmware load error message. The reason for a firmware loading failure can be obscure if the driver is built-in. Provide an explanation to the user about the likely reason for the failure and how to resolve it. (Imre) v14: Suggested by Jani. - fix s/I915/CONFIG_DRM_I915/ typo - add fw_path to the firmware object instead of using a static ptr (Jani) v15: 1) Changed the firmware name as dmc_gen9.bin, everytime for a new firmware version a symbolic link with same name will help not to build kernel again. 2) Changes done as per review comments from Imre. - Error check removed for intel_csr_ucode_init. - Moved csr-specific data structure to intel_csr.h and optimization done on structure definition. - fw->data used directly for parsing the header info & memory allocation only done separately for payload. (Animesh) v16: - No need for out_regs label in i915_driver_load(), so removed it. - Changed the firmware name as skl_dmc_ver1.bin, followed naming convention <platform>_dmc_<api-version>.bin (Animesh) Issue: VIZ-2569 Signed-off-by: A.Sunil Kamath <sunil.kamath@intel.com> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Animesh Manna <animesh.manna@intel.com> Signed-off-by: Imre Deak <imre.deak@intel.com> Reviewed-by: Imre Deak <imre.deak@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-04 20:58:44 +08:00
void i915_firmware_load_error_print(const char *fw_path, int err);
/* intel_hotplug.c */
void intel_hpd_irq_handler(struct drm_device *dev, u32 pin_mask, u32 long_mask);
void intel_hpd_init(struct drm_i915_private *dev_priv);
void intel_hpd_init_work(struct drm_i915_private *dev_priv);
void intel_hpd_cancel_work(struct drm_i915_private *dev_priv);
bool intel_hpd_pin_to_port(enum hpd_pin pin, enum port *port);
/* i915_irq.c */
void i915_queue_hangcheck(struct drm_device *dev);
__printf(3, 4)
void i915_handle_error(struct drm_device *dev, bool wedged,
const char *fmt, ...);
extern void intel_irq_init(struct drm_i915_private *dev_priv);
int intel_irq_install(struct drm_i915_private *dev_priv);
void intel_irq_uninstall(struct drm_i915_private *dev_priv);
extern void intel_uncore_sanitize(struct drm_device *dev);
extern void intel_uncore_early_sanitize(struct drm_device *dev,
bool restore_forcewake);
extern void intel_uncore_init(struct drm_device *dev);
extern void intel_uncore_check_errors(struct drm_device *dev);
extern void intel_uncore_fini(struct drm_device *dev);
extern void intel_uncore_forcewake_reset(struct drm_device *dev, bool restore);
const char *intel_uncore_forcewake_domain_to_str(const enum forcewake_domain_id id);
void intel_uncore_forcewake_get(struct drm_i915_private *dev_priv,
enum forcewake_domains domains);
void intel_uncore_forcewake_put(struct drm_i915_private *dev_priv,
enum forcewake_domains domains);
/* Like above but the caller must manage the uncore.lock itself.
* Must be used with I915_READ_FW and friends.
*/
void intel_uncore_forcewake_get__locked(struct drm_i915_private *dev_priv,
enum forcewake_domains domains);
void intel_uncore_forcewake_put__locked(struct drm_i915_private *dev_priv,
enum forcewake_domains domains);
void assert_forcewakes_inactive(struct drm_i915_private *dev_priv);
drm/i915: Introduce a PV INFO page structure for Intel GVT-g. Introduce a PV INFO structure, to facilitate the Intel GVT-g technology, which is a GPU virtualization solution with mediated pass-through. This page contains the shared information between i915 driver and the host emulator. For now, this structure utilizes an area of 4K bytes on HSW GPU's unused MMIO space. Future hardware will have the reserved window architecturally defined, and layout of the page will be added in future BSpec. The i915 driver load routine detects if it is running in a VM by reading the contents of this PV INFO page. Thereafter a flag, vgpu.active is set, and intel_vgpu_active() is used by checking this flag to conclude if GPU is virtualized with Intel GVT-g. By now, intel_vgpu_active() will return true, only when the driver is running as a guest in the Intel GVT-g enhanced environment on HSW platform. v2: take Chris' comments: - call the i915_check_vgpu() in intel_uncore_init() - sanitize i915_check_vgpu() by adding BUILD_BUG_ON() and debug info take Daniel's comments: - put the definition of PV INFO into a new header - i915_vgt_if.h other changes: - access mmio regs by readq/readw in i915_check_vgpu() v3: take Daniel's comments: - move the i915/vgt interfaces into a new i915_vgpu.c - update makefile - add kerneldoc to functions which are non-static - add a DOC: section describing some of the high-level design - update drm docbook other changes: - rename i915_vgt_if.h to i915_vgpu.h v4: take Tvrtko's comments: - fix a typo in commit message - add debug message when vgt version mismatches - rename low_gmadr/high_gmadr to mappable/non-mappable in PV INFO structure Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com> Signed-off-by: Jike Song <jike.song@intel.com> Signed-off-by: Eddie Dong <eddie.dong@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-02-10 19:05:47 +08:00
static inline bool intel_vgpu_active(struct drm_device *dev)
{
return to_i915(dev)->vgpu.active;
}
void
i915_enable_pipestat(struct drm_i915_private *dev_priv, enum pipe pipe,
u32 status_mask);
void
i915_disable_pipestat(struct drm_i915_private *dev_priv, enum pipe pipe,
u32 status_mask);
void valleyview_enable_display_irqs(struct drm_i915_private *dev_priv);
void valleyview_disable_display_irqs(struct drm_i915_private *dev_priv);
void
ironlake_enable_display_irq(struct drm_i915_private *dev_priv, u32 mask);
void
ironlake_disable_display_irq(struct drm_i915_private *dev_priv, u32 mask);
void ibx_display_interrupt_update(struct drm_i915_private *dev_priv,
uint32_t interrupt_mask,
uint32_t enabled_irq_mask);
#define ibx_enable_display_interrupt(dev_priv, bits) \
ibx_display_interrupt_update((dev_priv), (bits), (bits))
#define ibx_disable_display_interrupt(dev_priv, bits) \
ibx_display_interrupt_update((dev_priv), (bits), 0)
/* i915_gem.c */
int i915_gem_create_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_pread_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
void i915_gem_execbuffer_move_to_active(struct list_head *vmas,
struct drm_i915_gem_request *req);
void i915_gem_execbuffer_retire_commands(struct i915_execbuffer_params *params);
drm/i915: Merged the many do_execbuf() parameters into a structure The do_execbuf() function takes quite a few parameters. The actual set of parameters is going to change with the conversion to passing requests around. Further, it is due to grow massively with the arrival of the GPU scheduler. This patch simplifies the prototype by passing a parameter structure instead. Changing the parameter set in the future is then simply a matter of adding/removing items to the structure. Note that the structure does not contain absolutely everything that is passed in. This is because the intention is to use this structure more extensively later in this patch series and more especially in the GPU scheduler that is coming soon. The latter requires hanging on to the structure as the final hardware submission can be delayed until long after the execbuf IOCTL has returned to user land. Thus it is unsafe to put anything in the structure that is local to the IOCTL call itself - such as the 'args' parameter. All entries must be copies of data or pointers to structures that are reference counted in some way and guaranteed to exist for the duration of the batch buffer's life. v2: Rebased to newer tree and updated for changes to the command parser. Specifically, a code shuffle has required saving the batch start address in the params structure. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:43:27 +08:00
int i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
struct drm_i915_gem_execbuffer2 *args,
drm/i915: Merged the many do_execbuf() parameters into a structure The do_execbuf() function takes quite a few parameters. The actual set of parameters is going to change with the conversion to passing requests around. Further, it is due to grow massively with the arrival of the GPU scheduler. This patch simplifies the prototype by passing a parameter structure instead. Changing the parameter set in the future is then simply a matter of adding/removing items to the structure. Note that the structure does not contain absolutely everything that is passed in. This is because the intention is to use this structure more extensively later in this patch series and more especially in the GPU scheduler that is coming soon. The latter requires hanging on to the structure as the final hardware submission can be delayed until long after the execbuf IOCTL has returned to user land. Thus it is unsafe to put anything in the structure that is local to the IOCTL call itself - such as the 'args' parameter. All entries must be copies of data or pointers to structures that are reference counted in some way and guaranteed to exist for the duration of the batch buffer's life. v2: Rebased to newer tree and updated for changes to the command parser. Specifically, a code shuffle has required saving the batch start address in the params structure. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:43:27 +08:00
struct list_head *vmas);
int i915_gem_execbuffer(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_execbuffer2(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_busy_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_set_tiling(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_get_tiling(struct drm_device *dev, void *data,
struct drm_file *file_priv);
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl By exporting the ability to map user address and inserting PTEs representing their backing pages into the GTT, we can exploit UMA in order to utilize normal application data as a texture source or even as a render target (depending upon the capabilities of the chipset). This has a number of uses, with zero-copy downloads to the GPU and efficient readback making the intermixed streaming of CPU and GPU operations fairly efficient. This ability has many widespread implications from faster rendering of client-side software rasterisers (chromium), mitigation of stalls due to read back (firefox) and to faster pipelining of texture data (such as pixel buffer objects in GL or data blobs in CL). v2: Compile with CONFIG_MMU_NOTIFIER v3: We can sleep while performing invalidate-range, which we can utilise to drop our page references prior to the kernel manipulating the vma (for either discard or cloning) and so protect normal users. v4: Only run the invalidate notifier if the range intercepts the bo. v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers v6: Recheck after reacquire mutex for lost mmu. v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary. v8: Fix rebasing error after forwarding porting the back port. v9: Limit the userptr to page aligned entries. We now expect userspace to handle all the offset-in-page adjustments itself. v10: Prevent vma from being copied across fork to avoid issues with cow. v11: Drop vma behaviour changes -- locking is nigh on impossible. Use a worker to load user pages to avoid lock inversions. v12: Use get_task_mm()/mmput() for correct refcounting of mm. v13: Use a worker to release the mmu_notifier to avoid lock inversion v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer with its own locking and tree of objects for each mm/mmu_notifier. v15: Prevent overlapping userptr objects, and invalidate all objects within the mmu_notifier range v16: Fix a typo for iterating over multiple objects in the range and rearrange error path to destroy the mmu_notifier locklessly. Also close a race between invalidate_range and the get_pages_worker. v17: Close a race between get_pages_worker/invalidate_range and fresh allocations of the same userptr range - and notice that struct_mutex was presumed to be held when during creation it wasn't. v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory for the struct sg_table and to clear it before reporting an error. v19: Always error out on read-only userptr requests as we don't have the hardware infrastructure to support them at the moment. v20: Refuse to implement read-only support until we have the required infrastructure - but reserve the bit in flags for future use. v21: use_mm() is not required for get_user_pages(). It is only meant to be used to fix up the kernel thread's current->mm for use with copy_user(). v22: Use sg_alloc_table_from_pages for that chunky feeling v23: Export a function for sanity checking dma-buf rather than encode userptr details elsewhere, and clean up comments based on suggestions by Bradley. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com> Cc: Akash Goel <akash.goel@intel.com> Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com> [danvet: Frob ioctl allocation to pick the next one - will cause a bit of fuss with create2 apparently, but such are the rules.] [danvet2: oops, forgot to git add after manual patch application] [danvet3: Appease sparse.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
int i915_gem_init_userptr(struct drm_device *dev);
int i915_gem_userptr_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
drm/i915: wait render timeout ioctl This helps implement GL_ARB_sync but stops short of allowing full blown sync objects. Finally we can use the new timed seqno waiting function to allow userspace to wait on a buffer object with a timeout. This implements that interface. The IOCTL will take as input a buffer object handle, and a timeout in nanoseconds (flags is currently optional but will likely be used for permutations of flush operations). Users may specify 0 nanoseconds to instantly check. The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any non-zero timeout parameter the wait ioctl will wait for the given number of nanoseconds on an object becoming unbusy. Since the wait itself does so holding struct_mutex the object may become re-busied before this completes. A similar but shorter race condition exists in the busy ioctl. v2: ETIME/ERESTARTSYS instead of changing to EBUSY, and EGAIN (Chris) Flush the object from the gpu write domain (Chris + Daniel) Fix leaked refcount in good case (Chris) Naturally align ioctl struct (Chris) v3: Drop lock after getting seqno to avoid ugly dance (Chris) v4: check for 0 timeout after olr check to allow polling (Chris) v5: Updated the comment. (Chris) v6: Return -ETIME instead of -EBUSY when timeout_ns is 0 (Daniel) Fix the commit message comment to be less ugly (Ben) Add a warning to check the return timespec (Ben) v7: Use DRM_AUTH for the ioctl. (Eugeni) Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-05-25 06:03:10 +08:00
int i915_gem_wait_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
void i915_gem_load(struct drm_device *dev);
void *i915_gem_object_alloc(struct drm_device *dev);
void i915_gem_object_free(struct drm_i915_gem_object *obj);
void i915_gem_object_init(struct drm_i915_gem_object *obj,
const struct drm_i915_gem_object_ops *ops);
struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev,
size_t size);
struct drm_i915_gem_object *i915_gem_object_create_from_data(
struct drm_device *dev, const void *data, size_t size);
void i915_init_vm(struct drm_i915_private *dev_priv,
struct i915_address_space *vm);
void i915_gem_free_object(struct drm_gem_object *obj);
void i915_gem_vma_destroy(struct i915_vma *vma);
/* Flags used by pin/bind&friends. */
#define PIN_MAPPABLE (1<<0)
#define PIN_NONBLOCK (1<<1)
#define PIN_GLOBAL (1<<2)
#define PIN_OFFSET_BIAS (1<<3)
#define PIN_USER (1<<4)
#define PIN_UPDATE (1<<5)
drm/i915: Prevent negative relocation deltas from wrapping This is pure evil. Userspace, I'm looking at you SNA, repacks batch buffers on the fly after generation as they are being passed to the kernel for execution. These batches also contain self-referenced relocations as a single buffer encompasses the state commands, kernels, vertices and sampler. During generation the buffers are placed at known offsets within the full batch, and then the relocation deltas (as passed to the kernel) are tweaked as the batch is repacked into a smaller buffer. This means that userspace is passing negative relocations deltas, which subsequently wrap to large values if the batch is at a low address. The GPU hangs when it then tries to use the large value as a base for its address offsets, rather than wrapping back to the real value (as one would hope). As the GPU uses positive offsets from the base, we can treat the relocation address as the minimum address read by the GPU. For the upper bound, we trust that userspace will not read beyond the end of the buffer. So, how do we fix negative relocations from wrapping? We can either check that every relocation looks valid when we write it, and then position each object such that we prevent the offset wraparound, or we just special-case the self-referential behaviour of SNA and force all batches to be above 256k. Daniel prefers the latter approach. This fixes a GPU hang when it tries to use an address (relocation + offset) greater than the GTT size. The issue would occur quite easily with full-ppgtt as each fd gets its own VM space, so low offsets would often be handed out. However, with the rearrangement of the low GTT due to capturing the BIOS framebuffer, it is already affecting kernels 3.15 onwards. I think only IVB+ is susceptible to this bug, but the workaround should only kick in rarely, so it seems sensible to always apply it. v3: Use a bias for batch buffers to prevent small negative delta relocations from wrapping. v4 from Daniel: - s/BIAS/BATCH_OFFSET_BIAS/ - Extract eb_vma_misplaced/i915_vma_misplaced since the conditions were growing rather cumbersome. - Add a comment to eb_get_batch explaining why we do this. - Apply the batch offset bias everywhere but mention that we've only observed it on gen7 gpus. - Drop PIN_OFFSET_FIX for now, that slipped in from a feature patch. v5: Add static to eb_get_batch, spotted by 0-day tester. Testcase: igt/gem_bad_reloc Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78533 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v3) Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-23 14:48:08 +08:00
#define PIN_OFFSET_MASK (~4095)
int __must_check
i915_gem_object_pin(struct drm_i915_gem_object *obj,
struct i915_address_space *vm,
uint32_t alignment,
uint64_t flags);
int __must_check
i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
const struct i915_ggtt_view *view,
uint32_t alignment,
uint64_t flags);
drm/i915: Infrastructure for supporting different GGTT views per object Things like reliable GGTT mappings and mirrored 2d-on-3d display will need to map objects into the same address space multiple times. Added a GGTT view concept and linked it with the VMA to distinguish between multiple instances per address space. New objects and GEM functions which do not take this new view as a parameter assume the default of zero (I915_GGTT_VIEW_NORMAL) which preserves the previous behaviour. This now means that objects can have multiple VMA entries so the code which assumed there will only be one also had to be modified. Alternative GGTT views are supposed to borrow DMA addresses from obj->pages which is DMA mapped on first VMA instantiation and unmapped on the last one going away. v2: * Removed per view special casing in i915_gem_ggtt_prepare / finish_object in favour of creating and destroying DMA mappings on first VMA instantiation and last VMA destruction. (Daniel Vetter) * Simplified i915_vma_unbind which does not need to count the GGTT views. (Daniel Vetter) * Also moved obj->map_and_fenceable reset under the same check. * Checkpatch cleanups. v3: * Only retire objects once the last VMA is unbound. v4: * Keep scatter-gather table for alternative views persistent for the lifetime of the VMA. * Propagate binding errors to callers and handle appropriately. v5: * Explicitly look for normal GGTT view in i915_gem_obj_bound to align usage in i915_gem_object_ggtt_unpin. (Michel Thierry) * Change to single if statement in i915_gem_obj_to_ggtt. (Michel Thierry) * Removed stray semi-colon in i915_gem_object_set_cache_level. For: VIZ-4544 Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Michel Thierry <michel.thierry@intel.com> [danvet: Drop hunk from i915_gem_shrink since it's just prettification but upsets a __must_check warning.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-11 01:27:58 +08:00
int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
u32 flags);
drm/i915: plumb VM into bind/unbind code As alluded to in several patches, and it will be reiterated later... A VMA is an abstraction for a GEM BO bound into an address space. Therefore it stands to reason, that the existing bind, and unbind are the ones which will be the most impacted. This patch implements this, and updates all callers which weren't already updated in the series (because it was too messy). This patch represents the bulk of an earlier, larger patch. I've pulled out a bunch of things by the request of Daniel. The history is preserved for posterity with the email convention of ">" One big change from the original patch aside from a bunch of cropping is I've created an i915_vma_unbind() function. That is because we always have the VMA anyway, and doing an extra lookup is useful. There is a caveat, we retain an i915_gem_object_ggtt_unbind, for the global cases which might not talk in VMAs. > drm/i915: plumb VM into object operations > > This patch was formerly known as: > "drm/i915: Create VMAs (part 3) - plumbing" > > This patch adds a VM argument, bind/unbind, and the object > offset/size/color getters/setters. It preserves the old ggtt helper > functions because things still need, and will continue to need them. > > Some code will still need to be ported over after this. > > v2: Fix purge to pick an object and unbind all vmas > This was doable because of the global bound list change. > > v3: With the commit to actually pin/unpin pages in place, there is no > longer a need to check if unbind succeeded before calling put_pages(). > Make put_pages only BUG() after checking pin count. > > v4: Rebased on top of the new hangcheck work by Mika > plumbed eb_destroy also > Many checkpatch related fixes > > v5: Very large rebase > > v6: > Change BUG_ON to WARN_ON (Daniel) > Rename vm to ggtt in preallocate stolen, since it is always ggtt when > dealing with stolen memory. (Daniel) > list_for_each will short-circuit already (Daniel) > remove superflous space (Daniel) > Use per object list of vmas (Daniel) > Make obj_bound_any() use obj_bound for each vm (Ben) > s/bind_to_gtt/bind_to_vm/ (Ben) > > Fixed up the inactive shrinker. As Daniel noticed the code could > potentially count the same object multiple times. While it's not > possible in the current case, since 1 object can only ever be bound into > 1 address space thus far - we may as well try to get something more > future proof in place now. With a prep patch before this to switch over > to using the bound list + inactive check, we're now able to carry that > forward for every address space an object is bound into. Signed-off-by: Ben Widawsky <ben@bwidawsk.net> [danvet: Rebase on top of the loss of "drm/i915: Cleanup more of VMA in destroy".] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-08-01 08:00:10 +08:00
int __must_check i915_vma_unbind(struct i915_vma *vma);
int i915_gem_object_put_pages(struct drm_i915_gem_object *obj);
void i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv);
void i915_gem_release_mmap(struct drm_i915_gem_object *obj);
int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
int *needs_clflush);
int __must_check i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
static inline int __sg_page_count(struct scatterlist *sg)
{
return sg->length >> PAGE_SHIFT;
}
static inline struct page *
i915_gem_object_get_page(struct drm_i915_gem_object *obj, int n)
{
if (WARN_ON(n >= obj->base.size >> PAGE_SHIFT))
return NULL;
if (n < obj->get_page.last) {
obj->get_page.sg = obj->pages->sgl;
obj->get_page.last = 0;
}
while (obj->get_page.last + __sg_page_count(obj->get_page.sg) <= n) {
obj->get_page.last += __sg_page_count(obj->get_page.sg++);
if (unlikely(sg_is_chain(obj->get_page.sg)))
obj->get_page.sg = sg_chain_ptr(obj->get_page.sg);
}
return nth_page(sg_page(obj->get_page.sg), n - obj->get_page.last);
}
static inline void i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
{
BUG_ON(obj->pages == NULL);
obj->pages_pin_count++;
}
static inline void i915_gem_object_unpin_pages(struct drm_i915_gem_object *obj)
{
BUG_ON(obj->pages_pin_count == 0);
obj->pages_pin_count--;
}
int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
int i915_gem_object_sync(struct drm_i915_gem_object *obj,
drm/i915: Update i915_gem_object_sync() to take a request structure The plan is to pass requests around as the basic submission tracking structure rather than rings and contexts. This patch updates the i915_gem_object_sync() code path. v2: Much more complex patch to share a single request between the sync and the page flip. The _sync() function now supports lazy allocation of the request structure. That is, if one is passed in then that will be used. If one is not, then a request will be allocated and passed back out. Note that the _sync() code does not necessarily require a request. Thus one will only be created until certain situations. The reason the lazy allocation must be done within the _sync() code itself is because the decision to need one or not is not really something that code above can second guess (except in the case where one is definitely not required because no ring is passed in). The call chains above _sync() now support passing a request through which most callers passing in NULL and assuming that no request will be required (because they also pass in NULL for the ring and therefore can't be generating any ring code). The exeception is intel_crtc_page_flip() which now supports having a request returned from _sync(). If one is, then that request is shared by the page flip (if the page flip is of a type to need a request). If _sync() does not generate a request but the page flip does need one, then the page flip path will create its own request. v3: Updated comment description to be clearer about 'to_req' parameter (Tomas Elf review request). Rebased onto newer tree that significantly changed the synchronisation code. v4: Updated comments from review feedback (Tomas Elf) For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-06-18 20:14:56 +08:00
struct intel_engine_cs *to,
struct drm_i915_gem_request **to_req);
void i915_vma_move_to_active(struct i915_vma *vma,
struct drm_i915_gem_request *req);
int i915_gem_dumb_create(struct drm_file *file_priv,
struct drm_device *dev,
struct drm_mode_create_dumb *args);
int i915_gem_mmap_gtt(struct drm_file *file_priv, struct drm_device *dev,
uint32_t handle, uint64_t *offset);
/**
* Returns true if seq1 is later than seq2.
*/
static inline bool
i915_seqno_passed(uint32_t seq1, uint32_t seq2)
{
return (int32_t)(seq1 - seq2) >= 0;
}
static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
bool lazy_coherency)
{
u32 seqno;
BUG_ON(req == NULL);
seqno = req->ring->get_seqno(req->ring, lazy_coherency);
return i915_seqno_passed(seqno, req->seqno);
}
int __must_check i915_gem_get_seqno(struct drm_device *dev, u32 *seqno);
int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
drm/i915: Rely on accurate request tracking for finding hung batches In the past, it was possible to have multiple batches per request due to a stray signal or ENOMEM. As a result we had to scan each active object (filtered by those having the COMMAND domain) for the one that contained the ACTHD pointer. This was then made more complicated by the introduction of ppgtt, whereby ACTHD then pointed into the address space of the context and so also needed to be taken into account. This is a fairly robust approach (though the implementation is a little fragile and depends upon the per-generation setup, registers and parameters). However, due to the requirements for hangstats, we needed a robust method for associating batches with a particular request and having that we can rely upon it for finding the associated batch object for error capture. If the batch buffer tracking is not robust enough, that should become apparent quite quickly through an erroneous error capture. That should also help to make sure that the runtime reporting to userspace is robust. It also means that we then report the oldest incomplete batch on each ring, which can be useful for determining the state of userspace at the time of a hang. v2: Use i915_gem_find_active_request (Mika) v3: remove check for ring->get_seqno, split long lines (Ben) v4: check that context is available (Chris) checkpatch warnings fixed Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1) Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v3) Cc: Ben Widawsky <benjamin.widawsky@intel.com> Reviewed-by: Ben Widawsky <ben@bwidawsk.net> (v3) Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-02-25 23:11:23 +08:00
struct drm_i915_gem_request *
i915_gem_find_active_request(struct intel_engine_cs *ring);
drm/i915: Rely on accurate request tracking for finding hung batches In the past, it was possible to have multiple batches per request due to a stray signal or ENOMEM. As a result we had to scan each active object (filtered by those having the COMMAND domain) for the one that contained the ACTHD pointer. This was then made more complicated by the introduction of ppgtt, whereby ACTHD then pointed into the address space of the context and so also needed to be taken into account. This is a fairly robust approach (though the implementation is a little fragile and depends upon the per-generation setup, registers and parameters). However, due to the requirements for hangstats, we needed a robust method for associating batches with a particular request and having that we can rely upon it for finding the associated batch object for error capture. If the batch buffer tracking is not robust enough, that should become apparent quite quickly through an erroneous error capture. That should also help to make sure that the runtime reporting to userspace is robust. It also means that we then report the oldest incomplete batch on each ring, which can be useful for determining the state of userspace at the time of a hang. v2: Use i915_gem_find_active_request (Mika) v3: remove check for ring->get_seqno, split long lines (Ben) v4: check that context is available (Chris) checkpatch warnings fixed Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1) Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v3) Cc: Ben Widawsky <benjamin.widawsky@intel.com> Reviewed-by: Ben Widawsky <ben@bwidawsk.net> (v3) Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-02-25 23:11:23 +08:00
drm/i915: Boost RPS frequency for CPU stalls If we encounter a situation where the CPU blocks waiting for results from the GPU, give the GPU a kick to boost its the frequency. This should work to reduce user interface stalls and to quickly promote mesa to high frequencies - but the cost is that our requested frequency stalls high (as we do not idle for long enough before rc6 to start reducing frequencies, nor are we aggressive at down clocking an underused GPU). However, this should be mitigated by rc6 itself powering off the GPU when idle, and that energy use is dependent upon the workload of the GPU in addition to its frequency (e.g. the math or sampler functions only consume power when used). Still, this is likely to adversely affect light workloads. In particular, this nearly eliminates the highly noticeable wake-up lag in animations from idle. For example, expose or workspace transitions. (However, given the situation where we fail to downclock, our requested frequency is almost always the maximum, except for Baytrail where we manually downclock upon idling. This often masks the latency of upclocking after being idle, so animations are typically smooth - at the cost of increased power consumption.) Stéphane raised the concern that this will punish good applications and reward bad applications - but due to the nature of how mesa performs its client throttling, I believe all mesa applications will be roughly equally affected. To address this concern, and to prevent applications like compositors from permanently boosting the RPS state, we ratelimit the frequency of the wait-boosts each client recieves. Unfortunately, this techinique is ineffective with Ironlake - which also has dynamic render power states and suffers just as dramatically. For Ironlake, the thermal/power headroom is shared with the CPU through Intelligent Power Sharing and the intel-ips module. This leaves us with no GPU boost frequencies available when coming out of idle, and due to hardware limitations we cannot change the arbitration between the CPU and GPU quickly enough to be effective. v2: Limit each client to receiving a single boost for each active period. Tested by QA to only marginally increase power, and to demonstrably increase throughput in games. No latency measurements yet. v3: Cater for front-buffer rendering with manual throttling. v4: Tidy up. v5: Sadly the compositor needs frequent boosts as it may never idle, but due to its picking mechanism (using ReadPixels) may require frequent waits. Those waits, along with the waits for the vrefresh swap, conspire to keep the GPU at low frequencies despite the interactive latency. To overcome this we ditch the one-boost-per-active-period and just ratelimit the number of wait-boosts each client can receive. Reported-and-tested-by: Paul Neumann <paul104x@yahoo.de> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=68716 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Kenneth Graunke <kenneth@whitecape.org> Cc: Stéphane Marchesin <stephane.marchesin@gmail.com> Cc: Owen Taylor <otaylor@redhat.com> Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com> Cc: "Zhuang, Lena" <lena.zhuang@intel.com> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> [danvet: No extern for function prototypes in headers.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-09-26 00:34:56 +08:00
bool i915_gem_retire_requests(struct drm_device *dev);
void i915_gem_retire_requests_ring(struct intel_engine_cs *ring);
int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
drm/i915: non-interruptible sleeps can't handle -EAGAIN So don't return -EAGAIN, even in the case of a gpu hang. Remap it to -EIO instead. Note that this isn't really an issue with interruptability, but more that we have quite a few codepaths (mostly around kms stuff) that simply can't handle any errors and hence not even -EAGAIN. Instead of adding proper failure paths so that we could restart these ioctls we've opted for the cheap way out of sleeping non-interruptibly. Which works everywhere but when the gpu dies, which this patch fixes. So essentially interruptible == false means 'wait for the gpu or die trying'.' This patch is a bit ugly because intel_ring_begin is all non-interruptible and hence only returns -EIO. But as the comment in there says, auditing all the callsites would be a pain. To avoid duplicating code, reuse i915_gem_check_wedge in __wait_seqno and intel_wait_ring_buffer. Also use the opportunity to clarify the different cases in i915_gem_check_wedge a bit with comments. v2: Don't access dev_priv->mm.interruptible from check_wedge - we might not hold dev->struct_mutex, making this racy. Instead pass interruptible in as a parameter. I've noticed this because I've hit a BUG_ON(!mutex_is_locked) at the top of check_wedge. This has been added in commit b4aca0106c466b5a0329318203f65bac2d91b682 Author: Ben Widawsky <ben@bwidawsk.net> Date: Wed Apr 25 20:50:12 2012 -0700 drm/i915: extract some common olr+wedge code although that commit is missing any justification for this. I guess it's just copy&paste, because the same commit add the same BUG_ON check to check_olr, where it indeed makes sense. But in check_wedge everything we access is protected by other means, so this is superflous. And because it now gets in the way (we add a new caller in __wait_seqno, which can be called without dev->struct_mutext) let's just remove it. v3: Group all the i915_gem_check_wedge refactoring into this patch, so that this patch here is all about not returning -EAGAIN to callsites that can't handle syscall restarting. v4: Add clarification what interuptible == fales means in our code, requested by Ben Widawsky. v5: Fix EAGAIN mispell noticed by Chris Wilson. Reviewed-by: Ben Widawsky <ben@bwidawsk.net> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-07-05 04:54:13 +08:00
bool interruptible);
drm/i915: Replaced Blitter ring based flips with MMIO flips This patch enables the framework for using MMIO based flip calls, in contrast with the CS based flip calls which are being used currently. MMIO based flip calls can be enabled on architectures where Render and Blitter engines reside in different power wells. The decision to use MMIO flips can be made based on workloads to give 100% residency for Media power well. v2: The MMIO flips now use the interrupt driven mechanism for issuing the flips when target seqno is reached. (Incorporating Ville's idea) v3: Rebasing on latest code. Code restructuring after incorporating Damien's comments v4: Addressing Ville's review comments -general cleanup -updating only base addr instead of calling update_primary_plane -extending patch for gen5+ platforms v5: Addressed Ville's review comments -Making mmio flip vs cs flip selection based on module parameter -Adding check for DRIVER_MODESET feature in notify_ring before calling notify mmio flip. -Other changes mostly in function arguments v6: -Having a seperate function to check condition for using mmio flips (Ville) -propogating error code from i915_gem_check_olr (Ville) v7: -Adding __must_check with i915_gem_check_olr (Chris) -Renaming mmio_flip_data to mmio_flip (Chris) -Rebasing on latest nightly v8: -Rebasing on latest code -squash 3rd patch in series(mmio setbase vs page flip race) with this patch -Added new tiling mode update in intel_do_mmio_flip (Chris) v9: -check for obj->last_write_seqno being 0 instead of obj->ring being NULL in intel_postpone_flip, as this is a more restrictive condition (Chris) v10: -Applied Chris's suggestions for squashing patches 2,3 into this patch. These patches make the selection of CS vs MMIO flip at the page flip time, and make the module parameter for using mmio flips as tristate, the states being 'force CS flips', 'force mmio flips', 'driver discretion'. Changed the logic for driver discretion (Chris) v11: Minor code cleanup(better readability, fixing whitespace errors, using lockdep to check mutex locked status in postpone_flip, removal of __must_check in function definition) (Chris) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Sourab Gupta <sourab.gupta@intel.com> Signed-off-by: Akash Goel <akash.goel@intel.com> Tested-by: Chris Wilson <chris@chris-wilson.co.uk> # snb, ivb [danvet: Fix up parameter alignement checkpatch spotted.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-02 19:17:17 +08:00
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
{
return unlikely(atomic_read(&error->reset_counter)
& (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
}
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
return atomic_read(&error->reset_counter) & I915_WEDGED;
}
static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
return ((atomic_read(&error->reset_counter) & ~I915_WEDGED) + 1) / 2;
drm/i915: clear up wedged transitions We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-16 00:17:22 +08:00
}
drm/i915: Record the tail at each request and use it to estimate the head By recording the location of every request in the ringbuffer, we know that in order to retire the request the GPU must have finished reading it and so the GPU head is now beyond the tail of the request. We can therefore provide a conservative estimate of where the GPU is reading from in order to avoid having to read back the ring buffer registers when polling for space upon starting a new write into the ringbuffer. A secondary effect is that this allows us to convert intel_ring_buffer_wait() to use i915_wait_request() and so consolidate upon the single function to handle the complicated task of waiting upon the GPU. A necessary precaution is that we need to make that wait uninterruptible to match the existing conditions as all the callers of intel_ring_begin() have not been audited to handle ERESTARTSYS correctly. By using a conservative estimate for the head, and always processing all outstanding requests first, we prevent a race condition between using the estimate and direct reads of I915_RING_HEAD which could result in the value of the head going backwards, and the tail overflowing once again. We are also careful to mark any request that we skip over in order to free space in ring as consumed which provides a self-consistency check. Given sufficient abuse, such as a set of unthrottled GPU bound cairo-traces, avoiding the use of I915_RING_HEAD gives a 10-20% boost on Sandy Bridge (i5-2520m): firefox-paintball 18927ms -> 15646ms: 1.21x speedup firefox-fishtank 12563ms -> 11278ms: 1.11x speedup which is a mild consolation for the performance those traces achieved from exploiting the buggy autoreported head. v2: Add a few more comments and make request->tail a conservative estimate as suggested by Daniel Vetter. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: resolve conflicts with retirement defering and the lack of the autoreport head removal (that will go in through -fixes).] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-02-15 19:25:36 +08:00
static inline bool i915_stop_ring_allow_ban(struct drm_i915_private *dev_priv)
{
return dev_priv->gpu_error.stop_rings == 0 ||
dev_priv->gpu_error.stop_rings & I915_STOP_RING_ALLOW_BAN;
}
static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv)
{
return dev_priv->gpu_error.stop_rings == 0 ||
dev_priv->gpu_error.stop_rings & I915_STOP_RING_ALLOW_WARN;
}
void i915_gem_reset(struct drm_device *dev);
bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
int __must_check i915_gem_init(struct drm_device *dev);
int i915_gem_init_rings(struct drm_device *dev);
int __must_check i915_gem_init_hw(struct drm_device *dev);
int i915_gem_l3_remap(struct drm_i915_gem_request *req, int slice);
void i915_gem_init_swizzling(struct drm_device *dev);
void i915_gem_cleanup_ringbuffer(struct drm_device *dev);
int __must_check i915_gpu_idle(struct drm_device *dev);
int __must_check i915_gem_suspend(struct drm_device *dev);
void __i915_add_request(struct drm_i915_gem_request *req,
struct drm_i915_gem_object *batch_obj,
bool flush_caches);
#define i915_add_request(req) \
drm/i915: Move the request/file and request/pid association to creation time In _i915_add_request(), the request is associated with a userland client. Specifically it is linked to the 'file' structure and the current user process is recorded. One problem here is that the current user process is not necessarily the same as when the request was submitted to the driver. This is especially true when the GPU scheduler arrives and decouples driver submission from hardware submission. Note also that it is only in the case where the add request comes from an execbuff call that there is a client to associate. Any other add request call is kernel only so does not need to do it. This patch moves the client association into a separate function. This is then called from the execbuffer code path itself at a sensible time. It also removes the now redundant 'file' pointer from the add request parameter list. An extra cleanup of the client association is also added to the request clean up code for the eventuality where the request is killed after association but before being submitted (e.g. due to out of memory error somewhere). Once the submission has happened, the request is on the request list and the regular request list removal will clear the association. Note that this still needs to happen at this point in time because the request might be kept floating around much longer (due to someone holding a reference count) and the client should not be worrying about this request after it has been retired. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:44:12 +08:00
__i915_add_request(req, NULL, true)
#define i915_add_request_no_flush(req) \
drm/i915: Move the request/file and request/pid association to creation time In _i915_add_request(), the request is associated with a userland client. Specifically it is linked to the 'file' structure and the current user process is recorded. One problem here is that the current user process is not necessarily the same as when the request was submitted to the driver. This is especially true when the GPU scheduler arrives and decouples driver submission from hardware submission. Note also that it is only in the case where the add request comes from an execbuff call that there is a client to associate. Any other add request call is kernel only so does not need to do it. This patch moves the client association into a separate function. This is then called from the execbuffer code path itself at a sensible time. It also removes the now redundant 'file' pointer from the add request parameter list. An extra cleanup of the client association is also added to the request clean up code for the eventuality where the request is killed after association but before being submitted (e.g. due to out of memory error somewhere). Once the submission has happened, the request is on the request list and the regular request list removal will clear the association. Note that this still needs to happen at this point in time because the request might be kept floating around much longer (due to someone holding a reference count) and the client should not be worrying about this request after it has been retired. For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-30 00:44:12 +08:00
__i915_add_request(req, NULL, false)
int __i915_wait_request(struct drm_i915_gem_request *req,
unsigned reset_counter,
bool interruptible,
s64 *timeout,
struct intel_rps_client *rps);
int __must_check i915_wait_request(struct drm_i915_gem_request *req);
int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
int __must_check
i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj,
bool readonly);
int __must_check
i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
bool write);
int __must_check
i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
int __must_check
i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
u32 alignment,
struct intel_engine_cs *pipelined,
drm/i915: Update i915_gem_object_sync() to take a request structure The plan is to pass requests around as the basic submission tracking structure rather than rings and contexts. This patch updates the i915_gem_object_sync() code path. v2: Much more complex patch to share a single request between the sync and the page flip. The _sync() function now supports lazy allocation of the request structure. That is, if one is passed in then that will be used. If one is not, then a request will be allocated and passed back out. Note that the _sync() code does not necessarily require a request. Thus one will only be created until certain situations. The reason the lazy allocation must be done within the _sync() code itself is because the decision to need one or not is not really something that code above can second guess (except in the case where one is definitely not required because no ring is passed in). The call chains above _sync() now support passing a request through which most callers passing in NULL and assuming that no request will be required (because they also pass in NULL for the ring and therefore can't be generating any ring code). The exeception is intel_crtc_page_flip() which now supports having a request returned from _sync(). If one is, then that request is shared by the page flip (if the page flip is of a type to need a request). If _sync() does not generate a request but the page flip does need one, then the page flip path will create its own request. v3: Updated comment description to be clearer about 'to_req' parameter (Tomas Elf review request). Rebased onto newer tree that significantly changed the synchronisation code. v4: Updated comments from review feedback (Tomas Elf) For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-06-18 20:14:56 +08:00
struct drm_i915_gem_request **pipelined_request,
const struct i915_ggtt_view *view);
void i915_gem_object_unpin_from_display_plane(struct drm_i915_gem_object *obj,
const struct i915_ggtt_view *view);
int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
int align);
drm/i915: Boost RPS frequency for CPU stalls If we encounter a situation where the CPU blocks waiting for results from the GPU, give the GPU a kick to boost its the frequency. This should work to reduce user interface stalls and to quickly promote mesa to high frequencies - but the cost is that our requested frequency stalls high (as we do not idle for long enough before rc6 to start reducing frequencies, nor are we aggressive at down clocking an underused GPU). However, this should be mitigated by rc6 itself powering off the GPU when idle, and that energy use is dependent upon the workload of the GPU in addition to its frequency (e.g. the math or sampler functions only consume power when used). Still, this is likely to adversely affect light workloads. In particular, this nearly eliminates the highly noticeable wake-up lag in animations from idle. For example, expose or workspace transitions. (However, given the situation where we fail to downclock, our requested frequency is almost always the maximum, except for Baytrail where we manually downclock upon idling. This often masks the latency of upclocking after being idle, so animations are typically smooth - at the cost of increased power consumption.) Stéphane raised the concern that this will punish good applications and reward bad applications - but due to the nature of how mesa performs its client throttling, I believe all mesa applications will be roughly equally affected. To address this concern, and to prevent applications like compositors from permanently boosting the RPS state, we ratelimit the frequency of the wait-boosts each client recieves. Unfortunately, this techinique is ineffective with Ironlake - which also has dynamic render power states and suffers just as dramatically. For Ironlake, the thermal/power headroom is shared with the CPU through Intelligent Power Sharing and the intel-ips module. This leaves us with no GPU boost frequencies available when coming out of idle, and due to hardware limitations we cannot change the arbitration between the CPU and GPU quickly enough to be effective. v2: Limit each client to receiving a single boost for each active period. Tested by QA to only marginally increase power, and to demonstrably increase throughput in games. No latency measurements yet. v3: Cater for front-buffer rendering with manual throttling. v4: Tidy up. v5: Sadly the compositor needs frequent boosts as it may never idle, but due to its picking mechanism (using ReadPixels) may require frequent waits. Those waits, along with the waits for the vrefresh swap, conspire to keep the GPU at low frequencies despite the interactive latency. To overcome this we ditch the one-boost-per-active-period and just ratelimit the number of wait-boosts each client can receive. Reported-and-tested-by: Paul Neumann <paul104x@yahoo.de> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=68716 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Kenneth Graunke <kenneth@whitecape.org> Cc: Stéphane Marchesin <stephane.marchesin@gmail.com> Cc: Owen Taylor <otaylor@redhat.com> Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com> Cc: "Zhuang, Lena" <lena.zhuang@intel.com> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> [danvet: No extern for function prototypes in headers.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-09-26 00:34:56 +08:00
int i915_gem_open(struct drm_device *dev, struct drm_file *file);
void i915_gem_release(struct drm_device *dev, struct drm_file *file);
uint32_t
i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode);
uint32_t
i915_gem_get_gtt_alignment(struct drm_device *dev, uint32_t size,
int tiling_mode, bool fenced);
int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
enum i915_cache_level cache_level);
i915: add dmabuf/prime buffer sharing support. This adds handle->fd and fd->handle support to i915, this is to allow for offloading of rendering in one direction and outputs in the other. v2 from Daniel Vetter: - fixup conflicts with the prepare/finish gtt prep work. - implement ppgtt binding support. Note that we have squat i-g-t testcoverage for any of the lifetime and access rules dma_buf/prime support brings along. And there are quite a few intricate situations here. Also note that the integration with the existing code is a bit hackish, especially around get_gtt_pages and put_gtt_pages. It imo would be easier with the prep code from Chris Wilson's unbound series, but that is for 3.6. Also note that I didn't bother to put the new prepare/finish gtt hooks to good use by moving the dma_buf_map/unmap_attachment calls in there (like we've originally planned for). Last but not least this patch is only compile-tested, but I've changed very little compared to Dave Airlie's version. So there's a decent chance v2 on drm-next works as well as v1 on 3.4-rc. v3: Right when I've hit sent I've noticed that I've screwed up one obj->sg_list (for dmar support) and obj->sg_table (for prime support) disdinction. We should be able to merge these 2 paths, but that's material for another patch. v4: fix the error reporting bugs pointed out by ickle. v5: fix another error, and stop non-gtt mmaps on shared objects stop pread/pwrite on imported objects, add fake kmap Signed-off-by: Dave Airlie <airlied@redhat.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-05-10 21:25:09 +08:00
struct drm_gem_object *i915_gem_prime_import(struct drm_device *dev,
struct dma_buf *dma_buf);
struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
struct drm_gem_object *gem_obj, int flags);
unsigned long
i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
const struct i915_ggtt_view *view);
unsigned long
i915_gem_obj_offset(struct drm_i915_gem_object *o,
struct i915_address_space *vm);
static inline unsigned long
i915_gem_obj_ggtt_offset(struct drm_i915_gem_object *o)
drm/i915: Infrastructure for supporting different GGTT views per object Things like reliable GGTT mappings and mirrored 2d-on-3d display will need to map objects into the same address space multiple times. Added a GGTT view concept and linked it with the VMA to distinguish between multiple instances per address space. New objects and GEM functions which do not take this new view as a parameter assume the default of zero (I915_GGTT_VIEW_NORMAL) which preserves the previous behaviour. This now means that objects can have multiple VMA entries so the code which assumed there will only be one also had to be modified. Alternative GGTT views are supposed to borrow DMA addresses from obj->pages which is DMA mapped on first VMA instantiation and unmapped on the last one going away. v2: * Removed per view special casing in i915_gem_ggtt_prepare / finish_object in favour of creating and destroying DMA mappings on first VMA instantiation and last VMA destruction. (Daniel Vetter) * Simplified i915_vma_unbind which does not need to count the GGTT views. (Daniel Vetter) * Also moved obj->map_and_fenceable reset under the same check. * Checkpatch cleanups. v3: * Only retire objects once the last VMA is unbound. v4: * Keep scatter-gather table for alternative views persistent for the lifetime of the VMA. * Propagate binding errors to callers and handle appropriately. v5: * Explicitly look for normal GGTT view in i915_gem_obj_bound to align usage in i915_gem_object_ggtt_unpin. (Michel Thierry) * Change to single if statement in i915_gem_obj_to_ggtt. (Michel Thierry) * Removed stray semi-colon in i915_gem_object_set_cache_level. For: VIZ-4544 Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Michel Thierry <michel.thierry@intel.com> [danvet: Drop hunk from i915_gem_shrink since it's just prettification but upsets a __must_check warning.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-11 01:27:58 +08:00
{
return i915_gem_obj_ggtt_offset_view(o, &i915_ggtt_view_normal);
drm/i915: Infrastructure for supporting different GGTT views per object Things like reliable GGTT mappings and mirrored 2d-on-3d display will need to map objects into the same address space multiple times. Added a GGTT view concept and linked it with the VMA to distinguish between multiple instances per address space. New objects and GEM functions which do not take this new view as a parameter assume the default of zero (I915_GGTT_VIEW_NORMAL) which preserves the previous behaviour. This now means that objects can have multiple VMA entries so the code which assumed there will only be one also had to be modified. Alternative GGTT views are supposed to borrow DMA addresses from obj->pages which is DMA mapped on first VMA instantiation and unmapped on the last one going away. v2: * Removed per view special casing in i915_gem_ggtt_prepare / finish_object in favour of creating and destroying DMA mappings on first VMA instantiation and last VMA destruction. (Daniel Vetter) * Simplified i915_vma_unbind which does not need to count the GGTT views. (Daniel Vetter) * Also moved obj->map_and_fenceable reset under the same check. * Checkpatch cleanups. v3: * Only retire objects once the last VMA is unbound. v4: * Keep scatter-gather table for alternative views persistent for the lifetime of the VMA. * Propagate binding errors to callers and handle appropriately. v5: * Explicitly look for normal GGTT view in i915_gem_obj_bound to align usage in i915_gem_object_ggtt_unpin. (Michel Thierry) * Change to single if statement in i915_gem_obj_to_ggtt. (Michel Thierry) * Removed stray semi-colon in i915_gem_object_set_cache_level. For: VIZ-4544 Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Michel Thierry <michel.thierry@intel.com> [danvet: Drop hunk from i915_gem_shrink since it's just prettification but upsets a __must_check warning.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-11 01:27:58 +08:00
}
bool i915_gem_obj_bound_any(struct drm_i915_gem_object *o);
bool i915_gem_obj_ggtt_bound_view(struct drm_i915_gem_object *o,
const struct i915_ggtt_view *view);
bool i915_gem_obj_bound(struct drm_i915_gem_object *o,
struct i915_address_space *vm);
drm/i915: Infrastructure for supporting different GGTT views per object Things like reliable GGTT mappings and mirrored 2d-on-3d display will need to map objects into the same address space multiple times. Added a GGTT view concept and linked it with the VMA to distinguish between multiple instances per address space. New objects and GEM functions which do not take this new view as a parameter assume the default of zero (I915_GGTT_VIEW_NORMAL) which preserves the previous behaviour. This now means that objects can have multiple VMA entries so the code which assumed there will only be one also had to be modified. Alternative GGTT views are supposed to borrow DMA addresses from obj->pages which is DMA mapped on first VMA instantiation and unmapped on the last one going away. v2: * Removed per view special casing in i915_gem_ggtt_prepare / finish_object in favour of creating and destroying DMA mappings on first VMA instantiation and last VMA destruction. (Daniel Vetter) * Simplified i915_vma_unbind which does not need to count the GGTT views. (Daniel Vetter) * Also moved obj->map_and_fenceable reset under the same check. * Checkpatch cleanups. v3: * Only retire objects once the last VMA is unbound. v4: * Keep scatter-gather table for alternative views persistent for the lifetime of the VMA. * Propagate binding errors to callers and handle appropriately. v5: * Explicitly look for normal GGTT view in i915_gem_obj_bound to align usage in i915_gem_object_ggtt_unpin. (Michel Thierry) * Change to single if statement in i915_gem_obj_to_ggtt. (Michel Thierry) * Removed stray semi-colon in i915_gem_object_set_cache_level. For: VIZ-4544 Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Michel Thierry <michel.thierry@intel.com> [danvet: Drop hunk from i915_gem_shrink since it's just prettification but upsets a __must_check warning.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-11 01:27:58 +08:00
unsigned long i915_gem_obj_size(struct drm_i915_gem_object *o,
struct i915_address_space *vm);
drm/i915: Infrastructure for supporting different GGTT views per object Things like reliable GGTT mappings and mirrored 2d-on-3d display will need to map objects into the same address space multiple times. Added a GGTT view concept and linked it with the VMA to distinguish between multiple instances per address space. New objects and GEM functions which do not take this new view as a parameter assume the default of zero (I915_GGTT_VIEW_NORMAL) which preserves the previous behaviour. This now means that objects can have multiple VMA entries so the code which assumed there will only be one also had to be modified. Alternative GGTT views are supposed to borrow DMA addresses from obj->pages which is DMA mapped on first VMA instantiation and unmapped on the last one going away. v2: * Removed per view special casing in i915_gem_ggtt_prepare / finish_object in favour of creating and destroying DMA mappings on first VMA instantiation and last VMA destruction. (Daniel Vetter) * Simplified i915_vma_unbind which does not need to count the GGTT views. (Daniel Vetter) * Also moved obj->map_and_fenceable reset under the same check. * Checkpatch cleanups. v3: * Only retire objects once the last VMA is unbound. v4: * Keep scatter-gather table for alternative views persistent for the lifetime of the VMA. * Propagate binding errors to callers and handle appropriately. v5: * Explicitly look for normal GGTT view in i915_gem_obj_bound to align usage in i915_gem_object_ggtt_unpin. (Michel Thierry) * Change to single if statement in i915_gem_obj_to_ggtt. (Michel Thierry) * Removed stray semi-colon in i915_gem_object_set_cache_level. For: VIZ-4544 Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Michel Thierry <michel.thierry@intel.com> [danvet: Drop hunk from i915_gem_shrink since it's just prettification but upsets a __must_check warning.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-11 01:27:58 +08:00
struct i915_vma *
i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
struct i915_address_space *vm);
struct i915_vma *
i915_gem_obj_to_ggtt_view(struct drm_i915_gem_object *obj,
const struct i915_ggtt_view *view);
drm/i915: Infrastructure for supporting different GGTT views per object Things like reliable GGTT mappings and mirrored 2d-on-3d display will need to map objects into the same address space multiple times. Added a GGTT view concept and linked it with the VMA to distinguish between multiple instances per address space. New objects and GEM functions which do not take this new view as a parameter assume the default of zero (I915_GGTT_VIEW_NORMAL) which preserves the previous behaviour. This now means that objects can have multiple VMA entries so the code which assumed there will only be one also had to be modified. Alternative GGTT views are supposed to borrow DMA addresses from obj->pages which is DMA mapped on first VMA instantiation and unmapped on the last one going away. v2: * Removed per view special casing in i915_gem_ggtt_prepare / finish_object in favour of creating and destroying DMA mappings on first VMA instantiation and last VMA destruction. (Daniel Vetter) * Simplified i915_vma_unbind which does not need to count the GGTT views. (Daniel Vetter) * Also moved obj->map_and_fenceable reset under the same check. * Checkpatch cleanups. v3: * Only retire objects once the last VMA is unbound. v4: * Keep scatter-gather table for alternative views persistent for the lifetime of the VMA. * Propagate binding errors to callers and handle appropriately. v5: * Explicitly look for normal GGTT view in i915_gem_obj_bound to align usage in i915_gem_object_ggtt_unpin. (Michel Thierry) * Change to single if statement in i915_gem_obj_to_ggtt. (Michel Thierry) * Removed stray semi-colon in i915_gem_object_set_cache_level. For: VIZ-4544 Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Michel Thierry <michel.thierry@intel.com> [danvet: Drop hunk from i915_gem_shrink since it's just prettification but upsets a __must_check warning.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-11 01:27:58 +08:00
struct i915_vma *
i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
struct i915_address_space *vm);
struct i915_vma *
i915_gem_obj_lookup_or_create_ggtt_vma(struct drm_i915_gem_object *obj,
const struct i915_ggtt_view *view);
static inline struct i915_vma *
i915_gem_obj_to_ggtt(struct drm_i915_gem_object *obj)
{
return i915_gem_obj_to_ggtt_view(obj, &i915_ggtt_view_normal);
}
bool i915_gem_obj_is_pinned(struct drm_i915_gem_object *obj);
/* Some GGTT VM helpers */
#define i915_obj_to_ggtt(obj) \
(&((struct drm_i915_private *)(obj)->base.dev->dev_private)->gtt.base)
static inline bool i915_is_ggtt(struct i915_address_space *vm)
{
struct i915_address_space *ggtt =
&((struct drm_i915_private *)(vm)->dev->dev_private)->gtt.base;
return vm == ggtt;
}
static inline struct i915_hw_ppgtt *
i915_vm_to_ppgtt(struct i915_address_space *vm)
{
WARN_ON(i915_is_ggtt(vm));
return container_of(vm, struct i915_hw_ppgtt, base);
}
static inline bool i915_gem_obj_ggtt_bound(struct drm_i915_gem_object *obj)
{
return i915_gem_obj_ggtt_bound_view(obj, &i915_ggtt_view_normal);
}
static inline unsigned long
i915_gem_obj_ggtt_size(struct drm_i915_gem_object *obj)
{
return i915_gem_obj_size(obj, i915_obj_to_ggtt(obj));
}
static inline int __must_check
i915_gem_obj_ggtt_pin(struct drm_i915_gem_object *obj,
uint32_t alignment,
unsigned flags)
{
return i915_gem_object_pin(obj, i915_obj_to_ggtt(obj),
alignment, flags | PIN_GLOBAL);
}
static inline int
i915_gem_object_ggtt_unbind(struct drm_i915_gem_object *obj)
{
return i915_vma_unbind(i915_gem_obj_to_ggtt(obj));
}
void i915_gem_object_ggtt_unpin_view(struct drm_i915_gem_object *obj,
const struct i915_ggtt_view *view);
static inline void
i915_gem_object_ggtt_unpin(struct drm_i915_gem_object *obj)
{
i915_gem_object_ggtt_unpin_view(obj, &i915_ggtt_view_normal);
}
/* i915_gem_fence.c */
int __must_check i915_gem_object_get_fence(struct drm_i915_gem_object *obj);
int __must_check i915_gem_object_put_fence(struct drm_i915_gem_object *obj);
bool i915_gem_object_pin_fence(struct drm_i915_gem_object *obj);
void i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj);
void i915_gem_restore_fences(struct drm_device *dev);
drm/i915: preliminary context support Very basic code for context setup/destruction in the driver. Adds the file i915_gem_context.c This file implements HW context support. On gen5+ a HW context consists of an opaque GPU object which is referenced at times of context saves and restores. With RC6 enabled, the context is also referenced as the GPU enters and exists from RC6 (GPU has it's own internal power context, except on gen5). Though something like a context does exist for the media ring, the code only supports contexts for the render ring. In software, there is a distinction between contexts created by the user, and the default HW context. The default HW context is used by GPU clients that do not request setup of their own hardware context. The default context's state is never restored to help prevent programming errors. This would happen if a client ran and piggy-backed off another clients GPU state. The default context only exists to give the GPU some offset to load as the current to invoke a save of the context we actually care about. In fact, the code could likely be constructed, albeit in a more complicated fashion, to never use the default context, though that limits the driver's ability to swap out, and/or destroy other contexts. All other contexts are created as a request by the GPU client. These contexts store GPU state, and thus allow GPU clients to not re-emit state (and potentially query certain state) at any time. The kernel driver makes certain that the appropriate commands are inserted. There are 4 entry points into the contexts, init, fini, open, close. The names are self-explanatory except that init can be called during reset, and also during pm thaw/resume. As we expect our context to be preserved across these events, we do not reinitialize in this case. As Adam Jackson pointed out, The cutoff of 1MB where a HW context is considered too big is arbitrary. The reason for this is even though context sizes are increasing with every generation, they have yet to eclipse even 32k. If we somehow read back way more than that, it probably means BIOS has done something strange, or we're running on a platform that wasn't designed for this. v2: rename load/unload to init/fini (daniel) remove ILK support for get_size() (indirectly daniel) add HAS_HW_CONTEXTS macro to clarify supported platforms (daniel) added comments (Ben) Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
2012-06-05 05:42:42 +08:00
/* i915_gem_context.c */
int __must_check i915_gem_context_init(struct drm_device *dev);
drm/i915: preliminary context support Very basic code for context setup/destruction in the driver. Adds the file i915_gem_context.c This file implements HW context support. On gen5+ a HW context consists of an opaque GPU object which is referenced at times of context saves and restores. With RC6 enabled, the context is also referenced as the GPU enters and exists from RC6 (GPU has it's own internal power context, except on gen5). Though something like a context does exist for the media ring, the code only supports contexts for the render ring. In software, there is a distinction between contexts created by the user, and the default HW context. The default HW context is used by GPU clients that do not request setup of their own hardware context. The default context's state is never restored to help prevent programming errors. This would happen if a client ran and piggy-backed off another clients GPU state. The default context only exists to give the GPU some offset to load as the current to invoke a save of the context we actually care about. In fact, the code could likely be constructed, albeit in a more complicated fashion, to never use the default context, though that limits the driver's ability to swap out, and/or destroy other contexts. All other contexts are created as a request by the GPU client. These contexts store GPU state, and thus allow GPU clients to not re-emit state (and potentially query certain state) at any time. The kernel driver makes certain that the appropriate commands are inserted. There are 4 entry points into the contexts, init, fini, open, close. The names are self-explanatory except that init can be called during reset, and also during pm thaw/resume. As we expect our context to be preserved across these events, we do not reinitialize in this case. As Adam Jackson pointed out, The cutoff of 1MB where a HW context is considered too big is arbitrary. The reason for this is even though context sizes are increasing with every generation, they have yet to eclipse even 32k. If we somehow read back way more than that, it probably means BIOS has done something strange, or we're running on a platform that wasn't designed for this. v2: rename load/unload to init/fini (daniel) remove ILK support for get_size() (indirectly daniel) add HAS_HW_CONTEXTS macro to clarify supported platforms (daniel) added comments (Ben) Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
2012-06-05 05:42:42 +08:00
void i915_gem_context_fini(struct drm_device *dev);
void i915_gem_context_reset(struct drm_device *dev);
int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
int i915_gem_context_enable(struct drm_i915_gem_request *req);
drm/i915: preliminary context support Very basic code for context setup/destruction in the driver. Adds the file i915_gem_context.c This file implements HW context support. On gen5+ a HW context consists of an opaque GPU object which is referenced at times of context saves and restores. With RC6 enabled, the context is also referenced as the GPU enters and exists from RC6 (GPU has it's own internal power context, except on gen5). Though something like a context does exist for the media ring, the code only supports contexts for the render ring. In software, there is a distinction between contexts created by the user, and the default HW context. The default HW context is used by GPU clients that do not request setup of their own hardware context. The default context's state is never restored to help prevent programming errors. This would happen if a client ran and piggy-backed off another clients GPU state. The default context only exists to give the GPU some offset to load as the current to invoke a save of the context we actually care about. In fact, the code could likely be constructed, albeit in a more complicated fashion, to never use the default context, though that limits the driver's ability to swap out, and/or destroy other contexts. All other contexts are created as a request by the GPU client. These contexts store GPU state, and thus allow GPU clients to not re-emit state (and potentially query certain state) at any time. The kernel driver makes certain that the appropriate commands are inserted. There are 4 entry points into the contexts, init, fini, open, close. The names are self-explanatory except that init can be called during reset, and also during pm thaw/resume. As we expect our context to be preserved across these events, we do not reinitialize in this case. As Adam Jackson pointed out, The cutoff of 1MB where a HW context is considered too big is arbitrary. The reason for this is even though context sizes are increasing with every generation, they have yet to eclipse even 32k. If we somehow read back way more than that, it probably means BIOS has done something strange, or we're running on a platform that wasn't designed for this. v2: rename load/unload to init/fini (daniel) remove ILK support for get_size() (indirectly daniel) add HAS_HW_CONTEXTS macro to clarify supported platforms (daniel) added comments (Ben) Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
2012-06-05 05:42:42 +08:00
void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
int i915_switch_context(struct drm_i915_gem_request *req);
struct intel_context *
drm/i915: Get context early in execbuf We need to have the address space when reserving space for the objects. Since the address space and context are tied together, and reserve occurs before context switch (for good reason), we must lookup our context earlier in the process. This leaves some room for optimizations where we no longer need to use ctx_id in certain places. This will be addressed in a subsequent patch. Important tricky bit: Because slow relocations during execbuffer drop struct_mutex Perhaps it would be best to acquire the reference when we get the context, but I'll save that for another day (note I have written the patch before, and I found the changes required to be uglier than this). Note that since we currently access everything via context id, and not the data structure this is fine, though not desirable. The next change attempts to get the context only once via the context ID idr lookup, and as such, the following can happen: CTX-A is created, refcount = 1 CTX-A execbuf, mutex dropped close IOCTL called on CTX-A, refcount = 0 CTX-A resumes in execbuf. v2: Rebased on top of commit b6359918b885da7c7b58c050674278dbd06020ab Author: Mika Kuoppala <mika.kuoppala@linux.intel.com> Date: Wed Oct 30 15:44:16 2013 +0200 drm/i915: add i915_get_reset_stats_ioctl v3: Rebased on top of commit 25b3dfc87bff80317d67ddd2cd4cfb91e6fe7d79 Author: Mika Westerberg <mika.westerberg@linux.intel.com> Date: Tue Nov 12 11:57:30 2013 +0200 Author: Mika Kuoppala <mika.kuoppala@linux.intel.com> Date: Tue Nov 26 16:14:33 2013 +0200 drm/i915: check context reset stats before relocations Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2013-12-07 06:11:21 +08:00
i915_gem_context_get(struct drm_i915_file_private *file_priv, u32 id);
void i915_gem_context_free(struct kref *ctx_ref);
drm/i915/bdw: A bit more advanced LR context alloc/free Now that we have the ability to allocate our own context backing objects and we have multiplexed one of them per engine inside the context structs, we can finally allocate and free them correctly. Regarding the context size, reading the register to calculate the sizes can work, I think, however the docs are very clear about the actual context sizes on GEN8, so just hardcode that and use it. v2: Rebased on top of the Full PPGTT series. It is important to notice that at this point we have one global default context per engine, all of them using the aliasing PPGTT (as opposed to the single global default context we have with legacy HW contexts). v3: - Go back to one single global default context, this time with multiple backing objects inside. - Use different context sizes for non-render engines, as suggested by Damien (still hardcoded, since the information about the context size registers in the BSpec is, well, *lacking*). - Render ctx size is 20 (or 19) pages, but not 21 (caught by Damien). - Move default context backing object creation to intel_init_ring (so that we don't waste memory in rings that might not get initialized). v4: - Reuse the HW legacy context init/fini. - Create a separate free function. - Rename the functions with an intel_ preffix. v5: Several rebases to account for the changes in the previous patches. Signed-off-by: Ben Widawsky <ben@bwidawsk.net> (v1) Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-25 00:04:14 +08:00
struct drm_i915_gem_object *
i915_gem_alloc_context_obj(struct drm_device *dev, size_t size);
static inline void i915_gem_context_reference(struct intel_context *ctx)
{
kref_get(&ctx->ref);
}
static inline void i915_gem_context_unreference(struct intel_context *ctx)
{
kref_put(&ctx->ref, i915_gem_context_free);
}
static inline bool i915_gem_context_is_default(const struct intel_context *c)
{
drm/i915: Emphasize that ctx->id is merely a user handle This is an Execlists preparatory patch, since they make context ID become an overloaded term: - In the software, it was used to distinguish which context userspace was trying to use. - In the BSpec, the term is used to describe the 20-bits long field the hardware uses to it to discriminate the contexts that are submitted to the ELSP and inform the driver about their current status (via Context Switch Interrupts and Context Status Buffers). Initially, I tried to make the different meanings converge, but it proved impossible: - The software ctx->id is per-filp, while the hardware one needs to be globally unique. - Also, we multiplex several backing states objects per intel_context, and all of them need unique HW IDs. - I tried adding a per-filp ID and then composing the HW context ID as: ctx->id + file_priv->id + ring->id, but the fact that the hardware only uses 20-bits means we have to artificially limit the number of filps or contexts the userspace can create. The ctx->user_handle renaming bits are done with this Cocci patch (plus manual frobbing of the struct declaration): @@ struct intel_context c; @@ - (c).id + c.user_handle @@ struct intel_context *c; @@ - (c)->id + c->user_handle Also, while we are at it, s/DEFAULT_CONTEXT_ID/DEFAULT_CONTEXT_HANDLE and change the type to unsigned 32 bits. v2: s/handle/user_handle and change the type to uint32_t as suggested by Chris Wilson. Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org> (v1) Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-03 23:28:00 +08:00
return c->user_handle == DEFAULT_CONTEXT_HANDLE;
}
int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
i915: add dmabuf/prime buffer sharing support. This adds handle->fd and fd->handle support to i915, this is to allow for offloading of rendering in one direction and outputs in the other. v2 from Daniel Vetter: - fixup conflicts with the prepare/finish gtt prep work. - implement ppgtt binding support. Note that we have squat i-g-t testcoverage for any of the lifetime and access rules dma_buf/prime support brings along. And there are quite a few intricate situations here. Also note that the integration with the existing code is a bit hackish, especially around get_gtt_pages and put_gtt_pages. It imo would be easier with the prep code from Chris Wilson's unbound series, but that is for 3.6. Also note that I didn't bother to put the new prepare/finish gtt hooks to good use by moving the dma_buf_map/unmap_attachment calls in there (like we've originally planned for). Last but not least this patch is only compile-tested, but I've changed very little compared to Dave Airlie's version. So there's a decent chance v2 on drm-next works as well as v1 on 3.4-rc. v3: Right when I've hit sent I've noticed that I've screwed up one obj->sg_list (for dmar support) and obj->sg_table (for prime support) disdinction. We should be able to merge these 2 paths, but that's material for another patch. v4: fix the error reporting bugs pointed out by ickle. v5: fix another error, and stop non-gtt mmaps on shared objects stop pread/pwrite on imported objects, add fake kmap Signed-off-by: Dave Airlie <airlied@redhat.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-05-10 21:25:09 +08:00
/* i915_gem_evict.c */
int __must_check i915_gem_evict_something(struct drm_device *dev,
struct i915_address_space *vm,
int min_size,
unsigned alignment,
unsigned cache_level,
drm/i915: Prevent negative relocation deltas from wrapping This is pure evil. Userspace, I'm looking at you SNA, repacks batch buffers on the fly after generation as they are being passed to the kernel for execution. These batches also contain self-referenced relocations as a single buffer encompasses the state commands, kernels, vertices and sampler. During generation the buffers are placed at known offsets within the full batch, and then the relocation deltas (as passed to the kernel) are tweaked as the batch is repacked into a smaller buffer. This means that userspace is passing negative relocations deltas, which subsequently wrap to large values if the batch is at a low address. The GPU hangs when it then tries to use the large value as a base for its address offsets, rather than wrapping back to the real value (as one would hope). As the GPU uses positive offsets from the base, we can treat the relocation address as the minimum address read by the GPU. For the upper bound, we trust that userspace will not read beyond the end of the buffer. So, how do we fix negative relocations from wrapping? We can either check that every relocation looks valid when we write it, and then position each object such that we prevent the offset wraparound, or we just special-case the self-referential behaviour of SNA and force all batches to be above 256k. Daniel prefers the latter approach. This fixes a GPU hang when it tries to use an address (relocation + offset) greater than the GTT size. The issue would occur quite easily with full-ppgtt as each fd gets its own VM space, so low offsets would often be handed out. However, with the rearrangement of the low GTT due to capturing the BIOS framebuffer, it is already affecting kernels 3.15 onwards. I think only IVB+ is susceptible to this bug, but the workaround should only kick in rarely, so it seems sensible to always apply it. v3: Use a bias for batch buffers to prevent small negative delta relocations from wrapping. v4 from Daniel: - s/BIAS/BATCH_OFFSET_BIAS/ - Extract eb_vma_misplaced/i915_vma_misplaced since the conditions were growing rather cumbersome. - Add a comment to eb_get_batch explaining why we do this. - Apply the batch offset bias everywhere but mention that we've only observed it on gen7 gpus. - Drop PIN_OFFSET_FIX for now, that slipped in from a feature patch. v5: Add static to eb_get_batch, spotted by 0-day tester. Testcase: igt/gem_bad_reloc Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78533 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v3) Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-23 14:48:08 +08:00
unsigned long start,
unsigned long end,
unsigned flags);
int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
int i915_gem_evict_everything(struct drm_device *dev);
/* belongs in i915_gem_gtt.h */
static inline void i915_gem_chipset_flush(struct drm_device *dev)
drm/i915: Stop using AGP layer for GEN6+ As a quick hack we make the old intel_gtt structure mutable so we can fool a bunch of the existing code which depends on elements in that data structure. We can/should try to remove this in a subsequent patch. This should preserve the old gtt init behavior which upon writing these patches seems incorrect. The next patch will fix these things. The one exception is VLV which doesn't have the preserved flush control write behavior. Since we want to do that for all GEN6+ stuff, we'll handle that in a later patch. Mainstream VLV support doesn't actually exist yet anyway. v2: Update the comment to remove the "voodoo" Check that the last pte written matches what we readback v3: actually kill cache_level_to_agp_type since most of the flags will disappear in an upcoming patch v4: v3 was actually not what we wanted (Daniel) Make the ggtt bind assertions better and stricter (Chris) Fix some uncaught errors at gtt init (Chris) Some other random stuff that Chris wanted v5: check for i==0 in gen6_ggtt_bind_object to shut up gcc (Ben) Signed-off-by: Ben Widawsky <ben@bwidawsk.net> Reviewed-by [v4]: Chris Wilson <chris@chris-wilson.co.uk> [danvet: Make the cache_level -> agp_flags conversion for pre-gen6 a tad more robust by mapping everything != CACHE_NONE to the cached agp flag - we have a 1:1 uncached mapping, but different modes of cacheable (at least on later generations). Suggested by Chris Wilson.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-11-05 01:21:27 +08:00
{
if (INTEL_INFO(dev)->gen < 6)
intel_gtt_chipset_flush();
}
/* i915_gem_stolen.c */
int i915_gem_stolen_insert_node(struct drm_i915_private *dev_priv,
struct drm_mm_node *node, u64 size,
unsigned alignment);
void i915_gem_stolen_remove_node(struct drm_i915_private *dev_priv,
struct drm_mm_node *node);
int i915_gem_init_stolen(struct drm_device *dev);
void i915_gem_cleanup_stolen(struct drm_device *dev);
struct drm_i915_gem_object *
i915_gem_object_create_stolen(struct drm_device *dev, u32 size);
struct drm_i915_gem_object *
i915_gem_object_create_stolen_for_preallocated(struct drm_device *dev,
u32 stolen_offset,
u32 gtt_offset,
u32 size);
/* i915_gem_shrinker.c */
unsigned long i915_gem_shrink(struct drm_i915_private *dev_priv,
long target,
unsigned flags);
#define I915_SHRINK_PURGEABLE 0x1
#define I915_SHRINK_UNBOUND 0x2
#define I915_SHRINK_BOUND 0x4
unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
void i915_gem_shrinker_init(struct drm_i915_private *dev_priv);
/* i915_gem_tiling.c */
static inline bool i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
{
struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
return dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_9_10_17 &&
obj->tiling_mode != I915_TILING_NONE;
}
void i915_gem_detect_bit_6_swizzle(struct drm_device *dev);
void i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj);
void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj);
/* i915_gem_debug.c */
#if WATCH_LISTS
int i915_verify_lists(struct drm_device *dev);
#else
#define i915_verify_lists(dev) 0
#endif
/* i915_debugfs.c */
int i915_debugfs_init(struct drm_minor *minor);
void i915_debugfs_cleanup(struct drm_minor *minor);
#ifdef CONFIG_DEBUG_FS
int i915_debugfs_connector_add(struct drm_connector *connector);
void intel_display_crc_init(struct drm_device *dev);
#else
static inline int i915_debugfs_connector_add(struct drm_connector *connector)
{ return 0; }
static inline void intel_display_crc_init(struct drm_device *dev) {}
#endif
/* i915_gpu_error.c */
__printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
const struct i915_error_state_file_priv *error);
int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb,
struct drm_i915_private *i915,
size_t count, loff_t pos);
static inline void i915_error_state_buf_release(
struct drm_i915_error_state_buf *eb)
{
kfree(eb->buf);
}
void i915_capture_error_state(struct drm_device *dev, bool wedge,
const char *error_msg);
void i915_error_state_get(struct drm_device *dev,
struct i915_error_state_file_priv *error_priv);
void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
void i915_destroy_error_state(struct drm_device *dev);
void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
/* i915_cmd_parser.c */
int i915_cmd_parser_get_version(void);
int i915_cmd_parser_init_ring(struct intel_engine_cs *ring);
void i915_cmd_parser_fini_ring(struct intel_engine_cs *ring);
bool i915_needs_cmd_parser(struct intel_engine_cs *ring);
int i915_parse_cmds(struct intel_engine_cs *ring,
struct drm_i915_gem_object *batch_obj,
drm/i915: Use batch pools with the command parser This patch sets up all of the tracking and copying necessary to use batch pools with the command parser and dispatches the copied (shadow) batch to the hardware. After this patch, the parser is in 'enabling' mode. Note that performance takes a hit from the copy in some cases and will likely need some work. At a rough pass, the memcpy appears to be the bottleneck. Without having done a deeper analysis, two ideas that come to mind are: 1) Copy sections of the batch at a time, as they are reached by parsing. Might improve cache locality. 2) Copy only up to the userspace-supplied batch length and memset the rest of the buffer. Reduces the number of reads. v2: - Remove setting the capacity of the pool - One global pool instead of per-ring pools - Replace batch_obj with shadow_batch_obj and hook into eb->vmas - Memset any space in the shadow batch beyond what gets copied - Rebased on execlist prep refactoring v3: - Rebase on chained batch handling - Squash in setting the secure dispatch flag - Add a note about the interaction w/secure dispatch pinning - Check for request->batch_obj == NULL in i915_gem_free_request v4: - Fix read domains for shadow_batch_obj - Remove the set_to_gtt_domain call from i915_parse_cmds - ggtt_pin/unpin in the parser block to simplify error handling - Check USES_FULL_PPGTT before setting DISPATCH_SECURE flag - Remove i915_gem_batch_pool_put calls v5: - Move 'pending_read_domains |= I915_GEM_DOMAIN_COMMAND' after the parser (danvet, from v4 0/7 feedback) Issue: VIZ-4719 Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> Reviewed-By: Jon Bloomfield <jon.bloomfield@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-12 04:13:09 +08:00
struct drm_i915_gem_object *shadow_batch_obj,
u32 batch_start_offset,
u32 batch_len,
bool is_master);
/* i915_suspend.c */
extern int i915_save_state(struct drm_device *dev);
extern int i915_restore_state(struct drm_device *dev);
drm/i915: rc6 in sysfs Merge rc6 information into the power group for our device. Until now the i915 driver has not had any sysfs entries (aside from the connector stuff enabled by drm core). Since it seems like we're likely to have more in the future I created a new file for sysfs stubs, as well as the rc6 sysfs functions which don't really belong elsewhere (perhaps i915_suspend, but most of the stuff is in intel_display,c). displays rc6 modes enabled (as a hex mask): cat /sys/class/drm/card0/power/rc6_enable displays #ms GPU has been in rc6 since boot: cat /sys/class/drm/card0/power/rc6_residency_ms displays #ms GPU has been in deep rc6 since boot: cat /sys/class/drm/card0/power/rc6p_residency_ms displays #ms GPU has been in deepest rc6 since boot: cat /sys/class/drm/card0/power/rc6pp_residency_ms Important note: I've seen on SNB that even when RC6 is *not* enabled the rc6 register seems to have a random value in it. I can only guess at the reason reason for this. Those writing tools that utilize this value need to be careful and probably want to scrutinize the value very carefully. v2: use common rc6 residency units to milliseconds for the other RC6 types v3: don't create sysfs files for GEN <= 5 add a rc6_enable to show a mask of enabled rc6 types use unmerge instead of remove for sysfs group squash intel_enable_rc6() extraction into this patch v4: rename sysfs files (Chris) CC: Chris Wilson <chris@chris-wilson.co.uk> CC: Daniel Vetter <daniel.vetter@ffwll.ch>f CC: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Ben Widawsky <benjamin.widawsky@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: squash in the 64bit division fix by Chris Wilson.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-04-11 12:17:01 +08:00
/* i915_sysfs.c */
void i915_setup_sysfs(struct drm_device *dev_priv);
void i915_teardown_sysfs(struct drm_device *dev_priv);
/* intel_i2c.c */
extern int intel_setup_gmbus(struct drm_device *dev);
extern void intel_teardown_gmbus(struct drm_device *dev);
extern bool intel_gmbus_is_valid_pin(struct drm_i915_private *dev_priv,
unsigned int pin);
extern struct i2c_adapter *
intel_gmbus_get_adapter(struct drm_i915_private *dev_priv, unsigned int pin);
extern void intel_gmbus_set_speed(struct i2c_adapter *adapter, int speed);
extern void intel_gmbus_force_bit(struct i2c_adapter *adapter, bool force_bit);
static inline bool intel_gmbus_is_forced_bit(struct i2c_adapter *adapter)
{
return container_of(adapter, struct intel_gmbus, adapter)->force_bit;
}
extern void intel_i2c_reset(struct drm_device *dev);
/* intel_opregion.c */
#ifdef CONFIG_ACPI
extern int intel_opregion_setup(struct drm_device *dev);
extern void intel_opregion_init(struct drm_device *dev);
extern void intel_opregion_fini(struct drm_device *dev);
extern void intel_opregion_asle_intr(struct drm_device *dev);
extern int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
bool enable);
extern int intel_opregion_notify_adapter(struct drm_device *dev,
pci_power_t state);
#else
static inline int intel_opregion_setup(struct drm_device *dev) { return 0; }
static inline void intel_opregion_init(struct drm_device *dev) { return; }
static inline void intel_opregion_fini(struct drm_device *dev) { return; }
static inline void intel_opregion_asle_intr(struct drm_device *dev) { return; }
static inline int
intel_opregion_notify_encoder(struct intel_encoder *intel_encoder, bool enable)
{
return 0;
}
static inline int
intel_opregion_notify_adapter(struct drm_device *dev, pci_power_t state)
{
return 0;
}
#endif
/* intel_acpi.c */
#ifdef CONFIG_ACPI
extern void intel_register_dsm_handler(void);
extern void intel_unregister_dsm_handler(void);
#else
static inline void intel_register_dsm_handler(void) { return; }
static inline void intel_unregister_dsm_handler(void) { return; }
#endif /* CONFIG_ACPI */
/* modesetting */
extern void intel_modeset_init_hw(struct drm_device *dev);
extern void intel_modeset_init(struct drm_device *dev);
extern void intel_modeset_gem_init(struct drm_device *dev);
extern void intel_modeset_cleanup(struct drm_device *dev);
extern void intel_connector_unregister(struct intel_connector *);
extern int intel_modeset_vga_set_state(struct drm_device *dev, bool state);
extern void intel_display_resume(struct drm_device *dev);
extern void i915_redisable_vga(struct drm_device *dev);
extern void i915_redisable_vga_power_on(struct drm_device *dev);
extern bool ironlake_set_drps(struct drm_device *dev, u8 val);
extern void intel_init_pch_refclk(struct drm_device *dev);
extern void intel_set_rps(struct drm_device *dev, u8 val);
extern void intel_set_memory_cxsr(struct drm_i915_private *dev_priv,
bool enable);
extern void intel_detect_pch(struct drm_device *dev);
extern int intel_trans_dp_port_sel(struct drm_crtc *crtc);
drm/i915: rc6 in sysfs Merge rc6 information into the power group for our device. Until now the i915 driver has not had any sysfs entries (aside from the connector stuff enabled by drm core). Since it seems like we're likely to have more in the future I created a new file for sysfs stubs, as well as the rc6 sysfs functions which don't really belong elsewhere (perhaps i915_suspend, but most of the stuff is in intel_display,c). displays rc6 modes enabled (as a hex mask): cat /sys/class/drm/card0/power/rc6_enable displays #ms GPU has been in rc6 since boot: cat /sys/class/drm/card0/power/rc6_residency_ms displays #ms GPU has been in deep rc6 since boot: cat /sys/class/drm/card0/power/rc6p_residency_ms displays #ms GPU has been in deepest rc6 since boot: cat /sys/class/drm/card0/power/rc6pp_residency_ms Important note: I've seen on SNB that even when RC6 is *not* enabled the rc6 register seems to have a random value in it. I can only guess at the reason reason for this. Those writing tools that utilize this value need to be careful and probably want to scrutinize the value very carefully. v2: use common rc6 residency units to milliseconds for the other RC6 types v3: don't create sysfs files for GEN <= 5 add a rc6_enable to show a mask of enabled rc6 types use unmerge instead of remove for sysfs group squash intel_enable_rc6() extraction into this patch v4: rename sysfs files (Chris) CC: Chris Wilson <chris@chris-wilson.co.uk> CC: Daniel Vetter <daniel.vetter@ffwll.ch>f CC: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Ben Widawsky <benjamin.widawsky@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: squash in the 64bit division fix by Chris Wilson.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-04-11 12:17:01 +08:00
extern int intel_enable_rc6(const struct drm_device *dev);
extern bool i915_semaphore_is_enabled(struct drm_device *dev);
int i915_reg_read_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
int i915_get_reset_stats_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
/* overlay */
extern struct intel_overlay_error_state *intel_overlay_capture_error_state(struct drm_device *dev);
extern void intel_overlay_print_error_state(struct drm_i915_error_state_buf *e,
struct intel_overlay_error_state *error);
extern struct intel_display_error_state *intel_display_capture_error_state(struct drm_device *dev);
extern void intel_display_print_error_state(struct drm_i915_error_state_buf *e,
struct drm_device *dev,
struct intel_display_error_state *error);
int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val);
int sandybridge_pcode_write(struct drm_i915_private *dev_priv, u32 mbox, u32 val);
/* intel_sideband.c */
u32 vlv_punit_read(struct drm_i915_private *dev_priv, u32 addr);
void vlv_punit_write(struct drm_i915_private *dev_priv, u32 addr, u32 val);
u32 vlv_nc_read(struct drm_i915_private *dev_priv, u8 addr);
u32 vlv_gpio_nc_read(struct drm_i915_private *dev_priv, u32 reg);
void vlv_gpio_nc_write(struct drm_i915_private *dev_priv, u32 reg, u32 val);
u32 vlv_cck_read(struct drm_i915_private *dev_priv, u32 reg);
void vlv_cck_write(struct drm_i915_private *dev_priv, u32 reg, u32 val);
u32 vlv_ccu_read(struct drm_i915_private *dev_priv, u32 reg);
void vlv_ccu_write(struct drm_i915_private *dev_priv, u32 reg, u32 val);
u32 vlv_bunit_read(struct drm_i915_private *dev_priv, u32 reg);
void vlv_bunit_write(struct drm_i915_private *dev_priv, u32 reg, u32 val);
u32 vlv_gps_core_read(struct drm_i915_private *dev_priv, u32 reg);
void vlv_gps_core_write(struct drm_i915_private *dev_priv, u32 reg, u32 val);
u32 vlv_dpio_read(struct drm_i915_private *dev_priv, enum pipe pipe, int reg);
void vlv_dpio_write(struct drm_i915_private *dev_priv, enum pipe pipe, int reg, u32 val);
u32 intel_sbi_read(struct drm_i915_private *dev_priv, u16 reg,
enum intel_sbi_destination destination);
void intel_sbi_write(struct drm_i915_private *dev_priv, u16 reg, u32 value,
enum intel_sbi_destination destination);
u32 vlv_flisdsi_read(struct drm_i915_private *dev_priv, u32 reg);
void vlv_flisdsi_write(struct drm_i915_private *dev_priv, u32 reg, u32 val);
int intel_gpu_freq(struct drm_i915_private *dev_priv, int val);
int intel_freq_opcode(struct drm_i915_private *dev_priv, int val);
#define I915_READ8(reg) dev_priv->uncore.funcs.mmio_readb(dev_priv, (reg), true)
#define I915_WRITE8(reg, val) dev_priv->uncore.funcs.mmio_writeb(dev_priv, (reg), (val), true)
#define I915_READ16(reg) dev_priv->uncore.funcs.mmio_readw(dev_priv, (reg), true)
#define I915_WRITE16(reg, val) dev_priv->uncore.funcs.mmio_writew(dev_priv, (reg), (val), true)
#define I915_READ16_NOTRACE(reg) dev_priv->uncore.funcs.mmio_readw(dev_priv, (reg), false)
#define I915_WRITE16_NOTRACE(reg, val) dev_priv->uncore.funcs.mmio_writew(dev_priv, (reg), (val), false)
#define I915_READ(reg) dev_priv->uncore.funcs.mmio_readl(dev_priv, (reg), true)
#define I915_WRITE(reg, val) dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), true)
#define I915_READ_NOTRACE(reg) dev_priv->uncore.funcs.mmio_readl(dev_priv, (reg), false)
#define I915_WRITE_NOTRACE(reg, val) dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), false)
/* Be very careful with read/write 64-bit values. On 32-bit machines, they
* will be implemented using 2 32-bit writes in an arbitrary order with
* an arbitrary delay between them. This can cause the hardware to
* act upon the intermediate value, possibly leading to corruption and
* machine death. You have been warned.
*/
#define I915_WRITE64(reg, val) dev_priv->uncore.funcs.mmio_writeq(dev_priv, (reg), (val), true)
#define I915_READ64(reg) dev_priv->uncore.funcs.mmio_readq(dev_priv, (reg), true)
#define I915_READ64_2x32(lower_reg, upper_reg) ({ \
u32 upper = I915_READ(upper_reg); \
u32 lower = I915_READ(lower_reg); \
u32 tmp = I915_READ(upper_reg); \
if (upper != tmp) { \
upper = tmp; \
lower = I915_READ(lower_reg); \
WARN_ON(I915_READ(upper_reg) != upper); \
} \
(u64)upper << 32 | lower; })
#define POSTING_READ(reg) (void)I915_READ_NOTRACE(reg)
#define POSTING_READ16(reg) (void)I915_READ16_NOTRACE(reg)
/* These are untraced mmio-accessors that are only valid to be used inside
* criticial sections inside IRQ handlers where forcewake is explicitly
* controlled.
* Think twice, and think again, before using these.
* Note: Should only be used between intel_uncore_forcewake_irqlock() and
* intel_uncore_forcewake_irqunlock().
*/
#define I915_READ_FW(reg__) readl(dev_priv->regs + (reg__))
#define I915_WRITE_FW(reg__, val__) writel(val__, dev_priv->regs + (reg__))
#define POSTING_READ_FW(reg__) (void)I915_READ_FW(reg__)
/* "Broadcast RGB" property */
#define INTEL_BROADCAST_RGB_AUTO 0
#define INTEL_BROADCAST_RGB_FULL 1
#define INTEL_BROADCAST_RGB_LIMITED 2
static inline uint32_t i915_vgacntrl_reg(struct drm_device *dev)
{
if (IS_VALLEYVIEW(dev))
return VLV_VGACNTRL;
else if (INTEL_INFO(dev)->gen >= 5)
return CPU_VGACNTRL;
else
return VGACNTRL;
}
static inline void __user *to_user_ptr(u64 address)
{
return (void __user *)(uintptr_t)address;
}
static inline unsigned long msecs_to_jiffies_timeout(const unsigned int m)
{
unsigned long j = msecs_to_jiffies(m);
return min_t(unsigned long, MAX_JIFFY_OFFSET, j + 1);
}
static inline unsigned long nsecs_to_jiffies_timeout(const u64 n)
{
return min_t(u64, MAX_JIFFY_OFFSET, nsecs_to_jiffies64(n) + 1);
}
static inline unsigned long
timespec_to_jiffies_timeout(const struct timespec *value)
{
unsigned long j = timespec_to_jiffies(value);
return min_t(unsigned long, MAX_JIFFY_OFFSET, j + 1);
}
/*
* If you need to wait X milliseconds between events A and B, but event B
* doesn't happen exactly after event A, you record the timestamp (jiffies) of
* when event A happened, then just before event B you call this function and
* pass the timestamp as the first argument, and X as the second argument.
*/
static inline void
wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
{
unsigned long target_jiffies, tmp_jiffies, remaining_jiffies;
/*
* Don't re-read the value of "jiffies" every time since it may change
* behind our back and break the math.
*/
tmp_jiffies = jiffies;
target_jiffies = timestamp_jiffies +
msecs_to_jiffies_timeout(to_wait_ms);
if (time_after(target_jiffies, tmp_jiffies)) {
remaining_jiffies = target_jiffies - tmp_jiffies;
while (remaining_jiffies)
remaining_jiffies =
schedule_timeout_uninterruptible(remaining_jiffies);
}
}
static inline void i915_trace_irq_get(struct intel_engine_cs *ring,
struct drm_i915_gem_request *req)
{
if (ring->trace_irq_req == NULL && ring->irq_get(ring))
i915_gem_request_assign(&ring->trace_irq_req, req);
}
#endif