iris: enable generated indirect draws

This mirror the ring buffer mode we have in Anv. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26797>
2025-01-22 05:33:55 +08:00 · 2023-12-20 12:15:43 +02:00 · 2023-12-20 12:15:43 +02:00 · 5438b19104
commit 5438b19104
parent d754ed5330
15 changed files with 1143 additions and 7 deletions
--- a/meson.build
+++ b/meson.build
@ -299,7 +299,7 @@ if ['x86_64'].contains(host_machine.cpu_family()) and \
   get_option('intel-clc') != 'system'
  # Require intel-clc with Anv & Iris (for internal shaders)
  with_intel_clc = get_option('intel-clc') == 'enabled' or \
-                   with_intel_vk
+                   with_intel_vk or with_gallium_iris
 else
  with_intel_clc = false
 endif
--- a/src/gallium/drivers/iris/driinfo_iris.h
+++ b/src/gallium/drivers/iris/driinfo_iris.h
@ -13,6 +13,7 @@ DRI_CONF_SECTION_PERFORMANCE
   DRI_CONF_ADAPTIVE_SYNC(true)
   DRI_CONF_OPT_E(bo_reuse, 1, 0, 1, "Buffer object reuse",)
   DRI_CONF_OPT_B(intel_tbimr, true, "Enable TBIMR tiled rendering")
+   DRI_CONF_OPT_I(generated_indirect_threshold, 100, 0, INT32_MAX, "Generated indirect draw threshold")
 DRI_CONF_SECTION_END

 DRI_CONF_SECTION_QUALITY
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@ -243,6 +243,12 @@ iris_batch_bytes_used(struct iris_batch *batch)
   return batch->map_next - batch->map;
 }

+static inline uint64_t
+iris_batch_current_address_u64(struct iris_batch *batch)
+{
+   return batch->bo->address + (batch->map_next - batch->map);
+}
+
 /**
 * Ensure the current command buffer has \param size bytes of space
 * remaining.  If not, this creates a secondary batch buffer and emits
--- a/src/gallium/drivers/iris/iris_binder.c
+++ b/src/gallium/drivers/iris/iris_binder.c
@ -119,6 +119,23 @@ iris_binder_reserve(struct iris_context *ice,
   return binder_insert(binder, size);
 }

+/**
+ * Reserve and record binder space for generation shader (FS stage only).
+ */
+void
+iris_binder_reserve_gen(struct iris_context *ice)
+{
+   struct iris_binder *binder = &ice->state.binder;
+
+   binder->bt_offset[MESA_SHADER_FRAGMENT] =
+      iris_binder_reserve(ice, sizeof(uint32_t));
+
+   iris_record_state_size(ice->state.sizes,
+                          binder->bo->address +
+                          binder->bt_offset[MESA_SHADER_FRAGMENT],
+                          sizeof(uint32_t));
+}
+
 /**
 * Reserve and record binder space for 3D pipeline shader stages.
 *
--- a/src/gallium/drivers/iris/iris_binder.h
+++ b/src/gallium/drivers/iris/iris_binder.h
@ -59,6 +59,7 @@ void iris_init_binder(struct iris_context *ice);
 void iris_destroy_binder(struct iris_binder *binder);
 uint32_t iris_binder_reserve(struct iris_context *ice, unsigned size);
 void iris_binder_reserve_3d(struct iris_context *ice);
+void iris_binder_reserve_gen(struct iris_context *ice);
 void iris_binder_reserve_compute(struct iris_context *ice);

 #endif
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@ -714,6 +714,28 @@ struct iris_context {
       * drawid and is_indexed_draw. They will go in their own vertex element.
       */
      struct iris_state_ref derived_draw_params;
+
+      struct {
+         /**
+          * Generation fragment shader
+          */
+         struct iris_compiled_shader *shader;
+
+         /**
+          * Ring buffer where to generate indirect draw commands
+          */
+         struct iris_bo *ring_bo;
+
+         /**
+          * Allocated iris_gen_indirect_params
+          */
+         struct iris_state_ref params;
+
+         /**
+          * Vertices used to dispatch the generated fragment shaders
+          */
+         struct iris_state_ref vertices;
+      } generation;
   } draw;

   struct {
@ -930,6 +952,60 @@ struct iris_context {
   } state;
 };

+/**
+ * Push constant data handed over to the indirect draw generation shader
+ */
+struct iris_gen_indirect_params {
+   /**
+    * Address of iris_context:draw:generation:ring_bo
+    */
+   uint64_t generated_cmds_addr;
+   /**
+    * Address of indirect data to draw with
+    */
+   uint64_t indirect_data_addr;
+   /**
+    * Address inside iris_context:draw:generation:ring_bo where to draw ids
+    */
+   uint64_t draw_id_addr;
+   /**
+    * Address of the indirect count (can be null, in which case max_draw_count
+    * is used)
+    */
+   uint64_t draw_count_addr;
+   /**
+    * Address to jump to in order to generate more draws
+    */
+   uint64_t gen_addr;
+   /**
+    * Address to jump to to end generated draws
+    */
+   uint64_t end_addr;
+   /**
+    * Stride between the indirect draw data
+    */
+   uint32_t indirect_data_stride;
+   /**
+    * Base index of the current generated draws in the ring buffer (increments
+    * by ring_count)
+    */
+   uint32_t draw_base;
+   /**
+    * Maximum number of generated draw if draw_count_addr is null
+    */
+   uint32_t max_draw_count;
+   /**
+    * bits 0-7:   ANV_GENERATED_FLAG_*
+    * bits 8-15:  vertex buffer mocs
+    * bits 16-23: stride between generated commands
+    */
+   uint32_t flags;
+   /**
+    * Number of items to generate in the ring buffer
+    */
+   uint32_t ring_count;
+};
+
 #define perf_debug(dbg, ...) do {                      \
   if (INTEL_DEBUG(DEBUG_PERF))                        \
      dbg_printf(__VA_ARGS__);                         \
@ -1134,6 +1210,9 @@ bool iris_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage,
                              uint32_t *kernel_out,
                              void *prog_data_out);

+void iris_ensure_indirect_generation_shader(struct iris_batch *batch);
+
+
 /* iris_resolve.c */

 void iris_predraw_resolve_inputs(struct iris_context *ice,
--- a/src/gallium/drivers/iris/iris_draw.c
+++ b/src/gallium/drivers/iris/iris_draw.c
@ -196,6 +196,14 @@ iris_simple_draw_vbo(struct iris_context *ice,
   batch->screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc);
 }

+static inline bool
+iris_use_draw_indirect_generation(const struct iris_screen *screen,
+                                  const struct pipe_draw_indirect_info *dindirect)
+{
+   return dindirect != NULL &&
+          dindirect->draw_count >= screen->driconf.generated_indirect_threshold;
+}
+
 static void
 iris_indirect_draw_vbo(struct iris_context *ice,
                       const struct pipe_draw_info *dinfo,
@ -204,6 +212,7 @@ iris_indirect_draw_vbo(struct iris_context *ice,
                       const struct pipe_draw_start_count_bias *draw)
 {
   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   struct iris_screen *screen = batch->screen;
   struct pipe_draw_info info = *dinfo;
   struct pipe_draw_indirect_info indirect = *dindirect;
   const bool use_predicate =
@ -217,7 +226,14 @@ iris_indirect_draw_vbo(struct iris_context *ice,

      iris_update_draw_parameters(ice, &info, drawid_offset, &indirect, draw);

-      batch->screen->vtbl.upload_indirect_render_state(ice, &info, &indirect, draw);
+      screen->vtbl.upload_indirect_render_state(ice, &info, &indirect, draw);
+   } else if (iris_use_draw_indirect_generation(screen, &indirect)) {
+      iris_batch_maybe_flush(batch, 1500);
+
+      iris_update_draw_parameters(ice, &info, drawid_offset, &indirect, draw);
+
+      screen->vtbl.upload_indirect_shader_render_state(
+         ice, &info, &indirect, draw);
   } else {
      iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer),
                                 IRIS_DOMAIN_VF_READ);
@ -231,7 +247,7 @@ iris_indirect_draw_vbo(struct iris_context *ice,

      if (use_predicate) {
         /* Upload MI_PREDICATE_RESULT to GPR15.*/
-         batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
+         screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
      }

      for (int i = 0; i < indirect.draw_count; i++) {
@ -245,7 +261,7 @@ iris_indirect_draw_vbo(struct iris_context *ice,

      if (use_predicate) {
         /* Restore MI_PREDICATE_RESULT. */
-         batch->screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
+         screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
      }
   }

@ -307,7 +323,19 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info,
         iris_predraw_flush_buffers(ice, batch, stage);
   }

-   iris_binder_reserve_3d(ice);
+   /* If we're going to use the generation shader, we need to allocate a
+    * binding table entry for it on <= Gfx9 because that platform does not
+    * have a null-rendertarget bit in the send message to the render cache,
+    * the EOT message might pollute later writes to the actual RT of the
+    * draws.
+    *
+    * The generation will call iris_binder_reserve_3d() after the generation
+    * draw call.
+    */
+   if (iris_use_draw_indirect_generation(screen, indirect) && devinfo->ver <= 9)
+      iris_binder_reserve_gen(ice);
+   else
+      iris_binder_reserve_3d(ice);

   batch->screen->vtbl.update_binder_address(batch, &ice->state.binder);

--- a/src/gallium/drivers/iris/iris_genx_macros.h
+++ b/src/gallium/drivers/iris/iris_genx_macros.h
@ -162,3 +162,10 @@ rw_bo(struct iris_bo *bo, uint64_t offset, enum iris_domain access)
   return (struct iris_address) { .bo = bo, .offset = offset,
                                  .access = access };
 }
+
+UNUSED static struct iris_address
+iris_address_add(struct iris_address addr, uint64_t offset)
+{
+   addr.offset += offset;
+   return addr;
+}
--- a/src/gallium/drivers/iris/iris_genx_protos.h
+++ b/src/gallium/drivers/iris/iris_genx_protos.h
@ -74,3 +74,12 @@ void genX(math_add32_gpr0)(struct iris_context *ice,
 void genX(math_div32_gpr0)(struct iris_context *ice,
                           struct iris_batch *batch,
                           uint32_t D);
+
+/* iris_indirect_gen.c */
+void genX(init_screen_gen_state)(struct iris_screen *screen);
+struct iris_gen_indirect_params *
+genX(emit_indirect_generate)(struct iris_batch *batch,
+                             const struct pipe_draw_info *draw,
+                             const struct pipe_draw_indirect_info *indirect,
+                             const struct pipe_draw_start_count_bias *sc,
+                             struct iris_address *out_params_addr);
--- a/src/gallium/drivers/iris/iris_indirect_gen.c
+++ b/src/gallium/drivers/iris/iris_indirect_gen.c
@ -0,0 +1,650 @@
+/* Copyright © 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <errno.h>
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_upload_mgr.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_serialize.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/common/intel_aux_map.h"
+#include "intel/common/intel_l3_config.h"
+#include "intel/common/intel_sample_positions.h"
+#include "intel/ds/intel_tracepoints.h"
+#include "iris_batch.h"
+#include "iris_context.h"
+#include "iris_defines.h"
+#include "iris_pipe.h"
+#include "iris_resource.h"
+#include "iris_utrace.h"
+
+#include "iris_genx_macros.h"
+#include "intel/common/intel_genX_state.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#include "libintel_shaders.h"
+
+#if GFX_VERx10 == 80
+# include "intel_gfx8_shaders_code.h"
+#elif GFX_VERx10 == 90
+# include "intel_gfx9_shaders_code.h"
+#elif GFX_VERx10 == 110
+# include "intel_gfx11_shaders_code.h"
+#elif GFX_VERx10 == 120
+# include "intel_gfx12_shaders_code.h"
+#elif GFX_VERx10 == 125
+# include "intel_gfx125_shaders_code.h"
+#elif GFX_VERx10 == 200
+# include "intel_gfx20_shaders_code.h"
+#else
+# error "Unsupported generation"
+#endif
+
+#define load_param(b, bit_size, struct_name, field_name)          \
+   nir_load_uniform(b, 1, bit_size, nir_imm_int(b, 0),            \
+                    .base = offsetof(struct_name, field_name),   \
+                    .range = bit_size / 8)
+
+static nir_def *
+load_fragment_index(nir_builder *b)
+{
+   nir_def *pos_in = nir_f2i32(b, nir_trim_vector(b, nir_load_frag_coord(b), 2));
+   return nir_iadd(b,
+                   nir_imul_imm(b, nir_channel(b, pos_in, 1), 8192),
+                   nir_channel(b, pos_in, 0));
+}
+
+static nir_shader *
+load_shader_lib(struct iris_screen *screen, void *mem_ctx)
+{
+   const nir_shader_compiler_options *nir_options =
+      screen->compiler->nir_options[MESA_SHADER_KERNEL];
+
+   struct blob_reader blob;
+   blob_reader_init(&blob, (void *)genX(intel_shaders_nir),
+                    sizeof(genX(intel_shaders_nir)));
+   return nir_deserialize(mem_ctx, nir_options, &blob);
+}
+
+static unsigned
+iris_call_generation_shader(struct iris_screen *screen, nir_builder *b)
+{
+   genX(libiris_write_draw)(
+      b,
+      load_param(b, 64, struct iris_gen_indirect_params, generated_cmds_addr),
+      load_param(b, 64, struct iris_gen_indirect_params, indirect_data_addr),
+      load_param(b, 64, struct iris_gen_indirect_params, draw_id_addr),
+      load_param(b, 32, struct iris_gen_indirect_params, indirect_data_stride),
+      load_param(b, 64, struct iris_gen_indirect_params, draw_count_addr),
+      load_param(b, 32, struct iris_gen_indirect_params, draw_base),
+      load_param(b, 32, struct iris_gen_indirect_params, max_draw_count),
+      load_param(b, 32, struct iris_gen_indirect_params, flags),
+      load_param(b, 32, struct iris_gen_indirect_params, ring_count),
+      load_param(b, 64, struct iris_gen_indirect_params, gen_addr),
+      load_param(b, 64, struct iris_gen_indirect_params, end_addr),
+      load_fragment_index(b));
+   return sizeof(struct iris_gen_indirect_params);
+}
+
+void
+genX(init_screen_gen_state)(struct iris_screen *screen)
+{
+   screen->vtbl.load_shader_lib = load_shader_lib;
+   screen->vtbl.call_generation_shader = iris_call_generation_shader;
+}
+
+/**
+ * Stream out temporary/short-lived state.
+ *
+ * This allocates space, pins the BO, and includes the BO address in the
+ * returned offset (which works because all state lives in 32-bit memory
+ * zones).
+ */
+static void *
+upload_state(struct iris_batch *batch,
+             struct u_upload_mgr *uploader,
+             struct iris_state_ref *ref,
+             unsigned size,
+             unsigned alignment)
+{
+   void *p = NULL;
+   u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
+   iris_use_pinned_bo(batch, iris_resource_bo(ref->res), false, IRIS_DOMAIN_NONE);
+   return p;
+}
+
+static uint32_t *
+stream_state(struct iris_batch *batch,
+             struct u_upload_mgr *uploader,
+             struct pipe_resource **out_res,
+             unsigned size,
+             unsigned alignment,
+             uint32_t *out_offset)
+{
+   void *ptr = NULL;
+
+   u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
+
+   struct iris_bo *bo = iris_resource_bo(*out_res);
+   iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
+
+   iris_record_state_size(batch->state_sizes,
+                          bo->address + *out_offset, size);
+
+   *out_offset += iris_bo_offset_from_base_address(bo);
+
+   return ptr;
+}
+
+static void
+emit_indirect_generate_draw(struct iris_batch *batch,
+                            struct iris_address params_addr,
+                            unsigned params_size,
+                            unsigned ring_count)
+{
+   struct iris_screen *screen = batch->screen;
+   struct iris_context *ice = batch->ice;
+   struct isl_device *isl_dev = &screen->isl_dev;
+   const struct intel_device_info *devinfo = screen->devinfo;
+
+   /* State emission */
+   uint32_t ves_dws[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)];
+   iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), ves_dws, ve) {
+      ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 -
+                           GENX(3DSTATE_VERTEX_ELEMENTS_length_bias);
+   }
+   iris_pack_state(GENX(VERTEX_ELEMENT_STATE), &ves_dws[1], ve) {
+      ve.VertexBufferIndex = 1;
+      ve.Valid = true;
+      ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
+      ve.SourceElementOffset = 0;
+      ve.Component0Control = VFCOMP_STORE_SRC;
+      ve.Component1Control = VFCOMP_STORE_0;
+      ve.Component2Control = VFCOMP_STORE_0;
+      ve.Component3Control = VFCOMP_STORE_0;
+   }
+   iris_pack_state(GENX(VERTEX_ELEMENT_STATE), &ves_dws[3], ve) {
+      ve.VertexBufferIndex   = 0;
+      ve.Valid               = true;
+      ve.SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT;
+      ve.SourceElementOffset = 0;
+      ve.Component0Control   = VFCOMP_STORE_SRC;
+      ve.Component1Control   = VFCOMP_STORE_SRC;
+      ve.Component2Control   = VFCOMP_STORE_SRC;
+      ve.Component3Control   = VFCOMP_STORE_1_FP;
+   }
+
+   iris_batch_emit(batch, ves_dws, sizeof(ves_dws));
+
+   iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf);
+   iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+      sgvs.InstanceIDEnable = true;
+      sgvs.InstanceIDComponentNumber = COMP_1;
+      sgvs.InstanceIDElementOffset = 0;
+   }
+#if GFX_VER >= 11
+   iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+#endif
+   iris_emit_cmd(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 0;
+   }
+   iris_emit_cmd(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 1;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+   }
+
+   ice->shaders.urb.cfg.size[MESA_SHADER_VERTEX] = 1;
+   ice->shaders.urb.cfg.size[MESA_SHADER_TESS_CTRL] = 1;
+   ice->shaders.urb.cfg.size[MESA_SHADER_TESS_EVAL] = 1;
+   ice->shaders.urb.cfg.size[MESA_SHADER_GEOMETRY] = 1;
+   genX(emit_urb_config)(batch,
+                         false /* has_tess_eval */,
+                         false /* has_geometry */);
+
+   iris_emit_cmd(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
+      ps_blend.HasWriteableRT = true;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm);
+
+#if GFX_VER >= 12
+   iris_emit_cmd(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+      db.DepthBoundsTestEnable = false;
+      db.DepthBoundsTestMinValue = 0.0;
+      db.DepthBoundsTestMaxValue = 1.0;
+   }
+#endif
+
+   iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms);
+   iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+      sm.SampleMask = 0x1;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_VS), vs);
+   iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
+   iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
+   iris_emit_cmd(batch, GENX(3DSTATE_DS), DS);
+
+   iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so);
+
+   iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
+
+   iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) {
+      clip.PerspectiveDivideDisable = true;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_SF), sf) {
+#if GFX_VER >= 12
+      sf.DerefBlockSize = ice->state.urb_deref_block_size;
+#endif
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_RASTER), raster) {
+      raster.CullMode = CULLMODE_NONE;
+   }
+
+   const struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->draw.generation.shader->prog_data;
+
+   iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.VertexURBEntryReadOffset = 1;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+      sbe.VertexURBEntryReadLength = MAX2((wm_prog_data->num_varying_inputs + 1) / 2, 1);
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+#if GFX_VER >= 9
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+#endif
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_WM), wm) {
+      if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
+         wm.ForceThreadDispatchEnable = ForceON;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
+      intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data,
+                                  1 /* rasterization_samples */,
+                                  0 /* msaa_flags */);
+
+      ps.VectorMaskEnable       = wm_prog_data->uses_vmask;
+
+      ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0;
+#if GFX_VER < 20
+      ps.PushConstantEnable     = wm_prog_data->base.nr_params > 0 ||
+                                  wm_prog_data->base.ubo_ranges[0].length;
+#endif
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
+#if GFX_VER < 20
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+#endif
+
+      ps.KernelStartPointer0 = KSP(ice->draw.generation.shader) +
+         brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+      ps.KernelStartPointer1 = KSP(ice->draw.generation.shader) +
+         brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+#if GFX_VER < 20
+      ps.KernelStartPointer2 = KSP(ice->draw.generation.shader) +
+         brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+#endif
+
+      ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
+      psx.PixelShaderValid = true;
+#if GFX_VER < 20
+      psx.AttributeEnable = wm_prog_data->num_varying_inputs > 0;
+#endif
+      psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
+      psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
+#if GFX_VER >= 9
+#if GFX_VER >= 20
+      assert(!wm_prog_data->pulls_bary);
+#else
+      psx.PixelShaderPullsBary = wm_prog_data->pulls_bary;
+#endif
+      psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
+#endif
+      psx.PixelShaderHasUAV = GFX_VER == 8;
+   }
+
+   iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+      uint32_t cc_vp_address;
+      uint32_t *cc_vp_map =
+         stream_state(batch, ice->state.dynamic_uploader,
+                      &ice->state.last_res.cc_vp,
+                      4 * GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
+
+      iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
+         ccv.MinimumDepth = 0.0f;
+         ccv.MaximumDepth = 1.0f;
+      }
+      cc.CCViewportPointer = cc_vp_address;
+   }
+
+#if GFX_VER >= 12
+   /* Disable Primitive Replication. */
+   iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+
+#if GFX_VERx10 == 125
+   /* DG2: Wa_22011440098
+    * MTL: Wa_18022330953
+    *
+    * In 3D mode, after programming push constant alloc command immediately
+    * program push constant command(ZERO length) without any commit between
+    * them.
+    *
+    * Note that Wa_16011448509 isn't needed here as all address bits are zero.
+    */
+   iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+      /* Update empty push constants for all stages (bitmask = 11111b) */
+      c.ShaderUpdateEnable = 0x1f;
+      c.MOCS = iris_mocs(NULL, isl_dev, 0);
+   }
+#endif
+
+   float x0 = 0.0f, x1 = MIN2(ring_count, 8192);
+   float y0 = 0.0f, y1 = DIV_ROUND_UP(ring_count, 8192);
+   float z = 0.0f;
+
+   float *vertices =
+      upload_state(batch, ice->state.dynamic_uploader,
+                   &ice->draw.generation.vertices,
+                   ALIGN(9 * sizeof(float), 8), 8);
+
+   vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */
+   vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */
+   vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */
+
+
+   uint32_t vbs_dws[1 + GENX(VERTEX_BUFFER_STATE_length)];
+   iris_pack_command(GENX(3DSTATE_VERTEX_BUFFERS), vbs_dws, vbs) {
+      vbs.DWordLength = ARRAY_SIZE(vbs_dws) -
+                        GENX(3DSTATE_VERTEX_BUFFERS_length_bias);
+   }
+   _iris_pack_state(batch, GENX(VERTEX_BUFFER_STATE), &vbs_dws[1], vb) {
+      vb.VertexBufferIndex     = 0;
+      vb.AddressModifyEnable   = true;
+      vb.BufferStartingAddress = ro_bo(iris_resource_bo(ice->draw.generation.vertices.res),
+                                       ice->draw.generation.vertices.offset);
+      vb.BufferPitch           = 3 * sizeof(float);
+      vb.BufferSize            = 9 * sizeof(float);
+      vb.MOCS                  = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
+#if GFX_VER >= 12
+      vb.L3BypassDisable       = true;
+#endif
+   }
+   iris_batch_emit(batch, vbs_dws, sizeof(vbs_dws));
+
+#if GFX_VERx10 > 120
+   uint32_t const_dws[GENX(3DSTATE_CONSTANT_ALL_length) +
+                      GENX(3DSTATE_CONSTANT_ALL_DATA_length)];
+
+   iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), const_dws, all) {
+      all.DWordLength = ARRAY_SIZE(const_dws) -
+         GENX(3DSTATE_CONSTANT_ALL_length_bias);
+      all.ShaderUpdateEnable = 1 << MESA_SHADER_FRAGMENT;
+      all.MOCS = isl_mocs(isl_dev, 0, false);
+      all.PointerBufferMask = 0x1;
+   }
+   _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
+                    &const_dws[GENX(3DSTATE_CONSTANT_ALL_length)], data) {
+      data.PointerToConstantBuffer = params_addr;
+      data.ConstantBufferReadLength = DIV_ROUND_UP(params_size, 32);
+   }
+   iris_batch_emit(batch, const_dws, sizeof(const_dws));
+#else
+   /* The Skylake PRM contains the following restriction:
+    *
+    *    "The driver must ensure The following case does not occur without a
+    *     flush to the 3D engine: 3DSTATE_CONSTANT_* with buffer 3 read length
+    *     equal to zero committed followed by a 3DSTATE_CONSTANT_* with buffer
+    *     0 read length not equal to zero committed."
+    *
+    * To avoid this, we program the highest slot.
+    */
+   iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_PS), c) {
+#if GFX_VER > 8
+      c.MOCS = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
+#endif
+      c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(params_size, 32);
+      c.ConstantBody.Buffer[3] = params_addr;
+   }
+#endif
+
+#if GFX_VER <= 9
+   /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted in
+    * order to commit constants. TODO: Investigate "Disable Gather at Set
+    * Shader" to go back to legacy mode...
+    *
+    * The null writes of the generation shader also appear to disturb the next
+    * RT writes, so we choose to reemit the binding table to a null RT on Gfx8
+    * too.
+    */
+   struct iris_binder *binder = &ice->state.binder;
+   iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), ptr) {
+      ptr.PointertoPSBindingTable =
+         binder->bt_offset[MESA_SHADER_FRAGMENT] >> IRIS_BT_OFFSET_SHIFT;
+   }
+   uint32_t *bt_map = binder->map + binder->bt_offset[MESA_SHADER_FRAGMENT];
+   uint32_t surf_base_offset = binder->bo->address;
+   bt_map[0] = ice->state.null_fb.offset - surf_base_offset;
+#endif
+
+   genX(maybe_emit_breakpoint)(batch, true);
+
+   iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = _3DPRIM_RECTLIST;
+      prim.VertexCountPerInstance   = 3;
+      prim.InstanceCount            = 1;
+   }
+
+
+   /* We've smashed all state compared to what the normal 3D pipeline
+    * rendering tracks for GL.
+    */
+
+   uint64_t skip_bits = (IRIS_DIRTY_POLYGON_STIPPLE |
+                         IRIS_DIRTY_SO_BUFFERS |
+                         IRIS_DIRTY_SO_DECL_LIST |
+                         IRIS_DIRTY_LINE_STIPPLE |
+                         IRIS_ALL_DIRTY_FOR_COMPUTE |
+                         IRIS_DIRTY_SCISSOR_RECT |
+                         IRIS_DIRTY_VF);
+   /* Wa_14016820455
+    * On Gfx 12.5 platforms, the SF_CL_VIEWPORT pointer can be invalidated
+    * likely by a read cache invalidation when clipping is disabled, so we
+    * don't skip its dirty bit here, in order to reprogram it.
+    */
+   if (GFX_VERx10 != 125)
+      skip_bits |= IRIS_DIRTY_SF_CL_VIEWPORT;
+
+   uint64_t skip_stage_bits = (IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE |
+                               IRIS_STAGE_DIRTY_UNCOMPILED_VS |
+                               IRIS_STAGE_DIRTY_UNCOMPILED_TCS |
+                               IRIS_STAGE_DIRTY_UNCOMPILED_TES |
+                               IRIS_STAGE_DIRTY_UNCOMPILED_GS |
+                               IRIS_STAGE_DIRTY_UNCOMPILED_FS |
+                               IRIS_STAGE_DIRTY_SAMPLER_STATES_VS |
+                               IRIS_STAGE_DIRTY_SAMPLER_STATES_TCS |
+                               IRIS_STAGE_DIRTY_SAMPLER_STATES_TES |
+                               IRIS_STAGE_DIRTY_SAMPLER_STATES_GS);
+
+   if (!ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
+      /* Generation disabled tessellation, but it was already off anyway */
+      skip_stage_bits |= IRIS_STAGE_DIRTY_TCS |
+                         IRIS_STAGE_DIRTY_TES |
+                         IRIS_STAGE_DIRTY_CONSTANTS_TCS |
+                         IRIS_STAGE_DIRTY_CONSTANTS_TES |
+                         IRIS_STAGE_DIRTY_BINDINGS_TCS |
+                         IRIS_STAGE_DIRTY_BINDINGS_TES;
+   }
+
+   if (!ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
+      /* Generation disabled geometry shaders, but it was already off
+       * anyway
+       */
+      skip_stage_bits |= IRIS_STAGE_DIRTY_GS |
+                         IRIS_STAGE_DIRTY_CONSTANTS_GS |
+                         IRIS_STAGE_DIRTY_BINDINGS_GS;
+   }
+
+   ice->state.dirty |= ~skip_bits;
+   ice->state.stage_dirty |= ~skip_stage_bits;
+
+   for (int i = 0; i < ARRAY_SIZE(ice->shaders.urb.cfg.size); i++)
+      ice->shaders.urb.cfg.size[i] = 0;
+
+#if GFX_VER <= 9
+   /* Now reupdate the binding tables with the new offsets for the actual
+    * application shaders.
+    */
+   iris_binder_reserve_3d(ice);
+   screen->vtbl.update_binder_address(batch, binder);
+#endif
+}
+
+#define RING_SIZE (128 * 1024)
+
+static void
+ensure_ring_bo(struct iris_context *ice, struct iris_screen *screen)
+{
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+
+   if (ice->draw.generation.ring_bo != NULL)
+      return;
+
+   ice->draw.generation.ring_bo =
+      iris_bo_alloc(bufmgr, "gen ring",
+                    RING_SIZE, 8, IRIS_MEMZONE_OTHER,
+                    BO_ALLOC_NO_SUBALLOC);
+   iris_get_backing_bo(ice->draw.generation.ring_bo)->real.kflags |= EXEC_OBJECT_CAPTURE;
+
+}
+
+struct iris_gen_indirect_params *
+genX(emit_indirect_generate)(struct iris_batch *batch,
+                             const struct pipe_draw_info *draw,
+                             const struct pipe_draw_indirect_info *indirect,
+                             const struct pipe_draw_start_count_bias *sc,
+                             struct iris_address *out_params_addr)
+{
+   struct iris_screen *screen = batch->screen;
+   struct iris_context *ice = batch->ice;
+
+   iris_ensure_indirect_generation_shader(batch);
+   ensure_ring_bo(ice, screen);
+
+   const size_t struct_stride = draw->index_size > 0 ?
+      sizeof(uint32_t) * 5 :
+      sizeof(uint32_t) * 4;
+   unsigned cmd_stride = 0;
+   if (ice->state.vs_uses_draw_params ||
+       ice->state.vs_uses_derived_draw_params) {
+      cmd_stride += 4; /* 3DSTATE_VERTEX_BUFFERS */
+
+      if (ice->state.vs_uses_draw_params)
+         cmd_stride += 4 * GENX(VERTEX_BUFFER_STATE_length);
+
+      if (ice->state.vs_uses_derived_draw_params)
+         cmd_stride += 4 * GENX(VERTEX_BUFFER_STATE_length);
+   }
+   cmd_stride += 4 * GENX(3DPRIMITIVE_length);
+
+   const unsigned setup_dws =
+#if GFX_VER >= 12
+      GENX(MI_ARB_CHECK_length) +
+#endif
+      GENX(MI_BATCH_BUFFER_START_length);
+   const unsigned ring_count =
+      (RING_SIZE - 4 * setup_dws) /
+      (cmd_stride + 4 * 2 /* draw_id, is_indexed_draw */);
+
+   uint32_t params_size = align(sizeof(struct iris_gen_indirect_params), 32);
+   struct iris_gen_indirect_params *params =
+      upload_state(batch, ice->ctx.const_uploader,
+                   &ice->draw.generation.params,
+                   params_size, 64);
+   *out_params_addr =
+      ro_bo(iris_resource_bo(ice->draw.generation.params.res),
+            ice->draw.generation.params.offset);
+
+   iris_use_pinned_bo(batch,
+                      iris_resource_bo(indirect->buffer),
+                      false, IRIS_DOMAIN_NONE);
+   if (indirect->indirect_draw_count) {
+      iris_use_pinned_bo(batch,
+                         iris_resource_bo(indirect->indirect_draw_count),
+                         false, IRIS_DOMAIN_NONE);
+   }
+   iris_use_pinned_bo(batch, ice->draw.generation.ring_bo,
+                      false, IRIS_DOMAIN_NONE);
+
+   *params = (struct iris_gen_indirect_params) {
+      .generated_cmds_addr  = ice->draw.generation.ring_bo->address,
+      .ring_count           = ring_count,
+      .draw_id_addr         = ice->draw.generation.ring_bo->address +
+                              ring_count * cmd_stride +
+                              4 * GENX(MI_BATCH_BUFFER_START_length),
+      .draw_count_addr      = indirect->indirect_draw_count ?
+                              (iris_resource_bo(indirect->indirect_draw_count)->address +
+                               indirect->indirect_draw_count_offset) : 0,
+      .indirect_data_addr   = iris_resource_bo(indirect->buffer)->address +
+                              indirect->offset,
+      .indirect_data_stride = indirect->stride == 0 ?
+                              struct_stride : indirect->stride,
+      .max_draw_count       = indirect->draw_count,
+      .flags                = (draw->index_size > 0 ? ANV_GENERATED_FLAG_INDEXED : 0) |
+                              (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT ?
+                               ANV_GENERATED_FLAG_PREDICATED : 0) |
+                              (ice->state.vs_uses_draw_params ?
+                               ANV_GENERATED_FLAG_BASE : 0) |
+                              (ice->state.vs_uses_derived_draw_params ?
+                               ANV_GENERATED_FLAG_DRAWID : 0) |
+                              (iris_mocs(NULL, &screen->isl_dev,
+                                         ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) |
+                              ((cmd_stride / 4) << 16) |
+                              util_bitcount64(ice->state.bound_vertex_buffers) << 24,
+   };
+
+   genX(maybe_emit_breakpoint)(batch, true);
+
+   emit_indirect_generate_draw(batch, *out_params_addr, params_size,
+                               MIN2(ring_count, indirect->draw_count));
+
+   genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
+   genX(maybe_emit_breakpoint)(batch, false);
+
+
+   return params;
+}
--- a/src/gallium/drivers/iris/iris_program_cache.c
+++ b/src/gallium/drivers/iris/iris_program_cache.c
@ -39,6 +39,7 @@
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
 #include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_nir.h"
 #include "iris_context.h"
 #include "iris_resource.h"

@ -290,3 +291,140 @@ iris_destroy_program_cache(struct iris_context *ice)

   ralloc_free(ice->shaders.cache);
 }
+
+static void
+link_libintel_shaders(nir_shader *nir, const nir_shader *libintel)
+{
+   nir_link_shader_functions(nir, libintel);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_remove_non_entrypoints);
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
+              glsl_get_cl_type_size_align);
+   NIR_PASS_V(nir, nir_opt_deref);
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+                 nir_var_mem_global,
+              nir_address_format_62bit_generic);
+}
+
+void
+iris_ensure_indirect_generation_shader(struct iris_batch *batch)
+{
+   struct iris_context *ice = batch->ice;
+   if (ice->draw.generation.shader)
+      return;
+
+   struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
+   const struct {
+      char name[40];
+   } key = {
+      .name = "iris-generation-shader",
+   };
+   ice->draw.generation.shader =
+      iris_find_cached_shader(ice, IRIS_CACHE_BLORP, sizeof(key), &key);
+   if (ice->draw.generation.shader != NULL)
+      return;
+
+   struct brw_compiler *compiler = screen->compiler;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[MESA_SHADER_FRAGMENT];
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
+                                                  nir_options,
+                                                  "iris-indirect-generate");
+
+   uint32_t uniform_size =
+      screen->vtbl.call_generation_shader(screen, &b);
+
+   nir_shader *nir = b.shader;
+
+   void *mem_ctx = ralloc_context(NULL);
+   link_libintel_shaders(nir, screen->vtbl.load_shader_lib(screen, mem_ctx));
+
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_opt_cse);
+   NIR_PASS_V(nir, nir_opt_gcm, true);
+   NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_split_per_member_structs);
+
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   NIR_PASS_V(nir, nir_propagate_invariant, false);
+
+   NIR_PASS_V(nir, nir_lower_input_attachments,
+              &(nir_input_attachment_options) {
+                 .use_fragcoord_sysval = true,
+                 .use_layer_id_sysval = true,
+              });
+
+   /* Reset sizes before gathering information */
+   nir->global_mem_size = 0;
+   nir->scratch_size = 0;
+   nir->info.shared_size = 0;
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_constant_folding);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   /* Do vectorizing here. For some reason when trying to do it in the back
+    * this just isn't working.
+    */
+   nir_load_store_vectorize_options options = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
+      .callback = brw_nir_should_vectorize_mem,
+      .robust_modes = (nir_variable_mode)0,
+   };
+   NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options);
+
+   nir->num_uniforms = uniform_size;
+
+   union brw_any_prog_key prog_key;
+   memset(&prog_key, 0, sizeof(prog_key));
+
+   struct brw_wm_prog_data *prog_data = ralloc_size(NULL, sizeof(*prog_data));
+   memset(prog_data, 0, sizeof(*prog_data));
+   prog_data->base.nr_params = nir->num_uniforms / 4;
+
+   brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->base.ubo_ranges);
+
+   struct brw_compile_stats stats[3];
+   struct brw_compile_fs_params params = {
+      .base = {
+         .nir = nir,
+         .log_data = &ice->dbg,
+         .debug_flag = DEBUG_WM,
+         .stats = stats,
+         .mem_ctx = mem_ctx,
+      },
+      .key = &prog_key.wm,
+      .prog_data = prog_data,
+   };
+   const unsigned *program = brw_compile_fs(compiler, &params);
+
+   struct iris_binding_table bt;
+   memset(&bt, 0, sizeof(bt));
+
+   struct iris_compiled_shader *shader =
+      iris_create_shader_variant(screen, ice->shaders.cache,
+                                 IRIS_CACHE_BLORP,
+                                 sizeof(key), &key);
+   iris_finalize_program(shader, &prog_data->base, NULL, NULL, 0, 0, 0, &bt);
+
+   iris_upload_shader(screen, NULL, shader, ice->shaders.cache,
+                      ice->shaders.uploader_driver,
+                      IRIS_CACHE_BLORP, sizeof(key), &key, program);
+
+   ralloc_free(mem_ctx);
+
+   struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
+   iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
+
+   ice->draw.generation.shader = shader;
+}
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@ -891,6 +891,8 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
      driQueryOptionb(config->options, "intel_enable_wa_14018912822");
   screen->driconf.enable_tbimr =
      driQueryOptionb(config->options, "intel_tbimr");
+   screen->driconf.generated_indirect_threshold =
+      driQueryOptioni(config->options, "generated_indirect_threshold");

   screen->precompile = debug_get_bool_option("shader_precompile", true);

@ -941,6 +943,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
   iris_init_screen_program_functions(pscreen);

   genX_call(screen->devinfo, init_screen_state, screen);
+   genX_call(screen->devinfo, init_screen_gen_state, screen);

   glsl_type_singleton_init_or_ref();

--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@ -45,6 +45,9 @@ struct iris_fs_prog_key;
 struct iris_cs_prog_key;
 enum iris_program_cache_id;

+typedef struct nir_builder nir_builder;
+typedef struct nir_shader nir_shader;
+
 struct u_trace;

 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
@ -74,6 +77,10 @@ struct iris_vtable {
                                        const struct pipe_draw_info *draw,
                                        const struct pipe_draw_indirect_info *indirect,
                                        const struct pipe_draw_start_count_bias *sc);
+   void (*upload_indirect_shader_render_state)(struct iris_context *ice,
+                                               const struct pipe_draw_info *draw,
+                                               const struct pipe_draw_indirect_info *indirect,
+                                               const struct pipe_draw_start_count_bias *sc);
   void (*update_binder_address)(struct iris_batch *batch,
                                 struct iris_binder *binder);
   void (*upload_compute_state)(struct iris_context *ice,
@ -151,6 +158,9 @@ struct iris_vtable {
                           struct iris_cs_prog_key *key);
   void (*lost_genx_state)(struct iris_context *ice, struct iris_batch *batch);
   void (*disable_rhwo_optimization)(struct iris_batch *batch, bool disable);
+
+   nir_shader *(*load_shader_lib)(struct iris_screen *screen, void *mem_ctx);
+   unsigned (*call_generation_shader)(struct iris_screen *screen, nir_builder *b);
 };

 struct iris_address {
@ -195,6 +205,7 @@ struct iris_screen {
      float lower_depth_range_rate;
      bool intel_enable_wa_14018912822;
      bool enable_tbimr;
+      unsigned generated_indirect_threshold;
   } driconf;

   /** Does the kernel support various features (KERNEL_HAS_* bitfield)? */
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -8536,6 +8536,189 @@ iris_upload_indirect_render_state(struct iris_context *ice,
 #endif /* GFX_VERx10 >= 125 */
 }

+static void
+iris_upload_indirect_shader_render_state(struct iris_context *ice,
+                                         const struct pipe_draw_info *draw,
+                                         const struct pipe_draw_indirect_info *indirect,
+                                         const struct pipe_draw_start_count_bias *sc)
+{
+   assert(indirect);
+
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+   UNUSED struct iris_screen *screen = batch->screen;
+   UNUSED const struct intel_device_info *devinfo = screen->devinfo;
+
+   if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
+      flush_vbos(ice, batch);
+
+   iris_batch_sync_region_start(batch);
+
+   /* Always pin the binder.  If we're emitting new binding table pointers,
+    * we need it.  If not, we're probably inheriting old tables via the
+    * context, and need it anyway.  Since true zero-bindings cases are
+    * practically non-existent, just pin it and avoid last_res tracking.
+    */
+   iris_use_pinned_bo(batch, ice->state.binder.bo, false,
+                      IRIS_DOMAIN_NONE);
+
+   if (!batch->contains_draw) {
+      if (GFX_VER == 12) {
+         /* Re-emit constants when starting a new batch buffer in order to
+          * work around push constant corruption on context switch.
+          *
+          * XXX - Provide hardware spec quotation when available.
+          */
+         ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
+                                    IRIS_STAGE_DIRTY_CONSTANTS_TCS |
+                                    IRIS_STAGE_DIRTY_CONSTANTS_TES |
+                                    IRIS_STAGE_DIRTY_CONSTANTS_GS  |
+                                    IRIS_STAGE_DIRTY_CONSTANTS_FS);
+      }
+      batch->contains_draw = true;
+   }
+
+   if (!batch->contains_draw_with_next_seqno) {
+      iris_restore_render_saved_bos(ice, batch, draw);
+      batch->contains_draw_with_next_seqno = true;
+   }
+
+   if (draw->index_size > 0)
+      iris_emit_index_buffer(ice, batch, draw, sc);
+
+   /* Make sure we have enough space to keep all the commands in the single BO
+    * (because of the jumps)
+    */
+   iris_require_command_space(batch, 2000);
+
+#ifndef NDEBUG
+   struct iris_bo *command_bo = batch->bo;
+#endif
+
+   /* Jump point to generate more draw if we run out of space in the ring
+    * buffer.
+    */
+   uint64_t gen_addr = iris_batch_current_address_u64(batch);
+
+   iris_handle_always_flush_cache(batch);
+
+#if GFX_VER == 9
+   iris_emit_pipe_control_flush(batch, "before generation",
+                                PIPE_CONTROL_VF_CACHE_INVALIDATE);
+#endif
+
+   struct iris_address params_addr;
+   struct iris_gen_indirect_params *params =
+      genX(emit_indirect_generate)(batch, draw, indirect, sc,
+                                   &params_addr);
+
+   iris_emit_pipe_control_flush(batch, "after generation flush",
+                                ((ice->state.vs_uses_draw_params ||
+                                  ice->state.vs_uses_derived_draw_params) ?
+                                 PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) |
+                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                PIPE_CONTROL_DATA_CACHE_FLUSH |
+                                PIPE_CONTROL_CS_STALL);
+
+   trace_intel_begin_draw(&batch->trace);
+
+   /* Always pin the binder.  If we're emitting new binding table pointers,
+    * we need it.  If not, we're probably inheriting old tables via the
+    * context, and need it anyway.  Since true zero-bindings cases are
+    * practically non-existent, just pin it and avoid last_res tracking.
+    */
+   iris_use_pinned_bo(batch, ice->state.binder.bo, false,
+                      IRIS_DOMAIN_NONE);
+
+   /* Wa_1306463417 - Send HS state for every primitive on gfx11.
+    * Wa_16011107343 (same for gfx12)
+    * We implement this by setting TCS dirty on each draw.
+    */
+   if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
+       ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
+      ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
+   }
+
+   iris_upload_dirty_render_state(ice, batch, draw, true);
+
+   iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
+
+   genX(maybe_emit_breakpoint)(batch, true);
+
+#if GFX_VER >= 12
+   iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = true;
+   }
+#endif
+
+   iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress = (struct iris_address) {
+         .bo = ice->draw.generation.ring_bo,
+      };
+   }
+
+   /* Run the ring buffer one more time with the next set of commands */
+   uint64_t inc_addr = iris_batch_current_address_u64(batch);
+   {
+      iris_emit_pipe_control_flush(batch,
+                                   "post generated draws wait",
+                                   PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                   PIPE_CONTROL_CS_STALL);
+
+      struct mi_builder b;
+      mi_builder_init(&b, batch->screen->devinfo, batch);
+
+      struct iris_address draw_base_addr = iris_address_add(
+         params_addr,
+         offsetof(struct iris_gen_indirect_params, draw_base));
+
+      const uint32_t mocs =
+         iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0);
+      mi_builder_set_mocs(&b, mocs);
+
+      mi_store(&b, mi_mem32(draw_base_addr),
+                   mi_iadd(&b, mi_mem32(draw_base_addr),
+                               mi_imm(params->ring_count)));
+
+      iris_emit_pipe_control_flush(batch,
+                                   "post generation base increment",
+                                   PIPE_CONTROL_CS_STALL |
+                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+
+      iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+         bbs.AddressSpaceIndicator = ASI_PPGTT;
+         bbs.BatchBufferStartAddress = (struct iris_address) {
+            .offset = gen_addr,
+         };
+      }
+   }
+
+   /* Exit of the ring buffer */
+   uint64_t end_addr = iris_batch_current_address_u64(batch);
+
+#ifndef NDEBUG
+   assert(command_bo == batch->bo);
+#endif
+
+   genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
+   genX(maybe_emit_breakpoint)(batch, false);
+
+   iris_emit_pipe_control_flush(batch,
+                                "post generated draws wait",
+                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                PIPE_CONTROL_CS_STALL);
+
+   params->gen_addr = inc_addr;
+   params->end_addr = end_addr;
+
+   iris_batch_sync_region_end(batch);
+
+   uint32_t count = (sc) ? sc->count : 0;
+   count *= draw->instance_count ? draw->instance_count : 1;
+   trace_intel_end_draw(&batch->trace, count);
+}
+
 static void
 iris_load_indirect_location(struct iris_context *ice,
                            struct iris_batch *batch,
@ -8916,6 +9099,8 @@ iris_destroy_state(struct iris_context *ice)

   pipe_resource_reference(&ice->draw.draw_params.res, NULL);
   pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
+   pipe_resource_reference(&ice->draw.generation.params.res, NULL);
+   pipe_resource_reference(&ice->draw.generation.vertices.res, NULL);

   /* Loop over all VBOs, including ones for draw parameters */
   for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
@ -9974,6 +10159,7 @@ genX(init_screen_state)(struct iris_screen *screen)
   screen->vtbl.init_copy_context = iris_init_copy_context;
   screen->vtbl.upload_render_state = iris_upload_render_state;
   screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
+   screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state;
   screen->vtbl.update_binder_address = iris_update_binder_address;
   screen->vtbl.upload_compute_state = iris_upload_compute_state;
   screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
--- a/src/gallium/drivers/iris/meson.build
+++ b/src/gallium/drivers/iris/meson.build
@ -74,7 +74,7 @@ iris_per_hw_ver_libs = []
 foreach v : ['80', '90', '110', '120', '125', '200']
  iris_per_hw_ver_libs += static_library(
    'iris_per_hw_ver@0@'.format(v),
-    ['iris_blorp.c', 'iris_query.c', 'iris_state.c', gen_xml_pack],
+    ['iris_blorp.c', 'iris_query.c', 'iris_state.c', 'iris_indirect_gen.c', gen_xml_pack],
    include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_intel],
    c_args : [
      no_override_init_args, sse2_args,
@ -82,7 +82,7 @@ foreach v : ['80', '90', '110', '120', '125', '200']
    ],
    gnu_symbol_visibility : 'hidden',
    dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_nir_headers,
-                    idep_intel_driver_ds_headers, ],
+                    idep_intel_driver_ds_headers, idep_intel_shaders, ],
  )
 endforeach