intel/fs: fix total_scratch computation

We only have a single prog_data::total_scratch for all shader variants (SIMD 8, 16, 32). Therefore we should always max the total_scratch on top of existing variant. We probably haven't run into that issue before because we compile by increasing SIMD size and higher SIMD size is more likely to spill. But for bindless shaders with return shaders, if the last return part doesn't spill, we completely ignore the previous parts' scratch computation. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Cc: mesa-stable Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15193>
2024-12-12 19:54:33 +08:00 · 2022-02-28 15:13:07 +02:00 · 2022-02-28 15:13:07 +02:00 · 96c8880900
commit 96c8880900
parent 5b43075888
6 changed files with 16 additions and 2 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -8864,7 +8864,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
   if (last_scratch > 0) {
      ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;

-      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
+      /* Take the max of any previously compiled variant of the shader. In the
+       * case of bindless shaders with return parts, this will also take the
+       * max of all parts.
+       */
+      prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
+                                      prog_data->total_scratch);

      if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
         if (devinfo->platform == INTEL_PLATFORM_HSW) {
@ -9701,6 +9706,7 @@ brw_compile_fs(const struct brw_compiler *compiler,

   prog_data->base.stage = MESA_SHADER_FRAGMENT;
   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.total_scratch = 0;

   const struct intel_device_info *devinfo = compiler->devinfo;
   const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
@ -10074,6 +10080,7 @@ brw_compile_cs(const struct brw_compiler *compiler,
   prog_data->base.stage = MESA_SHADER_COMPUTE;
   prog_data->base.total_shared = nir->info.shared_size;
   prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.total_scratch = 0;

   if (!nir->info.workgroup_size_variable) {
      prog_data->local_size[0] = nir->info.workgroup_size[0];
@ -10333,6 +10340,8 @@ brw_compile_bs(const struct brw_compiler *compiler,

   prog_data->base.stage = shader->info.stage;
   prog_data->base.ray_queries = shader->info.ray_queries;
+   prog_data->base.total_scratch = 0;
+
   prog_data->max_stack_size = 0;

   fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@ -167,6 +167,7 @@ brw_compile_task(const struct brw_compiler *compiler,

   prog_data->base.base.stage = MESA_SHADER_TASK;
   prog_data->base.base.total_shared = nir->info.shared_size;
+   prog_data->base.base.total_scratch = 0;

   prog_data->base.local_size[0] = nir->info.workgroup_size[0];
   prog_data->base.local_size[1] = nir->info.workgroup_size[1];
@ -531,6 +532,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,

   prog_data->base.base.stage = MESA_SHADER_MESH;
   prog_data->base.base.total_shared = nir->info.shared_size;
+   prog_data->base.base.total_scratch = 0;

   prog_data->base.local_size[0] = nir->info.workgroup_size[0];
   prog_data->base.local_size[1] = nir->info.workgroup_size[1];
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@ -122,7 +122,7 @@ extern const char *const conditional_modifier[16];
 extern const char *const pred_ctrl_align16[16];

 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
-static inline int
+static inline unsigned
 brw_get_scratch_size(int size)
 {
   return MAX2(1024, util_next_power_of_two(size));
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@ -2543,6 +2543,7 @@ brw_compile_vs(const struct brw_compiler *compiler,

   prog_data->base.base.stage = MESA_SHADER_VERTEX;
   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;

   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
   brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
--- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@ -597,6 +597,7 @@ brw_compile_gs(const struct brw_compiler *compiler,

   prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;

   /* The GLSL linker will have already matched up GS inputs and the outputs
    * of prior stages.  The driver does extend VS outputs in some cases, but
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@ -367,6 +367,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,

   vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
   prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;

   nir->info.outputs_written = key->outputs_written;
   nir->info.patch_outputs_written = key->patch_outputs_written;