From 96c8880900e3351dfb88c43b0e3e324539436bc0 Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Mon, 28 Feb 2022 15:13:07 +0200
Subject: [PATCH] intel/fs: fix total_scratch computation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We only have a single prog_data::total_scratch for all shader variants
(SIMD 8, 16, 32). Therefore we should always max the total_scratch on
top of existing variant.

We probably haven't run into that issue before because we compile by
increasing SIMD size and higher SIMD size is more likely to spill. But
for bindless shaders with return shaders, if the last return part
doesn't spill, we completely ignore the previous parts' scratch
computation.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: mesa-stable
Reviewed-by: Tapani Pälli <tapani.palli@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15193>
---
 src/intel/compiler/brw_fs.cpp              | 11 ++++++++++-
 src/intel/compiler/brw_mesh.cpp            |  2 ++
 src/intel/compiler/brw_shader.h            |  2 +-
 src/intel/compiler/brw_vec4.cpp            |  1 +
 src/intel/compiler/brw_vec4_gs_visitor.cpp |  1 +
 src/intel/compiler/brw_vec4_tcs.cpp        |  1 +
 6 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 666f2826ba6..0cbc6b6016c 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -8864,7 +8864,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
    if (last_scratch > 0) {
       ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
 
-      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
+      /* Take the max of any previously compiled variant of the shader. In the
+       * case of bindless shaders with return parts, this will also take the
+       * max of all parts.
+       */
+      prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
+                                      prog_data->total_scratch);
 
       if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
          if (devinfo->platform == INTEL_PLATFORM_HSW) {
@@ -9701,6 +9706,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
 
    prog_data->base.stage = MESA_SHADER_FRAGMENT;
    prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.total_scratch = 0;
 
    const struct intel_device_info *devinfo = compiler->devinfo;
    const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
@@ -10074,6 +10080,7 @@ brw_compile_cs(const struct brw_compiler *compiler,
    prog_data->base.stage = MESA_SHADER_COMPUTE;
    prog_data->base.total_shared = nir->info.shared_size;
    prog_data->base.ray_queries = nir->info.ray_queries;
+   prog_data->base.total_scratch = 0;
 
    if (!nir->info.workgroup_size_variable) {
       prog_data->local_size[0] = nir->info.workgroup_size[0];
@@ -10333,6 +10340,8 @@ brw_compile_bs(const struct brw_compiler *compiler,
 
    prog_data->base.stage = shader->info.stage;
    prog_data->base.ray_queries = shader->info.ray_queries;
+   prog_data->base.total_scratch = 0;
+
    prog_data->max_stack_size = 0;
 
    fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp
index 521f89a2f77..f39b2a706ed 100644
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -167,6 +167,7 @@ brw_compile_task(const struct brw_compiler *compiler,
 
    prog_data->base.base.stage = MESA_SHADER_TASK;
    prog_data->base.base.total_shared = nir->info.shared_size;
+   prog_data->base.base.total_scratch = 0;
 
    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
@@ -531,6 +532,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
 
    prog_data->base.base.stage = MESA_SHADER_MESH;
    prog_data->base.base.total_shared = nir->info.shared_size;
+   prog_data->base.base.total_scratch = 0;
 
    prog_data->base.local_size[0] = nir->info.workgroup_size[0];
    prog_data->base.local_size[1] = nir->info.workgroup_size[1];
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
index 6afe9c72836..dc201a66a3e 100644
--- a/src/intel/compiler/brw_shader.h
+++ b/src/intel/compiler/brw_shader.h
@@ -122,7 +122,7 @@ extern const char *const conditional_modifier[16];
 extern const char *const pred_ctrl_align16[16];
 
 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
-static inline int
+static inline unsigned
 brw_get_scratch_size(int size)
 {
    return MAX2(1024, util_next_power_of_two(size));
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 6f4f18abac4..7ee58fecad4 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -2543,6 +2543,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
 
    prog_data->base.base.stage = MESA_SHADER_VERTEX;
    prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
 
    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
    brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp
index 2d3c651dc51..65c5694cc85 100644
--- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -597,6 +597,7 @@ brw_compile_gs(const struct brw_compiler *compiler,
 
    prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
    prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
 
    /* The GLSL linker will have already matched up GS inputs and the outputs
     * of prior stages.  The driver does extend VS outputs in some cases, but
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index 0ae713fef28..c4c2ec113dc 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -367,6 +367,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
 
    vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
    prog_data->base.base.ray_queries = nir->info.ray_queries;
+   prog_data->base.base.total_scratch = 0;
 
    nir->info.outputs_written = key->outputs_written;
    nir->info.patch_outputs_written = key->patch_outputs_written;