From 96c8880900e3351dfb88c43b0e3e324539436bc0 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 28 Feb 2022 15:13:07 +0200 Subject: [PATCH] intel/fs: fix total_scratch computation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only have a single prog_data::total_scratch for all shader variants (SIMD 8, 16, 32). Therefore we should always max the total_scratch on top of existing variant. We probably haven't run into that issue before because we compile by increasing SIMD size and higher SIMD size is more likely to spill. But for bindless shaders with return shaders, if the last return part doesn't spill, we completely ignore the previous parts' scratch computation. Signed-off-by: Lionel Landwerlin Cc: mesa-stable Reviewed-by: Tapani Pälli Part-of: --- src/intel/compiler/brw_fs.cpp | 11 ++++++++++- src/intel/compiler/brw_mesh.cpp | 2 ++ src/intel/compiler/brw_shader.h | 2 +- src/intel/compiler/brw_vec4.cpp | 1 + src/intel/compiler/brw_vec4_gs_visitor.cpp | 1 + src/intel/compiler/brw_vec4_tcs.cpp | 1 + 6 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 666f2826ba6..0cbc6b6016c 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -8864,7 +8864,12 @@ fs_visitor::allocate_registers(bool allow_spilling) if (last_scratch > 0) { ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024; - prog_data->total_scratch = brw_get_scratch_size(last_scratch); + /* Take the max of any previously compiled variant of the shader. In the + * case of bindless shaders with return parts, this will also take the + * max of all parts. + */ + prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch), + prog_data->total_scratch); if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) { if (devinfo->platform == INTEL_PLATFORM_HSW) { @@ -9701,6 +9706,7 @@ brw_compile_fs(const struct brw_compiler *compiler, prog_data->base.stage = MESA_SHADER_FRAGMENT; prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.total_scratch = 0; const struct intel_device_info *devinfo = compiler->devinfo; const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16; @@ -10074,6 +10080,7 @@ brw_compile_cs(const struct brw_compiler *compiler, prog_data->base.stage = MESA_SHADER_COMPUTE; prog_data->base.total_shared = nir->info.shared_size; prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.total_scratch = 0; if (!nir->info.workgroup_size_variable) { prog_data->local_size[0] = nir->info.workgroup_size[0]; @@ -10333,6 +10340,8 @@ brw_compile_bs(const struct brw_compiler *compiler, prog_data->base.stage = shader->info.stage; prog_data->base.ray_queries = shader->info.ray_queries; + prog_data->base.total_scratch = 0; + prog_data->max_stack_size = 0; fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base, diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index 521f89a2f77..f39b2a706ed 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -167,6 +167,7 @@ brw_compile_task(const struct brw_compiler *compiler, prog_data->base.base.stage = MESA_SHADER_TASK; prog_data->base.base.total_shared = nir->info.shared_size; + prog_data->base.base.total_scratch = 0; prog_data->base.local_size[0] = nir->info.workgroup_size[0]; prog_data->base.local_size[1] = nir->info.workgroup_size[1]; @@ -531,6 +532,7 @@ brw_compile_mesh(const struct brw_compiler *compiler, prog_data->base.base.stage = MESA_SHADER_MESH; prog_data->base.base.total_shared = nir->info.shared_size; + prog_data->base.base.total_scratch = 0; prog_data->base.local_size[0] = nir->info.workgroup_size[0]; prog_data->base.local_size[1] = nir->info.workgroup_size[1]; diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h index 6afe9c72836..dc201a66a3e 100644 --- a/src/intel/compiler/brw_shader.h +++ b/src/intel/compiler/brw_shader.h @@ -122,7 +122,7 @@ extern const char *const conditional_modifier[16]; extern const char *const pred_ctrl_align16[16]; /* Per-thread scratch space is a power-of-two multiple of 1KB. */ -static inline int +static inline unsigned brw_get_scratch_size(int size) { return MAX2(1024, util_next_power_of_two(size)); diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index 6f4f18abac4..7ee58fecad4 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -2543,6 +2543,7 @@ brw_compile_vs(const struct brw_compiler *compiler, prog_data->base.base.stage = MESA_SHADER_VERTEX; prog_data->base.base.ray_queries = nir->info.ray_queries; + prog_data->base.base.total_scratch = 0; const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar); diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp index 2d3c651dc51..65c5694cc85 100644 --- a/src/intel/compiler/brw_vec4_gs_visitor.cpp +++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp @@ -597,6 +597,7 @@ brw_compile_gs(const struct brw_compiler *compiler, prog_data->base.base.stage = MESA_SHADER_GEOMETRY; prog_data->base.base.ray_queries = nir->info.ray_queries; + prog_data->base.base.total_scratch = 0; /* The GLSL linker will have already matched up GS inputs and the outputs * of prior stages. The driver does extend VS outputs in some cases, but diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp index 0ae713fef28..c4c2ec113dc 100644 --- a/src/intel/compiler/brw_vec4_tcs.cpp +++ b/src/intel/compiler/brw_vec4_tcs.cpp @@ -367,6 +367,7 @@ brw_compile_tcs(const struct brw_compiler *compiler, vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL; prog_data->base.base.ray_queries = nir->info.ray_queries; + prog_data->base.base.total_scratch = 0; nir->info.outputs_written = key->outputs_written; nir->info.patch_outputs_written = key->patch_outputs_written;