From ecb709c85351c8c127a1cd3577c74d6b2b0b706a Mon Sep 17 00:00:00 2001 From: Felix DeGrood Date: Tue, 28 Mar 2023 17:04:32 +0000 Subject: [PATCH] anv: only emit CFE_STATE when scratch space increases On Gen12.5+, we only need to emit CFE_STATE when scratch space has changed, not on every pipeline binding. Also, only grow the scratch space, never shrink it. Need to reset after secondary buf. Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/vulkan/anv_genX.h | 3 + src/intel/vulkan/anv_private.h | 3 + .../vulkan/genX_acceleration_structure.c | 11 +++ src/intel/vulkan/genX_cmd_buffer.c | 73 +++++++++++++------ src/intel/vulkan/genX_pipeline.c | 11 --- 5 files changed, 68 insertions(+), 33 deletions(-) diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index ebd24659ea5..cdb05427881 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -132,6 +132,9 @@ void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buff struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer, + uint32_t total_scratch); + void genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct intel_l3_config *l3_config, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index fc46d87e53b..844976074b3 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2593,6 +2593,9 @@ struct anv_cmd_compute_state { struct anv_state push_data; struct anv_address num_workgroups; + + uint32_t scratch_size; + bool cfe_state_valid; }; struct anv_cmd_ray_tracing_state { diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c index caf8b1ed6c4..4c675e985c2 100644 --- a/src/intel/vulkan/genX_acceleration_structure.c +++ b/src/intel/vulkan/genX_acceleration_structure.c @@ -817,6 +817,17 @@ cmd_build_acceleration_structures( &data, sizeof(data)); } + if (anv_cmd_buffer_is_render_queue(cmd_buffer)) + genX(flush_pipeline_select_gpgpu)(cmd_buffer); + + /* Due to the nature of GRL and its heavy use of jumps/predication, we + * cannot tell exactly in what order the CFE_STATE we insert are going to + * be executed. So always use the largest possible size. + */ + genX(cmd_buffer_ensure_cfe_state)( + cmd_buffer, + cmd_buffer->device->physical->max_grl_scratch_size); + /* Round 1 : init_globals kernel */ genX(grl_misc_batched_init_globals)( cmd_buffer, diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 46ea84c57e3..f05099b0940 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -3955,6 +3955,7 @@ genX(CmdExecuteCommands)( primary->state.current_l3_config = NULL; primary->state.current_hash_scale = 0; primary->state.gfx.push_constant_stages = 0; + primary->state.compute.cfe_state_valid = false; vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state); /* Each of the secondary command buffers will use its own state base @@ -5492,11 +5493,56 @@ genX(CmdDrawMeshTasksIndirectCountEXT)( #endif /* GFX_VERx10 >= 125 */ +void +genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer, + uint32_t total_scratch) +{ +#if GFX_VERx10 >= 125 + assert(cmd_buffer->state.current_pipeline == GPGPU); + + struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; + + if (comp_state->cfe_state_valid && + total_scratch <= comp_state->scratch_size) + return; + + const struct intel_device_info *devinfo = cmd_buffer->device->info; + anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) { + const uint32_t subslices = MAX2(devinfo->subslice_total, 1); + cfe.MaximumNumberofThreads = + devinfo->max_cs_threads * subslices - 1; + + uint32_t scratch_surf = 0xffffffff; + if (total_scratch > 0) { + struct anv_bo *scratch_bo = + anv_scratch_pool_alloc(cmd_buffer->device, + &cmd_buffer->device->scratch_pool, + MESA_SHADER_COMPUTE, + total_scratch); + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + scratch_bo); + scratch_surf = + anv_scratch_pool_get_surf(cmd_buffer->device, + &cmd_buffer->device->scratch_pool, + total_scratch); + cfe.ScratchSpaceBuffer = scratch_surf >> 4; + } + } + + comp_state->scratch_size = total_scratch; + comp_state->cfe_state_valid = true; +#else + unreachable("Invalid call"); +#endif +} + static void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; struct anv_compute_pipeline *pipeline = comp_state->pipeline; + const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info; assert(pipeline->cs); @@ -5528,6 +5574,11 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); +#if GFX_VERx10 >= 125 + const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch); +#endif + /* The workgroup size of the pipeline affects our push constant layout * so flag push constants as dirty if we change the pipeline. */ @@ -5939,28 +5990,6 @@ genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer, struct brw_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); - anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) { - const uint32_t subslices = MAX2(devinfo->subslice_total, 1); - cfe.MaximumNumberofThreads = - devinfo->max_cs_threads * subslices - 1; - - if (cs_prog_data->base.total_scratch > 0) { - struct anv_bo *scratch_bo = - anv_scratch_pool_alloc(cmd_buffer->device, - &cmd_buffer->device->scratch_pool, - MESA_SHADER_COMPUTE, - cs_prog_data->base.total_scratch); - anv_reloc_list_add_bo(cmd_buffer->batch.relocs, - cmd_buffer->batch.alloc, - scratch_bo); - uint32_t scratch_surf = - anv_scratch_pool_get_surf(cmd_buffer->device, - &cmd_buffer->device->scratch_pool, - cs_prog_data->base.total_scratch); - cfe.ScratchSpaceBuffer = scratch_surf >> 4; - } - } - anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { cw.PredicateEnable = false; cw.SIMDSize = dispatch.simd_size / 16; diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 68c5eb9dbaa..cefdf66f082 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -1879,19 +1879,8 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, void genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) { - struct anv_device *device = pipeline->base.device; const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0); - - const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs; - const struct intel_device_info *devinfo = device->info; - - anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) { - cfe.MaximumNumberofThreads = - devinfo->max_cs_threads * devinfo->subslice_total; - cfe.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin); - } } #else /* #if GFX_VERx10 >= 125 */