radv: fix NGG streamout with VS and GPL on GFX11

With GPL it's not possible to know the primitive topology when
compiling the pre-rasterization stages. For NGG, we use the maximum
number of vertices per prim and rely on the hardware to ignore the
extra bits for points/lines.

Though, this can't work for NGG streamout because the number of
vertices per prim is used to compute a streamout offset. The only
way to solve this is to pass the number of vertices per prim through
a new user SGPR.

This fixes a bunch of streamout tests with Zink/RADV on GFX11.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21833>
This commit is contained in:
Samuel Pitoiset 2023-02-15 15:12:42 +01:00 committed by Marge Bot
parent 491887c9f2
commit d750ad19fd
8 changed files with 49 additions and 3 deletions

View File

@ -2174,7 +2174,12 @@ radv_emit_provoking_vertex_mode(struct radv_cmd_buffer *cmd_buffer)
static void
radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
{
const struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
const struct radv_userdata_info *loc =
&pipeline->last_vgt_api_stage_locs[AC_UD_NUM_VERTS_PER_PRIM];
const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
const unsigned stage = pipeline->last_vgt_api_stage;
uint32_t base_reg;
assert(!cmd_buffer->state.mesh_shading);
@ -2185,6 +2190,13 @@ radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE,
d->vk.ia.primitive_topology);
}
if (loc->sgpr_idx == -1)
return;
base_reg = pipeline->base.user_data_0[stage];
radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
si_conv_prim_to_gs_out(d->vk.ia.primitive_topology) + 1);
}
static void
@ -6342,6 +6354,11 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZATION_SAMPLES;
}
/* Re-emit the primitive topology because the SGPR idx can be different. */
if (graphics_pipeline->has_num_verts_per_prim) {
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
}
radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
radv_bind_vs_input_state(cmd_buffer, graphics_pipeline);

View File

@ -406,7 +406,12 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
unsigned num_vertices;
if (stage == MESA_SHADER_VERTEX) {
num_vertices = radv_get_num_vertices_per_prim(s->pl_key);
/* For dynamic primitive topology with streamout. */
if (s->info->vs.dynamic_num_verts_per_prim) {
replacement = ac_nir_load_arg(b, &s->args->ac, s->args->num_verts_per_prim);
} else {
replacement = nir_imm_int(b, radv_get_num_vertices_per_prim(s->pl_key));
}
} else if (stage == MESA_SHADER_TESS_EVAL) {
if (s->info->tes.point_mode) {
num_vertices = 1;
@ -415,6 +420,7 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
} else {
num_vertices = 3;
}
replacement = nir_imm_int(b, num_vertices);
} else {
assert(stage == MESA_SHADER_GEOMETRY);
switch (s->info->gs.output_prim) {
@ -431,8 +437,8 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
unreachable("invalid GS output primitive");
break;
}
replacement = nir_imm_int(b, num_vertices);
}
replacement = nir_imm_int(b, num_vertices);
break;
}
case nir_intrinsic_load_ordered_id_amd:

View File

@ -4980,6 +4980,8 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv
pipeline->has_streamout = pipeline->last_vgt_api_stage_locs[AC_UD_STREAMOUT_BUFFERS].sgpr_idx != -1;
pipeline->has_dynamic_samples = ps->info.user_sgprs_locs.shader_data[AC_UD_PS_NUM_SAMPLES].sgpr_idx != -1;
pipeline->has_sample_positions = ps->info.ps.needs_sample_positions;
pipeline->has_num_verts_per_prim =
pipeline->last_vgt_api_stage_locs[AC_UD_NUM_VERTS_PER_PRIM].sgpr_idx != -1;
pipeline->base.push_constant_size = pipeline_layout.push_constant_size;
pipeline->base.dynamic_offset_count = pipeline_layout.dynamic_offset_count;

View File

@ -2202,6 +2202,7 @@ struct radv_graphics_pipeline {
bool has_streamout;
bool has_dynamic_samples;
bool has_sample_positions;
bool has_num_verts_per_prim;
uint8_t vtx_emit_num;

View File

@ -159,7 +159,8 @@ enum radv_ud_index {
AC_UD_NGG_VIEWPORT = 9,
AC_UD_FORCE_VRS_RATES = 10,
AC_UD_TASK_RING_ENTRY = 11,
AC_UD_SHADER_START = 12,
AC_UD_NUM_VERTS_PER_PRIM = 12,
AC_UD_SHADER_START = 13,
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
AC_UD_VS_PROLOG_INPUTS,
@ -277,6 +278,7 @@ struct radv_shader_info {
uint32_t input_slot_usage_mask;
bool has_prolog;
bool dynamic_inputs;
bool dynamic_num_verts_per_prim;
} vs;
struct {
uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];

View File

@ -88,6 +88,8 @@ count_vs_user_sgprs(const struct radv_shader_info *info)
count++;
if (info->vs.needs_base_instance)
count++;
if (info->vs.dynamic_num_verts_per_prim)
count++;
return count;
}
@ -828,6 +830,9 @@ radv_declare_shader_args(const struct radv_device *device, const struct radv_pip
if (previous_stage == MESA_SHADER_TESS_EVAL && key->dynamic_patch_control_points)
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_num_patches);
if (previous_stage == MESA_SHADER_VERTEX && info->vs.dynamic_num_verts_per_prim)
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->num_verts_per_prim);
/* Legacy GS force vrs is handled by GS copy shader. */
if (info->force_vrs_per_vertex && info->is_ngg) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.force_vrs_rates);
@ -974,6 +979,9 @@ radv_declare_shader_args(const struct radv_device *device, const struct radv_pip
if (args->tes_num_patches.used)
set_loc_shader(args, AC_UD_TES_NUM_PATCHES, &user_sgpr_idx, 1);
if (args->num_verts_per_prim.used)
set_loc_shader(args, AC_UD_NUM_VERTS_PER_PRIM, &user_sgpr_idx, 1);
if (args->ac.force_vrs_rates.used)
set_loc_shader(args, AC_UD_FORCE_VRS_RATES, &user_sgpr_idx, 1);

View File

@ -74,6 +74,9 @@ struct radv_shader_args {
/* TES */
struct ac_arg tes_num_patches;
/* NGG VS streamout */
struct ac_arg num_verts_per_prim;
struct radv_userdata_locations user_sgprs_locs;
unsigned num_user_sgprs;

View File

@ -421,6 +421,13 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
nir_foreach_shader_in_variable(var, nir)
gather_info_input_decl_vs(nir, var->data.location - VERT_ATTRIB_GENERIC0, var->type,
pipeline_key, info);
/* When the topology is unknown (with GPL), the number of vertices per primitive needs be passed
* through a user SGPR for NGG streamout with VS. Otherwise, the XFB offset is incorrectly
* computed because using the maximum number of vertices can't work.
*/
info->vs.dynamic_num_verts_per_prim =
pipeline_key->vs.topology == V_008958_DI_PT_NONE && info->is_ngg && nir->xfb_info;
}
static void