radeonsi: emulate VGT_ESGS_RING_ITEMSIZE in the shader on gfx9-11

The hardware uses the register to premultiply GS vertex indices
in input VGPRs.

This changes the behavior as follows:
- VGT_ESGS_RING_ITEMSIZE is always 1 on gfx9-11, set in the preamble.
- The value is passed to the shader via current_gs_state (vs_state_bits).
- The shader does the multiplication.

The reason is that VGT_ESGS_RING_ITEMSIZE will be removed in the future.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21403>
This commit is contained in:
Marek Olšák 2023-02-20 01:07:58 -05:00 committed by Marge Bot
parent fb819fdb13
commit ddded6fbb5
7 changed files with 25 additions and 17 deletions

View File

@ -246,7 +246,10 @@ static bool lower_abi_instr(nir_builder *b, nir_instr *instr, struct lower_abi_s
unreachable("no nir_load_lshs_vertex_stride_amd");
break;
case nir_intrinsic_load_esgs_vertex_stride_amd:
replacement = nir_imm_int(b, 1);
assert(sel->screen->info.gfx_level >= GFX9);
replacement = shader->is_monolithic ?
nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4) :
GET_FIELD_NIR(GS_STATE_ESGS_VERTEX_STRIDE);
break;
case nir_intrinsic_load_tcs_num_patches_amd: {
nir_ssa_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 6);

View File

@ -266,6 +266,8 @@ enum
* in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader.
*/
/* bit gap */
#define GS_STATE_ESGS_VERTEX_STRIDE__SHIFT 10
#define GS_STATE_ESGS_VERTEX_STRIDE__MASK 0xff /* max 32 * 4 + 1 */
/* Small prim filter precision = num_samples / quant_mode, which can only be equal to 1/2^n
* where n is between 4 and 12. Knowing that, we only need to store 4 bits of the FP32 exponent.
* Set it like this: value = (fui(num_samples / quant_mode) >> 23) & 0xf;
@ -917,7 +919,7 @@ struct si_shader {
unsigned vgt_primitiveid_en;
unsigned vgt_gs_onchip_cntl;
unsigned vgt_gs_instance_cnt;
unsigned vgt_esgs_ring_itemsize;
unsigned esgs_vertex_stride;
unsigned spi_vs_out_config;
unsigned spi_shader_idx_format;
unsigned spi_shader_pos_format;

View File

@ -791,8 +791,8 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
*/
if (sscreen->info.gfx_level >= GFX9)
info->esgs_vertex_stride += 4;
assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0);
else
assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0);
info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read &
~info->base.inputs_read_indirectly &

View File

@ -65,8 +65,8 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
ret = si_insert_input_ptr(ctx, ret, ctx->args->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS);
ret = si_insert_input_ptr(ctx, ret, ctx->args->bindless_samplers_and_images,
8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
ret = si_insert_input_ptr(ctx, ret, ctx->args->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
if (ctx->screen->use_ngg) {
ret = si_insert_input_ptr(ctx, ret, ctx->args->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
ret = si_insert_input_ptr(ctx, ret, ctx->args->small_prim_cull_info, 8 + GFX9_SGPR_SMALL_PRIM_CULL_INFO);
if (ctx->screen->info.gfx_level >= GFX11)
ret = si_insert_input_ptr(ctx, ret, ctx->args->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR);

View File

@ -5804,6 +5804,7 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, 1);
si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
if (sctx->gfx_level < GFX11) {

View File

@ -1214,10 +1214,11 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
if (HAS_GS) {
radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
/* NGG always uses the state bits. Legacy GS uses the state bits only for the emulation
* of GS pipeline statistics on gfx10.x.
/* GS always uses the state bits for emulating VGT_ESGS_RING_ITEMSIZE on Gfx9
* (via nir_load_esgs_vertex_stride_amd) and for emulating GS pipeline statistics
* on gfx10.x. NGG GS also has lots of states in there.
*/
if (NGG || (GFX_VERSION >= GFX10 && GFX_VERSION <= GFX10_3))
if (GFX_VERSION >= GFX9)
radeon_set_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
/* The GS copy shader (for legacy GS) always uses the state bits. */

View File

@ -934,6 +934,11 @@ static void si_emit_shader_gs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.gs;
if (sctx->gfx_level >= GFX9) {
SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
shader->key.ge.part.gs.es->info.esgs_vertex_stride / 4);
}
radeon_begin(&sctx->gfx_cs);
/* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
@ -971,10 +976,6 @@ static void si_emit_shader_gs(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
shader->gs.vgt_gs_max_prims_per_subgroup);
/* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
shader->gs.vgt_esgs_ring_itemsize);
if (shader->key.ge.part.gs.es->stage == MESA_SHADER_TESS_EVAL)
radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
@ -1175,6 +1176,9 @@ bool gfx10_is_ngg_passthrough(struct si_shader *shader)
/* Common tail code for NGG primitive shaders. */
static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader)
{
SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE,
shader->ngg.esgs_vertex_stride);
radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
@ -1189,9 +1193,6 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
}
radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
shader->ngg.vgt_gs_instance_cnt);
radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
shader->ngg.vgt_esgs_ring_itemsize);
radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
shader->ngg.spi_vs_out_config);
radeon_opt_set_context_reg2(
@ -1441,10 +1442,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
gs_sel->info.writes_primid);
if (gs_stage == MESA_SHADER_GEOMETRY) {
shader->ngg.vgt_esgs_ring_itemsize = es_sel->info.esgs_vertex_stride / 4;
shader->ngg.esgs_vertex_stride = es_sel->info.esgs_vertex_stride / 4;
shader->ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out;
} else {
shader->ngg.vgt_esgs_ring_itemsize = 1;
shader->ngg.esgs_vertex_stride = 1;
}
if (es_stage == MESA_SHADER_TESS_EVAL)