aco: don't consider sa_sdst=0 before SALU write to fix VALUMaskWriteHazard

LLVM does but that's probably a bug.

fossil-db (navi31):
Totals from 311 (0.39% of 79395) affected shaders:
Instrs: 380453 -> 381075 (+0.16%)
CodeSize: 1961012 -> 1964744 (+0.19%)
Latency: 4799095 -> 4800313 (+0.03%)
InvThroughput: 958358 -> 958904 (+0.06%)
VALU: 242322 -> 242633 (+0.13%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Backport-to: 24.1
Backport-to: 24.2
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30818>
(cherry picked from commit b1ba7d1b99)
This commit is contained in:
Rhys Perry 2024-08-22 16:03:31 +01:00 committed by Eric Engestrom
parent 6beff74e00
commit a42e144c32
4 changed files with 36 additions and 9 deletions

View File

@ -674,7 +674,7 @@
"description": "aco: don't consider sa_sdst=0 before SALU write to fix VALUMaskWriteHazard",
"nominated": true,
"nomination_type": 4,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View File

@ -375,5 +375,5 @@ Triggered by:
SALU writing then SALU or VALU reading a SGPR that was previously used as a lane mask for a VALU.
Mitigated by:
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait:
`s_waitcnt_depctr 0xfffe`
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the
SALU write: `s_waitcnt_depctr 0xfffe`

View File

@ -1579,6 +1579,7 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
Builder bld(state.program, &new_instructions);
unsigned waitcnt_depctr = 0xffff;
bool valu_read_sgpr = false;
/* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
bool has_vdst0_since_valu = true;
@ -1598,12 +1599,15 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
}
/* VALUMaskWriteHazard */
if (state.program->wave_size == 64 &&
(ctx.sgpr_read_by_valu_as_lanemask.any() ||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) {
waitcnt_depctr &= 0xfffe;
ctx.sgpr_read_by_valu_as_lanemask.reset();
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
if (state.program->wave_size == 64) {
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
waitcnt_depctr &= 0xfffe;
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
}
if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
valu_read_sgpr = true;
ctx.sgpr_read_by_valu_as_lanemask.reset();
}
}
/* LdsDirectVMEMHazard */
@ -1617,6 +1621,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
if (waitcnt_depctr != 0xffff)
bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
if (valu_read_sgpr) {
/* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
* other hazards. */
bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
/* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
}
}
template <typename Ctx>

View File

@ -1282,9 +1282,22 @@ BEGIN_TEST(insert_nops.setpc_gfx11)
/* VALUMaskWriteHazard */
//! p_unit_test 4
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
//! s1: %0:vcc_hi = s_mov_b32 0
//! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
Operand::zero(), Operand(vcc, s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
//! p_unit_test 8
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
//! s_waitcnt_depctr va_vdst(0)
//! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
//! s_waitcnt_depctr va_vdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
Operand::zero(), Operand(vcc, s2));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));