From 807651561e75314769bb925c16d40f99576dc155 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 7 Aug 2024 15:41:42 +0100 Subject: [PATCH] aco: split insert_wait_states into two MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No fossil-db changes. Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Acked-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_delay_alu.cpp | 392 ++++++++++++++++++++++ src/amd/compiler/aco_insert_waitcnt.cpp | 322 ++---------------- src/amd/compiler/aco_interface.cpp | 13 +- src/amd/compiler/aco_ir.h | 4 +- src/amd/compiler/meson.build | 1 + src/amd/compiler/tests/helpers.cpp | 2 +- 6 files changed, 430 insertions(+), 304 deletions(-) create mode 100644 src/amd/compiler/aco_insert_delay_alu.cpp diff --git a/src/amd/compiler/aco_insert_delay_alu.cpp b/src/amd/compiler/aco_insert_delay_alu.cpp new file mode 100644 index 00000000000..baf1a5c5f58 --- /dev/null +++ b/src/amd/compiler/aco_insert_delay_alu.cpp @@ -0,0 +1,392 @@ +/* + * Copyright © 2018 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "aco_builder.h" +#include "aco_ir.h" + +#include +#include +#include + +namespace aco { + +namespace { + +/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different + * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal + * that we should switch to a different wave and contains info on dependencies as to + * when we can switch back. + * + * This seems to apply only for ALU->ALU dependencies as other instructions have better + * integration with the frontend. + * + * Note that if we do not emit s_delay_alu things will still be correct, but the wave + * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as + * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU + * instructions can take a different number of cycles based on the exec mask) + */ +struct alu_delay_info { + /* These are the values directly above the max representable value, i.e. the wait + * would turn into a no-op when we try to wait for something further back than + * this. + */ + static constexpr int8_t valu_nop = 5; + static constexpr int8_t trans_nop = 4; + + /* How many VALU instructions ago this value was written */ + int8_t valu_instrs = valu_nop; + /* Cycles until the writing VALU instruction is finished */ + int8_t valu_cycles = 0; + + /* How many Transcedent instructions ago this value was written */ + int8_t trans_instrs = trans_nop; + /* Cycles until the writing Transcendent instruction is finished */ + int8_t trans_cycles = 0; + + /* Cycles until the writing SALU instruction is finished*/ + int8_t salu_cycles = 0; + + bool combine(const alu_delay_info& other) + { + bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs || + other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles || + other.trans_cycles > trans_cycles; + valu_instrs = std::min(valu_instrs, other.valu_instrs); + trans_instrs = std::min(trans_instrs, other.trans_instrs); + salu_cycles = std::max(salu_cycles, other.salu_cycles); + valu_cycles = std::max(valu_cycles, other.valu_cycles); + trans_cycles = std::max(trans_cycles, other.trans_cycles); + return changed; + } + + /* Needs to be called after any change to keep the data consistent. */ + bool fixup() + { + if (valu_instrs >= valu_nop || valu_cycles <= 0) { + valu_instrs = valu_nop; + valu_cycles = 0; + } + + if (trans_instrs >= trans_nop || trans_cycles <= 0) { + trans_instrs = trans_nop; + trans_cycles = 0; + } + + salu_cycles = std::max(salu_cycles, 0); + + return empty(); + } + + /* Returns true if a wait would be a no-op */ + bool empty() const + { + return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0; + } + + UNUSED void print(FILE* output) const + { + if (valu_instrs != valu_nop) + fprintf(output, "valu_instrs: %u\n", valu_instrs); + if (valu_cycles) + fprintf(output, "valu_cycles: %u\n", valu_cycles); + if (trans_instrs != trans_nop) + fprintf(output, "trans_instrs: %u\n", trans_instrs); + if (trans_cycles) + fprintf(output, "trans_cycles: %u\n", trans_cycles); + if (salu_cycles) + fprintf(output, "salu_cycles: %u\n", salu_cycles); + } +}; + +struct delay_ctx { + Program* program; + std::map gpr_map; + + delay_ctx() {} + delay_ctx(Program* program_) : program(program_) {} + + bool join(const delay_ctx* other) + { + bool changed = false; + for (const auto& entry : other->gpr_map) { + using iterator = std::map::iterator; + const std::pair insert_pair = gpr_map.insert(entry); + if (insert_pair.second) + changed = true; + else + changed |= insert_pair.first->second.combine(entry.second); + } + + return changed; + } + + UNUSED void print(FILE* output) const + { + for (const auto& entry : gpr_map) { + fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's', + entry.first.reg() & 0xff); + entry.second.print(output); + fprintf(output, "}\n"); + } + } +}; + +void +check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr) +{ + for (const Operand op : instr->operands) { + if (op.isConstant() || op.isUndefined()) + continue; + + /* check consecutively read gprs */ + for (unsigned j = 0; j < op.size(); j++) { + std::map::iterator it = + ctx.gpr_map.find(PhysReg{op.physReg() + j}); + if (it != ctx.gpr_map.end()) + delay.combine(it->second); + } + } +} + +bool +parse_delay_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr) +{ + if (instr->opcode != aco_opcode::s_delay_alu) + return false; + + unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf}; + for (unsigned i = 0; i < 2; ++i) { + alu_delay_wait wait = (alu_delay_wait)imm[i]; + if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4) + delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1; + else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3) + delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1; + else if (wait >= alu_delay_wait::SALU_CYCLE_1) + delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1; + } + + delay.valu_cycles = instr->pass_flags & 0xffff; + delay.trans_cycles = instr->pass_flags >> 16; + + return true; +} + +void +update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles) +{ + std::map::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) { + alu_delay_info& entry = it->second; + entry.valu_instrs += is_valu ? 1 : 0; + entry.trans_instrs += is_trans ? 1 : 0; + entry.salu_cycles -= cycles; + entry.valu_cycles -= cycles; + entry.trans_cycles -= cycles; + it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); + } +} + +void +kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx) +{ + if (instr->isVALU() || instr->isSALU()) + check_alu(ctx, delay, instr); + + if (!delay.empty()) { + update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles)); + + /* remove all gprs with higher counter from map */ + std::map::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) { + if (delay.valu_instrs <= it->second.valu_instrs) + it->second.valu_instrs = alu_delay_info::valu_nop; + if (delay.trans_instrs <= it->second.trans_instrs) + it->second.trans_instrs = alu_delay_info::trans_nop; + it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); + } + } +} + +void +gen_alu(Instruction* instr, delay_ctx& ctx) +{ + if (instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() || + instr->isMUBUF() || instr->isMTBUF()) { + ctx.gpr_map.clear(); + return; + } + + Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr); + bool is_valu = instr->isVALU(); + bool is_trans = instr->isTrans(); + + if (is_trans || is_valu || instr->isSALU()) { + alu_delay_info delay; + if (is_trans) { + delay.trans_instrs = 0; + delay.trans_cycles = cycle_info.latency; + } else if (is_valu) { + delay.valu_instrs = 0; + delay.valu_cycles = cycle_info.latency; + } else if (instr->isSALU()) { + delay.salu_cycles = cycle_info.latency; + } + + for (const Definition& def : instr->definitions) { + for (unsigned i = 0; i < def.size(); i++) { + auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + i}, delay); + if (!it.second) + it.first->second.combine(delay); + } + } + } + + update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans, + cycle_info.issue_cycles); +} + +void +emit_delay_alu(delay_ctx& ctx, std::vector>& instructions, + alu_delay_info& delay) +{ + uint32_t imm = 0; + if (delay.trans_instrs != delay.trans_nop) { + imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1; + } + + if (delay.valu_instrs != delay.valu_nop) { + imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0); + } + + /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just + * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally + * getting this wrong isn't an issue. */ + if (delay.salu_cycles && imm <= 0xf) { + unsigned cycles = std::min(3, delay.salu_cycles); + imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0); + } + + Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0); + inst->salu().imm = imm; + inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16)); + instructions.emplace_back(inst); + delay = alu_delay_info(); +} + +void +handle_block(Program* program, Block& block, delay_ctx& ctx) +{ + std::vector> new_instructions; + alu_delay_info queued_delay; + + for (size_t i = 0; i < block.instructions.size(); i++) { + aco_ptr& instr = block.instructions[i]; + bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get()); + + kill_alu(queued_delay, instr.get(), ctx); + gen_alu(instr.get(), ctx); + + if (!is_delay_alu) { + if (!queued_delay.empty()) + emit_delay_alu(ctx, new_instructions, queued_delay); + new_instructions.emplace_back(std::move(instr)); + } + } + + if (!queued_delay.empty()) + emit_delay_alu(ctx, new_instructions, queued_delay); + block.instructions.swap(new_instructions); +} + +} /* end namespace */ + +void +insert_delay_alu(Program* program) +{ + /* per BB ctx */ + std::vector done(program->blocks.size()); + std::vector in_ctx(program->blocks.size(), delay_ctx(program)); + std::vector out_ctx(program->blocks.size(), delay_ctx(program)); + + std::stack> loop_header_indices; + unsigned loop_progress = 0; + + for (unsigned i = 0; i < program->blocks.size();) { + Block& current = program->blocks[i++]; + + if (current.kind & block_kind_discard_early_exit) { + /* Because the jump to the discard early exit block may happen anywhere in a block, it's + * not possible to join it with its predecessors this way. + */ + continue; + } + + delay_ctx ctx = in_ctx[current.index]; + + if (current.kind & block_kind_loop_header) { + loop_header_indices.push(current.index); + } else if (current.kind & block_kind_loop_exit) { + bool repeat = false; + if (loop_progress == loop_header_indices.size()) { + i = loop_header_indices.top(); + repeat = true; + } + loop_header_indices.pop(); + loop_progress = std::min(loop_progress, loop_header_indices.size()); + if (repeat) + continue; + } + + bool changed = false; + for (unsigned b : current.linear_preds) + changed |= ctx.join(&out_ctx[b]); + + if (done[current.index] && !changed) { + in_ctx[current.index] = std::move(ctx); + continue; + } else { + in_ctx[current.index] = ctx; + } + + loop_progress = std::max(loop_progress, current.loop_nest_depth); + done[current.index] = true; + + handle_block(program, current, ctx); + + out_ctx[current.index] = std::move(ctx); + } +} + +void +combine_delay_alu(Program* program) +{ + /* Combine s_delay_alu using the skip field. */ + for (Block& block : program->blocks) { + int i = 0; + int prev_delay_alu = -1; + for (aco_ptr& instr : block.instructions) { + if (instr->opcode != aco_opcode::s_delay_alu) { + block.instructions[i++] = std::move(instr); + continue; + } + + uint16_t imm = instr->salu().imm; + int skip = i - prev_delay_alu - 1; + if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) { + if (imm >> 7 == 0) + prev_delay_alu = i; + block.instructions[i++] = std::move(instr); + continue; + } + + block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7); + prev_delay_alu = -1; + } + block.instructions.resize(i); + } +} + +} // namespace aco diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index ae079dcd755..e6263d6f191 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -55,10 +55,7 @@ enum wait_event : uint32_t { event_ldsdir = 1 << 12, event_vmem_sample = 1 << 13, /* GFX12+ */ event_vmem_bvh = 1 << 14, /* GFX12+ */ - event_valu = 1 << 15, - event_trans = 1 << 16, - event_salu = 1 << 17, - num_events = 18, + num_events = 15, }; enum counter_type : uint8_t { @@ -69,107 +66,20 @@ enum counter_type : uint8_t { counter_sample = 1 << wait_type_sample, counter_bvh = 1 << wait_type_bvh, counter_km = 1 << wait_type_km, - counter_alu = 1 << wait_type_num, - num_counters = wait_type_num + 1, - wait_counters = BITFIELD_MASK(wait_type_num), -}; - -/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different - * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal - * that we should switch to a different wave and contains info on dependencies as to - * when we can switch back. - * - * This seems to apply only for ALU->ALU dependencies as other instructions have better - * integration with the frontend. - * - * Note that if we do not emit s_delay_alu things will still be correct, but the wave - * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as - * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU - * instructions can take a different number of cycles based on the exec mask) - */ -struct alu_delay_info { - /* These are the values directly above the max representable value, i.e. the wait - * would turn into a no-op when we try to wait for something further back than - * this. - */ - static constexpr int8_t valu_nop = 5; - static constexpr int8_t trans_nop = 4; - - /* How many VALU instructions ago this value was written */ - int8_t valu_instrs = valu_nop; - /* Cycles until the writing VALU instruction is finished */ - int8_t valu_cycles = 0; - - /* How many Transcedent instructions ago this value was written */ - int8_t trans_instrs = trans_nop; - /* Cycles until the writing Transcendent instruction is finished */ - int8_t trans_cycles = 0; - - /* Cycles until the writing SALU instruction is finished*/ - int8_t salu_cycles = 0; - - bool combine(const alu_delay_info& other) - { - bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs || - other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles || - other.trans_cycles > trans_cycles; - valu_instrs = std::min(valu_instrs, other.valu_instrs); - trans_instrs = std::min(trans_instrs, other.trans_instrs); - salu_cycles = std::max(salu_cycles, other.salu_cycles); - valu_cycles = std::max(valu_cycles, other.valu_cycles); - trans_cycles = std::max(trans_cycles, other.trans_cycles); - return changed; - } - - /* Needs to be called after any change to keep the data consistent. */ - void fixup() - { - if (valu_instrs >= valu_nop || valu_cycles <= 0) { - valu_instrs = valu_nop; - valu_cycles = 0; - } - - if (trans_instrs >= trans_nop || trans_cycles <= 0) { - trans_instrs = trans_nop; - trans_cycles = 0; - } - - salu_cycles = std::max(salu_cycles, 0); - } - - /* Returns true if a wait would be a no-op */ - bool empty() const - { - return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0; - } - - UNUSED void print(FILE* output) const - { - if (valu_instrs != valu_nop) - fprintf(output, "valu_instrs: %u\n", valu_instrs); - if (valu_cycles) - fprintf(output, "valu_cycles: %u\n", valu_cycles); - if (trans_instrs != trans_nop) - fprintf(output, "trans_instrs: %u\n", trans_instrs); - if (trans_cycles) - fprintf(output, "trans_cycles: %u\n", trans_cycles); - if (salu_cycles) - fprintf(output, "salu_cycles: %u\n", salu_cycles); - } + num_counters = wait_type_num, }; struct wait_entry { wait_imm imm; - alu_delay_info delay; uint32_t events; /* use wait_event notion */ uint8_t counters; /* use counter_type notion */ bool wait_on_read : 1; bool logical : 1; uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */ - wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_, - bool logical_, bool wait_on_read_) - : imm(imm_), delay(delay_), events(event_), counters(counters_), wait_on_read(wait_on_read_), + wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool logical_, + bool wait_on_read_) + : imm(imm_), events(event_), counters(counters_), wait_on_read(wait_on_read_), logical(logical_), vmem_types(0) {} @@ -181,20 +91,12 @@ struct wait_entry { events |= other.events; counters |= other.counters; changed |= imm.combine(other.imm); - changed |= delay.combine(other.delay); wait_on_read |= other.wait_on_read; vmem_types |= other.vmem_types; logical &= other.logical; return changed; } - void remove_alu_counter() - { - counters &= ~counter_alu; - delay = alu_delay_info(); - events &= ~(event_valu | event_trans | event_salu); - } - void remove_wait(wait_type type, uint32_t type_events) { counters &= ~(1 << type); @@ -212,7 +114,6 @@ struct wait_entry { { fprintf(output, "logical: %u\n", logical); imm.print(output); - delay.print(output); if (events) fprintf(output, "events: %u\n", events); if (counters) @@ -253,9 +154,6 @@ struct target_info { u_foreach_bit (j, events[i]) counters[j] |= (1 << i); } - counters[ffs(event_valu) - 1] |= counter_alu; - counters[ffs(event_trans) - 1] |= counter_alu; - counters[ffs(event_salu) - 1] |= counter_alu; unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0); } @@ -355,7 +253,7 @@ get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type) } void -check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr) +check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) { for (const Operand op : instr->operands) { if (op.isConstant() || op.isUndefined()) @@ -363,14 +261,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i /* check consecutively read gprs */ for (unsigned j = 0; j < op.size(); j++) { - PhysReg reg{op.physReg() + j}; - std::map::iterator it = ctx.gpr_map.find(reg); - if (it == ctx.gpr_map.end() || !it->second.wait_on_read) - continue; - - wait.combine(it->second.imm); - if (instr->isVALU() || instr->isSALU()) - delay.combine(it->second.delay); + std::map::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j}); + if (it != ctx.gpr_map.end() && it->second.wait_on_read) + wait.combine(it->second.imm); } } @@ -405,29 +298,6 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i } } -bool -parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr) -{ - if (instr->opcode != aco_opcode::s_delay_alu) - return false; - - unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf}; - for (unsigned i = 0; i < 2; ++i) { - alu_delay_wait wait = (alu_delay_wait)imm[i]; - if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4) - delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1; - else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3) - delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1; - else if (wait >= alu_delay_wait::SALU_CYCLE_1) - delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1; - } - - delay.valu_cycles = instr->pass_flags & 0xffff; - delay.trans_cycles = instr->pass_flags >> 16; - - return true; -} - void perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics) { @@ -464,36 +334,7 @@ force_waitcnt(wait_ctx& ctx, wait_imm& imm) } void -update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles) -{ - std::map::iterator it = ctx.gpr_map.begin(); - while (it != ctx.gpr_map.end()) { - wait_entry& entry = it->second; - - if (clear) { - entry.remove_alu_counter(); - } else { - entry.delay.valu_instrs += is_valu ? 1 : 0; - entry.delay.trans_instrs += is_trans ? 1 : 0; - entry.delay.salu_cycles -= cycles; - entry.delay.valu_cycles -= cycles; - entry.delay.trans_cycles -= cycles; - - entry.delay.fixup(); - if (it->second.delay.empty()) - entry.remove_alu_counter(); - } - - if (!entry.counters) - it = ctx.gpr_map.erase(it); - else - it++; - } -} - -void -kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, - memory_sync_info sync_info) +kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) { if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) { /* Force emitting waitcnt states right after the instruction if there is @@ -522,7 +363,7 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, imm[i] = 0; } - check_instr(ctx, imm, delay, instr); + check_instr(ctx, imm, instr); /* It's required to wait for scalar stores before "writing back" data. * It shouldn't cost anything anyways since we're about to do s_endpgm. @@ -555,7 +396,7 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, else perform_barrier(ctx, imm, sync_info, semantic_release); - if (!imm.empty() || !delay.empty()) { + if (!imm.empty()) { if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) imm.vm = 0; if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter) @@ -579,11 +420,6 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, bar_ev &= ~event_flat; } - if (ctx.program->gfx_level >= GFX11) { - update_alu(ctx, false, false, false, - MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles)); - } - /* remove all gprs with higher counter from map */ std::map::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { @@ -591,13 +427,6 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, if (imm[i] != wait_imm::unset_counter && imm[i] <= it->second.imm[i]) it->second.remove_wait((wait_type)i, ctx.info->events[i]); } - if (delay.valu_instrs <= it->second.delay.valu_instrs) - it->second.delay.valu_instrs = alu_delay_info::valu_nop; - if (delay.trans_instrs <= it->second.delay.trans_instrs) - it->second.delay.trans_instrs = alu_delay_info::trans_nop; - it->second.delay.fixup(); - if (it->second.delay.empty()) - it->second.remove_alu_counter(); if (!it->second.counters) it = ctx.gpr_map.erase(it); else @@ -685,26 +514,14 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, - uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false) + uint8_t vmem_types = 0, bool force_linear = false) { uint16_t counters = ctx.info->get_counters_for_event(event); wait_imm imm; - u_foreach_bit (i, counters & wait_counters) + u_foreach_bit (i, counters) imm[i] = 0; - alu_delay_info delay; - if (event == event_valu) { - delay.valu_instrs = 0; - delay.valu_cycles = cycles; - } else if (event == event_trans) { - delay.trans_instrs = 0; - delay.trans_cycles = cycles; - } else if (event == event_salu) { - delay.salu_cycles = cycles; - } - - wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear, - wait_on_read); + wait_entry new_entry(event, imm, counters, !rc.is_linear() && !force_linear, wait_on_read); if (counters & counter_vm) new_entry.vmem_types |= vmem_types; @@ -719,49 +536,19 @@ void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0) { if (!op.isConstant() && !op.isUndefined()) - insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, 0); + insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types); } void -insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0, - unsigned cycles = 0) +insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0) { /* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without * waiting for the load to finish. - * Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane - * dependency checks. */ uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat; - uint32_t alu_events = event_trans | event_valu | event_salu; - bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events)); + bool force_linear = ctx.gfx_level >= GFX11 && (event & ds_vmem_events); - insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles, - force_linear); -} - -void -gen_alu(Instruction* instr, wait_ctx& ctx) -{ - Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr); - bool is_valu = instr->isVALU(); - bool is_trans = instr->isTrans(); - bool clear = instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() || - instr->isMUBUF() || instr->isMTBUF(); - - wait_event event = (wait_event)0; - if (is_trans) - event = event_trans; - else if (is_valu) - event = event_valu; - else if (instr->isSALU()) - event = event_salu; - - if (event != (wait_event)0) { - for (const Definition& def : instr->definitions) - insert_wait_entry(ctx, def, event, 0, cycle_info.latency); - } - update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans, - clear, cycle_info.issue_cycles); + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, force_linear); } void @@ -918,34 +705,6 @@ emit_waitcnt(wait_ctx& ctx, std::vector>& instructions, wai imm = wait_imm(); } -void -emit_delay_alu(wait_ctx& ctx, std::vector>& instructions, - alu_delay_info& delay) -{ - uint32_t imm = 0; - if (delay.trans_instrs != delay.trans_nop) { - imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1; - } - - if (delay.valu_instrs != delay.valu_nop) { - imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0); - } - - /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just - * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally - * getting this wrong isn't an issue. */ - if (delay.salu_cycles && imm <= 0xf) { - unsigned cycles = std::min(3, delay.salu_cycles); - imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0); - } - - Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0); - inst->salu().imm = imm; - inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16)); - instructions.emplace_back(inst); - delay = alu_delay_info(); -} - bool check_clause_raw(std::bitset<512>& regs_written, Instruction* instr) { @@ -972,17 +731,15 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) std::vector> new_instructions; wait_imm queued_imm; - alu_delay_info queued_delay; size_t clause_end = 0; for (size_t i = 0; i < block.instructions.size(); i++) { aco_ptr& instr = block.instructions[i]; bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get()); - bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get()); memory_sync_info sync_info = get_sync_info(instr.get()); - kill(queued_imm, queued_delay, instr.get(), ctx, sync_info); + kill(queued_imm, instr.get(), ctx, sync_info); /* At the start of a possible clause, also emit waitcnts for each instruction to avoid * splitting the clause. @@ -1002,15 +759,13 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) if (!check_clause_raw(*regs_written, next)) break; - kill(queued_imm, queued_delay, next, ctx, get_sync_info(next)); + kill(queued_imm, next, ctx, get_sync_info(next)); } } - if (program->gfx_level >= GFX11) - gen_alu(instr.get(), ctx); gen(instr.get(), ctx); - if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) { + if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) { instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp); queued_imm.exp = wait_imm::unset_counter; @@ -1018,8 +773,6 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); - if (!queued_delay.empty()) - emit_delay_alu(ctx, new_instructions, queued_delay); bool is_ordered_count_acquire = instr->opcode == aco_opcode::ds_ordered_count && @@ -1041,8 +794,6 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) if (!queued_imm.empty()) emit_waitcnt(ctx, new_instructions, queued_imm); - if (!queued_delay.empty()) - emit_delay_alu(ctx, new_instructions, queued_delay); block.instructions.swap(new_instructions); } @@ -1050,7 +801,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) } /* end namespace */ void -insert_wait_states(Program* program) +insert_waitcnt(Program* program) { target_info info(program->gfx_level); @@ -1119,33 +870,6 @@ insert_wait_states(Program* program) out_ctx[current.index] = std::move(ctx); } - - /* Combine s_delay_alu using the skip field. */ - if (program->gfx_level >= GFX11) { - for (Block& block : program->blocks) { - int i = 0; - int prev_delay_alu = -1; - for (aco_ptr& instr : block.instructions) { - if (instr->opcode != aco_opcode::s_delay_alu) { - block.instructions[i++] = std::move(instr); - continue; - } - - uint16_t imm = instr->salu().imm; - int skip = i - prev_delay_alu - 1; - if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) { - if (imm >> 7 == 0) - prev_delay_alu = i; - block.instructions[i++] = std::move(instr); - continue; - } - - block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7); - prev_delay_alu = -1; - } - block.instructions.resize(i); - } - } } } // namespace aco diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 79048e3e361..22b538208bb 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -184,8 +184,11 @@ aco_postprocess_shader(const struct aco_compiler_options* options, if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_ILP)) schedule_ilp(program.get()); - /* Insert Waitcnt */ - insert_wait_states(program.get()); + insert_waitcnt(program.get()); + if (program->gfx_level >= GFX11) { + insert_delay_alu(program.get()); + combine_delay_alu(program.get()); + } insert_NOPs(program.get()); if (program->gfx_level >= GFX10) @@ -315,7 +318,11 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options, select_rt_prolog(program.get(), &config, options, info, in_args, out_args); validate(program.get()); - insert_wait_states(program.get()); + insert_waitcnt(program.get()); + if (program->gfx_level >= GFX11) { + insert_delay_alu(program.get()); + combine_delay_alu(program.get()); + } insert_NOPs(program.get()); if (program->gfx_level >= GFX10) form_hard_clauses(program.get()); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index eedd38e14ad..5100ebe33c7 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2184,7 +2184,9 @@ void schedule_program(Program* program); void schedule_ilp(Program* program); void schedule_vopd(Program* program); void spill(Program* program); -void insert_wait_states(Program* program); +void insert_waitcnt(Program* program); +void insert_delay_alu(Program* program); +void combine_delay_alu(Program* program); bool dealloc_vgprs(Program* program); void insert_NOPs(Program* program); void form_hard_clauses(Program* program); diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build index 857fa8f63cd..ae2d6a41b79 100644 --- a/src/amd/compiler/meson.build +++ b/src/amd/compiler/meson.build @@ -42,6 +42,7 @@ libaco_files = files( 'aco_ir.h', 'aco_assembler.cpp', 'aco_form_hard_clauses.cpp', + 'aco_insert_delay_alu.cpp', 'aco_insert_exec_mask.cpp', 'aco_insert_NOPs.cpp', 'aco_insert_waitcnt.cpp', diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index 291fc596cea..866c1368804 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -312,7 +312,7 @@ void finish_waitcnt_test() { finish_program(program.get()); - aco::insert_wait_states(program.get()); + aco::insert_waitcnt(program.get()); aco_print_program(program.get(), output); }