From 807651561e75314769bb925c16d40f99576dc155 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Wed, 7 Aug 2024 15:41:42 +0100
Subject: [PATCH] aco: split insert_wait_states into two
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No fossil-db changes.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Acked-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23337>
---
 src/amd/compiler/aco_insert_delay_alu.cpp | 392 ++++++++++++++++++++++
 src/amd/compiler/aco_insert_waitcnt.cpp   | 322 ++----------------
 src/amd/compiler/aco_interface.cpp        |  13 +-
 src/amd/compiler/aco_ir.h                 |   4 +-
 src/amd/compiler/meson.build              |   1 +
 src/amd/compiler/tests/helpers.cpp        |   2 +-
 6 files changed, 430 insertions(+), 304 deletions(-)
 create mode 100644 src/amd/compiler/aco_insert_delay_alu.cpp
diff --git a/src/amd/compiler/aco_insert_delay_alu.cpp b/src/amd/compiler/aco_insert_delay_alu.cpp
new file mode 100644
index 00000000000..baf1a5c5f58
--- /dev/null
+++ b/src/amd/compiler/aco_insert_delay_alu.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright © 2018 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "aco_builder.h"
+#include "aco_ir.h"
+
+#include <map>
+#include <stack>
+#include <vector>
+
+namespace aco {
+
+namespace {
+
+/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
+ * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
+ * that we should switch to a different wave and contains info on dependencies as to
+ * when we can switch back.
+ *
+ * This seems to apply only for ALU->ALU dependencies as other instructions have better
+ * integration with the frontend.
+ *
+ * Note that if we do not emit s_delay_alu things will still be correct, but the wave
+ * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
+ * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
+ * instructions can take a different number of cycles based on the exec mask)
+ */
+struct alu_delay_info {
+   /* These are the values directly above the max representable value, i.e. the wait
+    * would turn into a no-op when we try to wait for something further back than
+    * this.
+    */
+   static constexpr int8_t valu_nop = 5;
+   static constexpr int8_t trans_nop = 4;
+
+   /* How many VALU instructions ago this value was written */
+   int8_t valu_instrs = valu_nop;
+   /* Cycles until the writing VALU instruction is finished */
+   int8_t valu_cycles = 0;
+
+   /* How many Transcedent instructions ago this value was written */
+   int8_t trans_instrs = trans_nop;
+   /* Cycles until the writing Transcendent instruction is finished */
+   int8_t trans_cycles = 0;
+
+   /* Cycles until the writing SALU instruction is finished*/
+   int8_t salu_cycles = 0;
+
+   bool combine(const alu_delay_info& other)
+   {
+      bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
+                     other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
+                     other.trans_cycles > trans_cycles;
+      valu_instrs = std::min(valu_instrs, other.valu_instrs);
+      trans_instrs = std::min(trans_instrs, other.trans_instrs);
+      salu_cycles = std::max(salu_cycles, other.salu_cycles);
+      valu_cycles = std::max(valu_cycles, other.valu_cycles);
+      trans_cycles = std::max(trans_cycles, other.trans_cycles);
+      return changed;
+   }
+
+   /* Needs to be called after any change to keep the data consistent. */
+   bool fixup()
+   {
+      if (valu_instrs >= valu_nop || valu_cycles <= 0) {
+         valu_instrs = valu_nop;
+         valu_cycles = 0;
+      }
+
+      if (trans_instrs >= trans_nop || trans_cycles <= 0) {
+         trans_instrs = trans_nop;
+         trans_cycles = 0;
+      }
+
+      salu_cycles = std::max<int8_t>(salu_cycles, 0);
+
+      return empty();
+   }
+
+   /* Returns true if a wait would be a no-op */
+   bool empty() const
+   {
+      return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
+   }
+
+   UNUSED void print(FILE* output) const
+   {
+      if (valu_instrs != valu_nop)
+         fprintf(output, "valu_instrs: %u\n", valu_instrs);
+      if (valu_cycles)
+         fprintf(output, "valu_cycles: %u\n", valu_cycles);
+      if (trans_instrs != trans_nop)
+         fprintf(output, "trans_instrs: %u\n", trans_instrs);
+      if (trans_cycles)
+         fprintf(output, "trans_cycles: %u\n", trans_cycles);
+      if (salu_cycles)
+         fprintf(output, "salu_cycles: %u\n", salu_cycles);
+   }
+};
+
+struct delay_ctx {
+   Program* program;
+   std::map<PhysReg, alu_delay_info> gpr_map;
+
+   delay_ctx() {}
+   delay_ctx(Program* program_) : program(program_) {}
+
+   bool join(const delay_ctx* other)
+   {
+      bool changed = false;
+      for (const auto& entry : other->gpr_map) {
+         using iterator = std::map<PhysReg, alu_delay_info>::iterator;
+         const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
+         if (insert_pair.second)
+            changed = true;
+         else
+            changed |= insert_pair.first->second.combine(entry.second);
+      }
+
+      return changed;
+   }
+
+   UNUSED void print(FILE* output) const
+   {
+      for (const auto& entry : gpr_map) {
+         fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
+                 entry.first.reg() & 0xff);
+         entry.second.print(output);
+         fprintf(output, "}\n");
+      }
+   }
+};
+
+void
+check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
+{
+   for (const Operand op : instr->operands) {
+      if (op.isConstant() || op.isUndefined())
+         continue;
+
+      /* check consecutively read gprs */
+      for (unsigned j = 0; j < op.size(); j++) {
+         std::map<PhysReg, alu_delay_info>::iterator it =
+            ctx.gpr_map.find(PhysReg{op.physReg() + j});
+         if (it != ctx.gpr_map.end())
+            delay.combine(it->second);
+      }
+   }
+}
+
+bool
+parse_delay_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
+{
+   if (instr->opcode != aco_opcode::s_delay_alu)
+      return false;
+
+   unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf};
+   for (unsigned i = 0; i < 2; ++i) {
+      alu_delay_wait wait = (alu_delay_wait)imm[i];
+      if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4)
+         delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1;
+      else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3)
+         delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1;
+      else if (wait >= alu_delay_wait::SALU_CYCLE_1)
+         delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1;
+   }
+
+   delay.valu_cycles = instr->pass_flags & 0xffff;
+   delay.trans_cycles = instr->pass_flags >> 16;
+
+   return true;
+}
+
+void
+update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
+{
+   std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
+   while (it != ctx.gpr_map.end()) {
+      alu_delay_info& entry = it->second;
+      entry.valu_instrs += is_valu ? 1 : 0;
+      entry.trans_instrs += is_trans ? 1 : 0;
+      entry.salu_cycles -= cycles;
+      entry.valu_cycles -= cycles;
+      entry.trans_cycles -= cycles;
+      it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
+   }
+}
+
+void
+kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
+{
+   if (instr->isVALU() || instr->isSALU())
+      check_alu(ctx, delay, instr);
+
+   if (!delay.empty()) {
+      update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
+
+      /* remove all gprs with higher counter from map */
+      std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
+      while (it != ctx.gpr_map.end()) {
+         if (delay.valu_instrs <= it->second.valu_instrs)
+            it->second.valu_instrs = alu_delay_info::valu_nop;
+         if (delay.trans_instrs <= it->second.trans_instrs)
+            it->second.trans_instrs = alu_delay_info::trans_nop;
+         it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
+      }
+   }
+}
+
+void
+gen_alu(Instruction* instr, delay_ctx& ctx)
+{
+   if (instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() ||
+       instr->isMUBUF() || instr->isMTBUF()) {
+      ctx.gpr_map.clear();
+      return;
+   }
+
+   Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
+   bool is_valu = instr->isVALU();
+   bool is_trans = instr->isTrans();
+
+   if (is_trans || is_valu || instr->isSALU()) {
+      alu_delay_info delay;
+      if (is_trans) {
+         delay.trans_instrs = 0;
+         delay.trans_cycles = cycle_info.latency;
+      } else if (is_valu) {
+         delay.valu_instrs = 0;
+         delay.valu_cycles = cycle_info.latency;
+      } else if (instr->isSALU()) {
+         delay.salu_cycles = cycle_info.latency;
+      }
+
+      for (const Definition& def : instr->definitions) {
+         for (unsigned i = 0; i < def.size(); i++) {
+            auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + i}, delay);
+            if (!it.second)
+               it.first->second.combine(delay);
+         }
+      }
+   }
+
+   update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
+              cycle_info.issue_cycles);
+}
+
+void
+emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
+               alu_delay_info& delay)
+{
+   uint32_t imm = 0;
+   if (delay.trans_instrs != delay.trans_nop) {
+      imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
+   }
+
+   if (delay.valu_instrs != delay.valu_nop) {
+      imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
+   }
+
+   /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
+    * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
+    * getting this wrong isn't an issue. */
+   if (delay.salu_cycles && imm <= 0xf) {
+      unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
+      imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
+   }
+
+   Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
+   inst->salu().imm = imm;
+   inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
+   instructions.emplace_back(inst);
+   delay = alu_delay_info();
+}
+
+void
+handle_block(Program* program, Block& block, delay_ctx& ctx)
+{
+   std::vector<aco_ptr<Instruction>> new_instructions;
+   alu_delay_info queued_delay;
+
+   for (size_t i = 0; i < block.instructions.size(); i++) {
+      aco_ptr<Instruction>& instr = block.instructions[i];
+      bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
+
+      kill_alu(queued_delay, instr.get(), ctx);
+      gen_alu(instr.get(), ctx);
+
+      if (!is_delay_alu) {
+         if (!queued_delay.empty())
+            emit_delay_alu(ctx, new_instructions, queued_delay);
+         new_instructions.emplace_back(std::move(instr));
+      }
+   }
+
+   if (!queued_delay.empty())
+      emit_delay_alu(ctx, new_instructions, queued_delay);
+   block.instructions.swap(new_instructions);
+}
+
+} /* end namespace */
+
+void
+insert_delay_alu(Program* program)
+{
+   /* per BB ctx */
+   std::vector<bool> done(program->blocks.size());
+   std::vector<delay_ctx> in_ctx(program->blocks.size(), delay_ctx(program));
+   std::vector<delay_ctx> out_ctx(program->blocks.size(), delay_ctx(program));
+
+   std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
+   unsigned loop_progress = 0;
+
+   for (unsigned i = 0; i < program->blocks.size();) {
+      Block& current = program->blocks[i++];
+
+      if (current.kind & block_kind_discard_early_exit) {
+         /* Because the jump to the discard early exit block may happen anywhere in a block, it's
+          * not possible to join it with its predecessors this way.
+          */
+         continue;
+      }
+
+      delay_ctx ctx = in_ctx[current.index];
+
+      if (current.kind & block_kind_loop_header) {
+         loop_header_indices.push(current.index);
+      } else if (current.kind & block_kind_loop_exit) {
+         bool repeat = false;
+         if (loop_progress == loop_header_indices.size()) {
+            i = loop_header_indices.top();
+            repeat = true;
+         }
+         loop_header_indices.pop();
+         loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
+         if (repeat)
+            continue;
+      }
+
+      bool changed = false;
+      for (unsigned b : current.linear_preds)
+         changed |= ctx.join(&out_ctx[b]);
+
+      if (done[current.index] && !changed) {
+         in_ctx[current.index] = std::move(ctx);
+         continue;
+      } else {
+         in_ctx[current.index] = ctx;
+      }
+
+      loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
+      done[current.index] = true;
+
+      handle_block(program, current, ctx);
+
+      out_ctx[current.index] = std::move(ctx);
+   }
+}
+
+void
+combine_delay_alu(Program* program)
+{
+   /* Combine s_delay_alu using the skip field. */
+   for (Block& block : program->blocks) {
+      int i = 0;
+      int prev_delay_alu = -1;
+      for (aco_ptr<Instruction>& instr : block.instructions) {
+         if (instr->opcode != aco_opcode::s_delay_alu) {
+            block.instructions[i++] = std::move(instr);
+            continue;
+         }
+
+         uint16_t imm = instr->salu().imm;
+         int skip = i - prev_delay_alu - 1;
+         if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
+            if (imm >> 7 == 0)
+               prev_delay_alu = i;
+            block.instructions[i++] = std::move(instr);
+            continue;
+         }
+
+         block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
+         prev_delay_alu = -1;
+      }
+      block.instructions.resize(i);
+   }
+}
+
+} // namespace aco
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index ae079dcd755..e6263d6f191 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -55,10 +55,7 @@ enum wait_event : uint32_t {
    event_ldsdir = 1 << 12,
    event_vmem_sample = 1 << 13, /* GFX12+ */
    event_vmem_bvh = 1 << 14,    /* GFX12+ */
-   event_valu = 1 << 15,
-   event_trans = 1 << 16,
-   event_salu = 1 << 17,
-   num_events = 18,
+   num_events = 15,
 };
 
 enum counter_type : uint8_t {
@@ -69,107 +66,20 @@ enum counter_type : uint8_t {
    counter_sample = 1 << wait_type_sample,
    counter_bvh = 1 << wait_type_bvh,
    counter_km = 1 << wait_type_km,
-   counter_alu = 1 << wait_type_num,
-   num_counters = wait_type_num + 1,
-   wait_counters = BITFIELD_MASK(wait_type_num),
-};
-
-/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
- * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
- * that we should switch to a different wave and contains info on dependencies as to
- * when we can switch back.
- *
- * This seems to apply only for ALU->ALU dependencies as other instructions have better
- * integration with the frontend.
- *
- * Note that if we do not emit s_delay_alu things will still be correct, but the wave
- * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
- * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
- * instructions can take a different number of cycles based on the exec mask)
- */
-struct alu_delay_info {
-   /* These are the values directly above the max representable value, i.e. the wait
-    * would turn into a no-op when we try to wait for something further back than
-    * this.
-    */
-   static constexpr int8_t valu_nop = 5;
-   static constexpr int8_t trans_nop = 4;
-
-   /* How many VALU instructions ago this value was written */
-   int8_t valu_instrs = valu_nop;
-   /* Cycles until the writing VALU instruction is finished */
-   int8_t valu_cycles = 0;
-
-   /* How many Transcedent instructions ago this value was written */
-   int8_t trans_instrs = trans_nop;
-   /* Cycles until the writing Transcendent instruction is finished */
-   int8_t trans_cycles = 0;
-
-   /* Cycles until the writing SALU instruction is finished*/
-   int8_t salu_cycles = 0;
-
-   bool combine(const alu_delay_info& other)
-   {
-      bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
-                     other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
-                     other.trans_cycles > trans_cycles;
-      valu_instrs = std::min(valu_instrs, other.valu_instrs);
-      trans_instrs = std::min(trans_instrs, other.trans_instrs);
-      salu_cycles = std::max(salu_cycles, other.salu_cycles);
-      valu_cycles = std::max(valu_cycles, other.valu_cycles);
-      trans_cycles = std::max(trans_cycles, other.trans_cycles);
-      return changed;
-   }
-
-   /* Needs to be called after any change to keep the data consistent. */
-   void fixup()
-   {
-      if (valu_instrs >= valu_nop || valu_cycles <= 0) {
-         valu_instrs = valu_nop;
-         valu_cycles = 0;
-      }
-
-      if (trans_instrs >= trans_nop || trans_cycles <= 0) {
-         trans_instrs = trans_nop;
-         trans_cycles = 0;
-      }
-
-      salu_cycles = std::max<int8_t>(salu_cycles, 0);
-   }
-
-   /* Returns true if a wait would be a no-op */
-   bool empty() const
-   {
-      return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
-   }
-
-   UNUSED void print(FILE* output) const
-   {
-      if (valu_instrs != valu_nop)
-         fprintf(output, "valu_instrs: %u\n", valu_instrs);
-      if (valu_cycles)
-         fprintf(output, "valu_cycles: %u\n", valu_cycles);
-      if (trans_instrs != trans_nop)
-         fprintf(output, "trans_instrs: %u\n", trans_instrs);
-      if (trans_cycles)
-         fprintf(output, "trans_cycles: %u\n", trans_cycles);
-      if (salu_cycles)
-         fprintf(output, "salu_cycles: %u\n", salu_cycles);
-   }
+   num_counters = wait_type_num,
 };
 
 struct wait_entry {
    wait_imm imm;
-   alu_delay_info delay;
    uint32_t events;  /* use wait_event notion */
    uint8_t counters; /* use counter_type notion */
    bool wait_on_read : 1;
    bool logical : 1;
    uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */
 
-   wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_,
-              bool logical_, bool wait_on_read_)
-       : imm(imm_), delay(delay_), events(event_), counters(counters_), wait_on_read(wait_on_read_),
+   wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool logical_,
+              bool wait_on_read_)
+       : imm(imm_), events(event_), counters(counters_), wait_on_read(wait_on_read_),
          logical(logical_), vmem_types(0)
    {}
 
@@ -181,20 +91,12 @@ struct wait_entry {
       events |= other.events;
       counters |= other.counters;
       changed |= imm.combine(other.imm);
-      changed |= delay.combine(other.delay);
       wait_on_read |= other.wait_on_read;
       vmem_types |= other.vmem_types;
       logical &= other.logical;
       return changed;
    }
 
-   void remove_alu_counter()
-   {
-      counters &= ~counter_alu;
-      delay = alu_delay_info();
-      events &= ~(event_valu | event_trans | event_salu);
-   }
-
    void remove_wait(wait_type type, uint32_t type_events)
    {
       counters &= ~(1 << type);
@@ -212,7 +114,6 @@ struct wait_entry {
    {
       fprintf(output, "logical: %u\n", logical);
       imm.print(output);
-      delay.print(output);
       if (events)
          fprintf(output, "events: %u\n", events);
       if (counters)
@@ -253,9 +154,6 @@ struct target_info {
          u_foreach_bit (j, events[i])
             counters[j] |= (1 << i);
       }
-      counters[ffs(event_valu) - 1] |= counter_alu;
-      counters[ffs(event_trans) - 1] |= counter_alu;
-      counters[ffs(event_salu) - 1] |= counter_alu;
 
       unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0);
    }
@@ -355,7 +253,7 @@ get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type)
 }
 
 void
-check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr)
+check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
 {
    for (const Operand op : instr->operands) {
       if (op.isConstant() || op.isUndefined())
@@ -363,14 +261,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
 
       /* check consecutively read gprs */
       for (unsigned j = 0; j < op.size(); j++) {
-         PhysReg reg{op.physReg() + j};
-         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
-         if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
-            continue;
-
-         wait.combine(it->second.imm);
-         if (instr->isVALU() || instr->isSALU())
-            delay.combine(it->second.delay);
+         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j});
+         if (it != ctx.gpr_map.end() && it->second.wait_on_read)
+            wait.combine(it->second.imm);
       }
    }
 
@@ -405,29 +298,6 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
    }
 }
 
-bool
-parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr)
-{
-   if (instr->opcode != aco_opcode::s_delay_alu)
-      return false;
-
-   unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf};
-   for (unsigned i = 0; i < 2; ++i) {
-      alu_delay_wait wait = (alu_delay_wait)imm[i];
-      if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4)
-         delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1;
-      else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3)
-         delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1;
-      else if (wait >= alu_delay_wait::SALU_CYCLE_1)
-         delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1;
-   }
-
-   delay.valu_cycles = instr->pass_flags & 0xffff;
-   delay.trans_cycles = instr->pass_flags >> 16;
-
-   return true;
-}
-
 void
 perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, unsigned semantics)
 {
@@ -464,36 +334,7 @@ force_waitcnt(wait_ctx& ctx, wait_imm& imm)
 }
 
 void
-update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles)
-{
-   std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
-   while (it != ctx.gpr_map.end()) {
-      wait_entry& entry = it->second;
-
-      if (clear) {
-         entry.remove_alu_counter();
-      } else {
-         entry.delay.valu_instrs += is_valu ? 1 : 0;
-         entry.delay.trans_instrs += is_trans ? 1 : 0;
-         entry.delay.salu_cycles -= cycles;
-         entry.delay.valu_cycles -= cycles;
-         entry.delay.trans_cycles -= cycles;
-
-         entry.delay.fixup();
-         if (it->second.delay.empty())
-            entry.remove_alu_counter();
-      }
-
-      if (!entry.counters)
-         it = ctx.gpr_map.erase(it);
-      else
-         it++;
-   }
-}
-
-void
-kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
-     memory_sync_info sync_info)
+kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
    if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
       /* Force emitting waitcnt states right after the instruction if there is
@@ -522,7 +363,7 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
          imm[i] = 0;
    }
 
-   check_instr(ctx, imm, delay, instr);
+   check_instr(ctx, imm, instr);
 
    /* It's required to wait for scalar stores before "writing back" data.
     * It shouldn't cost anything anyways since we're about to do s_endpgm.
@@ -555,7 +396,7 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
    else
       perform_barrier(ctx, imm, sync_info, semantic_release);
 
-   if (!imm.empty() || !delay.empty()) {
+   if (!imm.empty()) {
       if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
          imm.vm = 0;
       if (ctx.pending_flat_lgkm && imm.lgkm != wait_imm::unset_counter)
@@ -579,11 +420,6 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
             bar_ev &= ~event_flat;
       }
 
-      if (ctx.program->gfx_level >= GFX11) {
-         update_alu(ctx, false, false, false,
-                    MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
-      }
-
       /* remove all gprs with higher counter from map */
       std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
       while (it != ctx.gpr_map.end()) {
@@ -591,13 +427,6 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
             if (imm[i] != wait_imm::unset_counter && imm[i] <= it->second.imm[i])
                it->second.remove_wait((wait_type)i, ctx.info->events[i]);
          }
-         if (delay.valu_instrs <= it->second.delay.valu_instrs)
-            it->second.delay.valu_instrs = alu_delay_info::valu_nop;
-         if (delay.trans_instrs <= it->second.delay.trans_instrs)
-            it->second.delay.trans_instrs = alu_delay_info::trans_nop;
-         it->second.delay.fixup();
-         if (it->second.delay.empty())
-            it->second.remove_alu_counter();
          if (!it->second.counters)
             it = ctx.gpr_map.erase(it);
          else
@@ -685,26 +514,14 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync
 
 void
 insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
-                  uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false)
+                  uint8_t vmem_types = 0, bool force_linear = false)
 {
    uint16_t counters = ctx.info->get_counters_for_event(event);
    wait_imm imm;
-   u_foreach_bit (i, counters & wait_counters)
+   u_foreach_bit (i, counters)
       imm[i] = 0;
 
-   alu_delay_info delay;
-   if (event == event_valu) {
-      delay.valu_instrs = 0;
-      delay.valu_cycles = cycles;
-   } else if (event == event_trans) {
-      delay.trans_instrs = 0;
-      delay.trans_cycles = cycles;
-   } else if (event == event_salu) {
-      delay.salu_cycles = cycles;
-   }
-
-   wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear,
-                        wait_on_read);
+   wait_entry new_entry(event, imm, counters, !rc.is_linear() && !force_linear, wait_on_read);
    if (counters & counter_vm)
       new_entry.vmem_types |= vmem_types;
 
@@ -719,49 +536,19 @@ void
 insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, uint8_t vmem_types = 0)
 {
    if (!op.isConstant() && !op.isUndefined())
-      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types, 0);
+      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, vmem_types);
 }
 
 void
-insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
-                  unsigned cycles = 0)
+insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0)
 {
    /* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without
     * waiting for the load to finish.
-    * Also, follow linear control flow for ALU because it's unlikely that the hardware does per-lane
-    * dependency checks.
     */
    uint32_t ds_vmem_events = event_lds | event_gds | event_vmem | event_flat;
-   uint32_t alu_events = event_trans | event_valu | event_salu;
-   bool force_linear = ctx.gfx_level >= GFX11 && (event & (ds_vmem_events | alu_events));
+   bool force_linear = ctx.gfx_level >= GFX11 && (event & ds_vmem_events);
 
-   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles,
-                     force_linear);
-}
-
-void
-gen_alu(Instruction* instr, wait_ctx& ctx)
-{
-   Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
-   bool is_valu = instr->isVALU();
-   bool is_trans = instr->isTrans();
-   bool clear = instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() ||
-                instr->isMUBUF() || instr->isMTBUF();
-
-   wait_event event = (wait_event)0;
-   if (is_trans)
-      event = event_trans;
-   else if (is_valu)
-      event = event_valu;
-   else if (instr->isSALU())
-      event = event_salu;
-
-   if (event != (wait_event)0) {
-      for (const Definition& def : instr->definitions)
-         insert_wait_entry(ctx, def, event, 0, cycle_info.latency);
-   }
-   update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
-              clear, cycle_info.issue_cycles);
+   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, force_linear);
 }
 
 void
@@ -918,34 +705,6 @@ emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wai
    imm = wait_imm();
 }
 
-void
-emit_delay_alu(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
-               alu_delay_info& delay)
-{
-   uint32_t imm = 0;
-   if (delay.trans_instrs != delay.trans_nop) {
-      imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
-   }
-
-   if (delay.valu_instrs != delay.valu_nop) {
-      imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
-   }
-
-   /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
-    * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
-    * getting this wrong isn't an issue. */
-   if (delay.salu_cycles && imm <= 0xf) {
-      unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
-      imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
-   }
-
-   Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
-   inst->salu().imm = imm;
-   inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
-   instructions.emplace_back(inst);
-   delay = alu_delay_info();
-}
-
 bool
 check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
 {
@@ -972,17 +731,15 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
    std::vector<aco_ptr<Instruction>> new_instructions;
 
    wait_imm queued_imm;
-   alu_delay_info queued_delay;
 
    size_t clause_end = 0;
    for (size_t i = 0; i < block.instructions.size(); i++) {
       aco_ptr<Instruction>& instr = block.instructions[i];
 
       bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get());
-      bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
 
       memory_sync_info sync_info = get_sync_info(instr.get());
-      kill(queued_imm, queued_delay, instr.get(), ctx, sync_info);
+      kill(queued_imm, instr.get(), ctx, sync_info);
 
       /* At the start of a possible clause, also emit waitcnts for each instruction to avoid
        * splitting the clause.
@@ -1002,15 +759,13 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
             if (!check_clause_raw(*regs_written, next))
                break;
 
-            kill(queued_imm, queued_delay, next, ctx, get_sync_info(next));
+            kill(queued_imm, next, ctx, get_sync_info(next));
          }
       }
 
-      if (program->gfx_level >= GFX11)
-         gen_alu(instr.get(), ctx);
       gen(instr.get(), ctx);
 
-      if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) {
+      if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
          if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {
             instr->vinterp_inreg().wait_exp = MIN2(instr->vinterp_inreg().wait_exp, queued_imm.exp);
             queued_imm.exp = wait_imm::unset_counter;
@@ -1018,8 +773,6 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
 
          if (!queued_imm.empty())
             emit_waitcnt(ctx, new_instructions, queued_imm);
-         if (!queued_delay.empty())
-            emit_delay_alu(ctx, new_instructions, queued_delay);
 
          bool is_ordered_count_acquire =
             instr->opcode == aco_opcode::ds_ordered_count &&
@@ -1041,8 +794,6 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
 
    if (!queued_imm.empty())
       emit_waitcnt(ctx, new_instructions, queued_imm);
-   if (!queued_delay.empty())
-      emit_delay_alu(ctx, new_instructions, queued_delay);
 
    block.instructions.swap(new_instructions);
 }
@@ -1050,7 +801,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
 } /* end namespace */
 
 void
-insert_wait_states(Program* program)
+insert_waitcnt(Program* program)
 {
    target_info info(program->gfx_level);
 
@@ -1119,33 +870,6 @@ insert_wait_states(Program* program)
 
       out_ctx[current.index] = std::move(ctx);
    }
-
-   /* Combine s_delay_alu using the skip field. */
-   if (program->gfx_level >= GFX11) {
-      for (Block& block : program->blocks) {
-         int i = 0;
-         int prev_delay_alu = -1;
-         for (aco_ptr<Instruction>& instr : block.instructions) {
-            if (instr->opcode != aco_opcode::s_delay_alu) {
-               block.instructions[i++] = std::move(instr);
-               continue;
-            }
-
-            uint16_t imm = instr->salu().imm;
-            int skip = i - prev_delay_alu - 1;
-            if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
-               if (imm >> 7 == 0)
-                  prev_delay_alu = i;
-               block.instructions[i++] = std::move(instr);
-               continue;
-            }
-
-            block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
-            prev_delay_alu = -1;
-         }
-         block.instructions.resize(i);
-      }
-   }
 }
 
 } // namespace aco
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index 79048e3e361..22b538208bb 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -184,8 +184,11 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
    if (!options->optimisations_disabled && !(debug_flags & DEBUG_NO_SCHED_ILP))
       schedule_ilp(program.get());
 
-   /* Insert Waitcnt */
-   insert_wait_states(program.get());
+   insert_waitcnt(program.get());
+   if (program->gfx_level >= GFX11) {
+      insert_delay_alu(program.get());
+      combine_delay_alu(program.get());
+   }
    insert_NOPs(program.get());
 
    if (program->gfx_level >= GFX10)
@@ -315,7 +318,11 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
 
    select_rt_prolog(program.get(), &config, options, info, in_args, out_args);
    validate(program.get());
-   insert_wait_states(program.get());
+   insert_waitcnt(program.get());
+   if (program->gfx_level >= GFX11) {
+      insert_delay_alu(program.get());
+      combine_delay_alu(program.get());
+   }
    insert_NOPs(program.get());
    if (program->gfx_level >= GFX10)
       form_hard_clauses(program.get());
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index eedd38e14ad..5100ebe33c7 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2184,7 +2184,9 @@ void schedule_program(Program* program);
 void schedule_ilp(Program* program);
 void schedule_vopd(Program* program);
 void spill(Program* program);
-void insert_wait_states(Program* program);
+void insert_waitcnt(Program* program);
+void insert_delay_alu(Program* program);
+void combine_delay_alu(Program* program);
 bool dealloc_vgprs(Program* program);
 void insert_NOPs(Program* program);
 void form_hard_clauses(Program* program);
diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build
index 857fa8f63cd..ae2d6a41b79 100644
--- a/src/amd/compiler/meson.build
+++ b/src/amd/compiler/meson.build
@@ -42,6 +42,7 @@ libaco_files = files(
   'aco_ir.h',
   'aco_assembler.cpp',
   'aco_form_hard_clauses.cpp',
+  'aco_insert_delay_alu.cpp',
   'aco_insert_exec_mask.cpp',
   'aco_insert_NOPs.cpp',
   'aco_insert_waitcnt.cpp',
diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp
index 291fc596cea..866c1368804 100644
--- a/src/amd/compiler/tests/helpers.cpp
+++ b/src/amd/compiler/tests/helpers.cpp
@@ -312,7 +312,7 @@ void
 finish_waitcnt_test()
 {
    finish_program(program.get());
-   aco::insert_wait_states(program.get());
+   aco::insert_waitcnt(program.get());
    aco_print_program(program.get(), output);
 }