ir3/ra: Add specialized shared register RA/spilling

There are two problems with shared register allocation at the moment: 1. We weren't modelling physical edges correctly, and once we do, the current hack in RA for handling them won't work correctly. This means live-range splitting doesn't work. I've tried various strategies but none of them seems to fix this. 2. Spilling of shared registers to non-shared registers isn't implemented. Spilling of shared regs is significantly simpler than spilling non-shared regs, because (1) spilling and unspilling is significantly cheaper, just a single mov, and (2) we can swap "stack slots" (actually non-shared regs) so all the complexity of parallel copy handling isn't necessary. This means that it's much easier to integrate RA and spilling, while still using the tree-scan framework, so that we can spill instead of splitting live ranges. The other issue, of phi nodes with physical edges, we can handle by spilling those phis earlier. For this to work, we need to accurately insert physical edges based on divergence analysis or else every phi node would involve physical edges, which later commits will accomplish. This commit adds a shared register allocation pass which is a severely-cut-down version of RA and spilling. Everything to do with live range splitting is cut from RA, and everything to do with parallel copy handling and for spilling we simply always spill as long as soon as we encounter a case where it's necessary. This could be improved, especially the spilling strategy, but for now it keeps the pass simple and cuts down on code duplication. Unfortunately there's still some shared boilerplate with regular RA which seems unavoidable however. The new RA requires us to redo liveness information, which is significantly expensive, so we keep the ability of the old RA to handle shared registers and only use the new RA when it may be required: either something potentially requiring live-range splitting, or a too-high shared register limit. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22072>
2024-11-27 12:14:10 +08:00 · 2023-02-09 13:06:30 +01:00 · 2023-02-09 13:06:30 +01:00 · fa22b0901a
commit fa22b0901a
parent f977501a7c
9 changed files with 1687 additions and 75 deletions
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -335,7 +335,14 @@ typedef enum ir3_instruction_flags {
    * before register assignment is done:
    */
   IR3_INSTR_MARK = BIT(15),
-   IR3_INSTR_UNUSED = BIT(16),
+
+   /* Used by shared register allocation when creating spill/reload instructions
+    * to inform validation that this is created by RA. This also may be set on
+    * an instruction where a spill has been folded into it.
+    */
+   IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
+
+   IR3_INSTR_UNUSED = BIT(17),
 } ir3_instruction_flags;

 struct ir3_instruction {
--- a/src/freedreno/ir3/ir3_merge_regs.c
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@ -377,6 +377,8 @@ static void
 aggressive_coalesce_split(struct ir3_liveness *live,
                          struct ir3_instruction *split)
 {
+   if (!(split->dsts[0]->flags & IR3_REG_SSA))
+      return;
   try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
                  split->split.off * reg_elem_size(split->dsts[0]));
 }
@ -409,6 +411,10 @@ create_parallel_copy(struct ir3_block *block)
         if (phi->opc != OPC_META_PHI)
            break;

+         /* Avoid phis we've already colored */
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
+
         /* Avoid undef */
         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
             !phi->srcs[pred_idx]->def)
@ -430,6 +436,8 @@ create_parallel_copy(struct ir3_block *block)
      foreach_instr (phi, &succ->instr_list) {
         if (phi->opc != OPC_META_PHI)
            break;
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
             !phi->srcs[pred_idx]->def)
            continue;
@ -456,6 +464,8 @@ create_parallel_copy(struct ir3_block *block)
      foreach_instr (phi, &succ->instr_list) {
         if (phi->opc != OPC_META_PHI)
            break;
+         if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+            continue;
         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
             !phi->srcs[pred_idx]->def)
            continue;
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -193,6 +193,8 @@ void
 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
                        struct ir3_reg_interval *interval)
 {
+   assert(interval->inserted);
+
   if (interval->parent) {
      rb_tree_remove(&interval->parent->children, &interval->node);
   } else {
@ -684,6 +686,8 @@ ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
                struct ra_interval *interval)
 {
   assert(!interval->interval.parent);
+   /* shared live splitting is not allowed! */
+   assert(!(interval->interval.reg->flags & IR3_REG_SHARED));

   /* Check if we've already moved this reg before */
   unsigned pcopy_index;
@ -1665,6 +1669,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
   struct ir3_register *dst = instr->dsts[0];
   struct ir3_register *src = instr->srcs[0];

+   if (!(dst->flags & IR3_REG_SSA))
+      return;
+
   if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
      handle_normal_instr(ctx, instr);
      return;
@ -1683,6 +1690,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
   struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
   unsigned dst_offset = instr->dsts[0]->merge_set_offset;

@ -1798,7 +1808,8 @@ handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-   if (instr->dsts[0]->num == INVALID_REG)
+   if (instr->dsts[0]->num == INVALID_REG ||
+       !(instr->dsts[0]->flags & IR3_REG_SSA))
      return;

   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
@ -1829,6 +1840,9 @@ handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 static void
 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
   struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
   struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);

@ -1973,6 +1987,9 @@ handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 {
+   if (!(def->flags & IR3_REG_SSA))
+      return;
+
   struct ra_file *file = ra_get_file(ctx, def);
   struct ra_interval *interval = &ctx->intervals[def->name];

@ -1999,6 +2016,9 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
 static void
 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
   struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
   struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
   assert(!interval->interval.parent);
@ -2085,15 +2105,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
 {
   physreg_t physreg = ra_interval_get_physreg(interval);

-   bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
-   struct ir3_block **predecessors =
-      shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
-   unsigned predecessors_count = shared
-                                    ? ctx->block->physical_predecessors_count
-                                    : ctx->block->predecessors_count;
-
-   for (unsigned i = 0; i < predecessors_count; i++) {
-      struct ir3_block *pred = predecessors[i];
+   for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+      struct ir3_block *pred = ctx->block->predecessors[i];
      struct ra_block_state *pred_state = &ctx->blocks[pred->index];

      if (!pred_state->visited)
@ -2101,28 +2114,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)

      physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
      if (pred_reg != physreg) {
+         assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
         insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
-
-         /* This is a bit tricky, but when visiting the destination of a
-          * physical-only edge, we have two predecessors (the if and the
-          * header block) and both have multiple successors. We pick the
-          * register for all live-ins from the normal edge, which should
-          * guarantee that there's no need for shuffling things around in
-          * the normal predecessor as long as there are no phi nodes, but
-          * we still may need to insert fixup code in the physical
-          * predecessor (i.e. the last block of the if) and that has
-          * another successor (the block after the if) so we need to update
-          * the renames state for when we process the other successor. This
-          * crucially depends on the other successor getting processed
-          * after this.
-          *
-          * For normal (non-physical) edges we disallow critical edges so
-          * that hacks like this aren't necessary.
-          */
-         if (!pred_state->renames)
-            pred_state->renames = _mesa_pointer_hash_table_create(ctx);
-         _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
-                                 (void *)(uintptr_t)physreg);
      }
   }
 }
@ -2561,6 +2554,18 @@ ir3_ra(struct ir3_shader_variant *v)

   ir3_merge_regs(live, v->ir);

+   bool has_shared_vectors = false;
+   foreach_block (block, &v->ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
+               has_shared_vectors = true;
+               break;
+            }
+         }
+      }
+   }
+
   struct ir3_pressure max_pressure;
   ir3_calc_pressure(v, live, &max_pressure);
   d("max pressure:");
@ -2590,10 +2595,17 @@ ir3_ra(struct ir3_shader_variant *v)
   if (ir3_shader_debug & IR3_DBG_SPILLALL)
      calc_min_limit_pressure(v, live, &limit_pressure);

-   if (max_pressure.shared > limit_pressure.shared) {
-      /* TODO shared reg -> normal reg spilling */
-      d("shared max pressure exceeded!");
-      goto fail;
+   if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) {
+      ir3_ra_shared(v, live);
+
+      /* Recalculate liveness and register pressure now that additional values
+       * have been added.
+       */
+      ralloc_free(live);
+      live = ir3_calc_liveness(ctx, v->ir);
+      ir3_calc_pressure(v, live, &max_pressure);
+
+      ir3_debug_print(v->ir, "AFTER: shared register allocation");
   }

   bool spilled = false;
@ -2629,7 +2641,7 @@ ir3_ra(struct ir3_shader_variant *v)
   foreach_block (block, &v->ir->block_list)
      handle_block(ctx, block);

-   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
+   ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);

   /* Strip array-ness and SSA-ness at the end, because various helpers still
    * need to work even on definitions that have already been assigned. For
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@ -168,8 +168,10 @@ bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,

 bool ir3_lower_spill(struct ir3 *ir);

+void ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness *live);
+
 void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
-                     unsigned half_size, unsigned block_count);
+                     unsigned half_size, unsigned block_count, bool shared_ra);

 void ir3_lower_copies(struct ir3_shader_variant *v);

--- a/src/freedreno/ir3/ir3_ra_validate.c
+++ b/src/freedreno/ir3/ir3_ra_validate.c
@ -92,13 +92,25 @@ struct reaching_state {
 struct ra_val_ctx {
   struct ir3_instruction *current_instr;

+   /* The current state of the dataflow analysis for the instruction we're
+    * processing.
+    */
   struct reaching_state reaching;
+
+   /* The state at the end of each basic block. */
   struct reaching_state *block_reaching;
   unsigned block_count;

+   /* When validating shared RA, we have to take spill/reload instructions into
+    * account. This saves an array of reg_state for the source of each spill
+    * instruction, to be restored at the corresponding reload(s).
+    */
+   struct hash_table *spill_reaching;
+
   unsigned full_size, half_size;

   bool merged_regs;
+   bool shared_ra;

   bool failed;
 };
@ -130,6 +142,28 @@ get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
      return ctx->half_size;
 }

+static struct reg_state *
+get_spill_state(struct ra_val_ctx *ctx, struct ir3_register *dst)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(ctx->spill_reaching, dst);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
+}
+
+static struct reg_state *
+get_or_create_spill_state(struct ra_val_ctx *ctx, struct ir3_register *dst)
+{
+   struct reg_state *state = get_spill_state(ctx, dst);
+   if (state)
+      return state;
+
+   state = rzalloc_array(ctx, struct reg_state, reg_size(dst));
+   _mesa_hash_table_insert(ctx->spill_reaching, dst, state);
+   return state;
+}
+
 /* Validate simple things, like the registers being in-bounds. This way we
 * don't have to worry about out-of-bounds accesses later.
 */
@ -139,6 +173,8 @@ validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
   ctx->current_instr = instr;
   ra_foreach_dst (dst, instr) {
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED))
+         continue;
      unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
      validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
      if (dst->tied)
@ -146,6 +182,8 @@ validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
   }

   ra_foreach_src (src, instr) {
+      if (ctx->shared_ra && !(src->flags & IR3_REG_SHARED))
+         continue;
      unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
      validate_assert(ctx, src_max <= get_file_size(ctx, src));
   }
@ -219,6 +257,24 @@ static void
 propagate_normal_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
   ra_foreach_dst (dst, instr) {
+      /* Process destinations from scalar ALU instructions that were demoted to
+       * normal ALU instructions. For these we must treat the instruction as a
+       * spill of itself and set the propagate state to itself. See
+       * try_demote_instructions().
+       */
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         if (instr->flags & IR3_INSTR_SHARED_SPILL) {
+            struct reg_state *state = get_or_create_spill_state(ctx, dst);
+            for (unsigned i = 0; i < reg_size(dst); i++) {
+               state[i] = (struct reg_state){
+                  .def = dst,
+                  .offset = i,
+               };
+            }
+         }
+         continue;
+      }
+
      struct file_state *file = ra_val_get_file(ctx, dst);
      physreg_t physreg = ra_reg_get_physreg(dst);
      for (unsigned i = 0; i < reg_size(dst); i++) {
@ -239,6 +295,16 @@ propagate_split(struct ra_val_ctx *ctx, struct ir3_instruction *split)
   physreg_t src_physreg = ra_reg_get_physreg(src);
   struct file_state *file = ra_val_get_file(ctx, dst);

+   if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+      struct reg_state *src_state = get_spill_state(ctx, src->def);
+      if (src_state) {
+         struct reg_state *dst_state = get_or_create_spill_state(ctx, dst);
+         memcpy(dst_state, &src_state[split->split.off * reg_elem_size(src)],
+                reg_size(dst) * sizeof(struct reg_state));
+      }
+      return;
+   }
+
   unsigned offset = split->split.off * reg_elem_size(src);
   for (unsigned i = 0; i < reg_elem_size(src); i++) {
      file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
@ -249,30 +315,50 @@ static void
 propagate_collect(struct ra_val_ctx *ctx, struct ir3_instruction *collect)
 {
   struct ir3_register *dst = collect->dsts[0];
-   physreg_t dst_physreg = ra_reg_get_physreg(dst);
-   struct file_state *file = ra_val_get_file(ctx, dst);
-
   unsigned size = reg_size(dst);
-   struct reg_state srcs[size];

-   for (unsigned i = 0; i < collect->srcs_count; i++) {
-      struct ir3_register *src = collect->srcs[i];
-      unsigned dst_offset = i * reg_elem_size(dst);
-      for (unsigned j = 0; j < reg_elem_size(dst); j++) {
-         if (!ra_reg_is_src(src)) {
-            srcs[dst_offset + j] = (struct reg_state){
-               .def = dst,
-               .offset = dst_offset + j,
-            };
-         } else {
-            physreg_t src_physreg = ra_reg_get_physreg(src);
-            srcs[dst_offset + j] = file->regs[src_physreg + j];
+   if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+      struct reg_state *dst_state = NULL;
+
+      for (unsigned i = 0; i < collect->srcs_count; i++) {
+         struct ir3_register *src = collect->srcs[i];
+         unsigned dst_offset = i * reg_elem_size(dst);
+
+         if (ra_reg_is_src(src)) {
+            struct reg_state *src_state = get_spill_state(ctx, src->def);
+            if (src_state) {
+               if (!dst_state)
+                  dst_state = get_or_create_spill_state(ctx, dst);
+               memcpy(&dst_state[dst_offset], src_state,
+                      reg_size(src) * sizeof(struct reg_state));
+            }
         }
      }
-   }
+   } else {
+      struct file_state *file = ra_val_get_file(ctx, dst);
+      physreg_t dst_physreg = ra_reg_get_physreg(dst);
+      struct reg_state srcs[size];

-   for (unsigned i = 0; i < size; i++)
-      file->regs[dst_physreg + i] = srcs[i];
+      for (unsigned i = 0; i < collect->srcs_count; i++) {
+         struct ir3_register *src = collect->srcs[i];
+         unsigned dst_offset = i * reg_elem_size(dst);
+
+         for (unsigned j = 0; j < reg_elem_size(dst); j++) {
+            if (!ra_reg_is_src(src)) {
+               srcs[dst_offset + j] = (struct reg_state){
+                  .def = dst,
+                  .offset = dst_offset + j,
+               };
+            } else {
+               physreg_t src_physreg = ra_reg_get_physreg(src);
+               srcs[dst_offset + j] = file->regs[src_physreg + j];
+            }
+         }
+      }
+
+      for (unsigned i = 0; i < size; i++)
+         file->regs[dst_physreg + i] = srcs[i];
+   }
 }

 static void
@ -291,15 +377,25 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
      struct ir3_register *src = pcopy->srcs[i];
      struct file_state *file = ra_val_get_file(ctx, dst);

-      for (unsigned j = 0; j < reg_size(dst); j++) {
-         if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
-            srcs[offset + j] = (struct reg_state){
-               .def = dst,
-               .offset = j,
-            };
-         } else {
-            physreg_t src_physreg = ra_reg_get_physreg(src);
-            srcs[offset + j] = file->regs[src_physreg + j];
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         if (ra_reg_is_src(src)) {
+            struct reg_state *src_state = get_spill_state(ctx, src->def);
+            if (src_state) {
+               struct reg_state *dst_state = get_or_create_spill_state(ctx, dst);
+               memcpy(dst_state, src_state, reg_size(dst) * sizeof(struct reg_state));
+            }
+         }
+      } else {
+         for (unsigned j = 0; j < reg_size(dst); j++) {
+            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
+               srcs[offset + j] = (struct reg_state){
+                  .def = dst,
+                  .offset = j,
+               };
+            } else {
+               physreg_t src_physreg = ra_reg_get_physreg(src);
+               srcs[offset + j] = file->regs[src_physreg + j];
+            }
         }
      }

@ -310,6 +406,12 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
   offset = 0;
   for (unsigned i = 0; i < pcopy->dsts_count; i++) {
      struct ir3_register *dst = pcopy->dsts[i];
+
+      if (ctx->shared_ra && !(dst->flags & IR3_REG_SHARED)) {
+         offset += reg_size(dst);
+         continue;
+      }
+
      physreg_t dst_physreg = ra_reg_get_physreg(dst);
      struct file_state *file = ra_val_get_file(ctx, dst);

@ -321,6 +423,23 @@ propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
   assert(offset == size);
 }

+static void
+propagate_spill(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
+{
+   if (instr->srcs[0]->flags & IR3_REG_SHARED) { /* spill */
+      struct reg_state *state = get_or_create_spill_state(ctx, instr->dsts[0]);
+      physreg_t src_physreg = ra_reg_get_physreg(instr->srcs[0]);
+      memcpy(state, &ctx->reaching.shared.regs[src_physreg],
+             reg_size(instr->srcs[0]) * sizeof(struct reg_state));
+   } else { /* reload */
+      struct reg_state *state = get_spill_state(ctx, instr->srcs[0]->def);
+      assert(state);
+      physreg_t dst_physreg = ra_reg_get_physreg(instr->dsts[0]);
+      memcpy(&ctx->reaching.shared.regs[dst_physreg], state,
+             reg_size(instr->dsts[0]) * sizeof(struct reg_state));
+   }
+}
+
 static void
 propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
 {
@ -330,6 +449,13 @@ propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
      propagate_collect(ctx, instr);
   else if (instr->opc == OPC_META_PARALLEL_COPY)
      propagate_parallelcopy(ctx, instr);
+   else if (ctx->shared_ra && instr->opc == OPC_MOV &&
+            /* Moves from immed/const with IR3_INSTR_SHARED_SPILL were demoted
+             * from scalar ALU, see try_demote_instruction().
+             */
+            !(instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
+            (instr->flags & IR3_INSTR_SHARED_SPILL))
+      propagate_spill(ctx, instr);
   else
      propagate_normal_instr(ctx, instr);
 }
@ -439,6 +565,8 @@ static void
 check_reaching_src(struct ra_val_ctx *ctx, struct ir3_instruction *instr,
                   struct ir3_register *src)
 {
+   if (ctx->shared_ra && !(src->flags & IR3_REG_SHARED))
+      return;
   struct file_state *file = ra_val_get_file(ctx, src);
   physreg_t physreg = ra_reg_get_physreg(src);
   for (unsigned i = 0; i < reg_size(src); i++) {
@ -541,7 +669,7 @@ check_reaching_defs(struct ra_val_ctx *ctx, struct ir3 *ir)

 void
 ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
-                unsigned half_size, unsigned block_count)
+                unsigned half_size, unsigned block_count, bool shared_ra)
 {
 #ifdef NDEBUG
 #define VALIDATE 0
@ -557,6 +685,9 @@ ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
   ctx->full_size = full_size;
   ctx->half_size = half_size;
   ctx->block_count = block_count;
+   ctx->shared_ra = shared_ra;
+   if (ctx->shared_ra)
+      ctx->spill_reaching = _mesa_pointer_hash_table_create(ctx);

   foreach_block (block, &v->ir->block_list) {
      foreach_instr (instr, &block->instr_list) {
--- a/src/freedreno/ir3/ir3_shared_ra.c
+++ b/src/freedreno/ir3/ir3_shared_ra.c
--- a/src/freedreno/ir3/ir3_spill.c
+++ b/src/freedreno/ir3/ir3_spill.c
@ -1193,20 +1193,23 @@ is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
 static void
 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 {
-   foreach_dst (dst, pcopy) {
+   ra_foreach_dst (dst, pcopy) {
      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
      ra_spill_interval_init(dst_interval, dst);
   }

   foreach_src_n (src, i, pcopy) {
-      d("processing src %u", i);
      struct ir3_register *dst = pcopy->dsts[i];
+      if (!(dst->flags & IR3_REG_SSA))
+         continue;
+
+      d("processing src %u", i);

      /* Skip the intermediate copy for cases where the source is merged with
       * the destination. Crucially this means that we also don't reload/spill
       * it if it's been spilled, because it shares the same spill slot.
       */
-      if (src->def && src->def->merge_set &&
+      if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
          src->def->merge_set == dst->merge_set &&
          src->def->merge_set_offset == dst->merge_set_offset) {
         struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
@ -1221,7 +1224,7 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
            dst_interval->cant_spill = false;
            dst_interval->dst = src_interval->dst;
         }
-      } else if (src->def) {
+      } else if (src->flags & IR3_REG_SSA) {
         struct ra_spill_interval *temp_interval =
            create_temp_interval(ctx, dst);
         struct ir3_register *temp = temp_interval->interval.reg;
@ -1251,15 +1254,17 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)

   foreach_src_n (src, i, pcopy) {
      struct ir3_register *dst = pcopy->dsts[i];
+      if (!(dst->flags & IR3_REG_SSA))
+         continue;

-      if (src->def && src->def->merge_set &&
+      if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
          src->def->merge_set == dst->merge_set &&
          src->def->merge_set_offset == dst->merge_set_offset)
         continue;

      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];

-      if (!src->def) {
+      if (!(src->flags & IR3_REG_SSA)) {
         dst_interval->cant_spill = true;
         ra_spill_ctx_insert(ctx, dst_interval);
         limit(ctx, pcopy);
@ -1292,6 +1297,9 @@ handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
 static void
 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
   init_dst(ctx, instr->dsts[0]);
   insert_dst(ctx, instr->dsts[0]);
   finish_dst(ctx, instr->dsts[0]);
@ -1300,6 +1308,9 @@ handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 static void
 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
+   if (!(instr->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
   if (instr->opc == OPC_META_TEX_PREFETCH) {
      ra_foreach_src (src, instr)
         remove_src(ctx, instr, src);
@ -1623,6 +1634,9 @@ static void
 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
            struct ir3_block *block)
 {
+   if (!(phi->dsts[0]->flags & IR3_REG_SSA))
+      return;
+
   if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
      phi->flags |= IR3_INSTR_UNUSED;
      return;
@ -1977,8 +1991,25 @@ cleanup_dead(struct ir3 *ir)
 {
   foreach_block (block, &ir->block_list) {
      foreach_instr_safe (instr, &block->instr_list) {
-         if (instr->flags & IR3_INSTR_UNUSED)
-            list_delinit(&instr->node);
+         if (instr->flags & IR3_INSTR_UNUSED) {
+            if (instr->opc == OPC_META_PARALLEL_COPY) {
+               /* There may be non-SSA shared copies, we need to preserve these.
+                */
+               for (unsigned i = 0; i < instr->dsts_count;) {
+                  if (instr->dsts[i]->flags & IR3_REG_SSA) {
+                     instr->dsts[i] = instr->dsts[--instr->dsts_count];
+                     instr->srcs[i] = instr->srcs[--instr->srcs_count];
+                  } else {
+                     i++;
+                  }
+               }
+
+               if (instr->dsts_count == 0)
+                  list_delinit(&instr->node);
+            } else {
+               list_delinit(&instr->node);
+            }
+         }
      }
   }
 }
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@ -84,6 +84,9 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
   validate_assert(ctx, src->wrmask == reg->wrmask);
   validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));

+   if (src->flags & IR3_REG_CONST)
+      validate_assert(ctx, !(src->flags & IR3_REG_SHARED));
+
   if (reg->tied) {
      validate_assert(ctx, reg->tied->tied == reg);
      bool found = false;
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@ -112,6 +112,7 @@ libfreedreno_ir3_files = files(
  'ir3_sched.c',
  'ir3_shader.c',
  'ir3_shader.h',
+  'ir3_shared_ra.c',
  'ir3_spill.c',
  'ir3_validate.c',
 )