intel/fs: Fix copy propagation dataflow analysis in presence of force_writemask_all ACP overwrites.

This fixes the behavior of copy propagation in cases where either the
source or destination of an ACP is overwritten elsewhere in the
program by a force_writemask_all instruction, which could cause the
overwrite to be executed for an inactive channel under non-uniform
control flow, causing the current per-channel dataflow propagation to
give incorrect results.  This has been reported in cases like:

> while (true) {
>    x = imageSize(img);
>    if (non_uniform_condition()) {
>       y = x;
>       break;
>    }
> }
> use(y);

Currently the copy propagation pass would propagate copy 'y = x' into
'use(y)', which is invalid since in the example above imageSize() is
implemented as a force_writemask_all SEND message, whose result is
broadcast to all channels, so when a given channel executes 'y = x'
and breaks out of the loop, another divergent channel can execute a
subsequent iteration of the loop overwriting 'x' with a different
value, hence replacing 'y' with 'x' at 'use(y)' changes the behavior
of the program.

This patch extends the global dataflow analysis algorithm to determine
whether there is any control flow path from a given copy to an
overwrite of its source or destination which has force_writemask_all
behavior inconsistent with the copy, and in such case prevents copy
propagation for that ACP entry at any point of the program which can
be reached from the overwrite, even if the copy is statically
re-executed along all such control flow paths (as in the example
above), since the execution of the overwrite for a given channel i may
corrupt other channels j!=i inactive for the subsequently re-executed
copy.

Note that a simpler solution has been attempted which fully shuts down
copy propagation if such a force_writemask_all ACP overwrite is
present /anywhere/ in the program regardless of its location in the
control flow graph, however that led to large shader-db regressions in
some programs from shader-db (like a CS from Car Chase which would
emit 53% more instructions).  With this solution the only handful of
shaders that suffer instruction count regressions seem to be getting
misoptimized right now (e.g. some compute shaders from Deus Ex
Mankind).  This solution doesn't seem to affect the run-time of
shader-db significantly, it's less than 1% higher with the fix
applied.

Reported-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Matt Turner <mattst88@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21351>
This commit is contained in:
Francisco Jerez 2023-02-10 19:33:50 -08:00 committed by Kenneth Graunke
parent 1c1be23497
commit d4015bcb38

View File

@ -90,6 +90,25 @@ struct block_data {
* have a fully uninitialized destination at the end of this block.
*/
BITSET_WORD *undef;
/**
* Which entries in the fs_copy_prop_dataflow acp table can the
* start of this block be reached from. Note that this is a weaker
* condition than livein.
*/
BITSET_WORD *reachin;
/**
* Which entries in the fs_copy_prop_dataflow acp table are
* overwritten by an instruction with channel masks inconsistent
* with the copy instruction (e.g. due to force_writemask_all).
* Such an overwrite can cause the copy entry to become invalid
* even if the copy instruction is subsequently re-executed for any
* given channel i, since the execution of the overwrite for
* channel i may corrupt other channels j!=i inactive for the
* subsequent copy.
*/
BITSET_WORD *exec_mismatch;
};
class fs_copy_prop_dataflow
@ -141,6 +160,8 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
bd[block->num].undef = rzalloc_array(bd, BITSET_WORD, bitset_words);
bd[block->num].reachin = rzalloc_array(bd, BITSET_WORD, bitset_words);
bd[block->num].exec_mismatch = rzalloc_array(bd, BITSET_WORD, bitset_words);
for (int i = 0; i < ACP_HASH_SIZE; i++) {
foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
@ -208,8 +229,11 @@ fs_copy_prop_dataflow::setup_initial_values()
unsigned idx = reg_space(inst->dst) & (acp_table_size - 1);
foreach_in_list(acp_entry, entry, &acp_table[idx]) {
if (regions_overlap(inst->dst, inst->size_written,
entry->dst, entry->size_written))
entry->dst, entry->size_written)) {
BITSET_SET(bd[block->num].kill, entry->global_idx);
if (inst->force_writemask_all && !entry->force_writemask_all)
BITSET_SET(bd[block->num].exec_mismatch, entry->global_idx);
}
}
}
}
@ -235,8 +259,11 @@ fs_copy_prop_dataflow::setup_initial_values()
unsigned idx = reg_space(inst->dst) & (acp_table_size - 1);
foreach_in_list(acp_entry, entry, &acp_table[idx]) {
if (regions_overlap(inst->dst, inst->size_written,
entry->src, entry->size_read))
entry->src, entry->size_read)) {
BITSET_SET(bd[block->num].kill, entry->global_idx);
if (inst->force_writemask_all && !entry->force_writemask_all)
BITSET_SET(bd[block->num].exec_mismatch, entry->global_idx);
}
}
}
}
@ -293,6 +320,7 @@ fs_copy_prop_dataflow::run()
for (int i = 0; i < bitset_words; i++) {
const BITSET_WORD old_liveout = bd[block->num].liveout[i];
const BITSET_WORD old_reachin = bd[block->num].reachin[i];
BITSET_WORD livein_from_any_block = 0;
/* Update livein for this block. If a copy is live out of all
@ -310,6 +338,13 @@ fs_copy_prop_dataflow::run()
bd[block->num].livein[i] &= (bd[parent->num].liveout[i] |
bd[parent->num].undef[i]);
livein_from_any_block |= bd[parent->num].liveout[i];
/* Update reachin for this block. If the end of any
* parent block is reachable from the copy, the start
* of this block is reachable from it as well.
*/
bd[block->num].reachin[i] |= (bd[parent->num].reachin[i] |
bd[parent->num].copy[i]);
}
/* Limit to the set of ACP entries that can possibly be available
@ -325,7 +360,44 @@ fs_copy_prop_dataflow::run()
bd[block->num].copy[i] | (bd[block->num].livein[i] &
~bd[block->num].kill[i]);
if (old_liveout != bd[block->num].liveout[i])
if (old_liveout != bd[block->num].liveout[i] ||
old_reachin != bd[block->num].reachin[i])
progress = true;
}
}
} while (progress);
/* Perform a second fixed-point pass in order to propagate the
* exec_mismatch bitsets. Note that this requires an accurate
* value of the reachin bitsets as input, which isn't available
* until the end of the first propagation pass, so this loop cannot
* be folded into the previous one.
*/
do {
progress = false;
foreach_block (block, cfg) {
for (int i = 0; i < bitset_words; i++) {
const BITSET_WORD old_exec_mismatch = bd[block->num].exec_mismatch[i];
/* Update exec_mismatch for this block. If the end of a
* parent block is reachable by an overwrite with
* inconsistent execution masking, the start of this block
* is reachable by such an overwrite as well.
*/
foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
bblock_t *parent = parent_link->block;
bd[block->num].exec_mismatch[i] |= (bd[parent->num].exec_mismatch[i] &
bd[parent->num].reachin[i]);
}
/* Only consider overwrites with inconsistent execution
* masking if they are reachable from the copy, since
* overwrites unreachable from a copy are harmless to that
* copy.
*/
bd[block->num].exec_mismatch[i] &= bd[block->num].reachin[i];
if (old_exec_mismatch != bd[block->num].exec_mismatch[i])
progress = true;
}
}
@ -1182,7 +1254,8 @@ fs_visitor::opt_copy_propagation()
exec_list in_acp[ACP_HASH_SIZE];
for (int i = 0; i < dataflow.num_acp; i++) {
if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
if (BITSET_TEST(dataflow.bd[block->num].livein, i) &&
!BITSET_TEST(dataflow.bd[block->num].exec_mismatch, i)) {
struct acp_entry *entry = dataflow.acp[i];
in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
}