mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2024-11-30 21:54:16 +08:00
intel/fs: Fix copy propagation dataflow analysis in presence of force_writemask_all ACP overwrites.
This fixes the behavior of copy propagation in cases where either the source or destination of an ACP is overwritten elsewhere in the program by a force_writemask_all instruction, which could cause the overwrite to be executed for an inactive channel under non-uniform control flow, causing the current per-channel dataflow propagation to give incorrect results. This has been reported in cases like: > while (true) { > x = imageSize(img); > if (non_uniform_condition()) { > y = x; > break; > } > } > use(y); Currently the copy propagation pass would propagate copy 'y = x' into 'use(y)', which is invalid since in the example above imageSize() is implemented as a force_writemask_all SEND message, whose result is broadcast to all channels, so when a given channel executes 'y = x' and breaks out of the loop, another divergent channel can execute a subsequent iteration of the loop overwriting 'x' with a different value, hence replacing 'y' with 'x' at 'use(y)' changes the behavior of the program. This patch extends the global dataflow analysis algorithm to determine whether there is any control flow path from a given copy to an overwrite of its source or destination which has force_writemask_all behavior inconsistent with the copy, and in such case prevents copy propagation for that ACP entry at any point of the program which can be reached from the overwrite, even if the copy is statically re-executed along all such control flow paths (as in the example above), since the execution of the overwrite for a given channel i may corrupt other channels j!=i inactive for the subsequently re-executed copy. Note that a simpler solution has been attempted which fully shuts down copy propagation if such a force_writemask_all ACP overwrite is present /anywhere/ in the program regardless of its location in the control flow graph, however that led to large shader-db regressions in some programs from shader-db (like a CS from Car Chase which would emit 53% more instructions). With this solution the only handful of shaders that suffer instruction count regressions seem to be getting misoptimized right now (e.g. some compute shaders from Deus Ex Mankind). This solution doesn't seem to affect the run-time of shader-db significantly, it's less than 1% higher with the fix applied. Reported-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Matt Turner <mattst88@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21351>
This commit is contained in:
parent
1c1be23497
commit
d4015bcb38
@ -90,6 +90,25 @@ struct block_data {
|
||||
* have a fully uninitialized destination at the end of this block.
|
||||
*/
|
||||
BITSET_WORD *undef;
|
||||
|
||||
/**
|
||||
* Which entries in the fs_copy_prop_dataflow acp table can the
|
||||
* start of this block be reached from. Note that this is a weaker
|
||||
* condition than livein.
|
||||
*/
|
||||
BITSET_WORD *reachin;
|
||||
|
||||
/**
|
||||
* Which entries in the fs_copy_prop_dataflow acp table are
|
||||
* overwritten by an instruction with channel masks inconsistent
|
||||
* with the copy instruction (e.g. due to force_writemask_all).
|
||||
* Such an overwrite can cause the copy entry to become invalid
|
||||
* even if the copy instruction is subsequently re-executed for any
|
||||
* given channel i, since the execution of the overwrite for
|
||||
* channel i may corrupt other channels j!=i inactive for the
|
||||
* subsequent copy.
|
||||
*/
|
||||
BITSET_WORD *exec_mismatch;
|
||||
};
|
||||
|
||||
class fs_copy_prop_dataflow
|
||||
@ -141,6 +160,8 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
|
||||
bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
|
||||
bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
|
||||
bd[block->num].undef = rzalloc_array(bd, BITSET_WORD, bitset_words);
|
||||
bd[block->num].reachin = rzalloc_array(bd, BITSET_WORD, bitset_words);
|
||||
bd[block->num].exec_mismatch = rzalloc_array(bd, BITSET_WORD, bitset_words);
|
||||
|
||||
for (int i = 0; i < ACP_HASH_SIZE; i++) {
|
||||
foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
|
||||
@ -208,8 +229,11 @@ fs_copy_prop_dataflow::setup_initial_values()
|
||||
unsigned idx = reg_space(inst->dst) & (acp_table_size - 1);
|
||||
foreach_in_list(acp_entry, entry, &acp_table[idx]) {
|
||||
if (regions_overlap(inst->dst, inst->size_written,
|
||||
entry->dst, entry->size_written))
|
||||
entry->dst, entry->size_written)) {
|
||||
BITSET_SET(bd[block->num].kill, entry->global_idx);
|
||||
if (inst->force_writemask_all && !entry->force_writemask_all)
|
||||
BITSET_SET(bd[block->num].exec_mismatch, entry->global_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -235,8 +259,11 @@ fs_copy_prop_dataflow::setup_initial_values()
|
||||
unsigned idx = reg_space(inst->dst) & (acp_table_size - 1);
|
||||
foreach_in_list(acp_entry, entry, &acp_table[idx]) {
|
||||
if (regions_overlap(inst->dst, inst->size_written,
|
||||
entry->src, entry->size_read))
|
||||
entry->src, entry->size_read)) {
|
||||
BITSET_SET(bd[block->num].kill, entry->global_idx);
|
||||
if (inst->force_writemask_all && !entry->force_writemask_all)
|
||||
BITSET_SET(bd[block->num].exec_mismatch, entry->global_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -293,6 +320,7 @@ fs_copy_prop_dataflow::run()
|
||||
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
const BITSET_WORD old_liveout = bd[block->num].liveout[i];
|
||||
const BITSET_WORD old_reachin = bd[block->num].reachin[i];
|
||||
BITSET_WORD livein_from_any_block = 0;
|
||||
|
||||
/* Update livein for this block. If a copy is live out of all
|
||||
@ -310,6 +338,13 @@ fs_copy_prop_dataflow::run()
|
||||
bd[block->num].livein[i] &= (bd[parent->num].liveout[i] |
|
||||
bd[parent->num].undef[i]);
|
||||
livein_from_any_block |= bd[parent->num].liveout[i];
|
||||
|
||||
/* Update reachin for this block. If the end of any
|
||||
* parent block is reachable from the copy, the start
|
||||
* of this block is reachable from it as well.
|
||||
*/
|
||||
bd[block->num].reachin[i] |= (bd[parent->num].reachin[i] |
|
||||
bd[parent->num].copy[i]);
|
||||
}
|
||||
|
||||
/* Limit to the set of ACP entries that can possibly be available
|
||||
@ -325,7 +360,44 @@ fs_copy_prop_dataflow::run()
|
||||
bd[block->num].copy[i] | (bd[block->num].livein[i] &
|
||||
~bd[block->num].kill[i]);
|
||||
|
||||
if (old_liveout != bd[block->num].liveout[i])
|
||||
if (old_liveout != bd[block->num].liveout[i] ||
|
||||
old_reachin != bd[block->num].reachin[i])
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
} while (progress);
|
||||
|
||||
/* Perform a second fixed-point pass in order to propagate the
|
||||
* exec_mismatch bitsets. Note that this requires an accurate
|
||||
* value of the reachin bitsets as input, which isn't available
|
||||
* until the end of the first propagation pass, so this loop cannot
|
||||
* be folded into the previous one.
|
||||
*/
|
||||
do {
|
||||
progress = false;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
const BITSET_WORD old_exec_mismatch = bd[block->num].exec_mismatch[i];
|
||||
|
||||
/* Update exec_mismatch for this block. If the end of a
|
||||
* parent block is reachable by an overwrite with
|
||||
* inconsistent execution masking, the start of this block
|
||||
* is reachable by such an overwrite as well.
|
||||
*/
|
||||
foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
|
||||
bblock_t *parent = parent_link->block;
|
||||
bd[block->num].exec_mismatch[i] |= (bd[parent->num].exec_mismatch[i] &
|
||||
bd[parent->num].reachin[i]);
|
||||
}
|
||||
|
||||
/* Only consider overwrites with inconsistent execution
|
||||
* masking if they are reachable from the copy, since
|
||||
* overwrites unreachable from a copy are harmless to that
|
||||
* copy.
|
||||
*/
|
||||
bd[block->num].exec_mismatch[i] &= bd[block->num].reachin[i];
|
||||
if (old_exec_mismatch != bd[block->num].exec_mismatch[i])
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
@ -1182,7 +1254,8 @@ fs_visitor::opt_copy_propagation()
|
||||
exec_list in_acp[ACP_HASH_SIZE];
|
||||
|
||||
for (int i = 0; i < dataflow.num_acp; i++) {
|
||||
if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
|
||||
if (BITSET_TEST(dataflow.bd[block->num].livein, i) &&
|
||||
!BITSET_TEST(dataflow.bd[block->num].exec_mismatch, i)) {
|
||||
struct acp_entry *entry = dataflow.acp[i];
|
||||
in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user