nir,radeonsi: move ffma fusing to late optimizations for better codegen

The freedreno trace changes were suggested by Rob Clark.

ALU performance is higher, because ffma is used more often, but so is
register usage, because trinary opcodes (such as ffma) usually need
at least 3 live registers.

54793 shaders in 33659 tests
Totals:
SGPRS: 2639746 -> 2642938 (0.12 %)
VGPRS: 1534120 -> 1536392 (0.15 %)
Spilled SGPRs: 3541 -> 3618 (2.17 %)
Spilled VGPRs: 33 -> 44 (33.33 %)
Scratch size: 292 -> 312 (6.85 %) dwords per thread
Code Size: 55639836 -> 55620116 (-0.04 %) bytes
Max Waves: 964785 -> 963977 (-0.08 %)

Totals from affected shaders:
SGPRS: 1105800 -> 1108992 (0.29 %)
VGPRS: 635292 -> 637564 (0.36 %)
Spilled SGPRs: 3193 -> 3270 (2.41 %)
Spilled VGPRs: 33 -> 44 (33.33 %)
Scratch size: 36 -> 56 (55.56 %) dwords per thread
Code Size: 31568708 -> 31548988 (-0.06 %) bytes
Max Waves: 319991 -> 319183 (-0.25 %)

Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6596>
This commit is contained in:
Marek Olšák 2020-09-04 05:55:25 -04:00 committed by Marge Bot
parent a3512ddfdf
commit 57bf4c2028
4 changed files with 30 additions and 22 deletions

View File

@ -11,12 +11,12 @@ traces:
- path: gputest/furmark.trace
expectations:
- device: freedreno-a630
checksum: de674022e53fc9e0a9eb217f8bf0fe03
checksum: af6e1faf11407a7e7c416f2c532de029
# Note: Requires GL3.3
- path: gputest/gimark.trace
expectations:
- device: freedreno-a630
checksum: 2cae8e2104356e2b3017cbd953cf7b4a
checksum: 47419914b87422b267e20b6981a7eb43
- path: gputest/pixmark-julia-fp32.trace
expectations:
- device: freedreno-a630
@ -37,16 +37,16 @@ traces:
expectations:
# Looks fine, but totally different shape from the rendering on i965.
- device: freedreno-a630
checksum: 86d678c70b8adf27095ace1a6bbfe2d2
checksum: 9ee5a036510be0f506705eacc1516bf3
- path: gputest/plot3d.trace
expectations:
- device: freedreno-a630
checksum: 67a9eb692e694b11107860bbcd47d493
checksum: 42aba3ab943dae2fe952cae1ff91c354
# Note: Requires GL4 for tess.
- path: gputest/tessmark.trace
expectations:
- device: freedreno-a630
checksum: 985e231b58b7dc4b6da34ff32f8ebb82
checksum: 8688b3904b6b2bc591d8b669ecae4d53
- path: gputest/triangle.trace
expectations:
- device: freedreno-a630
@ -149,7 +149,7 @@ traces:
- path: glmark2/effect2d-kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;.rdc
expectations:
- device: freedreno-a630
checksum: 2346a6597f4d1f20b493e8d6a8f7e592
checksum: 2964d37446db126a5fe462b1ba4542cd
- path: glmark2/function-fragment-complexity=low:fragment-steps=5.rdc
expectations:
# Incorrect rendering, a bunch of the area is uniform gray when it should
@ -215,7 +215,7 @@ traces:
- path: glmark2/shading-shading=gouraud.rdc
expectations:
- device: freedreno-a630
checksum: fcc26fca31375b216382e69bc5f113fb
checksum: bd9058f041bd2d59c039cccdb7d50bf7
- path: glmark2/shading-shading=phong.rdc
# Some speckling on the main specular highlight that may just be
# mediump artifacts
@ -226,11 +226,6 @@ traces:
expectations:
- device: freedreno-a630
checksum: d8b5931669733240797f1acf5d98db25
# Very yellow terrain compared to i965, may just be mediump artifacts.
- path: glmark2/terrain.rdc
expectations:
- device: freedreno-a630
checksum: 114f7dfe97768d9c565a29f656c8f9cf
- path: glmark2/texture-texture-filter=linear.rdc
expectations:
- device: freedreno-a630

View File

@ -33,11 +33,11 @@ traces:
- path: gputest/furmark.trace
expectations:
- device: gl-radeonsi-stoney
checksum: 1c569668d608c644f353caa177d577c6
checksum: d71c0d8e6c46c8f29d1aa8d0ed7d3c87
- path: gputest/pixmark-piano.trace
expectations:
- device: gl-radeonsi-stoney
checksum: a0e1d6358f76666603b08eab383af080
checksum: 777d48e82cabceef6d9489189f91d266
- path: gputest/triangle.trace
expectations:
- device: gl-radeonsi-stoney
@ -153,7 +153,7 @@ traces:
- path: glmark2/shadow.rdc
expectations:
- device: gl-radeonsi-stoney
checksum: 4bf5ca9ce641de1031eb8125d80a3005
checksum: 03dfbf026a0f0ab643e5a6ef19623e81
- path: glmark2/terrain.rdc
expectations:
- device: gl-radeonsi-stoney
@ -173,7 +173,7 @@ traces:
- path: godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc
expectations:
- device: gl-radeonsi-stoney
checksum: 5164e238381e7d77a64e3de771cc005f
checksum: 990abd360dc380c95ee2645f8b402d47
- path: gputest/gimark.trace
expectations:
- device: gl-radeonsi-stoney
@ -189,15 +189,15 @@ traces:
- path: gputest/pixmark-piano.trace
expectations:
- device: gl-radeonsi-stoney
checksum: a0e1d6358f76666603b08eab383af080
checksum: 777d48e82cabceef6d9489189f91d266
- path: gputest/pixmark-volplosion.trace
expectations:
- device: gl-radeonsi-stoney
checksum: 2fba173643c014bcfa4b31eb55a514b9
checksum: 708f92a8ac8aef23a4a544cc5ec755d6
- path: gputest/plot3d.trace
expectations:
- device: gl-radeonsi-stoney
checksum: fd367551aa74e2903e0590a893da01a6
checksum: f9e6c1cb70add69cf2a4724800d48b25
- path: gputest/tessmark.trace
expectations:
- device: gl-radeonsi-stoney
@ -229,7 +229,7 @@ traces:
- path: supertuxkart/supertuxkart-antediluvian-abyss.rdc
expectations:
- device: gl-radeonsi-stoney
checksum: 17f4039392a65ad23133cb2cac82dba4
checksum: a2c4c127873f93b7db4ef48ea9fb7689
- path: supertuxkart/supertuxkart-menu.rdc
expectations:
- device: gl-radeonsi-stoney
@ -237,4 +237,4 @@ traces:
- path: supertuxkart/supertuxkart-ravenbridge-mansion.rdc
expectations:
- device: gl-radeonsi-stoney
checksum: 46f08af5c49d711b41d4082f8a5cf6d6
checksum: c8f9eae92c67c7d53db4d69a703e3914

View File

@ -194,7 +194,8 @@ optimizations.extend([
(('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
(('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
(('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
# Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
(('~ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma'),
(('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
@ -2027,6 +2028,7 @@ late_optimizations = [
(('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
(('ineg', a), ('isub', 0, a), 'options->lower_negate'),
(('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
# These are duplicated from the main optimizations table. The late
# patterns that rearrange expressions like x - .5 < 0 to x < .5 can create

View File

@ -698,6 +698,17 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
if (changed)
si_nir_opts(nir, false);
/* Run late optimizations to fuse ffma. */
bool more_late_algebraic = true;
while (more_late_algebraic) {
more_late_algebraic = false;
NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
NIR_PASS_V(nir, nir_opt_constant_folding);
NIR_PASS_V(nir, nir_copy_prop);
NIR_PASS_V(nir, nir_opt_dce);
NIR_PASS_V(nir, nir_opt_cse);
}
NIR_PASS_V(nir, nir_lower_bool_to_int32);
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);