diff --git a/gcc/testsuite/gcc.dg/vect/pr68445.c b/gcc/testsuite/gcc.dg/vect/pr68445.c index 15bffdc7e05..71d61b93bf6 100644 --- a/gcc/testsuite/gcc.dg/vect/pr68445.c +++ b/gcc/testsuite/gcc.dg/vect/pr68445.c @@ -16,4 +16,4 @@ void IMB_double_fast_x (int *destf, int *dest, int y, int *p1f) } } -/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && { ! vect_strided8 } } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-1.c b/gcc/testsuite/gcc.dg/vect/slp-1.c index d4a13f12df6..e1a45e1f1a7 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-1.c +++ b/gcc/testsuite/gcc.dg/vect/slp-1.c @@ -122,5 +122,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target {! vect_strided5 } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_strided5 } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c index df64c8db350..0208f03dafb 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-11b.c +++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c @@ -45,4 +45,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_strided4 || vect_perm } && vect_int_mult } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } xfail vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-11c.c b/gcc/testsuite/gcc.dg/vect/slp-11c.c index 2e70fca39ba..25d7f2ce383 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-11c.c +++ b/gcc/testsuite/gcc.dg/vect/slp-11c.c @@ -45,5 +45,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { vect_load_lanes } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target { vect_load_lanes } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-2.c b/gcc/testsuite/gcc.dg/vect/slp-2.c index d0de3577eb6..08d2116c3be 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-2.c +++ b/gcc/testsuite/gcc.dg/vect/slp-2.c @@ -144,5 +144,5 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c index 8836acf0330..d32ee5ba73b 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-23.c +++ b/gcc/testsuite/gcc.dg/vect/slp-23.c @@ -114,5 +114,5 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } } } } */ /* SLP fails for the second loop with variable-length SVE because the load size is greater than the minimum vector size. */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm xfail { { aarch64_sve || riscv_v } && vect_variable_length } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm xfail { aarch64_sve && vect_variable_length } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-33.c b/gcc/testsuite/gcc.dg/vect/slp-33.c index c382093c232..9c6c1e4cbec 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-33.c +++ b/gcc/testsuite/gcc.dg/vect/slp-33.c @@ -108,7 +108,7 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target {vect_uintfloat_cvt && vect_int_mult} } } } */ /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */ /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target {{! { vect_uintfloat_cvt}} && {! {vect_int_mult}}} } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_uintfloat_cvt && vect_int_mult} xfail { vect_variable_length && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_uintfloat_cvt && vect_int_mult} } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target {{! { vect_uintfloat_cvt}} && {! {vect_int_mult}}} } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-42.c b/gcc/testsuite/gcc.dg/vect/slp-42.c index 6b78246c2df..53eca6b6648 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-42.c +++ b/gcc/testsuite/gcc.dg/vect/slp-42.c @@ -15,5 +15,5 @@ void foo (int n) } } -/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && { ! vect_strided8 } } } } } */ /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c index bf445473657..b44a673f7de 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-46.c +++ b/gcc/testsuite/gcc.dg/vect/slp-46.c @@ -98,4 +98,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail { vect_load_lanes && vect_variable_length } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-53.c b/gcc/testsuite/gcc.dg/vect/slp-53.c index d8cd5f85b3c..50b3e9d3cee 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-53.c +++ b/gcc/testsuite/gcc.dg/vect/slp-53.c @@ -12,4 +12,5 @@ void foo (int * __restrict x, int *y) } } -/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } xfail vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target { vect_load_lanes } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-54.c b/gcc/testsuite/gcc.dg/vect/slp-54.c index ab66b349d1f..57268ab50b7 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-54.c +++ b/gcc/testsuite/gcc.dg/vect/slp-54.c @@ -15,4 +15,4 @@ void foo (int * __restrict x, int *y) } } -/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } xfail riscv*-*-* } } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-55.c b/gcc/testsuite/gcc.dg/vect/slp-55.c new file mode 100644 index 00000000000..0bf65ef6dc4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-55.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_int_mult } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +void foo (int * __restrict a, int *b, int *c) +{ + for (int i = 0; i < 1024; ++i) + { + a[2*i] = b[i] + 7; + a[2*i+1] = c[i] * 3; + } +} + +int bar (int *b) +{ + int res = 0; + for (int i = 0; i < 1024; ++i) + { + res += b[2*i] + 7; + res += b[2*i+1] * 3; + } + return res; +} + +void baz (int * __restrict a, int *b) +{ + for (int i = 0; i < 1024; ++i) + { + a[2*i] = b[2*i] + 7; + a[2*i+1] = b[2*i+1] * 3; + } +} + +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "LOAD_LANES" 2 "optimized" { target vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "STORE_LANES" 2 "optimized" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-56.c b/gcc/testsuite/gcc.dg/vect/slp-56.c new file mode 100644 index 00000000000..0b985eae55e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-56.c @@ -0,0 +1,51 @@ +#include "tree-vect.h" + +/* This is a load-lane / masked-store-lane test that more reliably + triggers SLP than SVEs mask_srtuct_store_*.c */ + +void __attribute__ ((noipa)) +test4 (int *__restrict dest, int *__restrict src, + int *__restrict cond, int bias, int n) +{ + for (int i = 0; i < n; ++i) + { + int value0 = src[i * 4] + bias; + int value1 = src[i * 4 + 1] * bias; + int value2 = src[i * 4 + 2] + bias; + int value3 = src[i * 4 + 3] * bias; + if (cond[i]) + { + dest[i * 4] = value0; + dest[i * 4 + 1] = value1; + dest[i * 4 + 2] = value2; + dest[i * 4 + 3] = value3; + } + } +} + +int dest[16*4]; +int src[16*4]; +int cond[16]; +const int dest_chk[16*4] = {0, 0, 0, 0, 9, 25, 11, 35, 0, 0, 0, 0, 17, 65, 19, + 75, 0, 0, 0, 0, 25, 105, 27, 115, 0, 0, 0, 0, 33, 145, 35, 155, 0, 0, 0, + 0, 41, 185, 43, 195, 0, 0, 0, 0, 49, 225, 51, 235, 0, 0, 0, 0, 57, 265, 59, + 275, 0, 0, 0, 0, 65, 305, 67, 315}; + +int main() +{ + check_vect (); +#pragma GCC novector + for (int i = 0; i < 16; ++i) + cond[i] = i & 1; +#pragma GCC novector + for (int i = 0; i < 16 * 4; ++i) + src[i] = i; + test4 (dest, src, cond, 5, 16); +#pragma GCC novector + for (int i = 0; i < 16 * 4; ++i) + if (dest[i] != dest_chk[i]) + abort (); + return 0; +} + +/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target { vect_variable_length && vect_load_lanes } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-1.c b/gcc/testsuite/gcc.dg/vect/slp-cond-1.c index c76ea5d17ef..16ab0cc7605 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-cond-1.c +++ b/gcc/testsuite/gcc.dg/vect/slp-cond-1.c @@ -125,5 +125,4 @@ main () return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { ! vect_load_lanes } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target { vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c index 2792b932734..07f871c8972 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c +++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c @@ -56,5 +56,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_unpack } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_unpack xfail { vect_variable_length && vect_load_lanes } } } } */ - +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_unpack } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c index 5c75dc12b69..0f7b479ce59 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c +++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c @@ -51,5 +51,5 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_unpack } } } */ /* The epilogues are vectorized using partial vectors. */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_unpack && { { ! vect_partial_vectors_usage_1 } || s390_vx } } xfail { vect_variable_length && vect_load_lanes } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { { vect_unpack && vect_partial_vectors_usage_1 } && { ! s390_vx } } xfail { vect_variable_length && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_unpack && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { { vect_unpack && vect_partial_vectors_usage_1 } && { ! s390_vx } } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c index dbb107f95fe..93b59075bce 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c @@ -81,9 +81,8 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && {! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump "can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-10.c b/gcc/testsuite/gcc.dg/vect/slp-perm-10.c index 03de4c61b50..2cce30c2444 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-10.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-10.c @@ -53,4 +53,4 @@ int main () /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ /* SLP fails for variable-length SVE because the load size is greater than the minimum vector size. */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm xfail { { aarch64_sve || riscv_v } && vect_variable_length } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm xfail { aarch64_sve && vect_variable_length } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c index 41fd159adce..6ac29e73122 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c @@ -55,8 +55,6 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c index 9ea35ba5afc..d1953054892 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c @@ -68,9 +68,7 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c index f4bda39c837..107968f1f7c 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c @@ -115,4 +115,4 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ /* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_load_lanes && vect_strided5 } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c index 7128cf47155..0dedd4a9b86 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c @@ -105,9 +105,6 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ - diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c index 5cc6261d69a..000848c587c 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c @@ -106,5 +106,5 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm3_int } } } */ /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */ -/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */ +/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c index df13c37bc75..f15736ef729 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c @@ -97,8 +97,6 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c index 029be5485b6..7610524f0bf 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c @@ -61,10 +61,8 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm_byte } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_byte && { { ! vect_load_lanes } && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_byte && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } */ /* The epilogues are vectorized using partial vectors. */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_byte && { { ! vect_load_lanes } && { vect_partial_vectors_usage_1 && { ! s390_vx } } } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_byte && vect_load_lanes } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_byte && { vect_partial_vectors_usage_1 && { ! s390_vx } } } } } } */ /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */ /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c index 89400fb4565..c9468d81a9d 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c @@ -58,7 +58,5 @@ int main (int argc, const char* argv[]) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */ /* We don't try permutes with a group size of 3 for variable-length vectors. */ -/* { dg-final { scan-tree-dump "permutation requires at least three vectors" "vect" { target { vect_perm_short && { ! vect_perm3_short } } xfail vect_variable_length } } } */ -/* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! { vect_perm3_short || vect32 } } || vect_load_lanes } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_perm3_short || vect32 } && { ! vect_load_lanes } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_perm3_short || { vect32 || vect_load_lanes } } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short || { vect32 || vect_load_lanes } } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c index ac562dc475c..0d850720d63 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c @@ -40,5 +40,4 @@ main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { ! vect_hw_misalign } } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 1fb7bbd4d25..242d5e2d916 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2958,82 +2958,6 @@ start_over: "unsupported SLP instances\n"); goto again; } - - /* Check whether any load in ALL SLP instances is possibly permuted. */ - slp_tree load_node, slp_root; - unsigned i, x; - slp_instance instance; - bool can_use_lanes = true; - FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) - { - slp_root = SLP_INSTANCE_TREE (instance); - int group_size = SLP_TREE_LANES (slp_root); - tree vectype = SLP_TREE_VECTYPE (slp_root); - bool loads_permuted = false; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) - { - if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) - continue; - unsigned j; - stmt_vec_info load_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) - if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) - { - loads_permuted = true; - break; - } - } - - /* If the loads and stores can be handled with load/store-lane - instructions record it and move on to the next instance. */ - if (loads_permuted - && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store - && vect_store_lanes_supported (vectype, group_size, false) - != IFN_LAST) - { - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) - if (STMT_VINFO_GROUPED_ACCESS - (SLP_TREE_REPRESENTATIVE (load_node))) - { - stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_REPRESENTATIVE (load_node)); - /* Use SLP for strided accesses (or if we can't - load-lanes). */ - if (STMT_VINFO_STRIDED_P (stmt_vinfo) - || vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), - DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) - break; - } - - can_use_lanes - = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); - - if (can_use_lanes && dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "SLP instance %p can use load/store-lanes\n", - (void *) instance); - } - else - { - can_use_lanes = false; - break; - } - } - - /* If all SLP instances can use load/store-lanes abort SLP and try again - with SLP disabled. */ - if (can_use_lanes) - { - ok = opt_result::failure_at (vect_location, - "Built SLP cancelled: can use " - "load/store-lanes\n"); - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Built SLP cancelled: all SLP instances support " - "load/store-lanes\n"); - goto again; - } } /* Dissolve SLP-only groups. */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 2304cdac583..5a65a99d61e 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -121,6 +121,7 @@ _slp_tree::_slp_tree () SLP_TREE_SIMD_CLONE_INFO (this) = vNULL; SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def; SLP_TREE_CODE (this) = ERROR_MARK; + this->ldst_lanes = false; SLP_TREE_VECTYPE (this) = NULL_TREE; SLP_TREE_REPRESENTATIVE (this) = NULL; SLP_TREE_REF_COUNT (this) = 1; @@ -3483,7 +3484,8 @@ static bool vect_analyze_slp_instance (vec_info *vinfo, scalar_stmts_to_slp_tree_map_t *bst_map, stmt_vec_info stmt_info, slp_instance_kind kind, - unsigned max_tree_size, unsigned *limit); + unsigned max_tree_size, unsigned *limit, + bool force_single_lane = false); /* Build an interleaving scheme for the store sources RHS_NODES from SCALAR_STMTS. */ @@ -3678,7 +3680,8 @@ vect_build_slp_instance (vec_info *vinfo, unsigned max_tree_size, unsigned *limit, scalar_stmts_to_slp_tree_map_t *bst_map, /* ??? We need stmt_info for group splitting. */ - stmt_vec_info stmt_info_) + stmt_vec_info stmt_info_, + bool force_single_lane = false) { /* If there's no budget left bail out early. */ if (*limit == 0) @@ -3707,9 +3710,17 @@ vect_build_slp_instance (vec_info *vinfo, poly_uint64 max_nunits = 1; unsigned tree_size = 0; unsigned i; - slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, - &max_nunits, matches, limit, - &tree_size, bst_map); + + slp_tree node = NULL; + if (force_single_lane) + { + matches[0] = true; + matches[1] = false; + } + else + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, limit, + &tree_size, bst_map); if (node != NULL) { /* Calculate the unrolling factor based on the smallest type. */ @@ -3905,10 +3916,33 @@ vect_build_slp_instance (vec_info *vinfo, /* For loop vectorization split the RHS into arbitrary pieces of size >= 1. */ else if (is_a (vinfo) - && (i > 0 && i < group_size) - && !vect_slp_prefer_store_lanes_p (vinfo, - stmt_info, group_size, i)) + && (group_size != 1 && i < group_size)) { + /* There are targets that cannot do even/odd interleaving schemes + so they absolutely need to use load/store-lanes. For now + force single-lane SLP for them - they would be happy with + uniform power-of-two lanes (but depending on element size), + but even if we can use 'i' as indicator we would need to + backtrack when later lanes fail to discover with the same + granularity. We cannot turn any of strided or scatter store + into store-lanes. */ + /* ??? If this is not in sync with what get_load_store_type + later decides the SLP representation is not good for other + store vectorization methods. */ + bool want_store_lanes + = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info) + && ! STMT_VINFO_STRIDED_P (stmt_info) + && compare_step_with_zero (vinfo, stmt_info) > 0 + && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, + group_size, 1)); + if (want_store_lanes || force_single_lane) + i = 1; + + /* A fatal discovery fail doesn't always mean single-lane SLP + isn't a possibility, so try. */ + if (i == 0) + i = 1; + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Splitting SLP group at stmt %u\n", i); @@ -3942,7 +3976,10 @@ vect_build_slp_instance (vec_info *vinfo, (max_nunits, end - start)); rhs_nodes.safe_push (node); start = end; - end = group_size; + if (want_store_lanes || force_single_lane) + end = start + 1; + else + end = group_size; } else { @@ -3976,7 +4013,31 @@ vect_build_slp_instance (vec_info *vinfo, } /* Now we assume we can build the root SLP node from all stores. */ - node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts); + if (want_store_lanes) + { + /* For store-lanes feed the store node with all RHS nodes + in order. */ + node = vect_create_new_slp_node (scalar_stmts, + SLP_TREE_CHILDREN + (rhs_nodes[0]).length ()); + SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]); + node->ldst_lanes = true; + SLP_TREE_CHILDREN (node) + .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length () + + rhs_nodes.length () - 1); + /* First store value and possibly mask. */ + SLP_TREE_CHILDREN (node) + .splice (SLP_TREE_CHILDREN (rhs_nodes[0])); + /* Rest of the store values. All mask nodes are the same, + this should be guaranteed by dataref group discovery. */ + for (unsigned j = 1; j < rhs_nodes.length (); ++j) + SLP_TREE_CHILDREN (node) + .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]); + for (slp_tree child : SLP_TREE_CHILDREN (node)) + child->refcnt++; + } + else + node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts); while (!rhs_nodes.is_empty ()) vect_free_slp_tree (rhs_nodes.pop ()); @@ -4043,7 +4104,8 @@ vect_analyze_slp_instance (vec_info *vinfo, scalar_stmts_to_slp_tree_map_t *bst_map, stmt_vec_info stmt_info, slp_instance_kind kind, - unsigned max_tree_size, unsigned *limit) + unsigned max_tree_size, unsigned *limit, + bool force_single_lane) { vec scalar_stmts; @@ -4088,7 +4150,7 @@ vect_analyze_slp_instance (vec_info *vinfo, roots, remain, max_tree_size, limit, bst_map, kind == slp_inst_kind_store - ? stmt_info : NULL); + ? stmt_info : NULL, force_single_lane); /* ??? If this is slp_inst_kind_store and the above succeeded here's where we should do store group splitting. */ @@ -4184,12 +4246,50 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, lower. */ stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]); + unsigned group_lanes = DR_GROUP_SIZE (first); + + /* Verify if all load permutations can be implemented with a suitably + large element load-lanes operation. */ + unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]); + if (STMT_VINFO_STRIDED_P (first) + || compare_step_with_zero (loop_vinfo, first) <= 0 + || exact_log2 (ld_lanes_lanes) == -1 + /* ??? For now only support the single-lane case as there is + missing support on the store-lane side and code generation + isn't up to the task yet. */ + || ld_lanes_lanes != 1 + || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]), + group_lanes / ld_lanes_lanes, + false) == IFN_LAST) + ld_lanes_lanes = 0; + else + /* Verify the loads access the same number of lanes aligned to + ld_lanes_lanes. */ + for (slp_tree load : loads) + { + if (SLP_TREE_LANES (load) != ld_lanes_lanes) + { + ld_lanes_lanes = 0; + break; + } + unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0]; + if (first % ld_lanes_lanes != 0) + { + ld_lanes_lanes = 0; + break; + } + for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i) + if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i) + { + ld_lanes_lanes = 0; + break; + } + } /* Only a power-of-two number of lanes matches interleaving with N levels. ??? An even number of lanes could be reduced to 1<= (group_lanes + 1) / 2) + if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2 + && ld_lanes_lanes == 0) continue; /* First build (and possibly re-use) a load node for the @@ -4239,10 +4340,20 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, final_perm.quick_push (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i])); + if (ld_lanes_lanes != 0) + { + /* ??? If this is not in sync with what get_load_store_type + later decides the SLP representation is not good for other + store vectorization methods. */ + l0->ldst_lanes = true; + load->ldst_lanes = true; + } + while (1) { unsigned group_lanes = SLP_TREE_LANES (l0); - if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2) + if (ld_lanes_lanes != 0 + || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2) break; /* Try to lower by reducing the group to half its size using an @@ -4570,6 +4681,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } } + /* Check whether we should force some SLP instances to use load/store-lanes + and do so by forcing SLP re-discovery with single lanes. We used + to cancel SLP when this applied to all instances in a loop but now + we decide this per SLP instance. It's important to do this only + after SLP pattern recognition. */ + if (is_a (vinfo)) + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) + if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store + && !SLP_INSTANCE_TREE (instance)->ldst_lanes) + { + slp_tree slp_root = SLP_INSTANCE_TREE (instance); + int group_size = SLP_TREE_LANES (slp_root); + tree vectype = SLP_TREE_VECTYPE (slp_root); + + auto_vec loads; + hash_set visited; + vect_gather_slp_loads (loads, slp_root, visited); + + /* Check whether any load in the SLP instance is possibly + permuted. */ + bool loads_permuted = false; + slp_tree load_node; + unsigned j; + FOR_EACH_VEC_ELT (loads, j, load_node) + { + if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) + continue; + unsigned k; + stmt_vec_info load_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info) + if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k) + { + loads_permuted = true; + break; + } + } + + /* If the loads and stores can use load/store-lanes force re-discovery + with single lanes. */ + if (loads_permuted + && !slp_root->ldst_lanes + && vect_store_lanes_supported (vectype, group_size, false) + != IFN_LAST) + { + bool can_use_lanes = true; + FOR_EACH_VEC_ELT (loads, j, load_node) + if (STMT_VINFO_GROUPED_ACCESS + (SLP_TREE_REPRESENTATIVE (load_node))) + { + stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT + (SLP_TREE_REPRESENTATIVE (load_node)); + /* Use SLP for strided accesses (or if we can't + load-lanes). */ + if (STMT_VINFO_STRIDED_P (stmt_vinfo) + || compare_step_with_zero (vinfo, stmt_vinfo) <= 0 + || vect_load_lanes_supported + (STMT_VINFO_VECTYPE (stmt_vinfo), + DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) + { + can_use_lanes = false; + break; + } + } + + if (can_use_lanes) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP instance %p can use load/store-lanes," + " re-discovering with single-lanes\n", + (void *) instance); + + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root); + + vect_free_slp_instance (instance); + limit = max_tree_size; + bool res = vect_analyze_slp_instance (vinfo, bst_map, + stmt_info, + slp_inst_kind_store, + max_tree_size, &limit, + true); + gcc_assert (res); + auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop (); + LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst; + } + } + } + /* When we end up with load permutations that we cannot possibly handle, like those requiring three vector inputs, lower them using interleaving like schemes. */ @@ -9877,6 +10076,28 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, gcc_assert (perm.length () == SLP_TREE_LANES (node)); + /* Load-lanes permute. This permute only acts as a forwarder to + select the correct vector def of the load-lanes load which + has the permuted vectors in its vector defs like + { v0, w0, r0, v1, w1, r1 ... } for a ld3. */ + if (node->ldst_lanes) + { + gcc_assert (children.length () == 1); + if (!gsi) + /* This is a trivial op always supported. */ + return 1; + slp_tree child = children[0]; + unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second + / SLP_TREE_LANES (node)); + unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node); + for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i) + { + tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx]; + node->push_vec_def (def); + } + return 1; + } + /* REPEATING_P is true if every output vector is guaranteed to use the same permute vector. We can handle that case for both variable-length and constant-length vectors, but we only handle other cases for diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 72a29c0584b..d2282c0dc4f 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1509,7 +1509,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, unsigned int nvectors; if (slp_node) - nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + /* ??? Incorrect for multi-lane lanes. */ + nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size; else nvectors = vect_get_num_copies (loop_vinfo, vectype); @@ -1795,7 +1796,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info, elements with a known constant step. Return -1 if that step is negative, 0 if it is zero, and 1 if it is greater than zero. */ -static int +int compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info) { dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); @@ -2070,6 +2071,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, is irrelevant for them. */ *alignment_support_scheme = dr_unaligned_supported; } + /* Try using LOAD/STORE_LANES. */ + else if (slp_node->ldst_lanes + && (*lanes_ifn + = (vls_type == VLS_LOAD + ? vect_load_lanes_supported (vectype, group_size, masked_p) + : vect_store_lanes_supported (vectype, group_size, + masked_p))) != IFN_LAST) + *memory_access_type = VMAT_LOAD_STORE_LANES; else *memory_access_type = VMAT_CONTIGUOUS; @@ -8201,6 +8210,16 @@ vectorizable_store (vec_info *vinfo, &lanes_ifn)) return false; + if (slp_node + && slp_node->ldst_lanes + && memory_access_type != VMAT_LOAD_STORE_LANES) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "discovered store-lane but cannot use it.\n"); + return false; + } + if (mask) { if (memory_access_type == VMAT_CONTIGUOUS) @@ -8717,7 +8736,7 @@ vectorizable_store (vec_info *vinfo, else { if (memory_access_type == VMAT_LOAD_STORE_LANES) - aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + aggr_type = build_array_type_nelts (elem_type, group_size * nunits); else aggr_type = vectype; bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type, @@ -8774,11 +8793,24 @@ vectorizable_store (vec_info *vinfo, if (memory_access_type == VMAT_LOAD_STORE_LANES) { - gcc_assert (!slp && grouped_store); + if (costing_p && slp_node) + /* Update all incoming store operand nodes, the general handling + above only handles the mask and the first store operand node. */ + for (slp_tree child : SLP_TREE_CHILDREN (slp_node)) + if (child != mask_node + && !vect_maybe_update_slp_op_vectype (child, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } unsigned inside_cost = 0, prologue_cost = 0; /* For costing some adjacent vector stores, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_stores = 0; + if (slp) + ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size; for (j = 0; j < ncopies; j++) { gimple *new_stmt; @@ -8796,7 +8828,7 @@ vectorizable_store (vec_info *vinfo, op = vect_get_store_rhs (next_stmt_info); if (costing_p) update_prologue_cost (&prologue_cost, op); - else + else if (!slp) { vect_get_vec_defs_for_operand (vinfo, next_stmt_info, ncopies, op, @@ -8811,15 +8843,15 @@ vectorizable_store (vec_info *vinfo, { if (mask) { - vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, - mask, &vec_masks, - mask_vectype); + if (slp_node) + vect_get_slp_defs (mask_node, &vec_masks); + else + vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, + mask, &vec_masks, + mask_vectype); vec_mask = vec_masks[0]; } - /* We should have catched mismatched types earlier. */ - gcc_assert ( - useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd))); dataref_ptr = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, NULL, offset, &dummy, @@ -8831,10 +8863,16 @@ vectorizable_store (vec_info *vinfo, gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); /* DR_CHAIN is then used as an input to vect_permute_store_chain(). */ - for (i = 0; i < group_size; i++) + if (!slp) { - vec_oprnd = (*gvec_oprnds[i])[j]; - dr_chain[i] = vec_oprnd; + /* We should have caught mismatched types earlier. */ + gcc_assert ( + useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd))); + for (i = 0; i < group_size; i++) + { + vec_oprnd = (*gvec_oprnds[i])[j]; + dr_chain[i] = vec_oprnd; + } } if (mask) vec_mask = vec_masks[j]; @@ -8844,12 +8882,12 @@ vectorizable_store (vec_info *vinfo, if (costing_p) { - n_adjacent_stores += vec_num; + n_adjacent_stores += group_size; continue; } /* Get an array into which we can store the individual vectors. */ - tree vec_array = create_vector_array (vectype, vec_num); + tree vec_array = create_vector_array (vectype, group_size); /* Invalidate the current contents of VEC_ARRAY. This should become an RTL clobber too, which prevents the vector registers @@ -8857,9 +8895,19 @@ vectorizable_store (vec_info *vinfo, vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); /* Store the individual vectors into the array. */ - for (i = 0; i < vec_num; i++) + for (i = 0; i < group_size; i++) { - vec_oprnd = dr_chain[i]; + if (slp) + { + slp_tree child; + if (i == 0 || !mask_node) + child = SLP_TREE_CHILDREN (slp_node)[i]; + else + child = SLP_TREE_CHILDREN (slp_node)[i + 1]; + vec_oprnd = SLP_TREE_VEC_DEFS (child)[j]; + } + else + vec_oprnd = dr_chain[i]; write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array, i); } @@ -8929,9 +8977,10 @@ vectorizable_store (vec_info *vinfo, /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); - if (j == 0) + if (j == 0 && !slp) *vec_stmt = new_stmt; - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + if (!slp) + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } if (costing_p) @@ -10035,6 +10084,16 @@ vectorizable_load (vec_info *vinfo, &lanes_ifn)) return false; + if (slp_node + && slp_node->ldst_lanes + && memory_access_type != VMAT_LOAD_STORE_LANES) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "discovered load-lane but cannot use it.\n"); + return false; + } + if (mask) { if (memory_access_type == VMAT_CONTIGUOUS) @@ -10753,7 +10812,7 @@ vectorizable_load (vec_info *vinfo, else { if (memory_access_type == VMAT_LOAD_STORE_LANES) - aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + aggr_type = build_array_type_nelts (elem_type, group_size * nunits); else aggr_type = vectype; bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type, @@ -10777,12 +10836,13 @@ vectorizable_load (vec_info *vinfo, { gcc_assert (alignment_support_scheme == dr_aligned || alignment_support_scheme == dr_unaligned_supported); - gcc_assert (grouped_load && !slp); unsigned int inside_cost = 0, prologue_cost = 0; /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; + if (slp_node) + ncopies = slp_node->vec_stmts_size / group_size; for (j = 0; j < ncopies; j++) { if (costing_p) @@ -10833,7 +10893,7 @@ vectorizable_load (vec_info *vinfo, if (mask) vec_mask = vec_masks[j]; - tree vec_array = create_vector_array (vectype, vec_num); + tree vec_array = create_vector_array (vectype, group_size); tree final_mask = NULL_TREE; tree final_len = NULL_TREE; @@ -10896,24 +10956,31 @@ vectorizable_load (vec_info *vinfo, gimple_call_set_nothrow (call, true); vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); - dr_chain.create (vec_num); + if (!slp) + dr_chain.create (group_size); /* Extract each vector into an SSA_NAME. */ - for (i = 0; i < vec_num; i++) + for (unsigned i = 0; i < group_size; i++) { new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest, vec_array, i); - dr_chain.quick_push (new_temp); + if (slp) + slp_node->push_vec_def (new_temp); + else + dr_chain.quick_push (new_temp); } - /* Record the mapping between SSA_NAMEs and statements. */ - vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); + if (!slp) + /* Record the mapping between SSA_NAMEs and statements. */ + vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); - dr_chain.release (); + if (!slp) + dr_chain.release (); - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + if (!slp_node) + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; } if (costing_p) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index df6c8ada2f7..699ae9e33ba 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -222,6 +222,9 @@ struct _slp_tree { unsigned int lanes; /* The operation of this node. */ enum tree_code code; + /* Whether uses of this load or feeders of this store are suitable + for load/store-lanes. */ + bool ldst_lanes; int vertex; @@ -2313,6 +2316,7 @@ extern bool supportable_indirect_convert_operation (code_helper, tree, tree, vec > *, tree = NULL_TREE); +extern int compare_step_with_zero (vec_info *, stmt_vec_info); extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, enum vect_cost_for_stmt, stmt_vec_info,