i386: Introduce V2QImode minmax, abs and uavgv2hi3_ceil [PR103861]

Add V2QImode minmax, abs and uavxv2qi3_ceil operations with SSE registers.

2022-01-05  Uroš Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog:

	PR target/103861
	* config/i386/mmx.md (VI_16_32): New mode iterator.
	(VI1_16_32): Ditto.
	(mmxvecsize): Handle V2QI mode.
	(<smaxmin:code><mode>3): Rename from <smaxmin:code>v4qi3.
	Use VI1_16_32 mode iterator.
	(<umaxmin:code><mode>3): Rename from <umaxmin:code>v4qi3.
	Use VI1_16_32 mode iterator.
	(abs<mode>2): Use VI_16_32 mode iterator.
	(uavgv2qi3_ceil): New insn pattern.

gcc/testsuite/ChangeLog:

	PR target/103861
	* gcc.target/i386/pr103861-3.c: New test.
	* g++.dg/vect/slp-pr98855.cc (dg-final): Check that
	no vectorization using SLP was performed.
This commit is contained in:
Uros Bizjak 2022-01-05 23:16:34 +01:00
parent e3ef832a9e
commit c166632bd2
3 changed files with 109 additions and 17 deletions

View File

@ -63,6 +63,12 @@
;; 4-byte integer vector modes
(define_mode_iterator VI_32 [V4QI V2HI])
;; 4-byte and 2-byte integer vector modes
(define_mode_iterator VI_16_32 [V4QI V2QI V2HI])
;; 4-byte and 2-byte QImode vector modes
(define_mode_iterator VI1_16_32 [V4QI V2QI])
;; V2S* modes
(define_mode_iterator V2FI [V2SF V2SI])
@ -71,7 +77,8 @@
;; Mapping from integer vector mode to mnemonic suffix
(define_mode_attr mmxvecsize
[(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
[(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
(define_mode_attr mmxdoublemode
[(V8QI "V8HI") (V4HI "V4SI")])
@ -2140,11 +2147,11 @@
(match_operand:V4HI 2 "register_operand")))]
"TARGET_MMX_WITH_SSE")
(define_insn "<code>v4qi3"
[(set (match_operand:V4QI 0 "register_operand" "=Yr,*x,Yv")
(smaxmin:V4QI
(match_operand:V4QI 1 "register_operand" "%0,0,Yv")
(match_operand:V4QI 2 "register_operand" "Yr,*x,Yv")))]
(define_insn "<code><mode>3"
[(set (match_operand:VI1_16_32 0 "register_operand" "=Yr,*x,Yv")
(smaxmin:VI1_16_32
(match_operand:VI1_16_32 1 "register_operand" "%0,0,Yv")
(match_operand:VI1_16_32 2 "register_operand" "Yr,*x,Yv")))]
"TARGET_SSE4_1"
"@
p<maxmin_int>b\t{%2, %0|%0, %2}
@ -2218,11 +2225,11 @@
(match_operand:V8QI 2 "register_operand")))]
"TARGET_MMX_WITH_SSE")
(define_insn "<code>v4qi3"
[(set (match_operand:V4QI 0 "register_operand" "=x,Yw")
(umaxmin:V4QI
(match_operand:V4QI 1 "register_operand" "%0,Yw")
(match_operand:V4QI 2 "register_operand" "x,Yw")))]
(define_insn "<code><mode>3"
[(set (match_operand:VI1_16_32 0 "register_operand" "=x,Yw")
(umaxmin:VI1_16_32
(match_operand:VI1_16_32 1 "register_operand" "%0,Yw")
(match_operand:VI1_16_32 2 "register_operand" "x,Yw")))]
"TARGET_SSE2"
"@
p<maxmin_int>b\t{%2, %0|%0, %2}
@ -2269,9 +2276,9 @@
"TARGET_SSSE3 && TARGET_MMX_WITH_SSE")
(define_insn "abs<mode>2"
[(set (match_operand:VI_32 0 "register_operand" "=Yv")
(abs:VI_32
(match_operand:VI_32 1 "register_operand" "Yv")))]
[(set (match_operand:VI_16_32 0 "register_operand" "=Yv")
(abs:VI_16_32
(match_operand:VI_16_32 1 "register_operand" "Yv")))]
"TARGET_SSSE3"
"%vpabs<mmxvecsize>\t{%1, %0|%0, %1}"
[(set_attr "type" "sselog1")
@ -4351,6 +4358,26 @@
(set_attr "type" "sseiadd")
(set_attr "mode" "TI")])
(define_insn "uavgv2qi3_ceil"
[(set (match_operand:V2QI 0 "register_operand" "=x,Yw")
(truncate:V2QI
(lshiftrt:V2HI
(plus:V2HI
(plus:V2HI
(zero_extend:V2HI
(match_operand:V2QI 1 "register_operand" "%0,Yw"))
(zero_extend:V2HI
(match_operand:V2QI 2 "register_operand" "x,Yw")))
(const_vector:V2HI [(const_int 1) (const_int 1)]))
(const_int 1))))]
"TARGET_SSE2"
"@
pavgb\t{%2, %0|%0, %2}
vpavgb\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sseiadd")
(set_attr "mode" "TI")])
(define_insn "uavgv2hi3_ceil"
[(set (match_operand:V2HI 0 "register_operand" "=x,Yw")
(truncate:V2HI

View File

@ -81,6 +81,5 @@ void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK)
}
}
// This used to work on { target x86_64-*-* i?86-*-* } but a fix in SLP
// discovery makes us trip over the threshold again.
// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { xfail *-*-* } } }
// { dg-final { scan-tree-dump "not vectorized: vectorization is not profitable" "slp1" } }
// { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" "slp1" } }

View File

@ -0,0 +1,66 @@
/* PR target/103861 */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -msse4" } */
char r[2], a[2], b[2];
unsigned char ur[2], ua[2], ub[2];
void maxs (void)
{
int i;
for (i = 0; i < 2; i++)
r[i] = a[i] > b[i] ? a[i] : b[i];
}
/* { dg-final { scan-assembler "pmaxsb" } } */
void maxu (void)
{
int i;
for (i = 0; i < 2; i++)
ur[i] = ua[i] > ub[i] ? ua[i] : ub[i];
}
/* { dg-final { scan-assembler "pmaxub" } } */
void mins (void)
{
int i;
for (i = 0; i < 2; i++)
r[i] = a[i] < b[i] ? a[i] : b[i];
}
/* { dg-final { scan-assembler "pminsb" } } */
void minu (void)
{
int i;
for (i = 0; i < 2; i++)
ur[i] = ua[i] < ub[i] ? ua[i] : ub[i];
}
/* { dg-final { scan-assembler "pminub" } } */
void _abs (void)
{
int i;
for (i = 0; i < 2; i++)
r[i] = a[i] < 0 ? -a[i] : a[i];
}
/* { dg-final { scan-assembler "pabsb" } } */
void avgu (void)
{
int i;
for (i = 0; i < 2; i++)
ur[i] = (ua[i] + ub[i] + 1) >> 1;
}
/* { dg-final { scan-assembler "pavgb" { xfail *-*-* } } } */