AArch64: Fix strict-align cpymem/setmem [PR103100]

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.

gcc/ChangeLog/
	PR target/103100
	* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
	(setmemdi): Likewise.
	* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
	strict-align.  Cleanup condition for using MOPS.
	(aarch64_expand_setmem): Likewise.
This commit is contained in:
Wilco Dijkstra 2023-10-25 16:28:04 +01:00
parent 951a3e3749
commit 318f5232cf
2 changed files with 27 additions and 35 deletions

View File

@ -23105,27 +23105,23 @@ aarch64_expand_cpymem (rtx *operands)
int mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode;
/* Variable-sized memcpy can go through the MOPS expansion if available. */
if (!CONST_INT_P (operands[2]))
return aarch64_expand_cpymem_mops (operands);
unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
/* Try to inline up to 256 bytes or use the MOPS threshold if available. */
unsigned HOST_WIDE_INT max_copy_size
= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
bool size_p = optimize_function_for_size_p (cfun);
/* Large constant-sized cpymem should go through MOPS when possible.
It should be a win even for size optimization in the general case.
For speed optimization the choice between MOPS and the SIMD sequence
depends on the size of the copy, rather than number of instructions,
alignment etc. */
if (size > max_copy_size)
/* Variable-sized or strict-align copies may use the MOPS expansion. */
if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_cpymem_mops (operands);
unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
/* Try to inline up to 256 bytes. */
unsigned max_copy_size = 256;
unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
/* Large copies use MOPS when available or a library call. */
if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
return aarch64_expand_cpymem_mops (operands);
int copy_bits = 256;
@ -23289,12 +23285,13 @@ aarch64_expand_setmem (rtx *operands)
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
/* If we don't have SIMD registers or the size is variable use the MOPS
inlined sequence if possible. */
if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
/* Variable-sized or strict-align memset may use the MOPS expansion. */
if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
|| (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
@ -23302,10 +23299,13 @@ aarch64_expand_setmem (rtx *operands)
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
unsigned mops_threshold = aarch64_mops_memset_size_threshold;
len = INTVAL (operands[1]);
if (len > max_set_size && !TARGET_MOPS)
return false;
len = UINTVAL (operands[1]);
/* Large memset uses MOPS when available or a library call. */
if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
return aarch64_expand_setmem_mops (operands);
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
@ -23318,12 +23318,6 @@ aarch64_expand_setmem (rtx *operands)
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
/* Upper bound check. For large constant-sized setmem use the MOPS sequence
when available. */
if (TARGET_MOPS
&& len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
return aarch64_expand_setmem_mops (operands);
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
@ -23365,10 +23359,8 @@ aarch64_expand_setmem (rtx *operands)
simd_ops++;
n -= mode_bits;
/* Do certain trailing copies as overlapping if it's going to be
cheaper. i.e. less instructions to do so. For instance doing a 15
byte copy it's more efficient to do two overlapping 8 byte copies than
8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
/* Emit trailing writes using overlapping unaligned accesses
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (n, MODE_INT);

View File

@ -1630,7 +1630,7 @@
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
"!STRICT_ALIGNMENT || TARGET_MOPS"
""
{
if (aarch64_expand_cpymem (operands))
DONE;
@ -1727,7 +1727,7 @@
(match_operand:QI 2 "nonmemory_operand")) ;; Value
(use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
"TARGET_SIMD || TARGET_MOPS"
""
{
if (aarch64_expand_setmem (operands))
DONE;