aarch64: Add masked-load else operands.

This adds zero else operands to masked loads and their intrinsics.
I needed to adjust more than initially thought because we rely on
combine for several instructions and a change in a "base" pattern
needs to propagate to all those.

gcc/ChangeLog:

	* config/aarch64/aarch64-sve-builtins-base.cc: Add else
	handling.
	* config/aarch64/aarch64-sve-builtins.cc (function_expander::use_contiguous_load_insn):
	Ditto.
	* config/aarch64/aarch64-sve-builtins.h: Add else operand to
	contiguous load.
	* config/aarch64/aarch64-sve.md (@aarch64_load<SVE_PRED_LOAD:pred_load>
	_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>):
	Split and add else operand.
	(@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>):
	Ditto.
	(*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>):
	Ditto.
	* config/aarch64/aarch64-sve2.md: Ditto.
	* config/aarch64/iterators.md: Remove unused iterators.
	* config/aarch64/predicates.md (aarch64_maskload_else_operand):
	Add zero else operand.
This commit is contained in:
Robin Dapp 2024-08-08 10:30:58 +02:00
parent 634ae740f5
commit a166a6ccdc
7 changed files with 77 additions and 24 deletions

View File

@ -1524,11 +1524,12 @@ public:
gimple_seq stmts = NULL;
tree pred = f.convert_pred (stmts, vectype, 0);
tree base = f.fold_contiguous_base (stmts, vectype);
tree els = build_zero_cst (vectype);
gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
base, cookie, pred);
gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 4,
base, cookie, pred, els);
gimple_call_set_lhs (new_call, f.lhs);
return new_call;
}
@ -1542,7 +1543,7 @@ public:
e.vector_mode (0), e.gp_mode (0));
else
icode = code_for_aarch64 (UNSPEC_LD1_COUNT, e.tuple_mode (0));
return e.use_contiguous_load_insn (icode);
return e.use_contiguous_load_insn (icode, true);
}
};
@ -1555,10 +1556,10 @@ public:
rtx
expand (function_expander &e) const override
{
insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code (),
insn_code icode = code_for_aarch64_load (extend_rtx_code (),
e.vector_mode (0),
e.memory_vector_mode ());
return e.use_contiguous_load_insn (icode);
return e.use_contiguous_load_insn (icode, true);
}
};
@ -1577,6 +1578,8 @@ public:
e.prepare_gather_address_operands (1);
/* Put the predicate last, as required by mask_gather_load_optab. */
e.rotate_inputs_left (0, 5);
/* Add the else operand. */
e.args.quick_push (CONST0_RTX (e.vector_mode (0)));
machine_mode mem_mode = e.memory_vector_mode ();
machine_mode int_mode = aarch64_sve_int_mode (mem_mode);
insn_code icode = convert_optab_handler (mask_gather_load_optab,
@ -1600,6 +1603,8 @@ public:
e.rotate_inputs_left (0, 5);
/* Add a constant predicate for the extension rtx. */
e.args.quick_push (CONSTM1_RTX (VNx16BImode));
/* Add the else operand. */
e.args.quick_push (CONST0_RTX (e.vector_mode (1)));
insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (),
e.vector_mode (0),
e.memory_vector_mode ());
@ -1742,6 +1747,7 @@ public:
/* Get the predicate and base pointer. */
gimple_seq stmts = NULL;
tree pred = f.convert_pred (stmts, vectype, 0);
tree els = build_zero_cst (vectype);
tree base = f.fold_contiguous_base (stmts, vectype);
gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
@ -1760,8 +1766,8 @@ public:
/* Emit the load itself. */
tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
base, cookie, pred);
gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 4,
base, cookie, pred, els);
gimple_call_set_lhs (new_call, lhs_array);
gsi_insert_after (f.gsi, new_call, GSI_SAME_STMT);
@ -1774,7 +1780,7 @@ public:
machine_mode tuple_mode = e.result_mode ();
insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab,
tuple_mode, e.vector_mode (0));
return e.use_contiguous_load_insn (icode);
return e.use_contiguous_load_insn (icode, true);
}
};
@ -1845,7 +1851,7 @@ public:
? code_for_aarch64_ldnt1 (e.vector_mode (0))
: code_for_aarch64 (UNSPEC_LDNT1_COUNT,
e.tuple_mode (0)));
return e.use_contiguous_load_insn (icode);
return e.use_contiguous_load_insn (icode, true);
}
};

View File

@ -4284,9 +4284,12 @@ function_expander::use_vcond_mask_insn (insn_code icode,
/* Implement the call using instruction ICODE, which loads memory operand 1
into register operand 0 under the control of predicate operand 2.
Extending loads have a further predicate (operand 3) that nominally
controls the extension. */
controls the extension.
HAS_ELSE is true if the pattern has an additional operand that specifies
the values of inactive lanes. This exists to match the general maskload
interface and is always zero for AArch64. */
rtx
function_expander::use_contiguous_load_insn (insn_code icode)
function_expander::use_contiguous_load_insn (insn_code icode, bool has_else)
{
machine_mode mem_mode = memory_vector_mode ();
@ -4295,6 +4298,11 @@ function_expander::use_contiguous_load_insn (insn_code icode)
add_input_operand (icode, args[0]);
if (GET_MODE_UNIT_BITSIZE (mem_mode) < type_suffix (0).element_bits)
add_input_operand (icode, CONSTM1_RTX (VNx16BImode));
/* If we have an else operand, add it. */
if (has_else)
add_input_operand (icode, CONST0_RTX (mem_mode));
return generate_insn (icode);
}

View File

@ -696,7 +696,7 @@ public:
rtx use_pred_x_insn (insn_code);
rtx use_cond_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
rtx use_vcond_mask_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
rtx use_contiguous_load_insn (insn_code);
rtx use_contiguous_load_insn (insn_code, bool = false);
rtx use_contiguous_prefetch_insn (insn_code);
rtx use_contiguous_store_insn (insn_code);

View File

@ -1291,7 +1291,8 @@
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand:<VPRED> 2 "register_operand" "Upl")
(match_operand:SVE_ALL 1 "memory_operand" "m")]
(match_operand:SVE_ALL 1 "memory_operand" "m")
(match_operand:SVE_ALL 3 "aarch64_maskload_else_operand")]
UNSPEC_LD1_SVE))]
"TARGET_SVE"
"ld1<Vesize>\t%0.<Vctype>, %2/z, %1"
@ -1302,11 +1303,13 @@
[(set (match_operand:SVE_STRUCT 0 "register_operand")
(unspec:SVE_STRUCT
[(match_dup 2)
(match_operand:SVE_STRUCT 1 "memory_operand")]
(match_operand:SVE_STRUCT 1 "memory_operand")
(match_dup 3)]
UNSPEC_LDN))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (<VPRED>mode);
operands[3] = CONST0_RTX (<MODE>mode);
}
)
@ -1315,7 +1318,8 @@
[(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
(unspec:SVE_STRUCT
[(match_operand:<VPRED> 2 "register_operand" "Upl")
(match_operand:SVE_STRUCT 1 "memory_operand" "m")]
(match_operand:SVE_STRUCT 1 "memory_operand" "m")
(match_operand 3 "aarch64_maskload_else_operand")]
UNSPEC_LDN))]
"TARGET_SVE"
"ld<vector_count><Vesize>\t%0, %2/z, %1"
@ -1334,7 +1338,28 @@
;; -------------------------------------------------------------------------
;; Predicated load and extend, with 8 elements per 128-bit block.
(define_insn_and_rewrite "@aarch64_load<SVE_PRED_LOAD:pred_load>_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>"
(define_insn_and_rewrite "@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>"
[(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
(unspec:SVE_HSDI
[(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm")
(ANY_EXTEND:SVE_HSDI
(unspec:SVE_PARTIAL_I
[(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl")
(match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")
(match_operand:SVE_PARTIAL_I 4 "aarch64_maskload_else_operand")]
UNSPEC_LD1_SVE))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
"ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1"
"&& !CONSTANT_P (operands[3])"
{
operands[3] = CONSTM1_RTX (<SVE_HSDI:VPRED>mode);
}
)
;; Same as above without the maskload_else_operand to still allow combine to
;; match a sign-extended pred_mov pattern.
(define_insn_and_rewrite "*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>"
[(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
(unspec:SVE_HSDI
[(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm")
@ -1342,8 +1367,8 @@
(unspec:SVE_PARTIAL_I
[(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl")
(match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")]
SVE_PRED_LOAD))]
UNSPEC_PRED_X))]
UNSPEC_PRED_X))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
"ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1"
"&& !CONSTANT_P (operands[3])"
@ -1433,7 +1458,8 @@
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand:<VPRED> 2 "register_operand" "Upl")
(match_operand:SVE_FULL 1 "memory_operand" "m")]
(match_operand:SVE_FULL 1 "memory_operand" "m")
(match_operand:SVE_FULL 3 "aarch64_maskload_else_operand")]
UNSPEC_LDNT1_SVE))]
"TARGET_SVE"
"ldnt1<Vesize>\t%0.<Vetype>, %2/z, %1"
@ -1456,11 +1482,13 @@
(match_operand:<V_INT_CONTAINER> 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(match_dup 6)
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE && TARGET_NON_STREAMING"
{
operands[5] = aarch64_ptrue_reg (<VPRED>mode);
operands[6] = CONST0_RTX (<MODE>mode);
}
)
@ -1474,6 +1502,7 @@
(match_operand:VNx4SI 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(match_operand:SVE_4 6 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE && TARGET_NON_STREAMING"
@ -1503,6 +1532,7 @@
(match_operand:VNx2DI 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(match_operand:SVE_2 6 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE && TARGET_NON_STREAMING"
@ -1531,6 +1561,7 @@
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(match_operand:SVE_2 7 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE && TARGET_NON_STREAMING"
@ -1561,6 +1592,7 @@
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(match_operand:SVE_2 7 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE && TARGET_NON_STREAMING"
@ -1588,6 +1620,7 @@
(match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(match_operand:SVE_2 7 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE && TARGET_NON_STREAMING"
@ -1624,6 +1657,7 @@
(match_operand:VNx4SI 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_4BHI:Vesize>")
(match_operand:SVE_4BHI 7 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
@ -1663,6 +1697,7 @@
(match_operand:VNx2DI 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
(match_operand:SVE_2BHSI 7 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
@ -1701,6 +1736,7 @@
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
(match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
@ -1738,6 +1774,7 @@
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
(match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
@ -1772,6 +1809,7 @@
(match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
(match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]

View File

@ -264,7 +264,8 @@
[(set (match_operand:SVE_FULLx24 0 "aligned_register_operand" "=Uw<vector_count>")
(unspec:SVE_FULLx24
[(match_operand:VNx16BI 2 "register_operand" "Uph")
(match_operand:SVE_FULLx24 1 "memory_operand" "m")]
(match_operand:SVE_FULLx24 1 "memory_operand" "m")
(match_operand:SVE_FULLx24 3 "aarch64_maskload_else_operand")]
LD1_COUNT))]
"TARGET_SVE2p1_OR_SME2"
"<optab><Vesize>\t%0, %K2/z, %1"

View File

@ -3331,10 +3331,6 @@
(define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1])
(define_int_iterator SVE_PRED_LOAD [UNSPEC_PRED_X UNSPEC_LD1_SVE])
(define_int_attr pred_load [(UNSPEC_PRED_X "_x") (UNSPEC_LD1_SVE "")])
(define_int_iterator LD1_COUNT [UNSPEC_LD1_COUNT UNSPEC_LDNT1_COUNT])
(define_int_iterator ST1_COUNT [UNSPEC_ST1_COUNT UNSPEC_STNT1_COUNT])

View File

@ -1067,3 +1067,7 @@
(and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), -4096, 4080)
&& !(INTVAL (op) & 0xf)")))
(define_predicate "aarch64_maskload_else_operand"
(and (match_code "const_int,const_vector")
(match_test "op == CONST0_RTX (GET_MODE (op))")))