target/arm: Convert Neon 3-reg-diff prewidening ops to decodetree

Convert the "pre-widening" insns VADDL, VSUBL, VADDW and VSUBW in the Neon 3-registers-different-lengths group to decodetree. These insns work by widening one or both inputs to double their size, performing an add or subtract at the doubled size and then storing the double-size result. As usual, rather than copying the loop of the original decoder (which needs awkward code to avoid problems when source and destination registers overlap) we just unroll the two passes. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
2024-12-02 00:03:35 +08:00 · 2020-06-16 10:32:25 +01:00 · 2020-06-16 10:32:25 +01:00 · b28be09570
commit b28be09570
parent 9593a3988c
3 changed files with 151 additions and 12 deletions
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@ -397,3 +397,46 @@ VCVT_FU_2sh      1111 001 1 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
 # So we have a single decode line and check the cmode/op in the
 # trans function.
 Vimm_1r          1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
+
+######################################################################
+# Within the "two registers, or three registers of different lengths"
+# grouping ([23,4]=0b10), bits [21:20] are either part of the opcode
+# decode: 0b11 for VEXT, two-reg-misc, VTBL, and duplicate-scalar;
+# or they are a size field for the three-reg-different-lengths and
+# two-reg-and-scalar insn groups (where size cannot be 0b11). This
+# is slightly awkward for decodetree: we handle it with this
+# non-exclusive group which contains within it two exclusive groups:
+# one for the size=0b11 patterns, and one for the size-not-0b11
+# patterns. This allows us to check that none of the insns within
+# each subgroup accidentally overlap each other. Note that all the
+# trans functions for the size-not-0b11 patterns must check and
+# return false for size==3.
+######################################################################
+{
+  # 0b11 subgroup will go here
+
+  # Subgroup for size != 0b11
+  [
+    ##################################################################
+    # 3-reg-different-length grouping:
+    # 1111 001 U 1 D sz!=11 Vn:4 Vd:4 opc:4 N 0 M 0 Vm:4
+    ##################################################################
+
+    &3diff vm vn vd size
+
+    @3diff       .... ... . . . size:2 .... .... .... . . . . .... \
+                 &3diff vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+    VADDL_S_3d   1111 001 0 1 . .. .... .... 0000 . 0 . 0 .... @3diff
+    VADDL_U_3d   1111 001 1 1 . .. .... .... 0000 . 0 . 0 .... @3diff
+
+    VADDW_S_3d   1111 001 0 1 . .. .... .... 0001 . 0 . 0 .... @3diff
+    VADDW_U_3d   1111 001 1 1 . .. .... .... 0001 . 0 . 0 .... @3diff
+
+    VSUBL_S_3d   1111 001 0 1 . .. .... .... 0010 . 0 . 0 .... @3diff
+    VSUBL_U_3d   1111 001 1 1 . .. .... .... 0010 . 0 . 0 .... @3diff
+
+    VSUBW_S_3d   1111 001 0 1 . .. .... .... 0011 . 0 . 0 .... @3diff
+    VSUBW_U_3d   1111 001 1 1 . .. .... .... 0011 . 0 . 0 .... @3diff
+  ]
+}
--- a/target/arm/translate-neon.inc.c
+++ b/target/arm/translate-neon.inc.c
@ -1828,3 +1828,107 @@ static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
    }
    return do_1reg_imm(s, a, fn);
 }
+
+static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
+                           NeonGenWidenFn *widenfn,
+                           NeonGenTwo64OpFn *opfn,
+                           bool src1_wide)
+{
+    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
+    TCGv_i64 rn0_64, rn1_64, rm_64;
+    TCGv_i32 rm;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!widenfn || !opfn) {
+        /* size == 3 case, which is an entirely different insn group */
+        return false;
+    }
+
+    if ((a->vd & 1) || (src1_wide && (a->vn & 1))) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rn0_64 = tcg_temp_new_i64();
+    rn1_64 = tcg_temp_new_i64();
+    rm_64 = tcg_temp_new_i64();
+
+    if (src1_wide) {
+        neon_load_reg64(rn0_64, a->vn);
+    } else {
+        TCGv_i32 tmp = neon_load_reg(a->vn, 0);
+        widenfn(rn0_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }
+    rm = neon_load_reg(a->vm, 0);
+
+    widenfn(rm_64, rm);
+    tcg_temp_free_i32(rm);
+    opfn(rn0_64, rn0_64, rm_64);
+
+    /*
+     * Load second pass inputs before storing the first pass result, to
+     * avoid incorrect results if a narrow input overlaps with the result.
+     */
+    if (src1_wide) {
+        neon_load_reg64(rn1_64, a->vn + 1);
+    } else {
+        TCGv_i32 tmp = neon_load_reg(a->vn, 1);
+        widenfn(rn1_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }
+    rm = neon_load_reg(a->vm, 1);
+
+    neon_store_reg64(rn0_64, a->vd);
+
+    widenfn(rm_64, rm);
+    tcg_temp_free_i32(rm);
+    opfn(rn1_64, rn1_64, rm_64);
+    neon_store_reg64(rn1_64, a->vd + 1);
+
+    tcg_temp_free_i64(rn0_64);
+    tcg_temp_free_i64(rn1_64);
+    tcg_temp_free_i64(rm_64);
+
+    return true;
+}
+
+#define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE)                         \
+    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
+    {                                                                   \
+        static NeonGenWidenFn * const widenfn[] = {                     \
+            gen_helper_neon_widen_##S##8,                               \
+            gen_helper_neon_widen_##S##16,                              \
+            tcg_gen_##EXT##_i32_i64,                                    \
+            NULL,                                                       \
+        };                                                              \
+        static NeonGenTwo64OpFn * const addfn[] = {                     \
+            gen_helper_neon_##OP##l_u16,                                \
+            gen_helper_neon_##OP##l_u32,                                \
+            tcg_gen_##OP##_i64,                                         \
+            NULL,                                                       \
+        };                                                              \
+        return do_prewiden_3d(s, a, widenfn[a->size],                   \
+                              addfn[a->size], SRC1WIDE);                \
+    }
+
+DO_PREWIDEN(VADDL_S, s, ext, add, false)
+DO_PREWIDEN(VADDL_U, u, extu, add, false)
+DO_PREWIDEN(VSUBL_S, s, ext, sub, false)
+DO_PREWIDEN(VSUBL_U, u, extu, sub, false)
+DO_PREWIDEN(VADDW_S, s, ext, add, true)
+DO_PREWIDEN(VADDW_U, u, extu, add, true)
+DO_PREWIDEN(VSUBW_S, s, ext, sub, true)
+DO_PREWIDEN(VSUBW_U, u, extu, sub, true)
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@ -5241,7 +5241,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                /* Three registers of different lengths.  */
                int src1_wide;
                int src2_wide;
-                int prewiden;
                /* undefreq: bit 0 : UNDEF if size == 0
                 *           bit 1 : UNDEF if size == 1
                 *           bit 2 : UNDEF if size == 2
@ -5251,10 +5250,10 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                int undefreq;
                /* prewiden, src1_wide, src2_wide, undefreq */
                static const int neon_3reg_wide[16][4] = {
-                    {1, 0, 0, 0}, /* VADDL */
-                    {1, 1, 0, 0}, /* VADDW */
-                    {1, 0, 0, 0}, /* VSUBL */
-                    {1, 1, 0, 0}, /* VSUBW */
+                    {0, 0, 0, 7}, /* VADDL: handled by decodetree */
+                    {0, 0, 0, 7}, /* VADDW: handled by decodetree */
+                    {0, 0, 0, 7}, /* VSUBL: handled by decodetree */
+                    {0, 0, 0, 7}, /* VSUBW: handled by decodetree */
                    {0, 1, 1, 0}, /* VADDHN */
                    {0, 0, 0, 0}, /* VABAL */
                    {0, 1, 1, 0}, /* VSUBHN */
@ -5269,7 +5268,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                    {0, 0, 0, 7}, /* Reserved: always UNDEF */
                };

-                prewiden = neon_3reg_wide[op][0];
                src1_wide = neon_3reg_wide[op][1];
                src2_wide = neon_3reg_wide[op][2];
                undefreq = neon_3reg_wide[op][3];
@ -5322,9 +5320,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                        } else {
                            tmp = neon_load_reg(rn, pass);
                        }
-                        if (prewiden) {
-                            gen_neon_widen(cpu_V0, tmp, size, u);
-                        }
                    }
                    if (src2_wide) {
                        neon_load_reg64(cpu_V1, rm + pass);
@ -5335,9 +5330,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                        } else {
                            tmp2 = neon_load_reg(rm, pass);
                        }
-                        if (prewiden) {
-                            gen_neon_widen(cpu_V1, tmp2, size, u);
-                        }
                    }
                    switch (op) {
                    case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN */