diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode index bd1b0e13f7..144a527ee6 100644 --- a/target/arm/neon-dp.decode +++ b/target/arm/neon-dp.decode @@ -397,3 +397,46 @@ VCVT_FU_2sh 1111 001 1 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt # So we have a single decode line and check the cmode/op in the # trans function. Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm + +###################################################################### +# Within the "two registers, or three registers of different lengths" +# grouping ([23,4]=0b10), bits [21:20] are either part of the opcode +# decode: 0b11 for VEXT, two-reg-misc, VTBL, and duplicate-scalar; +# or they are a size field for the three-reg-different-lengths and +# two-reg-and-scalar insn groups (where size cannot be 0b11). This +# is slightly awkward for decodetree: we handle it with this +# non-exclusive group which contains within it two exclusive groups: +# one for the size=0b11 patterns, and one for the size-not-0b11 +# patterns. This allows us to check that none of the insns within +# each subgroup accidentally overlap each other. Note that all the +# trans functions for the size-not-0b11 patterns must check and +# return false for size==3. +###################################################################### +{ + # 0b11 subgroup will go here + + # Subgroup for size != 0b11 + [ + ################################################################## + # 3-reg-different-length grouping: + # 1111 001 U 1 D sz!=11 Vn:4 Vd:4 opc:4 N 0 M 0 Vm:4 + ################################################################## + + &3diff vm vn vd size + + @3diff .... ... . . . size:2 .... .... .... . . . . .... \ + &3diff vm=%vm_dp vn=%vn_dp vd=%vd_dp + + VADDL_S_3d 1111 001 0 1 . .. .... .... 0000 . 0 . 0 .... @3diff + VADDL_U_3d 1111 001 1 1 . .. .... .... 0000 . 0 . 0 .... @3diff + + VADDW_S_3d 1111 001 0 1 . .. .... .... 0001 . 0 . 0 .... @3diff + VADDW_U_3d 1111 001 1 1 . .. .... .... 0001 . 0 . 0 .... @3diff + + VSUBL_S_3d 1111 001 0 1 . .. .... .... 0010 . 0 . 0 .... @3diff + VSUBL_U_3d 1111 001 1 1 . .. .... .... 0010 . 0 . 0 .... @3diff + + VSUBW_S_3d 1111 001 0 1 . .. .... .... 0011 . 0 . 0 .... @3diff + VSUBW_U_3d 1111 001 1 1 . .. .... .... 0011 . 0 . 0 .... @3diff + ] +} diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c index 299a61f067..9b9d411107 100644 --- a/target/arm/translate-neon.inc.c +++ b/target/arm/translate-neon.inc.c @@ -1828,3 +1828,107 @@ static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a) } return do_1reg_imm(s, a, fn); } + +static bool do_prewiden_3d(DisasContext *s, arg_3diff *a, + NeonGenWidenFn *widenfn, + NeonGenTwo64OpFn *opfn, + bool src1_wide) +{ + /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */ + TCGv_i64 rn0_64, rn1_64, rm_64; + TCGv_i32 rm; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vn | a->vm) & 0x10)) { + return false; + } + + if (!widenfn || !opfn) { + /* size == 3 case, which is an entirely different insn group */ + return false; + } + + if ((a->vd & 1) || (src1_wide && (a->vn & 1))) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rn0_64 = tcg_temp_new_i64(); + rn1_64 = tcg_temp_new_i64(); + rm_64 = tcg_temp_new_i64(); + + if (src1_wide) { + neon_load_reg64(rn0_64, a->vn); + } else { + TCGv_i32 tmp = neon_load_reg(a->vn, 0); + widenfn(rn0_64, tmp); + tcg_temp_free_i32(tmp); + } + rm = neon_load_reg(a->vm, 0); + + widenfn(rm_64, rm); + tcg_temp_free_i32(rm); + opfn(rn0_64, rn0_64, rm_64); + + /* + * Load second pass inputs before storing the first pass result, to + * avoid incorrect results if a narrow input overlaps with the result. + */ + if (src1_wide) { + neon_load_reg64(rn1_64, a->vn + 1); + } else { + TCGv_i32 tmp = neon_load_reg(a->vn, 1); + widenfn(rn1_64, tmp); + tcg_temp_free_i32(tmp); + } + rm = neon_load_reg(a->vm, 1); + + neon_store_reg64(rn0_64, a->vd); + + widenfn(rm_64, rm); + tcg_temp_free_i32(rm); + opfn(rn1_64, rn1_64, rm_64); + neon_store_reg64(rn1_64, a->vd + 1); + + tcg_temp_free_i64(rn0_64); + tcg_temp_free_i64(rn1_64); + tcg_temp_free_i64(rm_64); + + return true; +} + +#define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE) \ + static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \ + { \ + static NeonGenWidenFn * const widenfn[] = { \ + gen_helper_neon_widen_##S##8, \ + gen_helper_neon_widen_##S##16, \ + tcg_gen_##EXT##_i32_i64, \ + NULL, \ + }; \ + static NeonGenTwo64OpFn * const addfn[] = { \ + gen_helper_neon_##OP##l_u16, \ + gen_helper_neon_##OP##l_u32, \ + tcg_gen_##OP##_i64, \ + NULL, \ + }; \ + return do_prewiden_3d(s, a, widenfn[a->size], \ + addfn[a->size], SRC1WIDE); \ + } + +DO_PREWIDEN(VADDL_S, s, ext, add, false) +DO_PREWIDEN(VADDL_U, u, extu, add, false) +DO_PREWIDEN(VSUBL_S, s, ext, sub, false) +DO_PREWIDEN(VSUBL_U, u, extu, sub, false) +DO_PREWIDEN(VADDW_S, s, ext, add, true) +DO_PREWIDEN(VADDW_U, u, extu, add, true) +DO_PREWIDEN(VSUBW_S, s, ext, sub, true) +DO_PREWIDEN(VSUBW_U, u, extu, sub, true) diff --git a/target/arm/translate.c b/target/arm/translate.c index bcdfec34d2..9376534441 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -5241,7 +5241,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) /* Three registers of different lengths. */ int src1_wide; int src2_wide; - int prewiden; /* undefreq: bit 0 : UNDEF if size == 0 * bit 1 : UNDEF if size == 1 * bit 2 : UNDEF if size == 2 @@ -5251,10 +5250,10 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) int undefreq; /* prewiden, src1_wide, src2_wide, undefreq */ static const int neon_3reg_wide[16][4] = { - {1, 0, 0, 0}, /* VADDL */ - {1, 1, 0, 0}, /* VADDW */ - {1, 0, 0, 0}, /* VSUBL */ - {1, 1, 0, 0}, /* VSUBW */ + {0, 0, 0, 7}, /* VADDL: handled by decodetree */ + {0, 0, 0, 7}, /* VADDW: handled by decodetree */ + {0, 0, 0, 7}, /* VSUBL: handled by decodetree */ + {0, 0, 0, 7}, /* VSUBW: handled by decodetree */ {0, 1, 1, 0}, /* VADDHN */ {0, 0, 0, 0}, /* VABAL */ {0, 1, 1, 0}, /* VSUBHN */ @@ -5269,7 +5268,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) {0, 0, 0, 7}, /* Reserved: always UNDEF */ }; - prewiden = neon_3reg_wide[op][0]; src1_wide = neon_3reg_wide[op][1]; src2_wide = neon_3reg_wide[op][2]; undefreq = neon_3reg_wide[op][3]; @@ -5322,9 +5320,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } else { tmp = neon_load_reg(rn, pass); } - if (prewiden) { - gen_neon_widen(cpu_V0, tmp, size, u); - } } if (src2_wide) { neon_load_reg64(cpu_V1, rm + pass); @@ -5335,9 +5330,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } else { tmp2 = neon_load_reg(rm, pass); } - if (prewiden) { - gen_neon_widen(cpu_V1, tmp2, size, u); - } } switch (op) { case 0: case 1: case 4: /* VADDL, VADDW, VADDHN, VRADDHN */