diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h index c165205a49..c5e2dab9f6 100644 --- a/fpu/softfloat-specialize.h +++ b/fpu/softfloat-specialize.h @@ -419,6 +419,82 @@ static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, } #endif +/*---------------------------------------------------------------------------- +| Select which NaN to propagate for a three-input operation. +| For the moment we assume that no CPU needs the 'larger significand' +| information. +| Return values : 0 : a; 1 : b; 2 : c; 3 : default-NaN +*----------------------------------------------------------------------------*/ +#if defined(TARGET_ARM) +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + /* For ARM, the (inf,zero,qnan) case sets InvalidOp and returns + * the default NaN + */ + if (infzero && cIsQNaN) { + float_raise(float_flag_invalid STATUS_VAR); + return 3; + } + + /* This looks different from the ARM ARM pseudocode, because the ARM ARM + * puts the operands to a fused mac operation (a*b)+c in the order c,a,b. + */ + if (cIsSNaN) { + return 2; + } else if (aIsSNaN) { + return 0; + } else if (bIsSNaN) { + return 1; + } else if (cIsQNaN) { + return 2; + } else if (aIsQNaN) { + return 0; + } else { + return 1; + } +} +#elif defined(TARGET_PPC) +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + /* For PPC, the (inf,zero,qnan) case sets InvalidOp, but we prefer + * to return an input NaN if we have one (ie c) rather than generating + * a default NaN + */ + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return 2; + } + + /* If fRA is a NaN return it; otherwise if fRB is a NaN return it; + * otherwise return fRC. Note that muladd on PPC is (fRA * fRC) + frB + */ + if (aIsSNaN || aIsQNaN) { + return 0; + } else if (cIsSNaN || cIsQNaN) { + return 2; + } else { + return 1; + } +} +#else +/* A default implementation: prefer a to b to c. + * This is unlikely to actually match any real implementation. + */ +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + if (aIsSNaN || aIsQNaN) { + return 0; + } else if (bIsSNaN || bIsQNaN) { + return 1; + } else { + return 2; + } +} +#endif + /*---------------------------------------------------------------------------- | Takes two single-precision floating-point values `a' and `b', one of which | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a @@ -459,6 +535,57 @@ static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM) } } +/*---------------------------------------------------------------------------- +| Takes three single-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float32 propagateFloat32MulAddNaN(float32 a, float32 b, + float32 c, flag infzero STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN; + int which; + + aIsQuietNaN = float32_is_quiet_nan(a); + aIsSignalingNaN = float32_is_signaling_nan(a); + bIsQuietNaN = float32_is_quiet_nan(b); + bIsSignalingNaN = float32_is_signaling_nan(b); + cIsQuietNaN = float32_is_quiet_nan(c); + cIsSignalingNaN = float32_is_signaling_nan(c); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) { + float_raise(float_flag_invalid STATUS_VAR); + } + + which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN, + bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN, infzero STATUS_VAR); + + if (STATUS(default_nan_mode)) { + /* Note that this check is after pickNaNMulAdd so that function + * has an opportunity to set the Invalid flag. + */ + return float32_default_nan; + } + + switch (which) { + case 0: + return float32_maybe_silence_nan(a); + case 1: + return float32_maybe_silence_nan(b); + case 2: + return float32_maybe_silence_nan(c); + case 3: + default: + return float32_default_nan; + } +} + /*---------------------------------------------------------------------------- | Returns 1 if the double-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. @@ -595,6 +722,57 @@ static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM) } } +/*---------------------------------------------------------------------------- +| Takes three double-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64MulAddNaN(float64 a, float64 b, + float64 c, flag infzero STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN; + int which; + + aIsQuietNaN = float64_is_quiet_nan(a); + aIsSignalingNaN = float64_is_signaling_nan(a); + bIsQuietNaN = float64_is_quiet_nan(b); + bIsSignalingNaN = float64_is_signaling_nan(b); + cIsQuietNaN = float64_is_quiet_nan(c); + cIsSignalingNaN = float64_is_signaling_nan(c); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) { + float_raise(float_flag_invalid STATUS_VAR); + } + + which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN, + bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN, infzero STATUS_VAR); + + if (STATUS(default_nan_mode)) { + /* Note that this check is after pickNaNMulAdd so that function + * has an opportunity to set the Invalid flag. + */ + return float64_default_nan; + } + + switch (which) { + case 0: + return float64_maybe_silence_nan(a); + case 1: + return float64_maybe_silence_nan(b); + case 2: + return float64_maybe_silence_nan(c); + case 3: + default: + return float64_default_nan; + } +} + /*---------------------------------------------------------------------------- | Returns 1 if the extended double-precision floating-point value `a' is a | quiet NaN; otherwise returns 0. This slightly differs from the same diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 3aafa81d58..81a7d1ae09 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2117,6 +2117,213 @@ float32 float32_rem( float32 a, float32 b STATUS_PARAM ) } +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM) +{ + flag aSign, bSign, cSign, zSign; + int aExp, bExp, cExp, pExp, zExp, expDiff; + uint32_t aSig, bSig, cSig; + flag pInf, pZero, pSign; + uint64_t pSig64, cSig64, zSig64; + uint32_t pSig; + int shiftcount; + flag signflip, infzero; + + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + c = float32_squash_input_denormal(c STATUS_VAR); + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + bSign = extractFloat32Sign(b); + cSig = extractFloat32Frac(c); + cExp = extractFloat32Exp(c); + cSign = extractFloat32Sign(c); + + infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || + (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0xff) && aSig) || + ((bExp == 0xff) && bSig) || + ((cExp == 0xff) && cSig)) { + return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR); + } + + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + signflip = (flags & float_muladd_negate_result) ? 1 : 0; + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0xff) || (bExp == 0xff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0xff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + return packFloat32(cSign ^ signflip, 0xff, 0); + } + + if (pInf) { + return packFloat32(pSign ^ signflip, 0xff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (STATUS(float_rounding_mode) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat32(zSign ^ signflip, 0, 0); + } + /* Exact zero plus a denorm */ + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat32(cSign ^ signflip, 0, 0); + } + } + /* Zero plus something non-zero : just return the something */ + return c ^ (signflip << 31); + } + + if (aExp == 0) { + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + normalizeFloat32Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat32() takes. + */ + pExp = aExp + bExp - 0x7e; + aSig = (aSig | 0x00800000) << 7; + bSig = (bSig | 0x00800000) << 8; + pSig64 = (uint64_t)aSig * bSig; + if ((int64_t)(pSig64 << 1) >= 0) { + pSig64 <<= 1; + pExp--; + } + + zSign = pSign ^ signflip; + + /* Now pSig64 is the significand of the multiply, with the explicit bit in + * position 62. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift64RightJamming(pSig64, 32, &pSig64); + pSig = pSig64; + return roundAndPackFloat32(zSign, pExp - 1, + pSig STATUS_VAR); + } + normalizeFloat32Subnormal(cSig, &cExp, &cSig); + } + + cSig64 = (uint64_t)cSig << (62 - 23); + cSig64 |= LIT64(0x4000000000000000); + expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift64RightJamming(cSig64, expDiff, &cSig64); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift64RightJamming(pSig64, -expDiff, &pSig64); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 62 */ + zSig64 = pSig64 + cSig64; + if ((int64_t)zSig64 < 0) { + shift64RightJamming(zSig64, 1, &zSig64); + } else { + zExp--; + } + } else { + /* Subtraction */ + if (expDiff > 0) { + shift64RightJamming(cSig64, expDiff, &cSig64); + zSig64 = pSig64 - cSig64; + zExp = pExp; + } else if (expDiff < 0) { + shift64RightJamming(pSig64, -expDiff, &pSig64); + zSig64 = cSig64 - pSig64; + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (cSig64 < pSig64) { + zSig64 = pSig64 - cSig64; + } else if (pSig64 < cSig64) { + zSig64 = cSig64 - pSig64; + zSign ^= 1; + } else { + /* Exact zero */ + zSign = signflip; + if (STATUS(float_rounding_mode) == float_round_down) { + zSign ^= 1; + } + return packFloat32(zSign, 0, 0); + } + } + --zExp; + /* Normalize to put the explicit bit back into bit 62. */ + shiftcount = countLeadingZeros64(zSig64) - 1; + zSig64 <<= shiftcount; + zExp -= shiftcount; + } + shift64RightJamming(zSig64, 32, &zSig64); + return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR); +} + + /*---------------------------------------------------------------------------- | Returns the square root of the single-precision floating-point value `a'. | The operation is performed according to the IEC/IEEE Standard for Binary @@ -3464,6 +3671,226 @@ float64 float64_rem( float64 a, float64 b STATUS_PARAM ) } +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM) +{ + flag aSign, bSign, cSign, zSign; + int aExp, bExp, cExp, pExp, zExp, expDiff; + uint64_t aSig, bSig, cSig; + flag pInf, pZero, pSign; + uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; + int shiftcount; + flag signflip, infzero; + + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + c = float64_squash_input_denormal(c STATUS_VAR); + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + bSign = extractFloat64Sign(b); + cSig = extractFloat64Frac(c); + cExp = extractFloat64Exp(c); + cSign = extractFloat64Sign(c); + + infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || + (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0x7ff) && aSig) || + ((bExp == 0x7ff) && bSig) || + ((cExp == 0x7ff) && cSig)) { + return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR); + } + + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + signflip = (flags & float_muladd_negate_result) ? 1 : 0; + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0x7ff) || (bExp == 0x7ff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0x7ff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + return packFloat64(cSign ^ signflip, 0x7ff, 0); + } + + if (pInf) { + return packFloat64(pSign ^ signflip, 0x7ff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (STATUS(float_rounding_mode) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat64(zSign ^ signflip, 0, 0); + } + /* Exact zero plus a denorm */ + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat64(cSign ^ signflip, 0, 0); + } + } + /* Zero plus something non-zero : just return the something */ + return c ^ ((uint64_t)signflip << 63); + } + + if (aExp == 0) { + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + normalizeFloat64Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat64() takes. + */ + pExp = aExp + bExp - 0x3fe; + aSig = (aSig | LIT64(0x0010000000000000))<<10; + bSig = (bSig | LIT64(0x0010000000000000))<<11; + mul64To128(aSig, bSig, &pSig0, &pSig1); + if ((int64_t)(pSig0 << 1) >= 0) { + shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); + pExp--; + } + + zSign = pSign ^ signflip; + + /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit + * bit in position 126. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); + return roundAndPackFloat64(zSign, pExp - 1, + pSig1 STATUS_VAR); + } + normalizeFloat64Subnormal(cSig, &cExp, &cSig); + } + + /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the + * significand of the addend, with the explicit bit in position 126. + */ + cSig0 = cSig << (126 - 64 - 52); + cSig1 = 0; + cSig0 |= LIT64(0x4000000000000000); + expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 126 */ + add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + if ((int64_t)zSig0 < 0) { + shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); + } else { + zExp--; + } + shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); + return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR); + } else { + /* Subtraction */ + if (expDiff > 0) { + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + zExp = pExp; + } else if (expDiff < 0) { + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (lt128(cSig0, cSig1, pSig0, pSig1)) { + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zSign ^= 1; + } else { + /* Exact zero */ + zSign = signflip; + if (STATUS(float_rounding_mode) == float_round_down) { + zSign ^= 1; + } + return packFloat64(zSign, 0, 0); + } + } + --zExp; + /* Do the equivalent of normalizeRoundAndPackFloat64() but + * starting with the significand in a pair of uint64_t. + */ + if (zSig0) { + shiftcount = countLeadingZeros64(zSig0) - 1; + shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); + if (zSig1) { + zSig0 |= 1; + } + zExp -= shiftcount; + } else { + shiftcount = countLeadingZeros64(zSig1) - 1; + zSig0 = zSig1 << shiftcount; + zExp -= (shiftcount + 64); + } + return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR); + } +} + /*---------------------------------------------------------------------------- | Returns the square root of the double-precision floating-point value `a'. | The operation is performed according to the IEC/IEEE Standard for Binary diff --git a/fpu/softfloat.h b/fpu/softfloat.h index 618ddee569..07c2929613 100644 --- a/fpu/softfloat.h +++ b/fpu/softfloat.h @@ -211,6 +211,18 @@ void set_floatx80_rounding_precision(int val STATUS_PARAM); *----------------------------------------------------------------------------*/ void float_raise( int8 flags STATUS_PARAM); +/*---------------------------------------------------------------------------- +| Options to indicate which negations to perform in float*_muladd() +| Using these differs from negating an input or output before calling +| the muladd function in that this means that a NaN doesn't have its +| sign bit inverted before it is propagated. +*----------------------------------------------------------------------------*/ +enum { + float_muladd_negate_c = 1, + float_muladd_negate_product = 2, + float_muladd_negate_result = 3, +}; + /*---------------------------------------------------------------------------- | Software IEC/IEEE integer-to-floating-point conversion routines. *----------------------------------------------------------------------------*/ @@ -269,6 +281,7 @@ float32 float32_sub( float32, float32 STATUS_PARAM ); float32 float32_mul( float32, float32 STATUS_PARAM ); float32 float32_div( float32, float32 STATUS_PARAM ); float32 float32_rem( float32, float32 STATUS_PARAM ); +float32 float32_muladd(float32, float32, float32, int STATUS_PARAM); float32 float32_sqrt( float32 STATUS_PARAM ); float32 float32_exp2( float32 STATUS_PARAM ); float32 float32_log2( float32 STATUS_PARAM ); @@ -375,6 +388,7 @@ float64 float64_sub( float64, float64 STATUS_PARAM ); float64 float64_mul( float64, float64 STATUS_PARAM ); float64 float64_div( float64, float64 STATUS_PARAM ); float64 float64_rem( float64, float64 STATUS_PARAM ); +float64 float64_muladd(float64, float64, float64, int STATUS_PARAM); float64 float64_sqrt( float64 STATUS_PARAM ); float64 float64_log2( float64 STATUS_PARAM ); int float64_eq( float64, float64 STATUS_PARAM ); diff --git a/target-arm/cpu.h b/target-arm/cpu.h index 6ab780d7ef..c4d742f084 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -366,7 +366,7 @@ enum arm_features { ARM_FEATURE_VFP3, ARM_FEATURE_VFP_FP16, ARM_FEATURE_NEON, - ARM_FEATURE_DIV, + ARM_FEATURE_THUMB_DIV, /* divide supported in Thumb encoding */ ARM_FEATURE_M, /* Microcontroller profile. */ ARM_FEATURE_OMAPCP, /* OMAP specific CP15 ops handling. */ ARM_FEATURE_THUMB2EE, @@ -375,6 +375,8 @@ enum arm_features { ARM_FEATURE_V5, ARM_FEATURE_STRONGARM, ARM_FEATURE_VAPA, /* cp15 VA to PA lookups */ + ARM_FEATURE_ARM_DIV, /* divide supported in ARM encoding */ + ARM_FEATURE_VFP4, /* VFPv4 (implies that NEON is v2) */ }; static inline int arm_feature(CPUARMState *env, int feature) diff --git a/target-arm/helper.c b/target-arm/helper.c index e2428eb7b2..97af4d0bba 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -193,7 +193,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_THUMB2); set_feature(env, ARM_FEATURE_V7); set_feature(env, ARM_FEATURE_M); - set_feature(env, ARM_FEATURE_DIV); + set_feature(env, ARM_FEATURE_THUMB_DIV); break; case ARM_CPUID_ANY: /* For userspace emulation. */ set_feature(env, ARM_FEATURE_V4T); @@ -204,10 +204,11 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_THUMB2); set_feature(env, ARM_FEATURE_VFP); set_feature(env, ARM_FEATURE_VFP3); + set_feature(env, ARM_FEATURE_VFP4); set_feature(env, ARM_FEATURE_VFP_FP16); set_feature(env, ARM_FEATURE_NEON); set_feature(env, ARM_FEATURE_THUMB2EE); - set_feature(env, ARM_FEATURE_DIV); + set_feature(env, ARM_FEATURE_ARM_DIV); set_feature(env, ARM_FEATURE_V7MP); break; case ARM_CPUID_TI915T: @@ -261,6 +262,9 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) if (arm_feature(env, ARM_FEATURE_V7)) { set_feature(env, ARM_FEATURE_VAPA); } + if (arm_feature(env, ARM_FEATURE_ARM_DIV)) { + set_feature(env, ARM_FEATURE_THUMB_DIV); + } } void cpu_reset(CPUARMState *env) @@ -471,7 +475,7 @@ static uint32_t cpu_arm_find_by_name(const char *name) void cpu_arm_close(CPUARMState *env) { - free(env); + g_free(env); } uint32_t cpsr_read(CPUARMState *env) @@ -3039,8 +3043,7 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) val64 = float64_val(f64); - val = ((val64 >> 63) & 0x80000000) - | ((result_exp & 0xff) << 23) + val = ((result_exp & 0xff) << 23) | ((val64 >> 29) & 0x7fffff); return make_float32(val); } @@ -3082,6 +3085,19 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env) return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff); } +/* VFPv4 fused multiply-accumulate */ +float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c, void *fpstp) +{ + float_status *fpst = fpstp; + return float32_muladd(a, b, c, 0, fpst); +} + +float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c, void *fpstp) +{ + float_status *fpst = fpstp; + return float64_muladd(a, b, c, 0, fpst); +} + void HELPER(set_teecr)(CPUState *env, uint32_t val) { val &= 1; diff --git a/target-arm/helper.h b/target-arm/helper.h index 3ad1cb0881..16dd5fcc89 100644 --- a/target-arm/helper.h +++ b/target-arm/helper.h @@ -132,6 +132,9 @@ DEF_HELPER_2(vfp_fcvt_f32_to_f16, i32, f32, env) DEF_HELPER_2(neon_fcvt_f16_to_f32, f32, i32, env) DEF_HELPER_2(neon_fcvt_f32_to_f16, i32, f32, env) +DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr) +DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr) + DEF_HELPER_3(recps_f32, f32, f32, f32, env) DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env) DEF_HELPER_2(recpe_f32, f32, f32, env) diff --git a/target-arm/machine.c b/target-arm/machine.c index 7d4fc545a6..aaee9b9c11 100644 --- a/target-arm/machine.c +++ b/target-arm/machine.c @@ -189,7 +189,7 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id) env->vfp.vec_stride = qemu_get_be32(f); if (arm_feature(env, ARM_FEATURE_VFP3)) { - for (i = 0; i < 16; i++) { + for (i = 16; i < 32; i++) { CPU_DoubleU u; u.l.upper = qemu_get_be32(f); u.l.lower = qemu_get_be32(f); diff --git a/target-arm/translate.c b/target-arm/translate.c index 75c0ad413a..0f35b60946 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3141,6 +3141,57 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) case 8: /* div: fn / fm */ gen_vfp_div(dp); break; + case 10: /* VFNMA : fd = muladd(-fd, fn, fm) */ + case 11: /* VFNMS : fd = muladd(-fd, -fn, fm) */ + case 12: /* VFMA : fd = muladd( fd, fn, fm) */ + case 13: /* VFMS : fd = muladd( fd, -fn, fm) */ + /* These are fused multiply-add, and must be done as one + * floating point operation with no rounding between the + * multiplication and addition steps. + * NB that doing the negations here as separate steps is + * correct : an input NaN should come out with its sign bit + * flipped if it is a negated-input. + */ + if (!arm_feature(env, ARM_FEATURE_VFP4)) { + return 1; + } + if (dp) { + TCGv_ptr fpst; + TCGv_i64 frd; + if (op & 1) { + /* VFNMS, VFMS */ + gen_helper_vfp_negd(cpu_F0d, cpu_F0d); + } + frd = tcg_temp_new_i64(); + tcg_gen_ld_f64(frd, cpu_env, vfp_reg_offset(dp, rd)); + if (op & 2) { + /* VFNMA, VFNMS */ + gen_helper_vfp_negd(frd, frd); + } + fpst = get_fpstatus_ptr(0); + gen_helper_vfp_muladdd(cpu_F0d, cpu_F0d, + cpu_F1d, frd, fpst); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(frd); + } else { + TCGv_ptr fpst; + TCGv_i32 frd; + if (op & 1) { + /* VFNMS, VFMS */ + gen_helper_vfp_negs(cpu_F0s, cpu_F0s); + } + frd = tcg_temp_new_i32(); + tcg_gen_ld_f32(frd, cpu_env, vfp_reg_offset(dp, rd)); + if (op & 2) { + gen_helper_vfp_negs(frd, frd); + } + fpst = get_fpstatus_ptr(0); + gen_helper_vfp_muladds(cpu_F0s, cpu_F0s, + cpu_F1s, frd, fpst); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(frd); + } + break; case 14: /* fconst */ if (!arm_feature(env, ARM_FEATURE_VFP3)) return 1; @@ -4417,6 +4468,7 @@ static void gen_neon_narrow_op(int op, int u, int size, TCGv dest, TCGv_i64 src) #define NEON_3R_VPMIN 21 #define NEON_3R_VQDMULH_VQRDMULH 22 #define NEON_3R_VPADD 23 +#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */ #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */ #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */ #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */ @@ -4449,6 +4501,7 @@ static const uint8_t neon_3r_sizes[] = { [NEON_3R_VPMIN] = 0x7, [NEON_3R_VQDMULH_VQRDMULH] = 0x6, [NEON_3R_VPADD] = 0x7, + [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */ @@ -4726,6 +4779,11 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) return 1; } break; + case NEON_3R_VFM: + if (!arm_feature(env, ARM_FEATURE_VFP4) || u) { + return 1; + } + break; default: break; } @@ -5006,6 +5064,20 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) else gen_helper_rsqrts_f32(tmp, tmp, tmp2, cpu_env); break; + case NEON_3R_VFM: + { + /* VFMA, VFMS: fused multiply-add */ + TCGv_ptr fpstatus = get_fpstatus_ptr(1); + TCGv_i32 tmp3 = neon_load_reg(rd, pass); + if (size) { + /* VFMS */ + gen_helper_vfp_negs(tmp, tmp); + } + gen_helper_vfp_muladds(tmp, tmp, tmp2, tmp3, fpstatus); + tcg_temp_free_i32(tmp3); + tcg_temp_free_ptr(fpstatus); + break; + } default: abort(); } @@ -7569,11 +7641,16 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) } break; case 2: /* Multiplies (Type 3). */ - tmp = load_reg(s, rm); - tmp2 = load_reg(s, rs); - if (insn & (1 << 20)) { + switch ((insn >> 20) & 0x7) { + case 5: + if (((insn >> 6) ^ (insn >> 7)) & 1) { + /* op2 not 00x or 11x : UNDEF */ + goto illegal_op; + } /* Signed multiply most significant [accumulate]. (SMMUL, SMMLA, SMMLS) */ + tmp = load_reg(s, rm); + tmp2 = load_reg(s, rs); tmp64 = gen_muls_i64_i32(tmp, tmp2); if (rd != 15) { @@ -7592,7 +7669,15 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) tcg_gen_trunc_i64_i32(tmp, tmp64); tcg_temp_free_i64(tmp64); store_reg(s, rn, tmp); - } else { + break; + case 0: + case 4: + /* SMLAD, SMUAD, SMLSD, SMUSD, SMLALD, SMLSLD */ + if (insn & (1 << 7)) { + goto illegal_op; + } + tmp = load_reg(s, rm); + tmp2 = load_reg(s, rs); if (insn & (1 << 5)) gen_swap_half(tmp2); gen_smul_dual(tmp, tmp2); @@ -7625,6 +7710,28 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) } store_reg(s, rn, tmp); } + break; + case 1: + case 3: + /* SDIV, UDIV */ + if (!arm_feature(env, ARM_FEATURE_ARM_DIV)) { + goto illegal_op; + } + if (((insn >> 5) & 7) || (rd != 15)) { + goto illegal_op; + } + tmp = load_reg(s, rm); + tmp2 = load_reg(s, rs); + if (insn & (1 << 21)) { + gen_helper_udiv(tmp, tmp, tmp2); + } else { + gen_helper_sdiv(tmp, tmp, tmp2); + } + tcg_temp_free_i32(tmp2); + store_reg(s, rn, tmp); + break; + default: + goto illegal_op; } break; case 3: @@ -8497,8 +8604,9 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1) tmp2 = load_reg(s, rm); if ((op & 0x50) == 0x10) { /* sdiv, udiv */ - if (!arm_feature(env, ARM_FEATURE_DIV)) + if (!arm_feature(env, ARM_FEATURE_THUMB_DIV)) { goto illegal_op; + } if (op & 0x20) gen_helper_udiv(tmp, tmp, tmp2); else