hash: Add SHA-NI implementation of SHA-256 (#15152)

* hash: Add SSE2 implementation of SHA-256

Implementation taken from
tarsnap/libcperciva@661752aee8.

Co-authored-by: Christoph M. Becker <cmbecker69@gmx.de>
Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com>

* zend_cpuinfo: Add ZEND_CPU_FEATURE_SHA

* hash: Add SHA-NI implementation of SHA-256

Implementation taken from
tarsnap/libcperciva@661752aee8.

Co-authored-by: Christoph M. Becker <cmbecker69@gmx.de>

* NEWS / UPGRADING

---------

Co-authored-by: Christoph M. Becker <cmbecker69@gmx.de>
Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
This commit is contained in:
Tim Düsterhus 2024-08-08 22:19:33 +02:00 committed by GitHub
parent a355c3572e
commit 6eca7839af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 483 additions and 3 deletions

2
NEWS
View File

@ -26,6 +26,8 @@ PHP NEWS
- Hash: - Hash:
. Deprecated passing incorrect data types for options to ext/hash functions. . Deprecated passing incorrect data types for options to ext/hash functions.
(nielsdos) (nielsdos)
. Added SSE2 and SHA-NI implementation of SHA-256. (timwolla, Colin Percival,
Graham Percival)
- PHPDBG: - PHPDBG:
. array out of bounds, stack overflow handled for segfault handler on windows. . array out of bounds, stack overflow handled for segfault handler on windows.

View File

@ -18,7 +18,7 @@
18. avifinfo (ext/standard/libavifinfo) see ext/standard/libavifinfo/LICENSE 18. avifinfo (ext/standard/libavifinfo) see ext/standard/libavifinfo/LICENSE
19. xxHash (ext/hash/xxhash) 19. xxHash (ext/hash/xxhash)
20. Lexbor (ext/dom/lexbor/lexbor) see ext/dom/lexbor/LICENSE 20. Lexbor (ext/dom/lexbor/lexbor) see ext/dom/lexbor/LICENSE
21. Portions of libcperciva (ext/hash/hash_sha_{ni,sse2}.c) see the header in the source file
3. pcre2lib (ext/pcre) 3. pcre2lib (ext/pcre)

View File

@ -955,6 +955,10 @@ PHP 8.4 UPGRADE NOTES
. Improved the performance of FTP uploads up to a factor of 10x for large . Improved the performance of FTP uploads up to a factor of 10x for large
uploads. uploads.
- Hash:
. Added SSE2 and SHA-NI implementations of SHA-256. This improves the performance
on supported CPUs by ~1.3x (SSE2) and 3x - 5x (SHA-NI).
- MBString: - MBString:
. The performance of strspn() and strcspn() is greatly improved. . The performance of strspn() and strcspn() is greatly improved.
They now run in linear time instead of being bounded by quadratic time. They now run in linear time instead of being bounded by quadratic time.

View File

@ -64,6 +64,7 @@ typedef enum _zend_cpu_feature {
ZEND_CPU_FEATURE_AVX512F = (1<<16 | ZEND_CPU_EBX_MASK), ZEND_CPU_FEATURE_AVX512F = (1<<16 | ZEND_CPU_EBX_MASK),
ZEND_CPU_FEATURE_AVX512DQ = (1<<17 | ZEND_CPU_EBX_MASK), ZEND_CPU_FEATURE_AVX512DQ = (1<<17 | ZEND_CPU_EBX_MASK),
ZEND_CPU_FEATURE_AVX512CD = (1<<28 | ZEND_CPU_EBX_MASK), ZEND_CPU_FEATURE_AVX512CD = (1<<28 | ZEND_CPU_EBX_MASK),
ZEND_CPU_FEATURE_SHA = (1<<29 | ZEND_CPU_EBX_MASK),
/* intentionally don't support = (1<<30 | ZEND_CPU_EBX_MASK) */ /* intentionally don't support = (1<<30 | ZEND_CPU_EBX_MASK) */
/* intentionally don't support = (1<<31 | ZEND_CPU_EBX_MASK) */ /* intentionally don't support = (1<<31 | ZEND_CPU_EBX_MASK) */

View File

@ -34,7 +34,7 @@ else
PHP_HASH_CFLAGS="$PHP_HASH_CFLAGS -I@ext_srcdir@/$SHA3_DIR -DKeccakP200_excluded -DKeccakP400_excluded -DKeccakP800_excluded -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1" PHP_HASH_CFLAGS="$PHP_HASH_CFLAGS -I@ext_srcdir@/$SHA3_DIR -DKeccakP200_excluded -DKeccakP400_excluded -DKeccakP800_excluded -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1"
fi fi
EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c \ EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_sha_sse2.c hash_sha_ni.c hash_ripemd.c hash_haval.c \
hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c hash_adler32.c \ hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c hash_adler32.c \
hash_crc32.c hash_fnv.c hash_joaat.c $EXT_HASH_SHA3_SOURCES hash_crc32.c hash_fnv.c hash_joaat.c $EXT_HASH_SHA3_SOURCES
murmur/PMurHash.c murmur/PMurHash128.c hash_murmur.c hash_xxhash.c" murmur/PMurHash.c murmur/PMurHash128.c hash_murmur.c hash_xxhash.c"

View File

@ -9,7 +9,7 @@ if (PHP_MHASH != 'no') {
PHP_HASH = 'yes'; PHP_HASH = 'yes';
EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c ' + EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_sha_sse2.c hash_sha_ni.c hash_ripemd.c hash_haval.c ' +
'hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c ' + 'hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c ' +
'hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c ' + 'hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c ' +
'hash_sha3.c hash_murmur.c hash_xxhash.c', false); 'hash_sha3.c hash_murmur.c hash_xxhash.c', false);

View File

@ -17,6 +17,7 @@
#include "php_hash.h" #include "php_hash.h"
#include "php_hash_sha.h" #include "php_hash_sha.h"
#include "Zend/zend_cpuinfo.h"
static const unsigned char PADDING[128] = static const unsigned char PADDING[128] =
{ {
@ -160,6 +161,24 @@ PHP_HASH_API void PHP_SHA256InitArgs(PHP_SHA256_CTX * context, ZEND_ATTRIBUTE_UN
*/ */
static void SHA256Transform(uint32_t state[8], const unsigned char block[64]) static void SHA256Transform(uint32_t state[8], const unsigned char block[64])
{ {
#if defined(PHP_HASH_INTRIN_SHA_NATIVE)
SHA256_Transform_shani(state, block);
return;
#elif defined(PHP_HASH_INTRIN_SHA_RESOLVER)
if (zend_cpu_supports(ZEND_CPU_FEATURE_SSSE3) && zend_cpu_supports(ZEND_CPU_FEATURE_SHA)) {
SHA256_Transform_shani(state, block);
return;
}
#endif
#if defined(__SSE2__)
uint32_t tmp32[72];
SHA256_Transform_sse2(state, block, &tmp32[0], &tmp32[64]);
ZEND_SECURE_ZERO((unsigned char*) tmp32, sizeof(tmp32));
return;
#endif
uint32_t a = state[0], b = state[1], c = state[2], d = state[3]; uint32_t a = state[0], b = state[1], c = state[2], d = state[3];
uint32_t e = state[4], f = state[5], g = state[6], h = state[7]; uint32_t e = state[4], f = state[5], g = state[6], h = state[7];
uint32_t x[16], T1, T2, W[64]; uint32_t x[16], T1, T2, W[64];

176
ext/hash/hash_sha_ni.c Normal file
View File

@ -0,0 +1,176 @@
/*-
* Copyright 2018 Tarsnap Backup Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "php_hash.h"
#include "php_hash_sha.h"
#if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_IMMINTRIN_H)
# include <immintrin.h>
# if PHP_HASH_INTRIN_SHA_RESOLVER
static __m128i be32dec_128(const uint8_t * src) __attribute__((target("ssse3")));
void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64]) __attribute__((target("ssse3,sha")));
# endif
/* Original implementation from libcperciva follows.
*
* Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
*/
/**
* This code uses intrinsics from the following feature sets:
* SHANI: _mm_sha256msg1_epu32, _mm_sha256msg2_epu32, _mm_sha256rnds2_epu32
* SSSE3: _mm_shuffle_epi8, _mm_alignr_epi8
* SSE2: Everything else
*
* The SSSE3 intrinsics could be avoided at a slight cost by using a few SSE2
* instructions in their place; we have not done this since to our knowledge
* there are presently no CPUs which support the SHANI instruction set but do
* not support SSSE3.
*/
/* Load 32-bit big-endian words. */
static __m128i
be32dec_128(const uint8_t * src)
{
const __m128i SHUF = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11,
4, 5, 6, 7, 0, 1, 2, 3);
__m128i x;
/* Load four 32-bit words. */
x = _mm_loadu_si128((const __m128i *)src);
/* Reverse the order of the bytes in each word. */
return (_mm_shuffle_epi8(x, SHUF));
}
/* Convert an unsigned 32-bit immediate into a signed value. */
#define I32(a) ((UINT32_C(a) >= UINT32_C(0x80000000)) ? \
-(int32_t)(UINT32_C(0xffffffff) - UINT32_C(a)) - 1 : (int32_t)INT32_C(a))
/* Load four unsigned 32-bit immediates into a vector register. */
#define IMM4(a, b, c, d) _mm_set_epi32(I32(a), I32(b), I32(c), I32(d))
/* Run four rounds of SHA256. */
#define RND4(S, W, K0, K1, K2, K3) do { \
__m128i M; \
\
/* Add the next four words of message schedule and round constants. */ \
M = _mm_add_epi32(W, IMM4(K3, K2, K1, K0)); \
\
/* Perform two rounds of SHA256, using the low two words in M. */ \
S[1] = _mm_sha256rnds2_epu32(S[1], S[0], M); \
\
/* Shift the two words of M down and perform the next two rounds. */ \
M = _mm_srli_si128(M, 8); \
S[0] = _mm_sha256rnds2_epu32(S[0], S[1], M); \
} while (0)
/* Compute the ith set of four words of message schedule. */
#define MSG4(W, i) do { \
W[(i + 0) % 4] = _mm_sha256msg1_epu32(W[(i + 0) % 4], W[(i + 1) % 4]); \
W[(i + 0) % 4] = _mm_add_epi32(W[(i + 0) % 4], \
_mm_alignr_epi8(W[(i + 3) % 4], W[(i + 2) % 4], 4)); \
W[(i + 0) % 4] = _mm_sha256msg2_epu32(W[(i + 0) % 4], W[(i + 3) % 4]); \
} while (0)
/* Perform 4 rounds of SHA256 and generate more message schedule if needed. */
#define RNDMSG(S, W, i, K0, K1, K2, K3) do { \
RND4(S, W[i % 4], K0, K1, K2, K3); \
if (i < 12) \
MSG4(W, i + 4); \
} while (0)
/**
* SHA256_Transform_shani(state, block):
* Compute the SHA256 block compression function, transforming ${state} using
* the data in ${block}. This implementation uses x86 SHANI and SSSE3
* instructions, and should only be used if CPUSUPPORT_X86_SHANI and _SSSE3
* are defined and cpusupport_x86_shani() and _ssse3() return nonzero.
*/
void
SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8],
const uint8_t block[PHP_STATIC_RESTRICT 64])
{
__m128i S3210, S7654;
__m128i S0123, S4567;
__m128i S0145, S2367;
__m128i W[4];
__m128i S[2];
/* Load state. */
S3210 = _mm_loadu_si128((const __m128i *)&state[0]);
S7654 = _mm_loadu_si128((const __m128i *)&state[4]);
/* Shuffle the 8 32-bit values into the order we need them. */
S0123 = _mm_shuffle_epi32(S3210, 0x1B);
S4567 = _mm_shuffle_epi32(S7654, 0x1B);
S0145 = _mm_unpackhi_epi64(S4567, S0123);
S2367 = _mm_unpacklo_epi64(S4567, S0123);
/* Load input block; this is the start of the message schedule. */
W[0] = be32dec_128(&block[0]);
W[1] = be32dec_128(&block[16]);
W[2] = be32dec_128(&block[32]);
W[3] = be32dec_128(&block[48]);
/* Initialize working variables. */
S[0] = S0145;
S[1] = S2367;
/* Perform 64 rounds, 4 at a time. */
RNDMSG(S, W, 0, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5);
RNDMSG(S, W, 1, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5);
RNDMSG(S, W, 2, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3);
RNDMSG(S, W, 3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174);
RNDMSG(S, W, 4, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc);
RNDMSG(S, W, 5, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da);
RNDMSG(S, W, 6, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7);
RNDMSG(S, W, 7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967);
RNDMSG(S, W, 8, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13);
RNDMSG(S, W, 9, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85);
RNDMSG(S, W, 10, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3);
RNDMSG(S, W, 11, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070);
RNDMSG(S, W, 12, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5);
RNDMSG(S, W, 13, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3);
RNDMSG(S, W, 14, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208);
RNDMSG(S, W, 15, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2);
/* Mix local working variables into global state. */
S0145 = _mm_add_epi32(S0145, S[0]);
S2367 = _mm_add_epi32(S2367, S[1]);
/* Shuffle state back to the original word order and store. */
S0123 = _mm_unpackhi_epi64(S2367, S0145);
S4567 = _mm_unpacklo_epi64(S2367, S0145);
S3210 = _mm_shuffle_epi32(S0123, 0x1B);
S7654 = _mm_shuffle_epi32(S4567, 0x1B);
_mm_storeu_si128((__m128i *)&state[0], S3210);
_mm_storeu_si128((__m128i *)&state[4], S7654);
}
#endif

257
ext/hash/hash_sha_sse2.c Normal file
View File

@ -0,0 +1,257 @@
/*-
* Copyright 2021 Tarsnap Backup Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "php_hash.h"
#include "php_hash_sha.h"
#ifdef __SSE2__
# include <emmintrin.h>
/* Original implementation from libcperciva follows.
*
* Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility.
*/
/**
* mm_bswap_epi32(a):
* Byte-swap each 32-bit word.
*/
static inline __m128i
mm_bswap_epi32(__m128i a)
{
/* Swap bytes in each 16-bit word. */
a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8));
/* Swap all 16-bit words. */
a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
return (a);
}
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c)
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
/* Message schedule computation */
#define SHR32(x, n) (_mm_srli_epi32(x, n))
#define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n))))
#define s0_128(x) _mm_xor_si128(_mm_xor_si128( \
ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3))
static inline __m128i
s1_128_high(__m128i a)
{
__m128i b;
__m128i c;
/* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
/* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
/* Shuffle good data back and zero unwanted lanes. */
c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
c = _mm_slli_si128(c, 8);
return (c);
}
static inline __m128i
s1_128_low(__m128i a)
{
__m128i b;
__m128i c;
/* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
/* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
/* Shuffle good data back and zero unwanted lanes. */
c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
c = _mm_srli_si128(c, 8);
return (c);
}
/**
* SPAN_ONE_THREE(a, b):
* Combine the upper three words of ${a} with the lowest word of ${b}. This
* could also be thought of returning bits [159:32] of the 256-bit value
* consisting of (b[127:0] a[127:0]). In other words, set:
* dst[31:0] := a[63:32]
* dst[63:32] := a[95:64]
* dst[95:64] := a[127:96]
* dst[127:96] := b[31:0]
*/
#define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \
_mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \
_MM_SHUFFLE(0, 3, 2, 1)))
/**
* MSG4(X0, X1, X2, X3):
* Calculate the next four values of the message schedule. If we define
* ${W[j]} as the first unknown value in the message schedule, then the input
* arguments are:
* X0 = W[j - 16] : W[j - 13]
* X1 = W[j - 12] : W[j - 9]
* X2 = W[j - 8] : W[j - 5]
* X3 = W[j - 4] : W[j - 1]
* This function therefore calculates:
* X4 = W[j + 0] : W[j + 3]
*/
static inline __m128i
MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3)
{
__m128i X4;
__m128i Xj_minus_seven, Xj_minus_fifteen;
/* Set up variables which span X values. */
Xj_minus_seven = SPAN_ONE_THREE(X2, X3);
Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1);
/* Begin computing X4. */
X4 = _mm_add_epi32(X0, Xj_minus_seven);
X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen));
/* First half of s1. */
X4 = _mm_add_epi32(X4, s1_128_low(X3));
/* Second half of s1; this depends on the above value of X4. */
X4 = _mm_add_epi32(X4, s1_128_high(X4));
return (X4);
}
/**
* SHA256_Transform_sse2(state, block, W, S):
* Compute the SHA256 block compression function, transforming ${state} using
* the data in ${block}. This implementation uses x86 SSE2 instructions, and
* should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns
* nonzero. The arrays W and S may be filled with sensitive data, and should
* be cleared by the callee.
*/
void
SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8],
const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64],
uint32_t S[PHP_STATIC_RESTRICT 8])
{
__m128i Y[4];
int i;
/* 1. Prepare the first part of the message schedule W. */
Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0]));
_mm_storeu_si128((__m128i *)&W[0], Y[0]);
Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16]));
_mm_storeu_si128((__m128i *)&W[4], Y[1]);
Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32]));
_mm_storeu_si128((__m128i *)&W[8], Y[2]);
Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48]));
_mm_storeu_si128((__m128i *)&W[12], Y[3]);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]);
_mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]);
Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]);
_mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]);
Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]);
_mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]);
Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]);
_mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]);
}
/* 4. Mix local working variables into global state. */
for (i = 0; i < 8; i++)
state[i] += S[i];
}
#endif

View File

@ -45,6 +45,27 @@ typedef struct {
#define PHP_SHA256Init(ctx) PHP_SHA256InitArgs(ctx, NULL) #define PHP_SHA256Init(ctx) PHP_SHA256InitArgs(ctx, NULL)
PHP_HASH_API void PHP_SHA256InitArgs(PHP_SHA256_CTX *, ZEND_ATTRIBUTE_UNUSED HashTable *); PHP_HASH_API void PHP_SHA256InitArgs(PHP_SHA256_CTX *, ZEND_ATTRIBUTE_UNUSED HashTable *);
PHP_HASH_API void PHP_SHA256Update(PHP_SHA256_CTX *, const unsigned char *, size_t); PHP_HASH_API void PHP_SHA256Update(PHP_SHA256_CTX *, const unsigned char *, size_t);
#ifdef _MSC_VER
# define PHP_STATIC_RESTRICT
#else
# define PHP_STATIC_RESTRICT static restrict
#endif
#if defined(__SSE2__)
void SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64], uint32_t S[PHP_STATIC_RESTRICT 8]);
#endif
#if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_IMMINTRIN_H)
# if defined(__SSSE3__) && defined(__SHA__)
# define PHP_HASH_INTRIN_SHA_NATIVE 1
# elif defined(HAVE_FUNC_ATTRIBUTE_TARGET)
# define PHP_HASH_INTRIN_SHA_RESOLVER 1
# endif
void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64]);
#endif
PHP_HASH_API void PHP_SHA256Final(unsigned char[32], PHP_SHA256_CTX *); PHP_HASH_API void PHP_SHA256Final(unsigned char[32], PHP_SHA256_CTX *);
/* SHA384 context */ /* SHA384 context */