From c3299d7dab15aeed52a34535f1967d426f5327de Mon Sep 17 00:00:00 2001 From: Frank Du Date: Wed, 2 Sep 2020 10:53:29 +0000 Subject: [PATCH] X86: Fast CRC32 computation using PCLMULQDQ instruction Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 Signed-off-by: Frank Du Closes GH-6018 --- Zend/zend_cpuinfo.h | 15 ++ Zend/zend_portability.h | 56 +++++ configure.ac | 1 + ext/hash/hash_crc32.c | 25 +- ext/hash/tests/crc32.phpt | 86 ++++++- ext/standard/basic_functions.c | 5 + ext/standard/config.m4 | 2 +- ext/standard/config.w32 | 2 +- ext/standard/crc32.c | 9 +- ext/standard/crc32_x86.c | 349 ++++++++++++++++++++++++++ ext/standard/crc32_x86.h | 52 ++++ ext/standard/tests/strings/crc32.phpt | 28 +++ 12 files changed, 620 insertions(+), 10 deletions(-) create mode 100644 ext/standard/crc32_x86.c create mode 100644 ext/standard/crc32_x86.h diff --git a/Zend/zend_cpuinfo.h b/Zend/zend_cpuinfo.h index b8b08e7ad06..0baec57c23a 100644 --- a/Zend/zend_cpuinfo.h +++ b/Zend/zend_cpuinfo.h @@ -159,6 +159,17 @@ static zend_always_inline int zend_cpu_supports_sse42() { return __builtin_cpu_supports("sse4.2"); } +/* __builtin_cpu_supports has pclmul from gcc9 */ +#if (!defined(__GNUC__) || (ZEND_GCC_VERSION >= 9000)) +ZEND_NO_SANITIZE_ADDRESS +static zend_always_inline int zend_cpu_supports_pclmul() { +#if PHP_HAVE_BUILTIN_CPU_INIT + __builtin_cpu_init(); +#endif + return __builtin_cpu_supports("pclmul"); +} +#endif + ZEND_NO_SANITIZE_ADDRESS static zend_always_inline int zend_cpu_supports_avx() { #if PHP_HAVE_BUILTIN_CPU_INIT @@ -196,6 +207,10 @@ static zend_always_inline int zend_cpu_supports_sse42() { return zend_cpu_supports(ZEND_CPU_FEATURE_SSE42); } +static zend_always_inline int zend_cpu_supports_pclmul() { + return zend_cpu_supports(ZEND_CPU_FEATURE_PCLMULQDQ); +} + static zend_always_inline int zend_cpu_supports_avx() { return zend_cpu_supports(ZEND_CPU_FEATURE_AVX); } diff --git a/Zend/zend_portability.h b/Zend/zend_portability.h index e94560d18f0..f4acc9930e8 100644 --- a/Zend/zend_portability.h +++ b/Zend/zend_portability.h @@ -487,6 +487,10 @@ extern "C++" { # define PHP_HAVE_SSE4_2 # endif +# if defined(HAVE_WMMINTRIN_H) +# define PHP_HAVE_PCLMUL +# endif + /* * AVX2 support was added in gcc 4.7, but AVX2 intrinsics don't work in * __attribute__((target("avx2"))) functions until gcc 4.9. @@ -547,6 +551,58 @@ extern "C++" { # define ZEND_INTRIN_SSE4_2_FUNC_DECL(func) #endif +#ifdef __PCLMUL__ +/* Instructions compiled directly. */ +# define ZEND_INTRIN_PCLMUL_NATIVE 1 +#elif (defined(HAVE_FUNC_ATTRIBUTE_TARGET) && defined(PHP_HAVE_PCLMUL)) || defined(ZEND_WIN32) +/* Function resolved by ifunc or MINIT. */ +# define ZEND_INTRIN_PCLMUL_RESOLVER 1 +#endif + +/* Do not use for conditional declaration of API functions! */ +#if defined(ZEND_INTRIN_PCLMUL_RESOLVER) && defined(ZEND_INTRIN_HAVE_IFUNC_TARGET) && (!defined(__GNUC__) || (ZEND_GCC_VERSION >= 9000)) +/* __builtin_cpu_supports has pclmul from gcc9 */ +# define ZEND_INTRIN_PCLMUL_FUNC_PROTO 1 +#elif defined(ZEND_INTRIN_PCLMUL_RESOLVER) +# define ZEND_INTRIN_PCLMUL_FUNC_PTR 1 +#endif + +#ifdef ZEND_INTRIN_PCLMUL_RESOLVER +# ifdef HAVE_FUNC_ATTRIBUTE_TARGET +# define ZEND_INTRIN_PCLMUL_FUNC_DECL(func) ZEND_API func __attribute__((target("pclmul"))) +# else +# define ZEND_INTRIN_PCLMUL_FUNC_DECL(func) func +# endif +#else +# define ZEND_INTRIN_PCLMUL_FUNC_DECL(func) +#endif + +#if defined(ZEND_INTRIN_SSE4_2_NATIVE) && defined(ZEND_INTRIN_PCLMUL_NATIVE) +/* Instructions compiled directly. */ +# define ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE 1 +#elif (defined(HAVE_FUNC_ATTRIBUTE_TARGET) && defined(PHP_HAVE_SSE4_2) && defined(PHP_HAVE_PCLMUL)) || defined(ZEND_WIN32) +/* Function resolved by ifunc or MINIT. */ +# define ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER 1 +#endif + +/* Do not use for conditional declaration of API functions! */ +#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) && defined(ZEND_INTRIN_HAVE_IFUNC_TARGET) && (!defined(__GNUC__) || (ZEND_GCC_VERSION >= 9000)) +/* __builtin_cpu_supports has pclmul from gcc9 */ +# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO 1 +#elif defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) +# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR 1 +#endif + +#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER +# ifdef HAVE_FUNC_ATTRIBUTE_TARGET +# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func) ZEND_API func __attribute__((target("sse4.2,pclmul"))) +# else +# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func) func +# endif +#else +# define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func) +#endif + #ifdef __AVX2__ # define ZEND_INTRIN_AVX2_NATIVE 1 #elif (defined(HAVE_FUNC_ATTRIBUTE_TARGET) && defined(PHP_HAVE_AVX2)) || defined(ZEND_WIN32) diff --git a/configure.ac b/configure.ac index 7e7f3edec48..3ff9a91bf9f 100644 --- a/configure.ac +++ b/configure.ac @@ -409,6 +409,7 @@ sys/ipc.h \ dlfcn.h \ tmmintrin.h \ nmmintrin.h \ +wmmintrin.h \ immintrin.h ],[],[],[ #ifdef HAVE_SYS_PARAM_H diff --git a/ext/hash/hash_crc32.c b/ext/hash/hash_crc32.c index de270522d7e..ade32a3b35c 100644 --- a/ext/hash/hash_crc32.c +++ b/ext/hash/hash_crc32.c @@ -18,6 +18,7 @@ #include "php_hash.h" #include "php_hash_crc32.h" #include "php_hash_crc32_tables.h" +#include "ext/standard/crc32_x86.h" PHP_HASH_API void PHP_CRC32Init(PHP_CRC32_CTX *context) { @@ -26,27 +27,39 @@ PHP_HASH_API void PHP_CRC32Init(PHP_CRC32_CTX *context) PHP_HASH_API void PHP_CRC32Update(PHP_CRC32_CTX *context, const unsigned char *input, size_t len) { - size_t i; + size_t i = 0; - for (i = 0; i < len; ++i) { +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER + i += crc32_x86_simd_update(X86_CRC32, &context->state, input, len); +#endif + + for (; i < len; ++i) { context->state = (context->state << 8) ^ crc32_table[(context->state >> 24) ^ (input[i] & 0xff)]; } } PHP_HASH_API void PHP_CRC32BUpdate(PHP_CRC32_CTX *context, const unsigned char *input, size_t len) { - size_t i; + size_t i = 0; - for (i = 0; i < len; ++i) { +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER + i += crc32_x86_simd_update(X86_CRC32B, &context->state, input, len); +#endif + + for (; i < len; ++i) { context->state = (context->state >> 8) ^ crc32b_table[(context->state ^ input[i]) & 0xff]; } } PHP_HASH_API void PHP_CRC32CUpdate(PHP_CRC32_CTX *context, const unsigned char *input, size_t len) { - size_t i; + size_t i = 0; - for (i = 0; i < len; ++i) { +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER + i += crc32_x86_simd_update(X86_CRC32C, &context->state, input, len); +#endif + + for (; i < len; ++i) { context->state = (context->state >> 8) ^ crc32c_table[(context->state ^ input[i]) & 0xff]; } } diff --git a/ext/hash/tests/crc32.phpt b/ext/hash/tests/crc32.phpt index f99152b49e0..fda63c71360 100644 --- a/ext/hash/tests/crc32.phpt +++ b/ext/hash/tests/crc32.phpt @@ -10,6 +10,20 @@ echo hash('crc32', 'message digest'), "\n"; echo hash('crc32', 'abcdefghijklmnopqrstuvwxyz'), "\n"; echo hash('crc32', 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'), "\n"; echo hash('crc32', '12345678901234567890123456789012345678901234567890123456789012345678901234567890'), "\n"; +echo hash('crc32', '1234567890123456'), "\n"; +echo hash('crc32', '1234567890123456abc'), "\n"; +echo hash('crc32', '12345678901234561234567890123456'), "\n"; +echo hash('crc32', '12345678901234561234567890123456abc'), "\n"; +echo hash('crc32', '123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32', '123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32', '1234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32', '1234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; echo "crc32b\n"; echo hash('crc32b', ''), "\n"; @@ -19,6 +33,20 @@ echo hash('crc32b', 'message digest'), "\n"; echo hash('crc32b', 'abcdefghijklmnopqrstuvwxyz'), "\n"; echo hash('crc32b', 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'), "\n"; echo hash('crc32b', '12345678901234567890123456789012345678901234567890123456789012345678901234567890'), "\n"; +echo hash('crc32b', '1234567890123456'), "\n"; +echo hash('crc32b', '1234567890123456abc'), "\n"; +echo hash('crc32b', '12345678901234561234567890123456'), "\n"; +echo hash('crc32b', '12345678901234561234567890123456abc'), "\n"; +echo hash('crc32b', '123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32b', '123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32b', '1234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32b', '1234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32b', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32b', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32b', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; echo "crc32c\n"; echo hash('crc32c', ''), "\n"; @@ -59,6 +87,20 @@ echo hash('crc32c', "Even if I could be Shakespeare, I think I should still choo echo hash('crc32c', "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"), "\n"; echo hash('crc32c', "How can you write a big system without C++? -Paul Glick"), "\n"; echo hash('crc32c', "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#\$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"), "\n"; +echo hash('crc32c', '1234567890123456'), "\n"; +echo hash('crc32c', '1234567890123456abc'), "\n"; +echo hash('crc32c', '12345678901234561234567890123456'), "\n"; +echo hash('crc32c', '12345678901234561234567890123456abc'), "\n"; +echo hash('crc32c', '123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32c', '123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32c', '1234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32c', '1234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32c', '12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; +echo hash('crc32c', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456'), "\n"; +echo hash('crc32c', '123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc'), "\n"; ?> --EXPECT-- @@ -70,6 +112,20 @@ crc32 9693bf77 882174a0 96790816 +98b0e78d +a6f33d71 +900a1d38 +396978fe +adfc6afe +d3ef9388 +c53911dc +37006f1b +4a54af3a +98d05c71 +5a26f5b4 +b9108715 +cc684112 +b2ac45af crc32b 00000000 e8b7be43 @@ -78,6 +134,20 @@ e8b7be43 4c2750bd 1fc2e6d2 7ca94a72 +1e5fcdb7 +70b54c2f +094fb11e +38210c49 +7399c6ef +83e98d04 +1f26a94e +e2e8634a +0642542d +43b42c9b +262e1ded +b7a463c4 +dfa1bbae +4022d57a crc32c 00000000 c1d04330 @@ -116,4 +186,18 @@ de2e65c5 297a88ed 66ed1d8b dcded527 -9c44184b \ No newline at end of file +9c44184b +9aa4287f +ab2761c5 +cd486b4b +c19c4a41 +1ea5b441 +36d20512 +31d11ffa +65d5bb9e +a0e3e317 +8dc10a7c +7ab04135 +c292a38d +e3e558ec +b6c5e13e diff --git a/ext/standard/basic_functions.c b/ext/standard/basic_functions.c index 18e4985da2b..7051e4a456b 100755 --- a/ext/standard/basic_functions.c +++ b/ext/standard/basic_functions.c @@ -33,6 +33,7 @@ #include "ext/standard/php_dns.h" #include "ext/standard/php_uuencode.h" #include "ext/standard/php_mt_rand.h" +#include "ext/standard/crc32_x86.h" #ifdef PHP_WIN32 #include "win32/php_win32_globals.h" @@ -363,6 +364,10 @@ PHP_MINIT_FUNCTION(basic) /* {{{ */ BASIC_MINIT_SUBMODULE(string_intrin) #endif +#if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR + BASIC_MINIT_SUBMODULE(crc32_x86_intrin) +#endif + #if ZEND_INTRIN_AVX2_FUNC_PTR || ZEND_INTRIN_SSSE3_FUNC_PTR BASIC_MINIT_SUBMODULE(base64_intrin) #endif diff --git a/ext/standard/config.m4 b/ext/standard/config.m4 index 04b9d0aea1d..1cb0af79f68 100644 --- a/ext/standard/config.m4 +++ b/ext/standard/config.m4 @@ -449,7 +449,7 @@ PHP_NEW_EXTENSION(standard, array.c base64.c basic_functions.c browscap.c crc32. http_fopen_wrapper.c php_fopen_wrapper.c credits.c css.c \ var_unserializer.c ftok.c sha1.c user_filters.c uuencode.c \ filters.c proc_open.c streamsfuncs.c http.c password.c \ - random.c net.c hrtime.c,,, + random.c net.c hrtime.c crc32_x86.c,,, -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1) PHP_ADD_MAKEFILE_FRAGMENT diff --git a/ext/standard/config.w32 b/ext/standard/config.w32 index a5737ea8538..d23bfd9db26 100644 --- a/ext/standard/config.w32 +++ b/ext/standard/config.w32 @@ -25,7 +25,7 @@ ADD_FLAG("LIBS_STANDARD", "iphlpapi.lib"); EXTENSION("standard", "array.c base64.c basic_functions.c browscap.c \ crc32.c crypt.c crypt_freesec.c crypt_blowfish.c crypt_sha256.c \ - crypt_sha512.c php_crypt_r.c \ + crypt_sha512.c php_crypt_r.c crc32_x86.c \ datetime.c dir.c dl.c dns.c dns_win32.c exec.c \ file.c filestat.c formatted_print.c fsock.c head.c html.c image.c \ info.c iptc.c lcg.c link.c mail.c math.c md5.c metaphone.c microtime.c \ diff --git a/ext/standard/crc32.c b/ext/standard/crc32.c index 6ab5c317bfc..40d9404053e 100644 --- a/ext/standard/crc32.c +++ b/ext/standard/crc32.c @@ -17,6 +17,7 @@ #include "php.h" #include "basic_functions.h" #include "crc32.h" +#include "crc32_x86.h" #if HAVE_AARCH64_CRC32 # include @@ -74,7 +75,7 @@ PHP_FUNCTION(crc32) char *p; size_t nr; uint32_t crcinit = 0; - register uint32_t crc; + uint32_t crc; ZEND_PARSE_PARAMETERS_START(1, 1) Z_PARAM_STRING(p, nr) @@ -89,6 +90,12 @@ PHP_FUNCTION(crc32) } #endif +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER + size_t nr_simd = crc32_x86_simd_update(X86_CRC32B, &crc, (const unsigned char *)p, nr); + nr -= nr_simd; + p += nr_simd; +#endif + for (; nr--; ++p) { crc = ((crc >> 8) & 0x00FFFFFF) ^ crc32tab[(crc ^ (*p)) & 0xFF ]; } diff --git a/ext/standard/crc32_x86.c b/ext/standard/crc32_x86.c new file mode 100644 index 00000000000..296eadeb949 --- /dev/null +++ b/ext/standard/crc32_x86.c @@ -0,0 +1,349 @@ +/* + +----------------------------------------------------------------------+ + | Copyright (c) The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | https://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Frank Du | + +----------------------------------------------------------------------+ + | Compute the crc32 of the buffer. Based on: | + | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ" | + | V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 | +*/ + +#include "crc32_x86.h" + +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER +# include +# include +#endif + +#if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER +# include "Zend/zend_cpuinfo.h" +#endif + +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER + +typedef struct _crc32_pclmul_bit_consts { + uint64_t k1k2[2]; + uint64_t k3k4[2]; + uint64_t k5k6[2]; + uint64_t uPx[2]; +} crc32_pclmul_consts; + +static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = { + { /* X86_CRC32, polynomial: 0x04C11DB7 */ + {0x00e6228b11, 0x008833794c}, /* endianness swap */ + {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */ + {0x00490d678d, 0x00f200aa66}, /* endianness swap */ + {0x0104d101df, 0x0104c11db7} + }, + { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */ + {0x0154442bd4, 0x01c6e41596}, + {0x01751997d0, 0x00ccaa009e}, + {0x0163cd6124, 0x01db710640}, + {0x01f7011641, 0x01db710641}, + }, + { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */ + {0x00740eef02, 0x009e4addf8}, + {0x00f20c0dfe, 0x014cd00bd6}, + {0x00dd45aab8, 0x0000000000}, + {0x00dea713f1, 0x0105ec76f0} + } +}; + +static uint8_t pclmul_shuf_mask_table[16] = { + 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, +}; + +/* Folding of 128-bit data chunks */ +#define CRC32_FOLDING_BLOCK_SIZE (16) + +/* PCLMUL version of non-relfected crc32 */ +ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)); +size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts) +{ + size_t nr_in = nr; + __m128i x0, x1, x2, k, shuf_mask; + + if (nr < CRC32_FOLDING_BLOCK_SIZE) { + return 0; + } + + shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table)); + x0 = _mm_cvtsi32_si128(*crc); + x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x0 = _mm_slli_si128(x0, 12); + x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */ + x0 = _mm_xor_si128(x1, x0); + p += CRC32_FOLDING_BLOCK_SIZE; + nr -= CRC32_FOLDING_BLOCK_SIZE; + + if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) { + __m128i x3, x4; + + x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */ + x2 = _mm_loadu_si128((__m128i *)(p + 0x10)); + x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */ + x3 = _mm_loadu_si128((__m128i *)(p + 0x20)); + x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */ + p += CRC32_FOLDING_BLOCK_SIZE * 3; + nr -= CRC32_FOLDING_BLOCK_SIZE * 3; + + k = _mm_loadu_si128((__m128i *)consts->k1k2); + /* parallel folding by 4 */ + while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) { + __m128i x5, x6, x7, x8, x9, x10, x11; + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x5 = _mm_clmulepi64_si128(x1, k, 0x00); + x6 = _mm_clmulepi64_si128(x2, k, 0x00); + x7 = _mm_clmulepi64_si128(x3, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x1 = _mm_clmulepi64_si128(x1, k, 0x11); + x2 = _mm_clmulepi64_si128(x2, k, 0x11); + x3 = _mm_clmulepi64_si128(x3, k, 0x11); + x8 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */ + x9 = _mm_loadu_si128((__m128i *)(p + 0x10)); + x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */ + x10 = _mm_loadu_si128((__m128i *)(p + 0x20)); + x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */ + x11 = _mm_loadu_si128((__m128i *)(p + 0x30)); + x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */ + x0 = _mm_xor_si128(x0, x4); + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_xor_si128(x2, x6); + x3 = _mm_xor_si128(x3, x7); + x0 = _mm_xor_si128(x0, x8); + x1 = _mm_xor_si128(x1, x9); + x2 = _mm_xor_si128(x2, x10); + x3 = _mm_xor_si128(x3, x11); + + p += CRC32_FOLDING_BLOCK_SIZE * 4; + nr -= CRC32_FOLDING_BLOCK_SIZE * 4; + } + + k = _mm_loadu_si128((__m128i *)consts->k3k4); + /* fold 4 to 1, [x1, x2, x3] -> x0 */ + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x1); + x0 = _mm_xor_si128(x0, x4); + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x2); + x0 = _mm_xor_si128(x0, x4); + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x3); + x0 = _mm_xor_si128(x0, x4); + } + + k = _mm_loadu_si128((__m128i *)consts->k3k4); + /* folding by 1 */ + while (nr >= CRC32_FOLDING_BLOCK_SIZE) { + /* load next to x2, fold to x0, x1 */ + x2 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */ + x1 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x2); + x0 = _mm_xor_si128(x0, x1); + p += CRC32_FOLDING_BLOCK_SIZE; + nr -= CRC32_FOLDING_BLOCK_SIZE; + } + + /* reduce 128-bits(final fold) to 96-bits */ + k = _mm_loadu_si128((__m128i*)consts->k5k6); + x1 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_slli_si128(x0, 8); + x0 = _mm_srli_si128(x0, 4); + x0 = _mm_xor_si128(x0, x1); + /* reduce 96-bits to 64-bits */ + x1 = _mm_clmulepi64_si128(x0, k, 0x01); + x0 = _mm_xor_si128(x0, x1); + + /* barrett reduction */ + k = _mm_loadu_si128((__m128i*)consts->uPx); + x1 = _mm_move_epi64(x0); + x1 = _mm_srli_si128(x1, 4); + x1 = _mm_clmulepi64_si128(x1, k, 0x00); + x1 = _mm_srli_si128(x1, 4); + x1 = _mm_clmulepi64_si128(x1, k, 0x10); + x0 = _mm_xor_si128(x1, x0); + *crc = _mm_extract_epi32(x0, 0); + return (nr_in - nr); /* the nr processed */ +} + +/* PCLMUL version of relfected crc32 */ +ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)); +size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts) +{ + size_t nr_in = nr; + __m128i x0, x1, x2, k; + + if (nr < CRC32_FOLDING_BLOCK_SIZE) { + return 0; + } + + x0 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc)); + p += CRC32_FOLDING_BLOCK_SIZE; + nr -= CRC32_FOLDING_BLOCK_SIZE; + if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) { + __m128i x3, x4; + + x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x2 = _mm_loadu_si128((__m128i *)(p + 0x10)); + x3 = _mm_loadu_si128((__m128i *)(p + 0x20)); + p += CRC32_FOLDING_BLOCK_SIZE * 3; + nr -= CRC32_FOLDING_BLOCK_SIZE * 3; + + k = _mm_loadu_si128((__m128i *)consts->k1k2); + /* parallel folding by 4 */ + while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) { + __m128i x5, x6, x7, x8, x9, x10, x11; + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x5 = _mm_clmulepi64_si128(x1, k, 0x00); + x6 = _mm_clmulepi64_si128(x2, k, 0x00); + x7 = _mm_clmulepi64_si128(x3, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x1 = _mm_clmulepi64_si128(x1, k, 0x11); + x2 = _mm_clmulepi64_si128(x2, k, 0x11); + x3 = _mm_clmulepi64_si128(x3, k, 0x11); + x8 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x9 = _mm_loadu_si128((__m128i *)(p + 0x10)); + x10 = _mm_loadu_si128((__m128i *)(p + 0x20)); + x11 = _mm_loadu_si128((__m128i *)(p + 0x30)); + x0 = _mm_xor_si128(x0, x4); + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_xor_si128(x2, x6); + x3 = _mm_xor_si128(x3, x7); + x0 = _mm_xor_si128(x0, x8); + x1 = _mm_xor_si128(x1, x9); + x2 = _mm_xor_si128(x2, x10); + x3 = _mm_xor_si128(x3, x11); + + p += CRC32_FOLDING_BLOCK_SIZE * 4; + nr -= CRC32_FOLDING_BLOCK_SIZE * 4; + } + + k = _mm_loadu_si128((__m128i *)consts->k3k4); + /* fold 4 to 1, [x1, x2, x3] -> x0 */ + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x1); + x0 = _mm_xor_si128(x0, x4); + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x2); + x0 = _mm_xor_si128(x0, x4); + x4 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x3); + x0 = _mm_xor_si128(x0, x4); + } + + k = _mm_loadu_si128((__m128i *)consts->k3k4); + /* folding by 1 */ + while (nr >= CRC32_FOLDING_BLOCK_SIZE) { + /* load next to x2, fold to x0, x1 */ + x2 = _mm_loadu_si128((__m128i *)(p + 0x00)); + x1 = _mm_clmulepi64_si128(x0, k, 0x00); + x0 = _mm_clmulepi64_si128(x0, k, 0x11); + x0 = _mm_xor_si128(x0, x2); + x0 = _mm_xor_si128(x0, x1); + p += CRC32_FOLDING_BLOCK_SIZE; + nr -= CRC32_FOLDING_BLOCK_SIZE; + } + + /* reduce 128-bits(final fold) to 96-bits */ + x1 = _mm_clmulepi64_si128(x0, k, 0x10); + x0 = _mm_srli_si128(x0, 8); + x0 = _mm_xor_si128(x0, x1); + /* reduce 96-bits to 64-bits */ + x1 = _mm_shuffle_epi32(x0, 0xfc); + x0 = _mm_shuffle_epi32(x0, 0xf9); + k = _mm_loadu_si128((__m128i*)consts->k5k6); + x1 = _mm_clmulepi64_si128(x1, k, 0x00); + x0 = _mm_xor_si128(x0, x1); + + /* barrett reduction */ + x1 = _mm_shuffle_epi32(x0, 0xf3); + x0 = _mm_slli_si128(x0, 4); + k = _mm_loadu_si128((__m128i*)consts->uPx); + x1 = _mm_clmulepi64_si128(x1, k, 0x00); + x1 = _mm_clmulepi64_si128(x1, k, 0x10); + x0 = _mm_xor_si128(x1, x0); + *crc = _mm_extract_epi32(x0, 2); + return (nr_in - nr); /* the nr processed */ +} + +# if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE +size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) +# else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */ +size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) +# endif +{ + if (type > X86_CRC32_MAX) { + return 0; + } + const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type]; + + switch (type) { + case X86_CRC32: + return crc32_pclmul_batch(crc, p, nr, consts); + case X86_CRC32B: + case X86_CRC32C: + return crc32_pclmul_reflected_batch(crc, p, nr, consts); + default: + return 0; + } +} +#endif + +#if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER +static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) +{ + return 0; +} + +# if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO +size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update"))); + +typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr); + +ZEND_NO_SANITIZE_ADDRESS +ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */ +static crc32_x86_simd_func_t resolve_crc32_x86_simd_update() { + if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) { + return crc32_sse42_pclmul_update; + } + return crc32_x86_simd_update_default; +} +# else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */ +static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default; + +size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) { + return crc32_x86_simd_ptr(type, crc, p, nr); +} + +/* {{{ PHP_MINIT_FUNCTION */ +PHP_MINIT_FUNCTION(crc32_x86_intrin) +{ + if (zend_cpu_supports(ZEND_CPU_FEATURE_SSE42) && zend_cpu_supports(ZEND_CPU_FEATURE_PCLMULQDQ)) { + crc32_x86_simd_ptr = crc32_sse42_pclmul_update; + } + return SUCCESS; +} +/* }}} */ +# endif +#endif diff --git a/ext/standard/crc32_x86.h b/ext/standard/crc32_x86.h new file mode 100644 index 00000000000..6420030dbf5 --- /dev/null +++ b/ext/standard/crc32_x86.h @@ -0,0 +1,52 @@ +/* + +----------------------------------------------------------------------+ + | Copyright (c) The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | https://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Author: Frank Du | + +----------------------------------------------------------------------+ +*/ + +#ifndef _CRC32_X86_HEADER_H_ +#define _CRC32_X86_HEADER_H_ + +#include "php.h" + +typedef enum { + /* polynomial: 0x04C11DB7, used by bzip */ + X86_CRC32 = 0, + /* + polynomial: 0x04C11DB7 with reversed ordering, + used by ethernet (IEEE 802.3), gzip, zip, png, etc + */ + X86_CRC32B, + /* + polynomial: 0x1EDC6F41 with reversed ordering, + used by iSCSI, SCTP, Btrfs, ext4, etc + */ + X86_CRC32C, + X86_CRC32_MAX, +} X86_CRC32_TYPE; + +#if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR +PHP_MINIT_FUNCTION(crc32_x86_intrin); +#endif + +#if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER +/* Return the size processed by SIMD routine */ +size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr); +#else +static inline size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) +{ + return 0; +} +#endif + +#endif diff --git a/ext/standard/tests/strings/crc32.phpt b/ext/standard/tests/strings/crc32.phpt index 2fa8be59d87..da86d067bf6 100644 --- a/ext/standard/tests/strings/crc32.phpt +++ b/ext/standard/tests/strings/crc32.phpt @@ -6,9 +6,37 @@ $input = array("foo", "bar", "baz", "grldsajkopallkjasd"); foreach($input AS $i) { printf("%u\n", crc32($i)); } +printf("%u\n", crc32("1234567890123456")); +printf("%u\n", crc32("1234567890123456abc")); +printf("%u\n", crc32("12345678901234561234567890123456")); +printf("%u\n", crc32("12345678901234561234567890123456abc")); +printf("%u\n", crc32("123456789012345612345678901234561234567890123456")); +printf("%u\n", crc32("123456789012345612345678901234561234567890123456abc")); +printf("%u\n", crc32("1234567890123456123456789012345612345678901234561234567890123456")); +printf("%u\n", crc32("1234567890123456123456789012345612345678901234561234567890123456abc")); +printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456")); +printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456abc")); +printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456")); +printf("%u\n", crc32("12345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc")); +printf("%u\n", crc32("123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456")); +printf("%u\n", crc32("123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456123456789012345612345678901234561234567890123456abc")); ?> --EXPECT-- 2356372769 1996459178 2015626392 824412087 +509595063 +1890929711 +156217630 +941689929 +1939457775 +2213121284 +522627406 +3806880586 +105010221 +1135881371 +640556525 +3081003972 +3751918510 +1076024698