x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results.

"Optimize" is in quotes because it's rather a "salvage operation" for now. Idea is to identify processor capability flags that drive Knights Landing to suboptimial code paths and mask them. Two flags were identified, XSAVE and ADCX/ADOX. Former affects choice of AES-NI code path specific for Silvermont (Knights Landing is of Silvermont "ancestry"). And 64-bit ADCX/ADOX instructions are effectively mishandled at decode time. In both cases we are looking at ~2x improvement. AVX-512 results cover even Skylake-X :-) Hardware used for benchmarking courtesy of Atos, experiments run by Romain Dolbeau <romain.dolbeau@atos.net>. Kudos! Reviewed-by: Rich Salz <rsalz@openssl.org>
2024-11-28 12:35:22 +08:00 · 2017-07-20 09:48:35 +02:00 · 2017-07-20 09:48:35 +02:00 · 64d92d7498
commit 64d92d7498
parent bbb4ceb86e
8 changed files with 33 additions and 5 deletions
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@ -179,6 +179,7 @@
 # Haswell	4.44/0.63	0.63	0.73	0.63	0.70
 # Skylake	2.62/0.63	0.63	0.63	0.63
 # Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
+# Knights L	2.54/0.77	0.78	0.85	-	1.50
 # Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
 # Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
 # Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@ -24,7 +24,7 @@
 #
 # Performance in cycles per byte out of large buffer.
 #
-#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    8xAVX2
+#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
 #
 # P4		9.48/+99%	-/22.7(ii)	-
 # Core2		7.83/+55%	7.90/8.08	4.35
@ -32,8 +32,9 @@
 # Sandy Bridge	8.31/+42%	5.45/6.76	2.72
 # Ivy Bridge	6.71/+46%	5.40/6.49	2.41
 # Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
-# Skylake	5.87/+39%	4.70/-		2.31	    1.19
+# Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.57]
 # Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
+# Knights L	11.7/-		-		9.60(iii)   0.80
 # Goldmont	10.6/+17%	5.10/-		3.28
 # Sledgehammer	7.28/+52%	-/14.2(ii)	-
 # Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
@ -50,6 +51,7 @@
 #	limitations, SSE2 can do better, but gain is considered too
 #	low to justify the [maintenance] effort;
 # (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20;
+# (v)	8xAVX2 or 16xAVX-512, whichever best applicable;

 $flavour = shift;
 $output  = shift;
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@ -35,6 +35,8 @@
 # Applications using the EVP interface will observe a few percent
 # worse performance.]
 #
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf

--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@ -74,6 +74,7 @@
 # Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Knights L	2.12(-)    (if system doesn't support AVX)
 # Goldmont	1.08(+24%)

 # March 2013
@ -86,6 +87,8 @@
 # it performs in 0.41 cycles per byte on Haswell processor, in
 # 0.29 on Broadwell, and in 0.36 on Skylake.
 #
+# Knights Landing achieves 1.09 cpb.
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest

 $flavour = shift;
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@ -27,14 +27,15 @@
 # Numbers are cycles per processed byte with poly1305_blocks alone,
 # measured with rdtsc at fixed clock frequency.
 #
-#		IALU/gcc-4.8(*)	AVX(**)		AVX2
+#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
 # P4		4.46/+120%	-
 # Core 2	2.41/+90%	-
 # Westmere	1.88/+120%	-
 # Sandy Bridge	1.39/+140%	1.10
 # Haswell	1.14/+175%	1.11		0.65
-# Skylake	1.13/+120%	0.96		0.51
+# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
 # Silvermont	2.83/+95%	-
+# Knights L	3.60/-		1.65		1.10	(***)
 # Goldmont	1.70/+180%	-
 # VIA Nano	1.82/+150%	-
 # Sledgehammer	1.38/+160%	-
@ -49,6 +50,8 @@
 #	Core processors, 50-30%, less newer processor is, but slower on
 #	contemporary ones, for example almost 2x slower on Atom, and as
 #	former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***)	Current AVX-512 code requires BW and VL extensions and can not
+#	execute on Knights Landing;

 $flavour = shift;
 $output  = shift;
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@ -86,6 +86,7 @@
 # VIA Nano	9.32		7.15/+30%
 # Atom		10.3		9.17/+12%
 # Silvermont	13.1(*)		9.37/+40%
+# Knights L	13.2(*)		9.68/+36%	8.30/+59%
 # Goldmont	8.13		6.42/+27%	1.70/+380%(**)
 #
 # (*)	obviously suboptimal result, nothing was done about it,
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@ -99,6 +99,7 @@
 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
 # Atom		23.0	18.9(+22%)  -		    14.7    -
 # Silvermont	27.4	20.6(+33%)  -               17.5    -
+# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
 # Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
 #
 # (*)	whichever best applicable, including SHAEXT;
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@ -145,8 +145,19 @@ OPENSSL_ia32_cpuid:
 	or	\$0x40000000,%edx	# set reserved bit#30 on Intel CPUs
 	and	\$15,%ah
 	cmp	\$15,%ah		# examine Family ID
-	jne	.Lnotintel
+	jne	.LnotP4
 	or	\$0x00100000,%edx	# set reserved bit#20 to engage RC4_CHAR
+.LnotP4:
+	cmp	\$6,%ah
+	jne	.Lnotintel
+	and	\$0x0ffff0f0,%eax
+	cmp	\$0x00050670,%eax	# Knights Landing
+	je	.Lknights
+	cmp	\$0x00080650,%eax	# Knights Mill (according to sde)
+	jne	.Lnotintel
+.Lknights:
+	and	\$0xfbffffff,%ecx	# clear XSAVE flag to mimic Silvermont
+
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
 	jnc	.Lgeneric
@ -171,6 +182,10 @@ OPENSSL_ia32_cpuid:
 	mov	\$7,%eax
 	xor	%ecx,%ecx
 	cpuid
+	bt	\$26,%r9d		# check XSAVE bit, cleared on Knights
+	jc	.Lnotknights
+	and	\$0xfff7ffff,%ebx	# clear ADCX/ADOX flag
+.Lnotknights:
 	mov	%ebx,8(%rdi)		# save extended feature flags
 .Lno_extended_info: