Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu: - Do not idle omap device between crypto operations in one session. - Added sha224/sha384 shims for SSSE3. - More optimisations for camellia-aesni-avx2. - Removed defunct blowfish/twofish AVX2 implementations. - Added unaligned buffer self-tests. - Added PCLMULQDQ optimisation for CRCT10DIF. - Added support for Freescale's DCP co-processor - Misc fixes. * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (44 commits) crypto: testmgr - test hash implementations with unaligned buffers crypto: testmgr - test AEADs with unaligned buffers crypto: testmgr - test skciphers with unaligned buffers crypto: testmgr - check that entries in alg_test_descs are in correct order Revert "crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher" Revert "crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher" crypto: camellia-aesni-avx2 - tune assembly code for more performance hwrng: bcm2835 - fix MODULE_LICENSE tag hwrng: nomadik - use clk_prepare_enable() crypto: picoxcell - replace strict_strtoul() with kstrtoul() crypto: dcp - Staticize local symbols crypto: dcp - Use NULL instead of 0 crypto: dcp - Use devm_* APIs crypto: dcp - Remove redundant platform_set_drvdata() hwrng: use platform_{get,set}_drvdata() crypto: omap-aes - Don't idle/start AES device between Encrypt operations crypto: crct10dif - Use PTR_RET crypto: ux500 - Cocci spatch "resource_size.spatch" crypto: sha256_ssse3 - add sha224 support crypto: sha512_ssse3 - add sha384 support ...
2025-01-18 20:04:16 +08:00 · 2013-07-05 12:12:33 -07:00 · 2013-07-05 12:12:33 -07:00 · b2c311075d
commit b2c311075d
parent 45175476ae 02c0241b60
48 changed files with 2541 additions and 2563 deletions
--- a/arch/arm/boot/dts/imx28.dtsi
+++ b/arch/arm/boot/dts/imx28.dtsi
@ -736,7 +736,7 @@
 			dcp@80028000 {
 				reg = <0x80028000 0x2000>;
 				interrupts = <52 53 54>;
-				status = "disabled";
+				compatible = "fsl-dcp";
 			};

 			pxp@8002a000 {
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@ -3,8 +3,6 @@
 #

 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
-					$(comma)4)$(comma)%ymm2,yes,no)

 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@ -29,6 +27,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o

 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@ -42,10 +41,8 @@ endif

 # These modules require assembler to support AVX2.
 ifeq ($(avx2_supported),yes)
-	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
-	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
 endif

 aes-i586-y := aes-i586-asm_32.o aes_glue.o
@ -73,10 +70,8 @@ ifeq ($(avx_supported),yes)
 endif

 ifeq ($(avx2_supported),yes)
-	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
-	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
 endif

 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
@ -87,3 +82,4 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
--- a/arch/x86/crypto/blowfish-avx2-asm_64.S
+++ b/arch/x86/crypto/blowfish-avx2-asm_64.S
@ -1,449 +0,0 @@
-/*
- * x86_64/AVX2 assembler optimized version of Blowfish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/linkage.h>
-
-.file "blowfish-avx2-asm_64.S"
-
-.data
-.align 32
-
-.Lprefetch_mask:
-.long 0*64
-.long 1*64
-.long 2*64
-.long 3*64
-.long 4*64
-.long 5*64
-.long 6*64
-.long 7*64
-
-.Lbswap32_mask:
-.long 0x00010203
-.long 0x04050607
-.long 0x08090a0b
-.long 0x0c0d0e0f
-
-.Lbswap128_mask:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lbswap_iv_mask:
-	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
-
-.text
-/* structure of crypto context */
-#define p	0
-#define s0	((16 + 2) * 4)
-#define s1	((16 + 2 + (1 * 256)) * 4)
-#define s2	((16 + 2 + (2 * 256)) * 4)
-#define s3	((16 + 2 + (3 * 256)) * 4)
-
-/* register macros */
-#define CTX	%rdi
-#define RIO	 %rdx
-
-#define RS0	%rax
-#define RS1	%r8
-#define RS2	%r9
-#define RS3	%r10
-
-#define RLOOP	%r11
-#define RLOOPd	%r11d
-
-#define RXr0	%ymm8
-#define RXr1	%ymm9
-#define RXr2	%ymm10
-#define RXr3	%ymm11
-#define RXl0	%ymm12
-#define RXl1	%ymm13
-#define RXl2	%ymm14
-#define RXl3	%ymm15
-
-/* temp regs */
-#define RT0	%ymm0
-#define RT0x	%xmm0
-#define RT1	%ymm1
-#define RT1x	%xmm1
-#define RIDX0	%ymm2
-#define RIDX1	%ymm3
-#define RIDX1x	%xmm3
-#define RIDX2	%ymm4
-#define RIDX3	%ymm5
-
-/* vpgatherdd mask and '-1' */
-#define RNOT	%ymm6
-
-/* byte mask, (-1 >> 24) */
-#define RBYTE	%ymm7
-
-/***********************************************************************
- * 32-way AVX2 blowfish
- ***********************************************************************/
-#define F(xl, xr) \
-	vpsrld $24, xl, RIDX0; \
-	vpsrld $16, xl, RIDX1; \
-	vpsrld $8, xl, RIDX2; \
-	vpand RBYTE, RIDX1, RIDX1; \
-	vpand RBYTE, RIDX2, RIDX2; \
-	vpand RBYTE, xl, RIDX3; \
-	\
-	vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpcmpeqd RIDX0, RIDX0, RIDX0; \
-	\
-	vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
-	vpcmpeqd RIDX1, RIDX1, RIDX1; \
-	vpaddd RT0, RT1, RT0; \
-	\
-	vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
-	vpxor RT0, RT1, RT0; \
-	\
-	vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpaddd RT0, RT1, RT0; \
-	\
-	vpxor RT0, xr, xr;
-
-#define add_roundkey(xl, nmem) \
-	vpbroadcastd nmem, RT0; \
-	vpxor RT0, xl ## 0, xl ## 0; \
-	vpxor RT0, xl ## 1, xl ## 1; \
-	vpxor RT0, xl ## 2, xl ## 2; \
-	vpxor RT0, xl ## 3, xl ## 3;
-
-#define round_enc() \
-	add_roundkey(RXr, p(CTX,RLOOP,4)); \
-	F(RXl0, RXr0); \
-	F(RXl1, RXr1); \
-	F(RXl2, RXr2); \
-	F(RXl3, RXr3); \
-	\
-	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
-	F(RXr0, RXl0); \
-	F(RXr1, RXl1); \
-	F(RXr2, RXl2); \
-	F(RXr3, RXl3);
-
-#define round_dec() \
-	add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
-	F(RXl0, RXr0); \
-	F(RXl1, RXr1); \
-	F(RXl2, RXr2); \
-	F(RXl3, RXr3); \
-	\
-	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
-	F(RXr0, RXl0); \
-	F(RXr1, RXl1); \
-	F(RXr2, RXl2); \
-	F(RXr3, RXl3);
-
-#define init_round_constants() \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	leaq s0(CTX), RS0; \
-	leaq s1(CTX), RS1; \
-	leaq s2(CTX), RS2; \
-	leaq s3(CTX), RS3; \
-	vpsrld $24, RNOT, RBYTE;
-
-#define transpose_2x2(x0, x1, t0) \
-	vpunpckldq x0, x1, t0; \
-	vpunpckhdq x0, x1, x1; \
-	\
-	vpunpcklqdq t0, x1, x0; \
-	vpunpckhqdq t0, x1, x1;
-
-#define read_block(xl, xr) \
-	vbroadcasti128 .Lbswap32_mask, RT1; \
-	\
-	vpshufb RT1, xl ## 0, xl ## 0; \
-	vpshufb RT1, xr ## 0, xr ## 0; \
-	vpshufb RT1, xl ## 1, xl ## 1; \
-	vpshufb RT1, xr ## 1, xr ## 1; \
-	vpshufb RT1, xl ## 2, xl ## 2; \
-	vpshufb RT1, xr ## 2, xr ## 2; \
-	vpshufb RT1, xl ## 3, xl ## 3; \
-	vpshufb RT1, xr ## 3, xr ## 3; \
-	\
-	transpose_2x2(xl ## 0, xr ## 0, RT0); \
-	transpose_2x2(xl ## 1, xr ## 1, RT0); \
-	transpose_2x2(xl ## 2, xr ## 2, RT0); \
-	transpose_2x2(xl ## 3, xr ## 3, RT0);
-
-#define write_block(xl, xr) \
-	vbroadcasti128 .Lbswap32_mask, RT1; \
-	\
-	transpose_2x2(xl ## 0, xr ## 0, RT0); \
-	transpose_2x2(xl ## 1, xr ## 1, RT0); \
-	transpose_2x2(xl ## 2, xr ## 2, RT0); \
-	transpose_2x2(xl ## 3, xr ## 3, RT0); \
-	\
-	vpshufb RT1, xl ## 0, xl ## 0; \
-	vpshufb RT1, xr ## 0, xr ## 0; \
-	vpshufb RT1, xl ## 1, xl ## 1; \
-	vpshufb RT1, xr ## 1, xr ## 1; \
-	vpshufb RT1, xl ## 2, xl ## 2; \
-	vpshufb RT1, xr ## 2, xr ## 2; \
-	vpshufb RT1, xl ## 3, xl ## 3; \
-	vpshufb RT1, xr ## 3, xr ## 3;
-
-.align 8
-__blowfish_enc_blk32:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RXl0..4, RXr0..4: plaintext
-	 * output:
-	 *	RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
-	 */
-	init_round_constants();
-
-	read_block(RXl, RXr);
-
-	movl $1, RLOOPd;
-	add_roundkey(RXl, p+4*(0)(CTX));
-
-.align 4
-.L__enc_loop:
-	round_enc();
-
-	leal 2(RLOOPd), RLOOPd;
-	cmpl $17, RLOOPd;
-	jne .L__enc_loop;
-
-	add_roundkey(RXr, p+4*(17)(CTX));
-
-	write_block(RXl, RXr);
-
-	ret;
-ENDPROC(__blowfish_enc_blk32)
-
-.align 8
-__blowfish_dec_blk32:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RXl0..4, RXr0..4: ciphertext
-	 * output:
-	 *	RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
-	 */
-	init_round_constants();
-
-	read_block(RXl, RXr);
-
-	movl $14, RLOOPd;
-	add_roundkey(RXl, p+4*(17)(CTX));
-
-.align 4
-.L__dec_loop:
-	round_dec();
-
-	addl $-2, RLOOPd;
-	jns .L__dec_loop;
-
-	add_roundkey(RXr, p+4*(0)(CTX));
-
-	write_block(RXl, RXr);
-
-	ret;
-ENDPROC(__blowfish_dec_blk32)
-
-ENTRY(blowfish_ecb_enc_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-
-	vmovdqu 0*32(%rdx), RXl0;
-	vmovdqu 1*32(%rdx), RXr0;
-	vmovdqu 2*32(%rdx), RXl1;
-	vmovdqu 3*32(%rdx), RXr1;
-	vmovdqu 4*32(%rdx), RXl2;
-	vmovdqu 5*32(%rdx), RXr2;
-	vmovdqu 6*32(%rdx), RXl3;
-	vmovdqu 7*32(%rdx), RXr3;
-
-	call __blowfish_enc_blk32;
-
-	vmovdqu RXr0, 0*32(%rsi);
-	vmovdqu RXl0, 1*32(%rsi);
-	vmovdqu RXr1, 2*32(%rsi);
-	vmovdqu RXl1, 3*32(%rsi);
-	vmovdqu RXr2, 4*32(%rsi);
-	vmovdqu RXl2, 5*32(%rsi);
-	vmovdqu RXr3, 6*32(%rsi);
-	vmovdqu RXl3, 7*32(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_ecb_enc_32way)
-
-ENTRY(blowfish_ecb_dec_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-
-	vmovdqu 0*32(%rdx), RXl0;
-	vmovdqu 1*32(%rdx), RXr0;
-	vmovdqu 2*32(%rdx), RXl1;
-	vmovdqu 3*32(%rdx), RXr1;
-	vmovdqu 4*32(%rdx), RXl2;
-	vmovdqu 5*32(%rdx), RXr2;
-	vmovdqu 6*32(%rdx), RXl3;
-	vmovdqu 7*32(%rdx), RXr3;
-
-	call __blowfish_dec_blk32;
-
-	vmovdqu RXr0, 0*32(%rsi);
-	vmovdqu RXl0, 1*32(%rsi);
-	vmovdqu RXr1, 2*32(%rsi);
-	vmovdqu RXl1, 3*32(%rsi);
-	vmovdqu RXr2, 4*32(%rsi);
-	vmovdqu RXl2, 5*32(%rsi);
-	vmovdqu RXr3, 6*32(%rsi);
-	vmovdqu RXl3, 7*32(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_ecb_dec_32way)
-
-ENTRY(blowfish_cbc_dec_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-
-	vmovdqu 0*32(%rdx), RXl0;
-	vmovdqu 1*32(%rdx), RXr0;
-	vmovdqu 2*32(%rdx), RXl1;
-	vmovdqu 3*32(%rdx), RXr1;
-	vmovdqu 4*32(%rdx), RXl2;
-	vmovdqu 5*32(%rdx), RXr2;
-	vmovdqu 6*32(%rdx), RXl3;
-	vmovdqu 7*32(%rdx), RXr3;
-
-	call __blowfish_dec_blk32;
-
-	/* xor with src */
-	vmovq (%rdx), RT0x;
-	vpshufd $0x4f, RT0x, RT0x;
-	vinserti128 $1, 8(%rdx), RT0, RT0;
-	vpxor RT0, RXr0, RXr0;
-	vpxor 0*32+24(%rdx), RXl0, RXl0;
-	vpxor 1*32+24(%rdx), RXr1, RXr1;
-	vpxor 2*32+24(%rdx), RXl1, RXl1;
-	vpxor 3*32+24(%rdx), RXr2, RXr2;
-	vpxor 4*32+24(%rdx), RXl2, RXl2;
-	vpxor 5*32+24(%rdx), RXr3, RXr3;
-	vpxor 6*32+24(%rdx), RXl3, RXl3;
-
-	vmovdqu RXr0, (0*32)(%rsi);
-	vmovdqu RXl0, (1*32)(%rsi);
-	vmovdqu RXr1, (2*32)(%rsi);
-	vmovdqu RXl1, (3*32)(%rsi);
-	vmovdqu RXr2, (4*32)(%rsi);
-	vmovdqu RXl2, (5*32)(%rsi);
-	vmovdqu RXr3, (6*32)(%rsi);
-	vmovdqu RXl3, (7*32)(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_cbc_dec_32way)
-
-ENTRY(blowfish_ctr_32way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 *	%rcx: iv (big endian, 64bit)
-	 */
-
-	vzeroupper;
-
-	vpcmpeqd RT0, RT0, RT0;
-	vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
-
-	vpcmpeqd RT1x, RT1x, RT1x;
-	vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
-	vpxor RIDX0, RIDX0, RIDX0;
-	vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
-
-	vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
-
-	vpcmpeqd RT1, RT1, RT1;
-	vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
-	vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
-
-	vbroadcasti128 .Lbswap_iv_mask, RIDX0;
-	vbroadcasti128 .Lbswap128_mask, RIDX1;
-
-	/* load IV and byteswap */
-	vmovq (%rcx), RT1x;
-	vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
-	vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
-
-	/* construct IVs */
-	vpsubq RT0, RT1, RT1;		/* a: le1, b: le0, c: le3, d: le2 */
-	vpshufb RIDX1, RT1, RXl0;	/* a: be0, b: be1, c: be2, d: be3 */
-	vpsubq RIDX2, RT1, RT1;		/* le5, le4, le7, le6 */
-	vpshufb RIDX1, RT1, RXr0;	/* be4, be5, be6, be7 */
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXl1;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXr1;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXl2;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXr2;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXl3;
-	vpsubq RIDX2, RT1, RT1;
-	vpshufb RIDX1, RT1, RXr3;
-
-	/* store last IV */
-	vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
-	vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
-	vmovq RT1x, (%rcx);
-
-	call __blowfish_enc_blk32;
-
-	/* dst = src ^ iv */
-	vpxor 0*32(%rdx), RXr0, RXr0;
-	vpxor 1*32(%rdx), RXl0, RXl0;
-	vpxor 2*32(%rdx), RXr1, RXr1;
-	vpxor 3*32(%rdx), RXl1, RXl1;
-	vpxor 4*32(%rdx), RXr2, RXr2;
-	vpxor 5*32(%rdx), RXl2, RXl2;
-	vpxor 6*32(%rdx), RXr3, RXr3;
-	vpxor 7*32(%rdx), RXl3, RXl3;
-	vmovdqu RXr0, (0*32)(%rsi);
-	vmovdqu RXl0, (1*32)(%rsi);
-	vmovdqu RXr1, (2*32)(%rsi);
-	vmovdqu RXl1, (3*32)(%rsi);
-	vmovdqu RXr2, (4*32)(%rsi);
-	vmovdqu RXl2, (5*32)(%rsi);
-	vmovdqu RXr3, (6*32)(%rsi);
-	vmovdqu RXl3, (7*32)(%rsi);
-
-	vzeroupper;
-
-	ret;
-ENDPROC(blowfish_ctr_32way)
--- a/arch/x86/crypto/blowfish_avx2_glue.c
+++ b/arch/x86/crypto/blowfish_avx2_glue.c
@ -1,585 +0,0 @@
-/*
- * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/blowfish.h>
-#include <crypto/cryptd.h>
-#include <crypto/ctr.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
-#include <asm/crypto/blowfish.h>
-#include <asm/crypto/ablk_helper.h>
-#include <crypto/scatterwalk.h>
-
-#define BF_AVX2_PARALLEL_BLOCKS 32
-
-/* 32-way AVX2 parallel cipher functions */
-asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
-				       const u8 *src);
-asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
-				       const u8 *src);
-asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
-				       const u8 *src);
-asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
-				   __be64 *iv);
-
-static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-	if (fpu_enabled)
-		return true;
-
-	/* FPU is only used when chunk to be processed is large enough, so
-	 * do not enable FPU until it is necessary.
-	 */
-	if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
-		return false;
-
-	kernel_fpu_begin();
-	return true;
-}
-
-static inline void bf_fpu_end(bool fpu_enabled)
-{
-	if (fpu_enabled)
-		kernel_fpu_end();
-}
-
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
-		     bool enc)
-{
-	bool fpu_enabled = false;
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes;
-	int err;
-
-	err = blkcipher_walk_virt(desc, walk);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk->nbytes)) {
-		u8 *wsrc = walk->src.virt.addr;
-		u8 *wdst = walk->dst.virt.addr;
-
-		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-
-		/* Process multi-block AVX2 batch */
-		if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-			do {
-				if (enc)
-					blowfish_ecb_enc_32way(ctx, wdst, wsrc);
-				else
-					blowfish_ecb_dec_32way(ctx, wdst, wsrc);
-
-				wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
-				wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
-				nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
-			} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Process multi-block batch */
-		if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-			do {
-				if (enc)
-					blowfish_enc_blk_4way(ctx, wdst, wsrc);
-				else
-					blowfish_dec_blk_4way(ctx, wdst, wsrc);
-
-				wsrc += bsize * BF_PARALLEL_BLOCKS;
-				wdst += bsize * BF_PARALLEL_BLOCKS;
-				nbytes -= bsize * BF_PARALLEL_BLOCKS;
-			} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Handle leftovers */
-		do {
-			if (enc)
-				blowfish_enc_blk(ctx, wdst, wsrc);
-			else
-				blowfish_dec_blk(ctx, wdst, wsrc);
-
-			wsrc += bsize;
-			wdst += bsize;
-			nbytes -= bsize;
-		} while (nbytes >= bsize);
-
-done:
-		err = blkcipher_walk_done(desc, walk, nbytes);
-	}
-
-	bf_fpu_end(fpu_enabled);
-	return err;
-}
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, true);
-}
-
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, false);
-}
-
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
-	u64 *iv = (u64 *)walk->iv;
-
-	do {
-		*dst = *src ^ *iv;
-		blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
-		iv = dst;
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-	*(u64 *)walk->iv = *iv;
-	return nbytes;
-}
-
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
-
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_encrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	return err;
-}
-
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
-	u64 last_iv;
-	int i;
-
-	/* Start of the last block. */
-	src += nbytes / bsize - 1;
-	dst += nbytes / bsize - 1;
-
-	last_iv = *src;
-
-	/* Process multi-block AVX2 batch */
-	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-		do {
-			nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
-			src -= BF_AVX2_PARALLEL_BLOCKS - 1;
-			dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
-
-			blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
-
-			*dst ^= *(src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Process multi-block batch */
-	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-		u64 ivs[BF_PARALLEL_BLOCKS - 1];
-
-		do {
-			nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
-			src -= BF_PARALLEL_BLOCKS - 1;
-			dst -= BF_PARALLEL_BLOCKS - 1;
-
-			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
-				ivs[i] = src[i];
-
-			blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
-
-			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
-				dst[i + 1] ^= ivs[i];
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
-
-			*dst ^= *(src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	for (;;) {
-		blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
-		nbytes -= bsize;
-		if (nbytes < bsize)
-			break;
-
-		*dst ^= *(src - 1);
-		src -= 1;
-		dst -= 1;
-	}
-
-done:
-	*dst ^= *(u64 *)walk->iv;
-	*(u64 *)walk->iv = last_iv;
-
-	return nbytes;
-}
-
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	bool fpu_enabled = false;
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk.nbytes)) {
-		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-		nbytes = __cbc_decrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	bf_fpu_end(fpu_enabled);
-	return err;
-}
-
-static void ctr_crypt_final(struct blkcipher_desc *desc,
-			    struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	u8 *ctrblk = walk->iv;
-	u8 keystream[BF_BLOCK_SIZE];
-	u8 *src = walk->src.virt.addr;
-	u8 *dst = walk->dst.virt.addr;
-	unsigned int nbytes = walk->nbytes;
-
-	blowfish_enc_blk(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
-
-	crypto_inc(ctrblk, BF_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-				struct blkcipher_walk *walk)
-{
-	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = BF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
-	int i;
-
-	/* Process multi-block AVX2 batch */
-	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
-		do {
-			blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
-					   (__be64 *)walk->iv);
-
-			src += BF_AVX2_PARALLEL_BLOCKS;
-			dst += BF_AVX2_PARALLEL_BLOCKS;
-			nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
-		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Process four block batch */
-	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
-		__be64 ctrblocks[BF_PARALLEL_BLOCKS];
-		u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
-
-		do {
-			/* create ctrblks for parallel encrypt */
-			for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
-				if (dst != src)
-					dst[i] = src[i];
-
-				ctrblocks[i] = cpu_to_be64(ctrblk++);
-			}
-
-			blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
-						  (u8 *)ctrblocks);
-
-			src += BF_PARALLEL_BLOCKS;
-			dst += BF_PARALLEL_BLOCKS;
-			nbytes -= bsize * BF_PARALLEL_BLOCKS;
-		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
-
-		*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	do {
-		u64 ctrblk;
-
-		if (dst != src)
-			*dst = *src;
-
-		ctrblk = *(u64 *)walk->iv;
-		be64_add_cpu((__be64 *)walk->iv, 1);
-
-		blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
-
-		src += 1;
-		dst += 1;
-	} while ((nbytes -= bsize) >= bsize);
-
-done:
-	return nbytes;
-}
-
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes)
-{
-	bool fpu_enabled = false;
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
-		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
-		nbytes = __ctr_crypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	bf_fpu_end(fpu_enabled);
-
-	if (walk.nbytes) {
-		ctr_crypt_final(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, 0);
-	}
-
-	return err;
-}
-
-static struct crypto_alg bf_algs[6] = { {
-	.cra_name		= "__ecb-blowfish-avx2",
-	.cra_driver_name	= "__driver-ecb-blowfish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__cbc-blowfish-avx2",
-	.cra_driver_name	= "__driver-cbc-blowfish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__ctr-blowfish-avx2",
-	.cra_driver_name	= "__driver-ctr-blowfish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct bf_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= blowfish_setkey,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-}, {
-	.cra_name		= "ecb(blowfish)",
-	.cra_driver_name	= "ecb-blowfish-avx2",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "cbc(blowfish)",
-	.cra_driver_name	= "cbc-blowfish-avx2",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= BF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= __ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "ctr(blowfish)",
-	.cra_driver_name	= "ctr-blowfish-avx2",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= BF_MIN_KEY_SIZE,
-			.max_keysize	= BF_MAX_KEY_SIZE,
-			.ivsize		= BF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_encrypt,
-			.geniv		= "chainiv",
-		},
-	},
-} };
-
-
-static int __init init(void)
-{
-	u64 xcr0;
-
-	if (!cpu_has_avx2 || !cpu_has_osxsave) {
-		pr_info("AVX2 instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-		return -ENODEV;
-	}
-
-	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("blowfish");
-MODULE_ALIAS("blowfish-asm");
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@ -1,7 +1,7 @@
 /*
 * Glue Code for assembler optimized version of Blowfish
 *
- * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 *
 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
 *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@ -32,24 +32,40 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
-#include <asm/crypto/blowfish.h>

 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
 				   bool xor);
-EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
-
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
-EXPORT_SYMBOL_GPL(blowfish_dec_blk);

 /* 4-way parallel cipher functions */
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
 					const u8 *src, bool xor);
-EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
-
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
 				      const u8 *src);
-EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
+
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, true);
+}
+
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+				      const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, true);
+}

 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@ -51,16 +51,6 @@
 #define ymm14_x xmm14
 #define ymm15_x xmm15

-/*
- * AES-NI instructions do not support ymmX registers, so we need splitting and
- * merging.
- */
-#define vaesenclast256(zero, yreg, tmp) \
-	vextracti128 $1, yreg, tmp##_x; \
-	vaesenclast zero##_x, yreg##_x, yreg##_x; \
-	vaesenclast zero##_x, tmp##_x, tmp##_x; \
-	vinserti128 $1, tmp##_x, yreg, yreg;
-
 /**********************************************************************
  32-way camellia
 **********************************************************************/
@ -79,46 +69,70 @@
 	 * S-function with AES subbytes \
 	 */ \
 	vbroadcasti128 .Linv_shift_row, t4; \
-	vpbroadcastb .L0f0f0f0f, t7; \
-	vbroadcasti128 .Lpre_tf_lo_s1, t0; \
-	vbroadcasti128 .Lpre_tf_hi_s1, t1; \
+	vpbroadcastd .L0f0f0f0f, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1, t5; \
+	vbroadcasti128 .Lpre_tf_hi_s1, t6; \
+	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
 	vpshufb t4, x7, x7; \
-	vpshufb t4, x1, x1; \
-	vpshufb t4, x4, x4; \
-	vpshufb t4, x2, x2; \
-	vpshufb t4, x5, x5; \
 	vpshufb t4, x3, x3; \
 	vpshufb t4, x6, x6; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
-	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
-	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
-	filter_8bit(x0, t0, t1, t7, t6); \
-	filter_8bit(x7, t0, t1, t7, t6); \
-	filter_8bit(x1, t0, t1, t7, t6); \
-	filter_8bit(x4, t0, t1, t7, t6); \
-	filter_8bit(x2, t0, t1, t7, t6); \
-	filter_8bit(x5, t0, t1, t7, t6); \
-	\
 	/* prefilter sbox 4 */ \
+	filter_8bit(x0, t5, t6, t7, t4); \
+	filter_8bit(x7, t5, t6, t7, t4); \
+	vextracti128 $1, x0, t0##_x; \
+	vextracti128 $1, x7, t1##_x; \
+	filter_8bit(x3, t2, t3, t7, t4); \
+	filter_8bit(x6, t2, t3, t7, t4); \
+	vextracti128 $1, x3, t3##_x; \
+	vextracti128 $1, x6, t2##_x; \
+	filter_8bit(x2, t5, t6, t7, t4); \
+	filter_8bit(x5, t5, t6, t7, t4); \
+	filter_8bit(x1, t5, t6, t7, t4); \
+	filter_8bit(x4, t5, t6, t7, t4); \
+	\
 	vpxor t4##_x, t4##_x, t4##_x; \
-	filter_8bit(x3, t2, t3, t7, t6); \
-	filter_8bit(x6, t2, t3, t7, t6); \
 	\
 	/* AES subbytes + AES shift rows */ \
+	vextracti128 $1, x2, t6##_x; \
+	vextracti128 $1, x5, t5##_x; \
+	vaesenclast t4##_x, x0##_x, x0##_x; \
+	vaesenclast t4##_x, t0##_x, t0##_x; \
+	vinserti128 $1, t0##_x, x0, x0; \
+	vaesenclast t4##_x, x7##_x, x7##_x; \
+	vaesenclast t4##_x, t1##_x, t1##_x; \
+	vinserti128 $1, t1##_x, x7, x7; \
+	vaesenclast t4##_x, x3##_x, x3##_x; \
+	vaesenclast t4##_x, t3##_x, t3##_x; \
+	vinserti128 $1, t3##_x, x3, x3; \
+	vaesenclast t4##_x, x6##_x, x6##_x; \
+	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t2##_x, x6, x6; \
+	vextracti128 $1, x1, t3##_x; \
+	vextracti128 $1, x4, t2##_x; \
 	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
 	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
-	vaesenclast256(t4, x0, t5); \
-	vaesenclast256(t4, x7, t5); \
-	vaesenclast256(t4, x1, t5); \
-	vaesenclast256(t4, x4, t5); \
-	vaesenclast256(t4, x2, t5); \
-	vaesenclast256(t4, x5, t5); \
-	vaesenclast256(t4, x3, t5); \
-	vaesenclast256(t4, x6, t5); \
+	vaesenclast t4##_x, x2##_x, x2##_x; \
+	vaesenclast t4##_x, t6##_x, t6##_x; \
+	vinserti128 $1, t6##_x, x2, x2; \
+	vaesenclast t4##_x, x5##_x, x5##_x; \
+	vaesenclast t4##_x, t5##_x, t5##_x; \
+	vinserti128 $1, t5##_x, x5, x5; \
+	vaesenclast t4##_x, x1##_x, x1##_x; \
+	vaesenclast t4##_x, t3##_x, t3##_x; \
+	vinserti128 $1, t3##_x, x1, x1; \
+	vaesenclast t4##_x, x4##_x, x4##_x; \
+	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t2##_x, x4, x4; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
 	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
@ -139,22 +153,12 @@
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
+	vpxor t7, t7, t7; \
 	\
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
+	vpshufb t7, t1, t1; \
 	vpsrldq $3, t0, t3; \
-	vpsrldq $4, t0, t4; \
-	vpsrldq $5, t0, t5; \
-	vpsrldq $6, t0, t6; \
-	vpsrldq $7, t0, t7; \
-	vpbroadcastb t0##_x, t0; \
-	vpbroadcastb t1##_x, t1; \
-	vpbroadcastb t2##_x, t2; \
-	vpbroadcastb t3##_x, t3; \
-	vpbroadcastb t4##_x, t4; \
-	vpbroadcastb t6##_x, t6; \
-	vpbroadcastb t5##_x, t5; \
-	vpbroadcastb t7##_x, t7; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
@ -162,11 +166,21 @@
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
+	vpshufb t7, t2, t2; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t7, t3, t3; \
+	vpsrldq $5, t0, t5; \
+	vpshufb t7, t4, t4; \
+	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
+	vpsrldq $6, t0, t6; \
+	vpshufb t7, t5, t5; \
+	vpshufb t7, t6, t6; \
+	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
@ -179,12 +193,16 @@
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
-	vpxor t7, x0, x0; \
-	vpxor 4 * 32(mem_cd), x0, x0; \
-	\
 	vpxor t6, x1, x1; \
 	vpxor 5 * 32(mem_cd), x1, x1; \
 	\
+	vpsrldq $7, t0, t6; \
+	vpshufb t7, t0, t0; \
+	vpshufb t7, t6, t7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 32(mem_cd), x0, x0; \
+	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
@ -204,7 +222,7 @@
 	vpxor 3 * 32(mem_cd), x7, x7;

 /*
- * Size optimization... with inlined roundsm16 binary would be over 5 times
+ * Size optimization... with inlined roundsm32 binary would be over 5 times
 * larger and would only marginally faster.
 */
 .align 8
@ -324,13 +342,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 */ \
 	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 	vpxor tt0, tt0, tt0; \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
@ -340,6 +358,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor l4, t0, l4; \
+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 	vmovdqu l4, 4 * 32(l); \
 	vpxor l5, t1, l5; \
 	vmovdqu l5, 5 * 32(l); \
@ -354,14 +373,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 * rl ^= t2; \
 	 */ \
 	\
-	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpor 4 * 32(r), t0, t0; \
 	vpor 5 * 32(r), t1, t1; \
@ -373,6 +391,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vpxor 2 * 32(r), t2, t2; \
 	vpxor 3 * 32(r), t3, t3; \
 	vmovdqu t0, 0 * 32(r); \
+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 1 * 32(r); \
 	vmovdqu t2, 2 * 32(r); \
 	vmovdqu t3, 3 * 32(r); \
@ -382,14 +401,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
-	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpand 0 * 32(r), t0, t0; \
 	vpand 1 * 32(r), t1, t1; \
@ -403,6 +421,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vpxor 6 * 32(r), t2, t2; \
 	vpxor 7 * 32(r), t3, t3; \
 	vmovdqu t0, 4 * 32(r); \
+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 5 * 32(r); \
 	vmovdqu t2, 6 * 32(r); \
 	vmovdqu t3, 7 * 32(r); \
@ -413,14 +432,13 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	 * ll ^= t0; \
 	 */ \
 	\
-	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
-	vpbroadcastb t0##_x, t3; \
+	vpshufb tt0, t0, t3; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t2; \
+	vpshufb tt0, t0, t2; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t1; \
+	vpshufb tt0, t0, t1; \
 	vpsrldq $1, t0, t0; \
-	vpbroadcastb t0##_x, t0; \
+	vpshufb tt0, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@ -0,0 +1,643 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+#     Erdinc Ozturk <erdinc.ozturk@intel.com>
+#     Vinodh Gopal <vinodh.gopal@intel.com>
+#     James Guilford <james.guilford@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the
+#   distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+#       Function API:
+#       UINT16 crc_t10dif_pcl(
+#               UINT16 init_crc, //initial CRC value, 16 bits
+#               const unsigned char *buf, //buffer pointer to calculate CRC on
+#               UINT64 len //buffer length in bytes (64-bit data)
+#       );
+#
+#       Reference paper titled "Fast CRC Computation for Generic
+#	Polynomials Using PCLMULQDQ Instruction"
+#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define        arg1 %rdi
+#define        arg2 %rsi
+#define        arg3 %rdx
+
+#define        arg1_low32 %edi
+
+ENTRY(crc_t10dif_pcl)
+.align 16
+
+	# adjust the 16-bit initial_crc value, scale it to 32 bits
+	shl	$16, arg1_low32
+
+	# Allocate Stack Space
+	mov     %rsp, %rcx
+	sub	$16*2, %rsp
+	# align stack to 16 byte boundary
+	and     $~(0x10 - 1), %rsp
+
+	# check if smaller than 256
+	cmp	$256, arg3
+
+	# for sizes less than 128, we can't fold 64B at a time...
+	jl	_less_than_128
+
+
+	# load the initial crc value
+	movd	arg1_low32, %xmm10	# initial crc
+
+	# crc value does not need to be byte-reflected, but it needs
+	# to be moved to the high part of the register.
+	# because data will be byte-reflected and will align with
+	# initial crc at correct place.
+	pslldq	$12, %xmm10
+
+	movdqa  SHUF_MASK(%rip), %xmm11
+	# receive the initial 64B data, xor the initial crc value
+	movdqu	16*0(arg2), %xmm0
+	movdqu	16*1(arg2), %xmm1
+	movdqu	16*2(arg2), %xmm2
+	movdqu	16*3(arg2), %xmm3
+	movdqu	16*4(arg2), %xmm4
+	movdqu	16*5(arg2), %xmm5
+	movdqu	16*6(arg2), %xmm6
+	movdqu	16*7(arg2), %xmm7
+
+	pshufb	%xmm11, %xmm0
+	# XOR the initial_crc value
+	pxor	%xmm10, %xmm0
+	pshufb	%xmm11, %xmm1
+	pshufb	%xmm11, %xmm2
+	pshufb	%xmm11, %xmm3
+	pshufb	%xmm11, %xmm4
+	pshufb	%xmm11, %xmm5
+	pshufb	%xmm11, %xmm6
+	pshufb	%xmm11, %xmm7
+
+	movdqa	rk3(%rip), %xmm10	#xmm10 has rk3 and rk4
+					#imm value of pclmulqdq instruction
+					#will determine which constant to use
+
+	#################################################################
+	# we subtract 256 instead of 128 to save one instruction from the loop
+	sub	$256, arg3
+
+	# at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	# buffer. The _fold_64_B_loop will fold 64B at a time
+	# until we have 64+y Bytes of buffer
+
+
+	# fold 64B at a time. This section of the code folds 4 xmm
+	# registers in parallel
+_fold_64_B_loop:
+
+	# update the buffer pointer
+	add	$128, arg2		#    buf += 64#
+
+	movdqu	16*0(arg2), %xmm9
+	movdqu	16*1(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm0, %xmm8
+	movdqa	%xmm1, %xmm13
+	pclmulqdq	$0x0 , %xmm10, %xmm0
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0 , %xmm10, %xmm1
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 , %xmm0
+	xorps	%xmm8 , %xmm0
+	pxor	%xmm12, %xmm1
+	xorps	%xmm13, %xmm1
+
+	movdqu	16*2(arg2), %xmm9
+	movdqu	16*3(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm2, %xmm8
+	movdqa	%xmm3, %xmm13
+	pclmulqdq	$0x0, %xmm10, %xmm2
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0, %xmm10, %xmm3
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 , %xmm2
+	xorps	%xmm8 , %xmm2
+	pxor	%xmm12, %xmm3
+	xorps	%xmm13, %xmm3
+
+	movdqu	16*4(arg2), %xmm9
+	movdqu	16*5(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm4, %xmm8
+	movdqa	%xmm5, %xmm13
+	pclmulqdq	$0x0,  %xmm10, %xmm4
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0,  %xmm10, %xmm5
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 ,  %xmm4
+	xorps	%xmm8 ,  %xmm4
+	pxor	%xmm12,  %xmm5
+	xorps	%xmm13,  %xmm5
+
+	movdqu	16*6(arg2), %xmm9
+	movdqu	16*7(arg2), %xmm12
+	pshufb	%xmm11, %xmm9
+	pshufb	%xmm11, %xmm12
+	movdqa	%xmm6 , %xmm8
+	movdqa	%xmm7 , %xmm13
+	pclmulqdq	$0x0 , %xmm10, %xmm6
+	pclmulqdq	$0x11, %xmm10, %xmm8
+	pclmulqdq	$0x0 , %xmm10, %xmm7
+	pclmulqdq	$0x11, %xmm10, %xmm13
+	pxor	%xmm9 , %xmm6
+	xorps	%xmm8 , %xmm6
+	pxor	%xmm12, %xmm7
+	xorps	%xmm13, %xmm7
+
+	sub	$128, arg3
+
+	# check if there is another 64B in the buffer to be able to fold
+	jge	_fold_64_B_loop
+	##################################################################
+
+
+	add	$128, arg2
+	# at this point, the buffer pointer is pointing at the last y Bytes
+	# of the buffer the 64B of folded data is in 4 of the xmm
+	# registers: xmm0, xmm1, xmm2, xmm3
+
+
+	# fold the 8 xmm registers to 1 xmm register with different constants
+
+	movdqa	rk9(%rip), %xmm10
+	movdqa	%xmm0, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm0
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm0, %xmm7
+
+	movdqa	rk11(%rip), %xmm10
+	movdqa	%xmm1, %xmm8
+	pclmulqdq	 $0x11, %xmm10, %xmm1
+	pclmulqdq	 $0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm1, %xmm7
+
+	movdqa	rk13(%rip), %xmm10
+	movdqa	%xmm2, %xmm8
+	pclmulqdq	 $0x11, %xmm10, %xmm2
+	pclmulqdq	 $0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm2, %xmm7
+
+	movdqa	rk15(%rip), %xmm10
+	movdqa	%xmm3, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm3
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm3, %xmm7
+
+	movdqa	rk17(%rip), %xmm10
+	movdqa	%xmm4, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm4
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm4, %xmm7
+
+	movdqa	rk19(%rip), %xmm10
+	movdqa	%xmm5, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm5
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	xorps	%xmm5, %xmm7
+
+	movdqa	rk1(%rip), %xmm10	#xmm10 has rk1 and rk2
+					#imm value of pclmulqdq instruction
+					#will determine which constant to use
+	movdqa	%xmm6, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm6
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm6, %xmm7
+
+
+	# instead of 64, we add 48 to the loop counter to save 1 instruction
+	# from the loop instead of a cmp instruction, we use the negative
+	# flag with the jl instruction
+	add	$128-16, arg3
+	jl	_final_reduction_for_128
+
+	# now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+	# and the rest is in memory. We can fold 16 bytes at a time if y>=16
+	# continue folding 16B at a time
+
+_16B_reduction_loop:
+	movdqa	%xmm7, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm7
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	movdqu	(arg2), %xmm0
+	pshufb	%xmm11, %xmm0
+	pxor	%xmm0 , %xmm7
+	add	$16, arg2
+	sub	$16, arg3
+	# instead of a cmp instruction, we utilize the flags with the
+	# jge instruction equivalent of: cmp arg3, 16-16
+	# check if there is any more 16B in the buffer to be able to fold
+	jge	_16B_reduction_loop
+
+	#now we have 16+z bytes left to reduce, where 0<= z < 16.
+	#first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+	# check if any more data to fold. If not, compute the CRC of
+	# the final 128 bits
+	add	$16, arg3
+	je	_128_done
+
+	# here we are getting data that is less than 16 bytes.
+	# since we know that there was data before the pointer, we can
+	# offset the input pointer before the actual point, to receive
+	# exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+	movdqa	%xmm7, %xmm2
+
+	movdqu	-16(arg2, arg3), %xmm1
+	pshufb	%xmm11, %xmm1
+
+	# get rid of the extra data that was loaded before
+	# load the shift constant
+	lea	pshufb_shf_table+16(%rip), %rax
+	sub	arg3, %rax
+	movdqu	(%rax), %xmm0
+
+	# shift xmm2 to the left by arg3 bytes
+	pshufb	%xmm0, %xmm2
+
+	# shift xmm7 to the right by 16-arg3 bytes
+	pxor	mask1(%rip), %xmm0
+	pshufb	%xmm0, %xmm7
+	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
+
+	# fold 16 Bytes
+	movdqa	%xmm1, %xmm2
+	movdqa	%xmm7, %xmm8
+	pclmulqdq	$0x11, %xmm10, %xmm7
+	pclmulqdq	$0x0 , %xmm10, %xmm8
+	pxor	%xmm8, %xmm7
+	pxor	%xmm2, %xmm7
+
+_128_done:
+	# compute crc of a 128-bit value
+	movdqa	rk5(%rip), %xmm10	# rk5 and rk6 in xmm10
+	movdqa	%xmm7, %xmm0
+
+	#64b fold
+	pclmulqdq	$0x1, %xmm10, %xmm7
+	pslldq	$8   ,  %xmm0
+	pxor	%xmm0,  %xmm7
+
+	#32b fold
+	movdqa	%xmm7, %xmm0
+
+	pand	mask2(%rip), %xmm0
+
+	psrldq	$12, %xmm7
+	pclmulqdq	$0x10, %xmm10, %xmm7
+	pxor	%xmm0, %xmm7
+
+	#barrett reduction
+_barrett:
+	movdqa	rk7(%rip), %xmm10	# rk7 and rk8 in xmm10
+	movdqa	%xmm7, %xmm0
+	pclmulqdq	$0x01, %xmm10, %xmm7
+	pslldq	$4, %xmm7
+	pclmulqdq	$0x11, %xmm10, %xmm7
+
+	pslldq	$4, %xmm7
+	pxor	%xmm0, %xmm7
+	pextrd	$1, %xmm7, %eax
+
+_cleanup:
+	# scale the result back to 16 bits
+	shr	$16, %eax
+	mov     %rcx, %rsp
+	ret
+
+########################################################################
+
+.align 16
+_less_than_128:
+
+	# check if there is enough buffer to be able to fold 16B at a time
+	cmp	$32, arg3
+	jl	_less_than_32
+	movdqa  SHUF_MASK(%rip), %xmm11
+
+	# now if there is, load the constants
+	movdqa	rk1(%rip), %xmm10	# rk1 and rk2 in xmm10
+
+	movd	arg1_low32, %xmm0	# get the initial crc value
+	pslldq	$12, %xmm0	# align it to its correct place
+	movdqu	(arg2), %xmm7	# load the plaintext
+	pshufb	%xmm11, %xmm7	# byte-reflect the plaintext
+	pxor	%xmm0, %xmm7
+
+
+	# update the buffer pointer
+	add	$16, arg2
+
+	# update the counter. subtract 32 instead of 16 to save one
+	# instruction from the loop
+	sub	$32, arg3
+
+	jmp	_16B_reduction_loop
+
+
+.align 16
+_less_than_32:
+	# mov initial crc to the return value. this is necessary for
+	# zero-length buffers.
+	mov	arg1_low32, %eax
+	test	arg3, arg3
+	je	_cleanup
+
+	movdqa  SHUF_MASK(%rip), %xmm11
+
+	movd	arg1_low32, %xmm0	# get the initial crc value
+	pslldq	$12, %xmm0	# align it to its correct place
+
+	cmp	$16, arg3
+	je	_exact_16_left
+	jl	_less_than_16_left
+
+	movdqu	(arg2), %xmm7	# load the plaintext
+	pshufb	%xmm11, %xmm7	# byte-reflect the plaintext
+	pxor	%xmm0 , %xmm7	# xor the initial crc value
+	add	$16, arg2
+	sub	$16, arg3
+	movdqa	rk1(%rip), %xmm10	# rk1 and rk2 in xmm10
+	jmp	_get_last_two_xmms
+
+
+.align 16
+_less_than_16_left:
+	# use stack space to load data less than 16 bytes, zero-out
+	# the 16B in memory first.
+
+	pxor	%xmm1, %xmm1
+	mov	%rsp, %r11
+	movdqa	%xmm1, (%r11)
+
+	cmp	$4, arg3
+	jl	_only_less_than_4
+
+	# backup the counter value
+	mov	arg3, %r9
+	cmp	$8, arg3
+	jl	_less_than_8_left
+
+	# load 8 Bytes
+	mov	(arg2), %rax
+	mov	%rax, (%r11)
+	add	$8, %r11
+	sub	$8, arg3
+	add	$8, arg2
+_less_than_8_left:
+
+	cmp	$4, arg3
+	jl	_less_than_4_left
+
+	# load 4 Bytes
+	mov	(arg2), %eax
+	mov	%eax, (%r11)
+	add	$4, %r11
+	sub	$4, arg3
+	add	$4, arg2
+_less_than_4_left:
+
+	cmp	$2, arg3
+	jl	_less_than_2_left
+
+	# load 2 Bytes
+	mov	(arg2), %ax
+	mov	%ax, (%r11)
+	add	$2, %r11
+	sub	$2, arg3
+	add	$2, arg2
+_less_than_2_left:
+	cmp     $1, arg3
+        jl      _zero_left
+
+	# load 1 Byte
+	mov	(arg2), %al
+	mov	%al, (%r11)
+_zero_left:
+	movdqa	(%rsp), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7	# xor the initial crc value
+
+	# shl r9, 4
+	lea	pshufb_shf_table+16(%rip), %rax
+	sub	%r9, %rax
+	movdqu	(%rax), %xmm0
+	pxor	mask1(%rip), %xmm0
+
+	pshufb	%xmm0, %xmm7
+	jmp	_128_done
+
+.align 16
+_exact_16_left:
+	movdqu	(arg2), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7   # xor the initial crc value
+
+	jmp	_128_done
+
+_only_less_than_4:
+	cmp	$3, arg3
+	jl	_only_less_than_3
+
+	# load 3 Bytes
+	mov	(arg2), %al
+	mov	%al, (%r11)
+
+	mov	1(arg2), %al
+	mov	%al, 1(%r11)
+
+	mov	2(arg2), %al
+	mov	%al, 2(%r11)
+
+	movdqa	 (%rsp), %xmm7
+	pshufb	 %xmm11, %xmm7
+	pxor	 %xmm0 , %xmm7  # xor the initial crc value
+
+	psrldq	$5, %xmm7
+
+	jmp	_barrett
+_only_less_than_3:
+	cmp	$2, arg3
+	jl	_only_less_than_2
+
+	# load 2 Bytes
+	mov	(arg2), %al
+	mov	%al, (%r11)
+
+	mov	1(arg2), %al
+	mov	%al, 1(%r11)
+
+	movdqa	(%rsp), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7   # xor the initial crc value
+
+	psrldq	$6, %xmm7
+
+	jmp	_barrett
+_only_less_than_2:
+
+	# load 1 Byte
+	mov	(arg2), %al
+	mov	%al, (%r11)
+
+	movdqa	(%rsp), %xmm7
+	pshufb	%xmm11, %xmm7
+	pxor	%xmm0 , %xmm7   # xor the initial crc value
+
+	psrldq	$7, %xmm7
+
+	jmp	_barrett
+
+ENDPROC(crc_t10dif_pcl)
+
+.data
+
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+.align 16
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+
+
+
+mask1:
+.octa 0x80808080808080808080808080808080
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+#	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+#	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+#	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+#	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+#	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+#	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+#	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+#	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+#	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+#	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+#	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+#	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+#	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+#	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+#	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/i387.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+				size_t len);
+
+struct chksum_desc_ctx {
+	__u16 crc;
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = 0;
+
+	return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	if (irq_fpu_usable()) {
+		kernel_fpu_begin();
+		ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+		kernel_fpu_end();
+	} else
+		ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	*(__u16 *)out = ctx->crc;
+	return 0;
+}
+
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+			u8 *out)
+{
+	if (irq_fpu_usable()) {
+		kernel_fpu_begin();
+		*(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+		kernel_fpu_end();
+	} else
+		*(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+	return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(&ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+			 unsigned int length, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(&ctx->crc, data, length, out);
+}
+
+static struct shash_alg alg = {
+	.digestsize		=	CRC_T10DIF_DIGEST_SIZE,
+	.init		=	chksum_init,
+	.update		=	chksum_update,
+	.final		=	chksum_final,
+	.finup		=	chksum_finup,
+	.digest		=	chksum_digest,
+	.descsize		=	sizeof(struct chksum_desc_ctx),
+	.base			=	{
+		.cra_name		=	"crct10dif",
+		.cra_driver_name	=	"crct10dif-pclmul",
+		.cra_priority		=	200,
+		.cra_blocksize		=	CRC_T10DIF_BLOCK_SIZE,
+		.cra_module		=	THIS_MODULE,
+	}
+};
+
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+
+static int __init crct10dif_intel_mod_init(void)
+{
+	if (!x86_match_cpu(crct10dif_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_intel_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crct10dif");
+MODULE_ALIAS("crct10dif-pclmul");
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@ -187,7 +187,36 @@ static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
 	return 0;
 }

-static struct shash_alg alg = {
+static int sha224_ssse3_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_ssse3_final(desc, D);
+
+	memcpy(hash, D, SHA224_DIGEST_SIZE);
+	memset(D, 0, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_ssse3_init,
 	.update		=	sha256_ssse3_update,
@ -204,7 +233,24 @@ static struct shash_alg alg = {
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
-};
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_ssse3_init,
+	.update		=	sha256_ssse3_update,
+	.final		=	sha224_ssse3_final,
+	.export		=	sha256_ssse3_export,
+	.import		=	sha256_ssse3_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };

 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
@ -227,7 +273,7 @@ static bool __init avx_usable(void)

 static int __init sha256_ssse3_mod_init(void)
 {
-	/* test for SSE3 first */
+	/* test for SSSE3 first */
 	if (cpu_has_ssse3)
 		sha256_transform_asm = sha256_transform_ssse3;

@ -254,7 +300,7 @@ static int __init sha256_ssse3_mod_init(void)
 		else
 #endif
 			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
-		return crypto_register_shash(&alg);
+		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
 	}
 	pr_info("Neither AVX nor SSSE3 is available/usable.\n");

@ -263,7 +309,7 @@ static int __init sha256_ssse3_mod_init(void)

 static void __exit sha256_ssse3_mod_fini(void)
 {
-	crypto_unregister_shash(&alg);
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }

 module_init(sha256_ssse3_mod_init);
@ -273,3 +319,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");

 MODULE_ALIAS("sha256");
+MODULE_ALIAS("sha384");
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@ -194,7 +194,37 @@ static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
 	return 0;
 }

-static struct shash_alg alg = {
+static int sha384_ssse3_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA384_H0;
+	sctx->state[1] = SHA384_H1;
+	sctx->state[2] = SHA384_H2;
+	sctx->state[3] = SHA384_H3;
+	sctx->state[4] = SHA384_H4;
+	sctx->state[5] = SHA384_H5;
+	sctx->state[6] = SHA384_H6;
+	sctx->state[7] = SHA384_H7;
+
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA512_DIGEST_SIZE];
+
+	sha512_ssse3_final(desc, D);
+
+	memcpy(hash, D, SHA384_DIGEST_SIZE);
+	memset(D, 0, SHA512_DIGEST_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_ssse3_init,
 	.update		=	sha512_ssse3_update,
@ -211,7 +241,24 @@ static struct shash_alg alg = {
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
-};
+},  {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_ssse3_init,
+	.update		=	sha512_ssse3_update,
+	.final		=	sha384_ssse3_final,
+	.export		=	sha512_ssse3_export,
+	.import		=	sha512_ssse3_import,
+	.descsize	=	sizeof(struct sha512_state),
+	.statesize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name =	"sha384-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };

 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
@ -234,7 +281,7 @@ static bool __init avx_usable(void)

 static int __init sha512_ssse3_mod_init(void)
 {
-	/* test for SSE3 first */
+	/* test for SSSE3 first */
 	if (cpu_has_ssse3)
 		sha512_transform_asm = sha512_transform_ssse3;

@ -261,7 +308,7 @@ static int __init sha512_ssse3_mod_init(void)
 		else
 #endif
 			pr_info("Using SSSE3 optimized SHA-512 implementation\n");
-		return crypto_register_shash(&alg);
+		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
 	}
 	pr_info("Neither AVX nor SSSE3 is available/usable.\n");

@ -270,7 +317,7 @@ static int __init sha512_ssse3_mod_init(void)

 static void __exit sha512_ssse3_mod_fini(void)
 {
-	crypto_unregister_shash(&alg);
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }

 module_init(sha512_ssse3_mod_init);
@ -280,3 +327,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");

 MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");
--- a/arch/x86/crypto/twofish-avx2-asm_64.S
+++ b/arch/x86/crypto/twofish-avx2-asm_64.S
@ -1,600 +0,0 @@
-/*
- * x86_64/AVX2 assembler optimized version of Twofish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/linkage.h>
-#include "glue_helper-asm-avx2.S"
-
-.file "twofish-avx2-asm_64.S"
-
-.data
-.align 16
-
-.Lvpshufb_mask0:
-.long 0x80808000
-.long 0x80808004
-.long 0x80808008
-.long 0x8080800c
-
-.Lbswap128_mask:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lxts_gf128mul_and_shl1_mask_0:
-	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-.Lxts_gf128mul_and_shl1_mask_1:
-	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
-.text
-
-/* structure of crypto context */
-#define s0	0
-#define s1	1024
-#define s2	2048
-#define s3	3072
-#define w	4096
-#define	k	4128
-
-/* register macros */
-#define CTX	%rdi
-
-#define RS0	CTX
-#define RS1	%r8
-#define RS2	%r9
-#define RS3	%r10
-#define RK	%r11
-#define RW	%rax
-#define RROUND  %r12
-#define RROUNDd %r12d
-
-#define RA0	%ymm8
-#define RB0	%ymm9
-#define RC0	%ymm10
-#define RD0	%ymm11
-#define RA1	%ymm12
-#define RB1	%ymm13
-#define RC1	%ymm14
-#define RD1	%ymm15
-
-/* temp regs */
-#define RX0	%ymm0
-#define RY0	%ymm1
-#define RX1	%ymm2
-#define RY1	%ymm3
-#define RT0	%ymm4
-#define RIDX	%ymm5
-
-#define RX0x	%xmm0
-#define RY0x	%xmm1
-#define RX1x	%xmm2
-#define RY1x	%xmm3
-#define RT0x	%xmm4
-
-/* vpgatherdd mask and '-1' */
-#define RNOT	%ymm6
-
-/* byte mask, (-1 >> 24) */
-#define RBYTE	%ymm7
-
-/**********************************************************************
-  16-way AVX2 twofish
- **********************************************************************/
-#define init_round_constants() \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpsrld $24, RNOT, RBYTE; \
-	leaq k(CTX), RK; \
-	leaq w(CTX), RW; \
-	leaq s1(CTX), RS1; \
-	leaq s2(CTX), RS2; \
-	leaq s3(CTX), RS3; \
-
-#define g16(ab, rs0, rs1, rs2, rs3, xy) \
-	vpand RBYTE, ab ## 0, RIDX; \
-	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-		\
-		vpand RBYTE, ab ## 1, RIDX; \
-		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-	\
-	vpsrld $8, ab ## 0, RIDX; \
-	vpand RBYTE, RIDX, RIDX; \
-	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpxor RT0, xy ## 0, xy ## 0; \
-		\
-		vpsrld $8, ab ## 1, RIDX; \
-		vpand RBYTE, RIDX, RIDX; \
-		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-		vpxor RT0, xy ## 1, xy ## 1; \
-	\
-	vpsrld $16, ab ## 0, RIDX; \
-	vpand RBYTE, RIDX, RIDX; \
-	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpxor RT0, xy ## 0, xy ## 0; \
-		\
-		vpsrld $16, ab ## 1, RIDX; \
-		vpand RBYTE, RIDX, RIDX; \
-		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-		vpxor RT0, xy ## 1, xy ## 1; \
-	\
-	vpsrld $24, ab ## 0, RIDX; \
-	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-	vpcmpeqd RNOT, RNOT, RNOT; \
-	vpxor RT0, xy ## 0, xy ## 0; \
-		\
-		vpsrld $24, ab ## 1, RIDX; \
-		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
-		vpcmpeqd RNOT, RNOT, RNOT; \
-		vpxor RT0, xy ## 1, xy ## 1;
-
-#define g1_16(a, x) \
-	g16(a, RS0, RS1, RS2, RS3, x);
-
-#define g2_16(b, y) \
-	g16(b, RS1, RS2, RS3, RS0, y);
-
-#define encrypt_round_end16(a, b, c, d, nk) \
-	vpaddd RY0, RX0, RX0; \
-	vpaddd RX0, RY0, RY0; \
-	vpbroadcastd nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RX0, RX0; \
-	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RY0, RY0; \
-	\
-	vpxor RY0, d ## 0, d ## 0; \
-	\
-	vpxor RX0, c ## 0, c ## 0; \
-	vpsrld $1, c ## 0, RT0; \
-	vpslld $31, c ## 0, c ## 0; \
-	vpor RT0, c ## 0, c ## 0; \
-	\
-		vpaddd RY1, RX1, RX1; \
-		vpaddd RX1, RY1, RY1; \
-		vpbroadcastd nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RX1, RX1; \
-		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RY1, RY1; \
-		\
-		vpxor RY1, d ## 1, d ## 1; \
-		\
-		vpxor RX1, c ## 1, c ## 1; \
-		vpsrld $1, c ## 1, RT0; \
-		vpslld $31, c ## 1, c ## 1; \
-		vpor RT0, c ## 1, c ## 1; \
-
-#define encrypt_round16(a, b, c, d, nk) \
-	g2_16(b, RY); \
-	\
-	vpslld $1, b ## 0, RT0; \
-	vpsrld $31, b ## 0, b ## 0; \
-	vpor RT0, b ## 0, b ## 0; \
-	\
-		vpslld $1, b ## 1, RT0; \
-		vpsrld $31, b ## 1, b ## 1; \
-		vpor RT0, b ## 1, b ## 1; \
-	\
-	g1_16(a, RX); \
-	\
-	encrypt_round_end16(a, b, c, d, nk);
-
-#define encrypt_round_first16(a, b, c, d, nk) \
-	vpslld $1, d ## 0, RT0; \
-	vpsrld $31, d ## 0, d ## 0; \
-	vpor RT0, d ## 0, d ## 0; \
-	\
-		vpslld $1, d ## 1, RT0; \
-		vpsrld $31, d ## 1, d ## 1; \
-		vpor RT0, d ## 1, d ## 1; \
-	\
-	encrypt_round16(a, b, c, d, nk);
-
-#define encrypt_round_last16(a, b, c, d, nk) \
-	g2_16(b, RY); \
-	\
-	g1_16(a, RX); \
-	\
-	encrypt_round_end16(a, b, c, d, nk);
-
-#define decrypt_round_end16(a, b, c, d, nk) \
-	vpaddd RY0, RX0, RX0; \
-	vpaddd RX0, RY0, RY0; \
-	vpbroadcastd nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RX0, RX0; \
-	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-	vpaddd RT0, RY0, RY0; \
-	\
-	vpxor RX0, c ## 0, c ## 0; \
-	\
-	vpxor RY0, d ## 0, d ## 0; \
-	vpsrld $1, d ## 0, RT0; \
-	vpslld $31, d ## 0, d ## 0; \
-	vpor RT0, d ## 0, d ## 0; \
-	\
-		vpaddd RY1, RX1, RX1; \
-		vpaddd RX1, RY1, RY1; \
-		vpbroadcastd nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RX1, RX1; \
-		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
-		vpaddd RT0, RY1, RY1; \
-		\
-		vpxor RX1, c ## 1, c ## 1; \
-		\
-		vpxor RY1, d ## 1, d ## 1; \
-		vpsrld $1, d ## 1, RT0; \
-		vpslld $31, d ## 1, d ## 1; \
-		vpor RT0, d ## 1, d ## 1;
-
-#define decrypt_round16(a, b, c, d, nk) \
-	g1_16(a, RX); \
-	\
-	vpslld $1, a ## 0, RT0; \
-	vpsrld $31, a ## 0, a ## 0; \
-	vpor RT0, a ## 0, a ## 0; \
-	\
-		vpslld $1, a ## 1, RT0; \
-		vpsrld $31, a ## 1, a ## 1; \
-		vpor RT0, a ## 1, a ## 1; \
-	\
-	g2_16(b, RY); \
-	\
-	decrypt_round_end16(a, b, c, d, nk);
-
-#define decrypt_round_first16(a, b, c, d, nk) \
-	vpslld $1, c ## 0, RT0; \
-	vpsrld $31, c ## 0, c ## 0; \
-	vpor RT0, c ## 0, c ## 0; \
-	\
-		vpslld $1, c ## 1, RT0; \
-		vpsrld $31, c ## 1, c ## 1; \
-		vpor RT0, c ## 1, c ## 1; \
-	\
-	decrypt_round16(a, b, c, d, nk)
-
-#define decrypt_round_last16(a, b, c, d, nk) \
-	g1_16(a, RX); \
-	\
-	g2_16(b, RY); \
-	\
-	decrypt_round_end16(a, b, c, d, nk);
-
-#define encrypt_cycle16() \
-	encrypt_round16(RA, RB, RC, RD, 0); \
-	encrypt_round16(RC, RD, RA, RB, 8);
-
-#define encrypt_cycle_first16() \
-	encrypt_round_first16(RA, RB, RC, RD, 0); \
-	encrypt_round16(RC, RD, RA, RB, 8);
-
-#define encrypt_cycle_last16() \
-	encrypt_round16(RA, RB, RC, RD, 0); \
-	encrypt_round_last16(RC, RD, RA, RB, 8);
-
-#define decrypt_cycle16(n) \
-	decrypt_round16(RC, RD, RA, RB, 8); \
-	decrypt_round16(RA, RB, RC, RD, 0);
-
-#define decrypt_cycle_first16(n) \
-	decrypt_round_first16(RC, RD, RA, RB, 8); \
-	decrypt_round16(RA, RB, RC, RD, 0);
-
-#define decrypt_cycle_last16(n) \
-	decrypt_round16(RC, RD, RA, RB, 8); \
-	decrypt_round_last16(RA, RB, RC, RD, 0);
-
-#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
-	vpunpckhdq x1, x0, t2; \
-	vpunpckldq x1, x0, x0; \
-	\
-	vpunpckldq x3, x2, t1; \
-	vpunpckhdq x3, x2, x2; \
-	\
-	vpunpckhqdq t1,	x0, x1; \
-	vpunpcklqdq t1,	x0, x0; \
-	\
-	vpunpckhqdq x2, t2, x3; \
-	vpunpcklqdq x2,	t2, x2;
-
-#define read_blocks8(offs,a,b,c,d) \
-	transpose_4x4(a, b, c, d, RX0, RY0);
-
-#define write_blocks8(offs,a,b,c,d) \
-	transpose_4x4(a, b, c, d, RX0, RY0);
-
-#define inpack_enc8(a,b,c,d) \
-	vpbroadcastd 4*0(RW), RT0; \
-	vpxor RT0, a, a; \
-	\
-	vpbroadcastd 4*1(RW), RT0; \
-	vpxor RT0, b, b; \
-	\
-	vpbroadcastd 4*2(RW), RT0; \
-	vpxor RT0, c, c; \
-	\
-	vpbroadcastd 4*3(RW), RT0; \
-	vpxor RT0, d, d;
-
-#define outunpack_enc8(a,b,c,d) \
-	vpbroadcastd 4*4(RW), RX0; \
-	vpbroadcastd 4*5(RW), RY0; \
-	vpxor RX0, c, RX0; \
-	vpxor RY0, d, RY0; \
-	\
-	vpbroadcastd 4*6(RW), RT0; \
-	vpxor RT0, a, c; \
-	vpbroadcastd 4*7(RW), RT0; \
-	vpxor RT0, b, d; \
-	\
-	vmovdqa RX0, a; \
-	vmovdqa RY0, b;
-
-#define inpack_dec8(a,b,c,d) \
-	vpbroadcastd 4*4(RW), RX0; \
-	vpbroadcastd 4*5(RW), RY0; \
-	vpxor RX0, a, RX0; \
-	vpxor RY0, b, RY0; \
-	\
-	vpbroadcastd 4*6(RW), RT0; \
-	vpxor RT0, c, a; \
-	vpbroadcastd 4*7(RW), RT0; \
-	vpxor RT0, d, b; \
-	\
-	vmovdqa RX0, c; \
-	vmovdqa RY0, d;
-
-#define outunpack_dec8(a,b,c,d) \
-	vpbroadcastd 4*0(RW), RT0; \
-	vpxor RT0, a, a; \
-	\
-	vpbroadcastd 4*1(RW), RT0; \
-	vpxor RT0, b, b; \
-	\
-	vpbroadcastd 4*2(RW), RT0; \
-	vpxor RT0, c, c; \
-	\
-	vpbroadcastd 4*3(RW), RT0; \
-	vpxor RT0, d, d;
-
-#define read_blocks16(a,b,c,d) \
-	read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-	read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define write_blocks16(a,b,c,d) \
-	write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-	write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define xor_blocks16(a,b,c,d) \
-	xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
-	xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define inpack_enc16(a,b,c,d) \
-	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define outunpack_enc16(a,b,c,d) \
-	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define inpack_dec16(a,b,c,d) \
-	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-#define outunpack_dec16(a,b,c,d) \
-	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
-	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
-
-.align 8
-__twofish_enc_blk16:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
-	 * output:
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
-	 */
-	init_round_constants();
-
-	read_blocks16(RA, RB, RC, RD);
-	inpack_enc16(RA, RB, RC, RD);
-
-	xorl RROUNDd, RROUNDd;
-	encrypt_cycle_first16();
-	movl $2, RROUNDd;
-
-.align 4
-.L__enc_loop:
-	encrypt_cycle16();
-
-	addl $2, RROUNDd;
-	cmpl $14, RROUNDd;
-	jne .L__enc_loop;
-
-	encrypt_cycle_last16();
-
-	outunpack_enc16(RA, RB, RC, RD);
-	write_blocks16(RA, RB, RC, RD);
-
-	ret;
-ENDPROC(__twofish_enc_blk16)
-
-.align 8
-__twofish_dec_blk16:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
-	 * output:
-	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
-	 */
-	init_round_constants();
-
-	read_blocks16(RA, RB, RC, RD);
-	inpack_dec16(RA, RB, RC, RD);
-
-	movl $14, RROUNDd;
-	decrypt_cycle_first16();
-	movl $12, RROUNDd;
-
-.align 4
-.L__dec_loop:
-	decrypt_cycle16();
-
-	addl $-2, RROUNDd;
-	jnz .L__dec_loop;
-
-	decrypt_cycle_last16();
-
-	outunpack_dec16(RA, RB, RC, RD);
-	write_blocks16(RA, RB, RC, RD);
-
-	ret;
-ENDPROC(__twofish_dec_blk16)
-
-ENTRY(twofish_ecb_enc_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	call __twofish_enc_blk16;
-
-	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_ecb_enc_16way)
-
-ENTRY(twofish_ecb_dec_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	call __twofish_dec_blk16;
-
-	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_ecb_dec_16way)
-
-ENTRY(twofish_cbc_dec_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	call __twofish_dec_blk16;
-
-	store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
-			RX0);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_cbc_dec_16way)
-
-ENTRY(twofish_ctr_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (little endian, 128bit)
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
-		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
-		       RBYTE);
-
-	call __twofish_enc_blk16;
-
-	store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_ctr_16way)
-
-.align 8
-twofish_xts_crypt_16way:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-	 *	%r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
-	 */
-
-	vzeroupper;
-	pushq %r12;
-
-	load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
-		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
-		       .Lxts_gf128mul_and_shl1_mask_0,
-		       .Lxts_gf128mul_and_shl1_mask_1);
-
-	call *%r8;
-
-	store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
-
-	popq %r12;
-	vzeroupper;
-
-	ret;
-ENDPROC(twofish_xts_crypt_16way)
-
-ENTRY(twofish_xts_enc_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-	 */
-	leaq __twofish_enc_blk16, %r8;
-	jmp twofish_xts_crypt_16way;
-ENDPROC(twofish_xts_enc_16way)
-
-ENTRY(twofish_xts_dec_16way)
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
-	 */
-	leaq __twofish_dec_blk16, %r8;
-	jmp twofish_xts_crypt_16way;
-ENDPROC(twofish_xts_dec_16way)
--- a/arch/x86/crypto/twofish_avx2_glue.c
+++ b/arch/x86/crypto/twofish_avx2_glue.c
@ -1,584 +0,0 @@
-/*
- * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/algapi.h>
-#include <crypto/ctr.h>
-#include <crypto/twofish.h>
-#include <crypto/lrw.h>
-#include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
-#include <asm/crypto/twofish.h>
-#include <asm/crypto/ablk_helper.h>
-#include <asm/crypto/glue_helper.h>
-#include <crypto/scatterwalk.h>
-
-#define TF_AVX2_PARALLEL_BLOCKS 16
-
-/* 16-way AVX2 parallel cipher functions */
-asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src);
-asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src);
-asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
-
-asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
-				  le128 *iv);
-
-asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
-				      const u8 *src, le128 *iv);
-
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__twofish_enc_blk_3way(ctx, dst, src, false);
-}
-
-static const struct common_glue_ctx twofish_enc = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_ctr = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
-	},  {
-		.num_blocks = 8,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_enc_xts = {
-	.num_funcs = 3,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_dec = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_dec_cbc = {
-	.num_funcs = 4,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
-	}, {
-		.num_blocks = 3,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
-	} }
-};
-
-static const struct common_glue_ctx twofish_dec_xts = {
-	.num_funcs = 3,
-	.fpu_blocks_limit = 8,
-
-	.funcs = { {
-		.num_blocks = 16,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
-	}, {
-		.num_blocks = 8,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
-	} }
-};
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
-}
-
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
-}
-
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
-				       dst, src, nbytes);
-}
-
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
-				       nbytes);
-}
-
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes)
-{
-	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
-}
-
-static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
-	return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
-}
-
-static inline void twofish_fpu_end(bool fpu_enabled)
-{
-	glue_fpu_end(fpu_enabled);
-}
-
-struct crypt_priv {
-	struct twofish_ctx *ctx;
-	bool fpu_enabled;
-};
-
-static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
-{
-	const unsigned int bsize = TF_BLOCK_SIZE;
-	struct crypt_priv *ctx = priv;
-	int i;
-
-	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
-
-	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
-		twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
-		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
-	}
-
-	while (nbytes >= 8 * bsize) {
-		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 8;
-		nbytes -= bsize * 8;
-	}
-
-	while (nbytes >= 3 * bsize) {
-		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 3;
-		nbytes -= bsize * 3;
-	}
-
-	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-		twofish_enc_blk(ctx->ctx, srcdst, srcdst);
-}
-
-static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
-{
-	const unsigned int bsize = TF_BLOCK_SIZE;
-	struct crypt_priv *ctx = priv;
-	int i;
-
-	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
-
-	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
-		twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
-		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
-	}
-
-	while (nbytes >= 8 * bsize) {
-		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 8;
-		nbytes -= bsize * 8;
-	}
-
-	while (nbytes >= 3 * bsize) {
-		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
-		srcdst += bsize * 3;
-		nbytes -= bsize * 3;
-	}
-
-	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-		twofish_dec_blk(ctx->ctx, srcdst, srcdst);
-}
-
-static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->twofish_ctx,
-		.fpu_enabled = false,
-	};
-	struct lrw_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.table_ctx = &ctx->lrw_table,
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = encrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = lrw_crypt(desc, dst, src, nbytes, &req);
-	twofish_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
-}
-
-static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->twofish_ctx,
-		.fpu_enabled = false,
-	};
-	struct lrw_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.table_ctx = &ctx->lrw_table,
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = decrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = lrw_crypt(desc, dst, src, nbytes, &req);
-	twofish_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
-}
-
-static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-
-	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
-				     XTS_TWEAK_CAST(twofish_enc_blk),
-				     &ctx->tweak_ctx, &ctx->crypt_ctx);
-}
-
-static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-
-	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
-				     XTS_TWEAK_CAST(twofish_enc_blk),
-				     &ctx->tweak_ctx, &ctx->crypt_ctx);
-}
-
-static struct crypto_alg tf_algs[10] = { {
-	.cra_name		= "__ecb-twofish-avx2",
-	.cra_driver_name	= "__driver-ecb-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= ecb_encrypt,
-			.decrypt	= ecb_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__cbc-twofish-avx2",
-	.cra_driver_name	= "__driver-cbc-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= cbc_encrypt,
-			.decrypt	= cbc_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__ctr-twofish-avx2",
-	.cra_driver_name	= "__driver-ctr-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct twofish_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= twofish_setkey,
-			.encrypt	= ctr_crypt,
-			.decrypt	= ctr_crypt,
-		},
-	},
-}, {
-	.cra_name		= "__lrw-twofish-avx2",
-	.cra_driver_name	= "__driver-lrw-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_exit		= lrw_twofish_exit_tfm,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= lrw_twofish_setkey,
-			.encrypt	= lrw_encrypt,
-			.decrypt	= lrw_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "__xts-twofish-avx2",
-	.cra_driver_name	= "__driver-xts-twofish-avx2",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.blkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE * 2,
-			.max_keysize	= TF_MAX_KEY_SIZE * 2,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= xts_twofish_setkey,
-			.encrypt	= xts_encrypt,
-			.decrypt	= xts_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "ecb(twofish)",
-	.cra_driver_name	= "ecb-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "cbc(twofish)",
-	.cra_driver_name	= "cbc-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= __ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "ctr(twofish)",
-	.cra_driver_name	= "ctr-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_encrypt,
-			.geniv		= "chainiv",
-		},
-	},
-}, {
-	.cra_name		= "lrw(twofish)",
-	.cra_driver_name	= "lrw-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.max_keysize	= TF_MAX_KEY_SIZE +
-					  TF_BLOCK_SIZE,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-}, {
-	.cra_name		= "xts(twofish)",
-	.cra_driver_name	= "xts-twofish-avx2",
-	.cra_priority		= 500,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= TF_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_u = {
-		.ablkcipher = {
-			.min_keysize	= TF_MIN_KEY_SIZE * 2,
-			.max_keysize	= TF_MAX_KEY_SIZE * 2,
-			.ivsize		= TF_BLOCK_SIZE,
-			.setkey		= ablk_set_key,
-			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
-		},
-	},
-} };
-
-static int __init init(void)
-{
-	u64 xcr0;
-
-	if (!cpu_has_avx2 || !cpu_has_osxsave) {
-		pr_info("AVX2 instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX2 detected but unusable.\n");
-		return -ENODEV;
-	}
-
-	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
-MODULE_ALIAS("twofish");
-MODULE_ALIAS("twofish-asm");
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@ -50,26 +50,18 @@
 /* 8-way parallel cipher functions */
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
-
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);

 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
-EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
-
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
 				 const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_ctr_8way);

 asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
 asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);

 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 					const u8 *src)
@ -77,19 +69,17 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 	__twofish_enc_blk_3way(ctx, dst, src, false);
 }

-void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(twofish_enc_blk));
 }
-EXPORT_SYMBOL_GPL(twofish_xts_enc);

-void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(twofish_dec_blk));
 }
-EXPORT_SYMBOL_GPL(twofish_xts_dec);


 static const struct common_glue_ctx twofish_enc = {
--- a/arch/x86/include/asm/crypto/blowfish.h
+++ b/arch/x86/include/asm/crypto/blowfish.h
@ -1,43 +0,0 @@
-#ifndef ASM_X86_BLOWFISH_H
-#define ASM_X86_BLOWFISH_H
-
-#include <linux/crypto.h>
-#include <crypto/blowfish.h>
-
-#define BF_PARALLEL_BLOCKS 4
-
-/* regular block cipher functions */
-asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
-				   bool xor);
-asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
-
-/* 4-way parallel cipher functions */
-asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-					const u8 *src, bool xor);
-asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
-				      const u8 *src);
-
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, true);
-}
-
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-					 const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, true);
-}
-
-#endif
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@ -28,20 +28,6 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);

-/* 8-way parallel cipher functions */
-asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
-asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
-asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
-asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
-				 const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src, le128 *iv);
-
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
@ -57,8 +43,4 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 			      unsigned int keylen);

-/* helpers from twofish-avx module */
-extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
-extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
-
 #endif /* ASM_X86_TWOFISH_H */
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@ -376,6 +376,25 @@ config CRYPTO_CRC32_PCLMUL
 	  which will enable any routine to use the CRC-32-IEEE 802.3 checksum
 	  and gain better performance as compared with the table implementation.

+config CRYPTO_CRCT10DIF
+	tristate "CRCT10DIF algorithm"
+	select CRYPTO_HASH
+	help
+	  CRC T10 Data Integrity Field computation is being cast as
+	  a crypto transform.  This allows for faster crc t10 diff
+	  transforms to be used if they are available.
+
+config CRYPTO_CRCT10DIF_PCLMUL
+	tristate "CRCT10DIF PCLMULQDQ hardware acceleration"
+	depends on X86 && 64BIT && CRC_T10DIF
+	select CRYPTO_HASH
+	help
+	  For x86_64 processors with SSE4.2 and PCLMULQDQ supported,
+	  CRC T10 DIF PCLMULQDQ computation can be hardware
+	  accelerated PCLMULQDQ instruction. This option will create
+	  'crct10dif-plcmul' module, which is faster when computing the
+	  crct10dif checksum as compared with the generic table implementation.
+
 config CRYPTO_GHASH
 	tristate "GHASH digest algorithm"
 	select CRYPTO_GF128MUL
@ -820,25 +839,6 @@ config CRYPTO_BLOWFISH_X86_64
 	  See also:
 	  <http://www.schneier.com/blowfish.html>

-config CRYPTO_BLOWFISH_AVX2_X86_64
-	tristate "Blowfish cipher algorithm (x86_64/AVX2)"
-	depends on X86 && 64BIT
-	depends on BROKEN
-	select CRYPTO_ALGAPI
-	select CRYPTO_CRYPTD
-	select CRYPTO_ABLK_HELPER_X86
-	select CRYPTO_BLOWFISH_COMMON
-	select CRYPTO_BLOWFISH_X86_64
-	help
-	  Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
-
-	  This is a variable key length cipher which can use keys from 32
-	  bits to 448 bits in length.  It's fast, simple and specifically
-	  designed for use on "large microprocessors".
-
-	  See also:
-	  <http://www.schneier.com/blowfish.html>
-
 config CRYPTO_CAMELLIA
 	tristate "Camellia cipher algorithms"
 	depends on CRYPTO
@ -1297,31 +1297,6 @@ config CRYPTO_TWOFISH_AVX_X86_64
 	  See also:
 	  <http://www.schneier.com/twofish.html>

-config CRYPTO_TWOFISH_AVX2_X86_64
-	tristate "Twofish cipher algorithm (x86_64/AVX2)"
-	depends on X86 && 64BIT
-	depends on BROKEN
-	select CRYPTO_ALGAPI
-	select CRYPTO_CRYPTD
-	select CRYPTO_ABLK_HELPER_X86
-	select CRYPTO_GLUE_HELPER_X86
-	select CRYPTO_TWOFISH_COMMON
-	select CRYPTO_TWOFISH_X86_64
-	select CRYPTO_TWOFISH_X86_64_3WAY
-	select CRYPTO_TWOFISH_AVX_X86_64
-	select CRYPTO_LRW
-	select CRYPTO_XTS
-	help
-	  Twofish cipher algorithm (x86_64/AVX2).
-
-	  Twofish was submitted as an AES (Advanced Encryption Standard)
-	  candidate cipher by researchers at CounterPane Systems.  It is a
-	  16 round block cipher supporting key sizes of 128, 192, and 256
-	  bits.
-
-	  See also:
-	  <http://www.schneier.com/twofish.html>
-
 comment "Compression"

 config CRYPTO_DEFLATE
--- a/crypto/Makefile
+++ b/crypto/Makefile
@ -83,6 +83,7 @@ obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o
 obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
 obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
 obj-$(CONFIG_CRYPTO_CRC32) += crc32.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o
 obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
 obj-$(CONFIG_CRYPTO_LZO) += lzo.o
 obj-$(CONFIG_CRYPTO_842) += 842.o
--- a/crypto/crct10dif.c
+++ b/crypto/crct10dif.c
@ -0,0 +1,178 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform
+ *
+ * Copyright (c) 2007 Oracle Corporation.  All rights reserved.
+ * Written by Martin K. Petersen <martin.petersen@oracle.com>
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+struct chksum_desc_ctx {
+	__u16 crc;
+};
+
+/* Table generated using the following polynomium:
+ * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ * gt: 0x8bb7
+ */
+static const __u16 t10_dif_crc_table[256] = {
+	0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+	0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+	0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+	0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+	0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+	0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+	0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+	0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+	0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+	0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+	0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+	0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+	0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+	0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+	0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+	0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+	0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+	0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+	0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+	0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+	0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+	0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+	0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+	0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+	0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+	0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+	0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+	0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+	0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+	0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+	0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+	0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < len ; i++)
+		crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff];
+
+	return crc;
+}
+EXPORT_SYMBOL(crc_t10dif_generic);
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = 0;
+
+	return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	*(__u16 *)out = ctx->crc;
+	return 0;
+}
+
+static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
+			u8 *out)
+{
+	*(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+	return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(&ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+			 unsigned int length, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(&ctx->crc, data, length, out);
+}
+
+static struct shash_alg alg = {
+	.digestsize		=	CRC_T10DIF_DIGEST_SIZE,
+	.init		=	chksum_init,
+	.update		=	chksum_update,
+	.final		=	chksum_final,
+	.finup		=	chksum_finup,
+	.digest		=	chksum_digest,
+	.descsize		=	sizeof(struct chksum_desc_ctx),
+	.base			=	{
+		.cra_name		=	"crct10dif",
+		.cra_driver_name	=	"crct10dif-generic",
+		.cra_priority		=	100,
+		.cra_blocksize		=	CRC_T10DIF_BLOCK_SIZE,
+		.cra_module		=	THIS_MODULE,
+	}
+};
+
+static int __init crct10dif_mod_init(void)
+{
+	int ret;
+
+	ret = crypto_register_shash(&alg);
+	return ret;
+}
+
+static void __exit crct10dif_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_mod_init);
+module_exit(crct10dif_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation.");
+MODULE_LICENSE("GPL");
--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@ -251,6 +251,7 @@ static struct shash_alg sha512_algs[2] = { {
 	.descsize	=	sizeof(struct sha512_state),
 	.base		=	{
 		.cra_name	=	"sha512",
+		.cra_driver_name =	"sha512-generic",
 		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
@ -263,6 +264,7 @@ static struct shash_alg sha512_algs[2] = { {
 	.descsize	=	sizeof(struct sha512_state),
 	.base		=	{
 		.cra_name	=	"sha384",
+		.cra_driver_name =	"sha384-generic",
 		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@ -1174,6 +1174,10 @@ static int do_test(int m)
 		ret += tcrypt_test("ghash");
 		break;

+	case 47:
+		ret += tcrypt_test("crct10dif");
+		break;
+
 	case 100:
 		ret += tcrypt_test("hmac(md5)");
 		break;
@ -1498,6 +1502,10 @@ static int do_test(int m)
 		test_hash_speed("crc32c", sec, generic_hash_speed_template);
 		if (mode > 300 && mode < 400) break;

+	case 320:
+		test_hash_speed("crct10dif", sec, generic_hash_speed_template);
+		if (mode > 300 && mode < 400) break;
+
 	case 399:
 		break;

--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@ -184,8 +184,9 @@ static int do_one_async_hash_op(struct ahash_request *req,
 	return ret;
 }

-static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
-		     unsigned int tcount, bool use_digest)
+static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
+		       unsigned int tcount, bool use_digest,
+		       const int align_offset)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
 	unsigned int i, j, k, temp;
@ -216,10 +217,15 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 		if (template[i].np)
 			continue;

+		ret = -EINVAL;
+		if (WARN_ON(align_offset + template[i].psize > PAGE_SIZE))
+			goto out;
+
 		j++;
 		memset(result, 0, 64);

 		hash_buff = xbuf[0];
+		hash_buff += align_offset;

 		memcpy(hash_buff, template[i].plaintext, template[i].psize);
 		sg_init_one(&sg[0], hash_buff, template[i].psize);
@ -281,6 +287,10 @@ static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,

 	j = 0;
 	for (i = 0; i < tcount; i++) {
+		/* alignment tests are only done with continuous buffers */
+		if (align_offset != 0)
+			break;
+
 		if (template[i].np) {
 			j++;
 			memset(result, 0, 64);
@ -358,9 +368,36 @@ out_nobuf:
 	return ret;
 }

+static int test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
+		     unsigned int tcount, bool use_digest)
+{
+	unsigned int alignmask;
+	int ret;
+
+	ret = __test_hash(tfm, template, tcount, use_digest, 0);
+	if (ret)
+		return ret;
+
+	/* test unaligned buffers, check with one byte offset */
+	ret = __test_hash(tfm, template, tcount, use_digest, 1);
+	if (ret)
+		return ret;
+
+	alignmask = crypto_tfm_alg_alignmask(&tfm->base);
+	if (alignmask) {
+		/* Check if alignment mask for tfm is correctly set. */
+		ret = __test_hash(tfm, template, tcount, use_digest,
+				  alignmask + 1);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int __test_aead(struct crypto_aead *tfm, int enc,
 		       struct aead_testvec *template, unsigned int tcount,
-		       const bool diff_dst)
+		       const bool diff_dst, const int align_offset)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm));
 	unsigned int i, j, k, n, temp;
@ -423,15 +460,16 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
 		if (!template[i].np) {
 			j++;

-			/* some tepmplates have no input data but they will
+			/* some templates have no input data but they will
 			 * touch input
 			 */
 			input = xbuf[0];
+			input += align_offset;
 			assoc = axbuf[0];

 			ret = -EINVAL;
-			if (WARN_ON(template[i].ilen > PAGE_SIZE ||
-				    template[i].alen > PAGE_SIZE))
+			if (WARN_ON(align_offset + template[i].ilen >
+				    PAGE_SIZE || template[i].alen > PAGE_SIZE))
 				goto out;

 			memcpy(input, template[i].input, template[i].ilen);
@ -470,6 +508,7 @@ static int __test_aead(struct crypto_aead *tfm, int enc,

 			if (diff_dst) {
 				output = xoutbuf[0];
+				output += align_offset;
 				sg_init_one(&sgout[0], output,
 					    template[i].ilen +
 						(enc ? authsize : 0));
@ -530,6 +569,10 @@ static int __test_aead(struct crypto_aead *tfm, int enc,
 	}

 	for (i = 0, j = 0; i < tcount; i++) {
+		/* alignment tests are only done with continuous buffers */
+		if (align_offset != 0)
+			break;
+
 		if (template[i].np) {
 			j++;

@ -732,15 +775,34 @@ out_noxbuf:
 static int test_aead(struct crypto_aead *tfm, int enc,
 		     struct aead_testvec *template, unsigned int tcount)
 {
+	unsigned int alignmask;
 	int ret;

 	/* test 'dst == src' case */
-	ret = __test_aead(tfm, enc, template, tcount, false);
+	ret = __test_aead(tfm, enc, template, tcount, false, 0);
 	if (ret)
 		return ret;

 	/* test 'dst != src' case */
-	return __test_aead(tfm, enc, template, tcount, true);
+	ret = __test_aead(tfm, enc, template, tcount, true, 0);
+	if (ret)
+		return ret;
+
+	/* test unaligned buffers, check with one byte offset */
+	ret = __test_aead(tfm, enc, template, tcount, true, 1);
+	if (ret)
+		return ret;
+
+	alignmask = crypto_tfm_alg_alignmask(&tfm->base);
+	if (alignmask) {
+		/* Check if alignment mask for tfm is correctly set. */
+		ret = __test_aead(tfm, enc, template, tcount, true,
+				  alignmask + 1);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }

 static int test_cipher(struct crypto_cipher *tfm, int enc,
@ -820,7 +882,7 @@ out_nobuf:

 static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 			   struct cipher_testvec *template, unsigned int tcount,
-			   const bool diff_dst)
+			   const bool diff_dst, const int align_offset)
 {
 	const char *algo =
 		crypto_tfm_alg_driver_name(crypto_ablkcipher_tfm(tfm));
@ -876,10 +938,12 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 			j++;

 			ret = -EINVAL;
-			if (WARN_ON(template[i].ilen > PAGE_SIZE))
+			if (WARN_ON(align_offset + template[i].ilen >
+				    PAGE_SIZE))
 				goto out;

 			data = xbuf[0];
+			data += align_offset;
 			memcpy(data, template[i].input, template[i].ilen);

 			crypto_ablkcipher_clear_flags(tfm, ~0);
@ -900,6 +964,7 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 			sg_init_one(&sg[0], data, template[i].ilen);
 			if (diff_dst) {
 				data = xoutbuf[0];
+				data += align_offset;
 				sg_init_one(&sgout[0], data, template[i].ilen);
 			}

@ -941,6 +1006,9 @@ static int __test_skcipher(struct crypto_ablkcipher *tfm, int enc,

 	j = 0;
 	for (i = 0; i < tcount; i++) {
+		/* alignment tests are only done with continuous buffers */
+		if (align_offset != 0)
+			break;

 		if (template[i].iv)
 			memcpy(iv, template[i].iv, MAX_IVLEN);
@ -1075,15 +1143,34 @@ out_nobuf:
 static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 			 struct cipher_testvec *template, unsigned int tcount)
 {
+	unsigned int alignmask;
 	int ret;

 	/* test 'dst == src' case */
-	ret = __test_skcipher(tfm, enc, template, tcount, false);
+	ret = __test_skcipher(tfm, enc, template, tcount, false, 0);
 	if (ret)
 		return ret;

 	/* test 'dst != src' case */
-	return __test_skcipher(tfm, enc, template, tcount, true);
+	ret = __test_skcipher(tfm, enc, template, tcount, true, 0);
+	if (ret)
+		return ret;
+
+	/* test unaligned buffers, check with one byte offset */
+	ret = __test_skcipher(tfm, enc, template, tcount, true, 1);
+	if (ret)
+		return ret;
+
+	alignmask = crypto_tfm_alg_alignmask(&tfm->base);
+	if (alignmask) {
+		/* Check if alignment mask for tfm is correctly set. */
+		ret = __test_skcipher(tfm, enc, template, tcount, true,
+				      alignmask + 1);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }

 static int test_comp(struct crypto_comp *tfm, struct comp_testvec *ctemplate,
@ -1653,16 +1740,10 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__cbc-twofish-avx",
 		.test = alg_test_null,
-	}, {
-		.alg = "__cbc-twofish-avx2",
-		.test = alg_test_null,
 	}, {
 		.alg = "__driver-cbc-aes-aesni",
 		.test = alg_test_null,
 		.fips_allowed = 1,
-	}, {
-		.alg = "__driver-cbc-blowfish-avx2",
-		.test = alg_test_null,
 	}, {
 		.alg = "__driver-cbc-camellia-aesni",
 		.test = alg_test_null,
@ -1687,16 +1768,10 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-cbc-twofish-avx",
 		.test = alg_test_null,
-	}, {
-		.alg = "__driver-cbc-twofish-avx2",
-		.test = alg_test_null,
 	}, {
 		.alg = "__driver-ecb-aes-aesni",
 		.test = alg_test_null,
 		.fips_allowed = 1,
-	}, {
-		.alg = "__driver-ecb-blowfish-avx2",
-		.test = alg_test_null,
 	}, {
 		.alg = "__driver-ecb-camellia-aesni",
 		.test = alg_test_null,
@ -1721,9 +1796,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-ecb-twofish-avx",
 		.test = alg_test_null,
-	}, {
-		.alg = "__driver-ecb-twofish-avx2",
-		.test = alg_test_null,
 	}, {
 		.alg = "__ghash-pclmulqdqni",
 		.test = alg_test_null,
@ -1973,13 +2045,20 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.count = CRC32C_TEST_VECTORS
 			}
 		}
+	}, {
+		.alg = "crct10dif",
+		.test = alg_test_hash,
+		.fips_allowed = 1,
+		.suite = {
+			.hash = {
+				.vecs = crct10dif_tv_template,
+				.count = CRCT10DIF_TEST_VECTORS
+			}
+		}
 	}, {
 		.alg = "cryptd(__driver-cbc-aes-aesni)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
-	}, {
-		.alg = "cryptd(__driver-cbc-blowfish-avx2)",
-		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.test = alg_test_null,
@ -1993,9 +2072,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
-	}, {
-		.alg = "cryptd(__driver-ecb-blowfish-avx2)",
-		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-ecb-camellia-aesni)",
 		.test = alg_test_null,
@ -2020,9 +2096,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cryptd(__driver-ecb-twofish-avx)",
 		.test = alg_test_null,
-	}, {
-		.alg = "cryptd(__driver-ecb-twofish-avx2)",
-		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-gcm-aes-aesni)",
 		.test = alg_test_null,
@ -3068,6 +3141,35 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}
 };

+static bool alg_test_descs_checked;
+
+static void alg_test_descs_check_order(void)
+{
+	int i;
+
+	/* only check once */
+	if (alg_test_descs_checked)
+		return;
+
+	alg_test_descs_checked = true;
+
+	for (i = 1; i < ARRAY_SIZE(alg_test_descs); i++) {
+		int diff = strcmp(alg_test_descs[i - 1].alg,
+				  alg_test_descs[i].alg);
+
+		if (WARN_ON(diff > 0)) {
+			pr_warn("testmgr: alg_test_descs entries in wrong order: '%s' before '%s'\n",
+				alg_test_descs[i - 1].alg,
+				alg_test_descs[i].alg);
+		}
+
+		if (WARN_ON(diff == 0)) {
+			pr_warn("testmgr: duplicate alg_test_descs entry: '%s'\n",
+				alg_test_descs[i].alg);
+		}
+	}
+}
+
 static int alg_find_test(const char *alg)
 {
 	int start = 0;
@ -3099,6 +3201,8 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
 	int j;
 	int rc;

+	alg_test_descs_check_order();
+
 	if ((type & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) {
 		char nalg[CRYPTO_MAX_ALG_NAME];

--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@ -450,6 +450,39 @@ static struct hash_testvec rmd320_tv_template[] = {
 	}
 };

+#define CRCT10DIF_TEST_VECTORS	3
+static struct hash_testvec crct10dif_tv_template[] = {
+	{
+		.plaintext = "abc",
+		.psize  = 3,
+#ifdef __LITTLE_ENDIAN
+		.digest = "\x3b\x44",
+#else
+		.digest = "\x44\x3b",
+#endif
+	}, {
+		.plaintext = "1234567890123456789012345678901234567890"
+			     "123456789012345678901234567890123456789",
+		.psize	= 79,
+#ifdef __LITTLE_ENDIAN
+		.digest	= "\x70\x4b",
+#else
+		.digest	= "\x4b\x70",
+#endif
+	}, {
+		.plaintext =
+		"abcddddddddddddddddddddddddddddddddddddddddddddddddddddd",
+		.psize  = 56,
+#ifdef __LITTLE_ENDIAN
+		.digest = "\xe3\x9c",
+#else
+		.digest = "\x9c\xe3",
+#endif
+		.np     = 2,
+		.tap    = { 28, 28 }
+	}
+};
+
 /*
 * SHA1 test vectors  from from FIPS PUB 180-1
 * Long vector from CAVS 5.0
--- a/drivers/char/hw_random/atmel-rng.c
+++ b/drivers/char/hw_random/atmel-rng.c
@ -108,8 +108,6 @@ static int atmel_trng_remove(struct platform_device *pdev)
 	clk_disable(trng->clk);
 	clk_put(trng->clk);

-	platform_set_drvdata(pdev, NULL);
-
 	return 0;
 }

--- a/drivers/char/hw_random/bcm63xx-rng.c
+++ b/drivers/char/hw_random/bcm63xx-rng.c
@ -137,7 +137,6 @@ static int bcm63xx_rng_probe(struct platform_device *pdev)
 out_clk_disable:
 	clk_disable(clk);
 out_free_rng:
-	platform_set_drvdata(pdev, NULL);
 	kfree(rng);
 out_free_priv:
 	kfree(priv);
@ -154,7 +153,6 @@ static int bcm63xx_rng_remove(struct platform_device *pdev)
 	clk_disable(priv->clk);
 	kfree(priv);
 	kfree(rng);
-	platform_set_drvdata(pdev, NULL);

 	return 0;
 }
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@ -700,7 +700,7 @@ static int n2rng_probe(struct platform_device *op)
 	if (err)
 		goto out_free_units;

-	dev_set_drvdata(&op->dev, np);
+	platform_set_drvdata(op, np);

 	schedule_delayed_work(&np->work, 0);

@ -721,7 +721,7 @@ out:

 static int n2rng_remove(struct platform_device *op)
 {
-	struct n2rng *np = dev_get_drvdata(&op->dev);
+	struct n2rng *np = platform_get_drvdata(op);

 	np->flags |= N2RNG_FLAG_SHUTDOWN;

@ -736,8 +736,6 @@ static int n2rng_remove(struct platform_device *op)

 	kfree(np);

-	dev_set_drvdata(&op->dev, NULL);
-
 	return 0;
 }

--- a/drivers/char/hw_random/nomadik-rng.c
+++ b/drivers/char/hw_random/nomadik-rng.c
@ -51,7 +51,7 @@ static int nmk_rng_probe(struct amba_device *dev, const struct amba_id *id)
 		return ret;
 	}

-	clk_enable(rng_clk);
+	clk_prepare_enable(rng_clk);

 	ret = amba_request_regions(dev, dev->dev.init_name);
 	if (ret)
--- a/drivers/char/hw_random/octeon-rng.c
+++ b/drivers/char/hw_random/octeon-rng.c
@ -96,7 +96,7 @@ static int octeon_rng_probe(struct platform_device *pdev)

 	rng->ops = ops;

-	dev_set_drvdata(&pdev->dev, &rng->ops);
+	platform_set_drvdata(pdev, &rng->ops);
 	ret = hwrng_register(&rng->ops);
 	if (ret)
 		return -ENOENT;
@ -108,7 +108,7 @@ static int octeon_rng_probe(struct platform_device *pdev)

 static int __exit octeon_rng_remove(struct platform_device *pdev)
 {
-	struct hwrng *rng = dev_get_drvdata(&pdev->dev);
+	struct hwrng *rng = platform_get_drvdata(pdev);

 	hwrng_unregister(rng);

--- a/drivers/char/hw_random/omap-rng.c
+++ b/drivers/char/hw_random/omap-rng.c
@ -116,7 +116,7 @@ static int omap_rng_probe(struct platform_device *pdev)
 	};

 	omap_rng_ops.priv = (unsigned long)priv;
-	dev_set_drvdata(&pdev->dev, priv);
+	platform_set_drvdata(pdev, priv);

 	priv->mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	priv->base = devm_ioremap_resource(&pdev->dev, priv->mem_res);
@ -124,7 +124,7 @@ static int omap_rng_probe(struct platform_device *pdev)
 		ret = PTR_ERR(priv->base);
 		goto err_ioremap;
 	}
-	dev_set_drvdata(&pdev->dev, priv);
+	platform_set_drvdata(pdev, priv);

 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_get_sync(&pdev->dev);
@ -151,7 +151,7 @@ err_ioremap:

 static int __exit omap_rng_remove(struct platform_device *pdev)
 {
-	struct omap_rng_private_data *priv = dev_get_drvdata(&pdev->dev);
+	struct omap_rng_private_data *priv = platform_get_drvdata(pdev);

 	hwrng_unregister(&omap_rng_ops);

--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@ -192,7 +192,6 @@ out_release_io:
 out_timer:
 	del_timer_sync(&priv->timer);
 out_free:
-	platform_set_drvdata(pdev, NULL);
 	kfree(priv);
 	return err;
 }
@ -209,7 +208,6 @@ static int timeriomem_rng_remove(struct platform_device *pdev)
 	del_timer_sync(&priv->timer);
 	iounmap(priv->io_base);
 	release_mem_region(res->start, resource_size(res));
-	platform_set_drvdata(pdev, NULL);
 	kfree(priv);

 	return 0;
--- a/drivers/char/hw_random/tx4939-rng.c
+++ b/drivers/char/hw_random/tx4939-rng.c
@ -154,7 +154,6 @@ static int __exit tx4939_rng_remove(struct platform_device *dev)
 	struct tx4939_rng *rngdev = platform_get_drvdata(dev);

 	hwrng_unregister(&rngdev->rng);
-	platform_set_drvdata(dev, NULL);
 	return 0;
 }

--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@ -278,7 +278,7 @@ config CRYPTO_DEV_PICOXCELL

 config CRYPTO_DEV_SAHARA
 	tristate "Support for SAHARA crypto accelerator"
-	depends on ARCH_MXC && EXPERIMENTAL && OF
+	depends on ARCH_MXC && OF
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES
 	select CRYPTO_ECB
@ -286,6 +286,16 @@ config CRYPTO_DEV_SAHARA
 	  This option enables support for the SAHARA HW crypto accelerator
 	  found in some Freescale i.MX chips.

+config CRYPTO_DEV_DCP
+	tristate "Support for the DCP engine"
+	depends on ARCH_MXS && OF
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES
+	select CRYPTO_CBC
+	help
+	  This options enables support for the hardware crypto-acceleration
+	  capabilities of the DCP co-processor
+
 config CRYPTO_DEV_S5P
 	tristate "Support for Samsung S5PV210 crypto accelerator"
 	depends on ARCH_S5PV210
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@ -13,6 +13,7 @@ obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
 obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
 obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
 obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
+obj-$(CONFIG_CRYPTO_DEV_DCP) += dcp.o
 obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
 obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o
 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@ -202,6 +202,7 @@ static int caam_probe(struct platform_device *pdev)
 #ifdef CONFIG_DEBUG_FS
 	struct caam_perfmon *perfmon;
 #endif
+	u64 cha_vid;

 	ctrlpriv = kzalloc(sizeof(struct caam_drv_private), GFP_KERNEL);
 	if (!ctrlpriv)
@ -293,11 +294,14 @@ static int caam_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}

+	cha_vid = rd_reg64(&topregs->ctrl.perfmon.cha_id);
+
 	/*
-	 * RNG4 based SECs (v5+) need special initialization prior
-	 * to executing any descriptors
+	 * If SEC has RNG version >= 4 and RNG state handle has not been
+	 * already instantiated ,do RNG instantiation
 	 */
-	if (of_device_is_compatible(nprop, "fsl,sec-v5.0")) {
+	if ((cha_vid & CHA_ID_RNG_MASK) >> CHA_ID_RNG_SHIFT >= 4 &&
+	    !(rd_reg32(&topregs->ctrl.r4tst[0].rdsta) & RDSTA_IF0)) {
 		kick_trng(pdev);
 		ret = instantiate_rng(ctrlpriv->jrdev[0]);
 		if (ret) {
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@ -231,7 +231,12 @@ struct sec4_sg_entry {
 #define LDST_SRCDST_WORD_PKHA_B_SZ	(0x11 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_PKHA_N_SZ	(0x12 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_PKHA_E_SZ	(0x13 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_CLASS_CTX	(0x20 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_DESCBUF	(0x40 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_DESCBUF_JOB	(0x41 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_DESCBUF_SHARED	(0x42 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_DESCBUF_JOB_WE	(0x45 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_DESCBUF_SHARED_WE (0x46 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_INFO_FIFO	(0x7a << LDST_SRCDST_SHIFT)

 /* Offset in source/destination */
@ -366,6 +371,7 @@ struct sec4_sg_entry {
 #define FIFOLD_TYPE_LAST2FLUSH1 (0x05 << FIFOLD_TYPE_SHIFT)
 #define FIFOLD_TYPE_LASTBOTH	(0x06 << FIFOLD_TYPE_SHIFT)
 #define FIFOLD_TYPE_LASTBOTHFL	(0x07 << FIFOLD_TYPE_SHIFT)
+#define FIFOLD_TYPE_NOINFOFIFO	(0x0F << FIFOLD_TYPE_SHIFT)

 #define FIFOLDST_LEN_MASK	0xffff
 #define FIFOLDST_EXT_LEN_MASK	0xffffffff
@ -1294,10 +1300,10 @@ struct sec4_sg_entry {
 #define SQOUT_SGF	0x01000000

 /* Appends to a previous pointer */
-#define SQOUT_PRE	0x00800000
+#define SQOUT_PRE	SQIN_PRE

 /* Restore sequence with pointer/length */
-#define SQOUT_RTO	0x00200000
+#define SQOUT_RTO	 SQIN_RTO

 /* Use extended length following pointer */
 #define SQOUT_EXT	0x00400000
@ -1359,6 +1365,7 @@ struct sec4_sg_entry {
 #define MOVE_DEST_MATH3		(0x07 << MOVE_DEST_SHIFT)
 #define MOVE_DEST_CLASS1INFIFO	(0x08 << MOVE_DEST_SHIFT)
 #define MOVE_DEST_CLASS2INFIFO	(0x09 << MOVE_DEST_SHIFT)
+#define MOVE_DEST_INFIFO_NOINFO (0x0a << MOVE_DEST_SHIFT)
 #define MOVE_DEST_PK_A		(0x0c << MOVE_DEST_SHIFT)
 #define MOVE_DEST_CLASS1KEY	(0x0d << MOVE_DEST_SHIFT)
 #define MOVE_DEST_CLASS2KEY	(0x0e << MOVE_DEST_SHIFT)
@ -1411,6 +1418,7 @@ struct sec4_sg_entry {
 #define MATH_SRC0_REG2		(0x02 << MATH_SRC0_SHIFT)
 #define MATH_SRC0_REG3		(0x03 << MATH_SRC0_SHIFT)
 #define MATH_SRC0_IMM		(0x04 << MATH_SRC0_SHIFT)
+#define MATH_SRC0_DPOVRD	(0x07 << MATH_SRC0_SHIFT)
 #define MATH_SRC0_SEQINLEN	(0x08 << MATH_SRC0_SHIFT)
 #define MATH_SRC0_SEQOUTLEN	(0x09 << MATH_SRC0_SHIFT)
 #define MATH_SRC0_VARSEQINLEN	(0x0a << MATH_SRC0_SHIFT)
@ -1425,6 +1433,7 @@ struct sec4_sg_entry {
 #define MATH_SRC1_REG2		(0x02 << MATH_SRC1_SHIFT)
 #define MATH_SRC1_REG3		(0x03 << MATH_SRC1_SHIFT)
 #define MATH_SRC1_IMM		(0x04 << MATH_SRC1_SHIFT)
+#define MATH_SRC1_DPOVRD	(0x07 << MATH_SRC0_SHIFT)
 #define MATH_SRC1_INFIFO	(0x0a << MATH_SRC1_SHIFT)
 #define MATH_SRC1_OUTFIFO	(0x0b << MATH_SRC1_SHIFT)
 #define MATH_SRC1_ONE		(0x0c << MATH_SRC1_SHIFT)
@ -1600,4 +1609,13 @@ struct sec4_sg_entry {
 #define NFIFOENTRY_PLEN_SHIFT	0
 #define NFIFOENTRY_PLEN_MASK	(0xFF << NFIFOENTRY_PLEN_SHIFT)

+/* Append Load Immediate Command */
+#define FD_CMD_APPEND_LOAD_IMMEDIATE			0x80000000
+
+/* Set SEQ LIODN equal to the Non-SEQ LIODN for the job */
+#define FD_CMD_SET_SEQ_LIODN_EQUAL_NONSEQ_LIODN		0x40000000
+
+/* Frame Descriptor Command for Replacement Job Descriptor */
+#define FD_CMD_REPLACE_JOB_DESC				0x20000000
+
 #endif /* DESC_H */
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@ -110,6 +110,26 @@ static inline void append_cmd(u32 *desc, u32 command)
 	(*desc)++;
 }

+#define append_u32 append_cmd
+
+static inline void append_u64(u32 *desc, u64 data)
+{
+	u32 *offset = desc_end(desc);
+
+	*offset = upper_32_bits(data);
+	*(++offset) = lower_32_bits(data);
+
+	(*desc) += 2;
+}
+
+/* Write command without affecting header, and return pointer to next word */
+static inline u32 *write_cmd(u32 *desc, u32 command)
+{
+	*desc = command;
+
+	return desc + 1;
+}
+
 static inline void append_cmd_ptr(u32 *desc, dma_addr_t ptr, int len,
 				  u32 command)
 {
@ -122,7 +142,8 @@ static inline void append_cmd_ptr_extlen(u32 *desc, dma_addr_t ptr,
 					 unsigned int len, u32 command)
 {
 	append_cmd(desc, command);
-	append_ptr(desc, ptr);
+	if (!(command & (SQIN_RTO | SQIN_PRE)))
+		append_ptr(desc, ptr);
 	append_cmd(desc, len);
 }

@ -176,17 +197,36 @@ static inline void append_##cmd(u32 *desc, dma_addr_t ptr, unsigned int len, \
 }
 APPEND_CMD_PTR(key, KEY)
 APPEND_CMD_PTR(load, LOAD)
-APPEND_CMD_PTR(store, STORE)
 APPEND_CMD_PTR(fifo_load, FIFO_LOAD)
 APPEND_CMD_PTR(fifo_store, FIFO_STORE)

+static inline void append_store(u32 *desc, dma_addr_t ptr, unsigned int len,
+				u32 options)
+{
+	u32 cmd_src;
+
+	cmd_src = options & LDST_SRCDST_MASK;
+
+	append_cmd(desc, CMD_STORE | options | len);
+
+	/* The following options do not require pointer */
+	if (!(cmd_src == LDST_SRCDST_WORD_DESCBUF_SHARED ||
+	      cmd_src == LDST_SRCDST_WORD_DESCBUF_JOB    ||
+	      cmd_src == LDST_SRCDST_WORD_DESCBUF_JOB_WE ||
+	      cmd_src == LDST_SRCDST_WORD_DESCBUF_SHARED_WE))
+		append_ptr(desc, ptr);
+}
+
 #define APPEND_SEQ_PTR_INTLEN(cmd, op) \
 static inline void append_seq_##cmd##_ptr_intlen(u32 *desc, dma_addr_t ptr, \
 						 unsigned int len, \
 						 u32 options) \
 { \
 	PRINT_POS; \
-	append_cmd_ptr(desc, ptr, len, CMD_SEQ_##op##_PTR | options); \
+	if (options & (SQIN_RTO | SQIN_PRE)) \
+		append_cmd(desc, CMD_SEQ_##op##_PTR | len | options); \
+	else \
+		append_cmd_ptr(desc, ptr, len, CMD_SEQ_##op##_PTR | options); \
 }
 APPEND_SEQ_PTR_INTLEN(in, IN)
 APPEND_SEQ_PTR_INTLEN(out, OUT)
@ -259,7 +299,7 @@ APPEND_CMD_RAW_IMM(load, LOAD, u32);
 */
 #define APPEND_MATH(op, desc, dest, src_0, src_1, len) \
 append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \
-	   MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32) (len & MATH_LEN_MASK));
+	MATH_SRC0_##src_0 | MATH_SRC1_##src_1 | (u32)len);

 #define append_math_add(desc, dest, src0, src1, len) \
 	APPEND_MATH(ADD, desc, dest, src0, src1, len)
@ -279,6 +319,8 @@ append_cmd(desc, CMD_MATH | MATH_FUN_##op | MATH_DEST_##dest | \
 	APPEND_MATH(LSHIFT, desc, dest, src0, src1, len)
 #define append_math_rshift(desc, dest, src0, src1, len) \
 	APPEND_MATH(RSHIFT, desc, dest, src0, src1, len)
+#define append_math_ldshift(desc, dest, src0, src1, len) \
+	APPEND_MATH(SHLD, desc, dest, src0, src1, len)

 /* Exactly one source is IMM. Data is passed in as u32 value */
 #define APPEND_MATH_IMM_u32(op, desc, dest, src_0, src_1, data) \
@ -305,3 +347,34 @@ do { \
 	APPEND_MATH_IMM_u32(LSHIFT, desc, dest, src0, src1, data)
 #define append_math_rshift_imm_u32(desc, dest, src0, src1, data) \
 	APPEND_MATH_IMM_u32(RSHIFT, desc, dest, src0, src1, data)
+
+/* Exactly one source is IMM. Data is passed in as u64 value */
+#define APPEND_MATH_IMM_u64(op, desc, dest, src_0, src_1, data) \
+do { \
+	u32 upper = (data >> 16) >> 16; \
+	APPEND_MATH(op, desc, dest, src_0, src_1, CAAM_CMD_SZ * 2 | \
+		    (upper ? 0 : MATH_IFB)); \
+	if (upper) \
+		append_u64(desc, data); \
+	else \
+		append_u32(desc, data); \
+} while (0)
+
+#define append_math_add_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(ADD, desc, dest, src0, src1, data)
+#define append_math_sub_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(SUB, desc, dest, src0, src1, data)
+#define append_math_add_c_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(ADDC, desc, dest, src0, src1, data)
+#define append_math_sub_b_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(SUBB, desc, dest, src0, src1, data)
+#define append_math_and_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(AND, desc, dest, src0, src1, data)
+#define append_math_or_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(OR, desc, dest, src0, src1, data)
+#define append_math_xor_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(XOR, desc, dest, src0, src1, data)
+#define append_math_lshift_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(LSHIFT, desc, dest, src0, src1, data)
+#define append_math_rshift_imm_u64(desc, dest, src0, src1, data) \
+	APPEND_MATH_IMM_u64(RSHIFT, desc, dest, src0, src1, data)
--- a/drivers/crypto/caam/pdb.h
+++ b/drivers/crypto/caam/pdb.h
@ -44,6 +44,7 @@
 #define PDBOPTS_ESP_IPHDRSRC	0x08 /* IP header comes from PDB (encap) */
 #define PDBOPTS_ESP_INCIPHDR	0x04 /* Prepend IP header to output frame */
 #define PDBOPTS_ESP_IPVSN	0x02 /* process IPv6 header */
+#define PDBOPTS_ESP_AOFL	0x04 /* adjust out frame len (decap, SEC>=5.3)*/
 #define PDBOPTS_ESP_TUNNEL	0x01 /* tunnel mode next-header byte */
 #define PDBOPTS_ESP_IPV6	0x02 /* ip header version is V6 */
 #define PDBOPTS_ESP_DIFFSERV	0x40 /* copy TOS/TC from inner iphdr */
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@ -117,6 +117,43 @@ struct jr_outentry {
 #define CHA_NUM_DECONUM_SHIFT	56
 #define CHA_NUM_DECONUM_MASK	(0xfull << CHA_NUM_DECONUM_SHIFT)

+/* CHA Version IDs */
+#define CHA_ID_AES_SHIFT	0
+#define CHA_ID_AES_MASK		(0xfull << CHA_ID_AES_SHIFT)
+
+#define CHA_ID_DES_SHIFT	4
+#define CHA_ID_DES_MASK		(0xfull << CHA_ID_DES_SHIFT)
+
+#define CHA_ID_ARC4_SHIFT	8
+#define CHA_ID_ARC4_MASK	(0xfull << CHA_ID_ARC4_SHIFT)
+
+#define CHA_ID_MD_SHIFT		12
+#define CHA_ID_MD_MASK		(0xfull << CHA_ID_MD_SHIFT)
+
+#define CHA_ID_RNG_SHIFT	16
+#define CHA_ID_RNG_MASK		(0xfull << CHA_ID_RNG_SHIFT)
+
+#define CHA_ID_SNW8_SHIFT	20
+#define CHA_ID_SNW8_MASK	(0xfull << CHA_ID_SNW8_SHIFT)
+
+#define CHA_ID_KAS_SHIFT	24
+#define CHA_ID_KAS_MASK		(0xfull << CHA_ID_KAS_SHIFT)
+
+#define CHA_ID_PK_SHIFT		28
+#define CHA_ID_PK_MASK		(0xfull << CHA_ID_PK_SHIFT)
+
+#define CHA_ID_CRC_SHIFT	32
+#define CHA_ID_CRC_MASK		(0xfull << CHA_ID_CRC_SHIFT)
+
+#define CHA_ID_SNW9_SHIFT	36
+#define CHA_ID_SNW9_MASK	(0xfull << CHA_ID_SNW9_SHIFT)
+
+#define CHA_ID_DECO_SHIFT	56
+#define CHA_ID_DECO_MASK	(0xfull << CHA_ID_DECO_SHIFT)
+
+#define CHA_ID_JR_SHIFT		60
+#define CHA_ID_JR_MASK		(0xfull << CHA_ID_JR_SHIFT)
+
 struct sec_vid {
 	u16 ip_id;
 	u8 maj_rev;
@ -228,7 +265,10 @@ struct rng4tst {
 		u32 rtfrqmax;	/* PRGM=1: freq. count max. limit register */
 		u32 rtfrqcnt;	/* PRGM=0: freq. count register */
 	};
-	u32 rsvd1[56];
+	u32 rsvd1[40];
+#define RDSTA_IF0 0x00000001
+	u32 rdsta;
+	u32 rsvd2[15];
 };

 /*
--- a/drivers/crypto/dcp.c
+++ b/drivers/crypto/dcp.c
@ -0,0 +1,912 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for DCP cryptographic accelerator.
+ *
+ * Copyright (c) 2013
+ * Author: Tobias Rauter <tobias.rauter@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Based on tegra-aes.c, dcp.c (from freescale SDK) and sahara.c
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/crypto.h>
+#include <linux/miscdevice.h>
+
+#include <crypto/scatterwalk.h>
+#include <crypto/aes.h>
+
+
+/* IOCTL for DCP OTP Key AES - taken from Freescale's SDK*/
+#define DBS_IOCTL_BASE   'd'
+#define DBS_ENC	_IOW(DBS_IOCTL_BASE, 0x00, uint8_t[16])
+#define DBS_DEC _IOW(DBS_IOCTL_BASE, 0x01, uint8_t[16])
+
+/* DCP channel used for AES */
+#define USED_CHANNEL 1
+/* Ring Buffers' maximum size */
+#define DCP_MAX_PKG 20
+
+/* Control Register */
+#define DCP_REG_CTRL 0x000
+#define DCP_CTRL_SFRST (1<<31)
+#define DCP_CTRL_CLKGATE (1<<30)
+#define DCP_CTRL_CRYPTO_PRESENT (1<<29)
+#define DCP_CTRL_SHA_PRESENT (1<<28)
+#define DCP_CTRL_GATHER_RES_WRITE (1<<23)
+#define DCP_CTRL_ENABLE_CONTEXT_CACHE (1<<22)
+#define DCP_CTRL_ENABLE_CONTEXT_SWITCH (1<<21)
+#define DCP_CTRL_CH_IRQ_E_0 0x01
+#define DCP_CTRL_CH_IRQ_E_1 0x02
+#define DCP_CTRL_CH_IRQ_E_2 0x04
+#define DCP_CTRL_CH_IRQ_E_3 0x08
+
+/* Status register */
+#define DCP_REG_STAT 0x010
+#define DCP_STAT_OTP_KEY_READY (1<<28)
+#define DCP_STAT_CUR_CHANNEL(stat) ((stat>>24)&0x0F)
+#define DCP_STAT_READY_CHANNEL(stat) ((stat>>16)&0x0F)
+#define DCP_STAT_IRQ(stat) (stat&0x0F)
+#define DCP_STAT_CHAN_0 (0x01)
+#define DCP_STAT_CHAN_1 (0x02)
+#define DCP_STAT_CHAN_2 (0x04)
+#define DCP_STAT_CHAN_3 (0x08)
+
+/* Channel Control Register */
+#define DCP_REG_CHAN_CTRL 0x020
+#define DCP_CHAN_CTRL_CH0_IRQ_MERGED (1<<16)
+#define DCP_CHAN_CTRL_HIGH_PRIO_0 (0x0100)
+#define DCP_CHAN_CTRL_HIGH_PRIO_1 (0x0200)
+#define DCP_CHAN_CTRL_HIGH_PRIO_2 (0x0400)
+#define DCP_CHAN_CTRL_HIGH_PRIO_3 (0x0800)
+#define DCP_CHAN_CTRL_ENABLE_0 (0x01)
+#define DCP_CHAN_CTRL_ENABLE_1 (0x02)
+#define DCP_CHAN_CTRL_ENABLE_2 (0x04)
+#define DCP_CHAN_CTRL_ENABLE_3 (0x08)
+
+/*
+ * Channel Registers:
+ * The DCP has 4 channels. Each of this channels
+ * has 4 registers (command pointer, semaphore, status and options).
+ * The address of register REG of channel CHAN is obtained by
+ * dcp_chan_reg(REG, CHAN)
+ */
+#define DCP_REG_CHAN_PTR	0x00000100
+#define DCP_REG_CHAN_SEMA	0x00000110
+#define DCP_REG_CHAN_STAT	0x00000120
+#define DCP_REG_CHAN_OPT	0x00000130
+
+#define DCP_CHAN_STAT_NEXT_CHAIN_IS_0	0x010000
+#define DCP_CHAN_STAT_NO_CHAIN		0x020000
+#define DCP_CHAN_STAT_CONTEXT_ERROR	0x030000
+#define DCP_CHAN_STAT_PAYLOAD_ERROR	0x040000
+#define DCP_CHAN_STAT_INVALID_MODE	0x050000
+#define DCP_CHAN_STAT_PAGEFAULT		0x40
+#define DCP_CHAN_STAT_DST		0x20
+#define DCP_CHAN_STAT_SRC		0x10
+#define DCP_CHAN_STAT_PACKET		0x08
+#define DCP_CHAN_STAT_SETUP		0x04
+#define DCP_CHAN_STAT_MISMATCH		0x02
+
+/* hw packet control*/
+
+#define DCP_PKT_PAYLOAD_KEY	(1<<11)
+#define DCP_PKT_OTP_KEY		(1<<10)
+#define DCP_PKT_CIPHER_INIT	(1<<9)
+#define DCP_PKG_CIPHER_ENCRYPT	(1<<8)
+#define DCP_PKT_CIPHER_ENABLE	(1<<5)
+#define DCP_PKT_DECR_SEM	(1<<1)
+#define DCP_PKT_CHAIN		(1<<2)
+#define DCP_PKT_IRQ		1
+
+#define DCP_PKT_MODE_CBC	(1<<4)
+#define DCP_PKT_KEYSELECT_OTP	(0xFF<<8)
+
+/* cipher flags */
+#define DCP_ENC		0x0001
+#define DCP_DEC		0x0002
+#define DCP_ECB		0x0004
+#define DCP_CBC		0x0008
+#define DCP_CBC_INIT	0x0010
+#define DCP_NEW_KEY	0x0040
+#define DCP_OTP_KEY	0x0080
+#define DCP_AES		0x1000
+
+/* DCP Flags */
+#define DCP_FLAG_BUSY	0x01
+#define DCP_FLAG_PRODUCING	0x02
+
+/* clock defines */
+#define CLOCK_ON	1
+#define CLOCK_OFF	0
+
+struct dcp_dev_req_ctx {
+	int mode;
+};
+
+struct dcp_op {
+	unsigned int		flags;
+	u8			key[AES_KEYSIZE_128];
+	int			keylen;
+
+	struct ablkcipher_request	*req;
+	struct crypto_ablkcipher	*fallback;
+
+	uint32_t stat;
+	uint32_t pkt1;
+	uint32_t pkt2;
+	struct ablkcipher_walk walk;
+};
+
+struct dcp_dev {
+	struct device *dev;
+	void __iomem *dcp_regs_base;
+
+	int dcp_vmi_irq;
+	int dcp_irq;
+
+	spinlock_t queue_lock;
+	struct crypto_queue queue;
+
+	uint32_t pkt_produced;
+	uint32_t pkt_consumed;
+
+	struct dcp_hw_packet *hw_pkg[DCP_MAX_PKG];
+	dma_addr_t hw_phys_pkg;
+
+	/* [KEY][IV] Both with 16 Bytes */
+	u8 *payload_base;
+	dma_addr_t payload_base_dma;
+
+
+	struct tasklet_struct	done_task;
+	struct tasklet_struct	queue_task;
+	struct timer_list	watchdog;
+
+	unsigned long		flags;
+
+	struct dcp_op *ctx;
+
+	struct miscdevice dcp_bootstream_misc;
+};
+
+struct dcp_hw_packet {
+	uint32_t next;
+	uint32_t pkt1;
+	uint32_t pkt2;
+	uint32_t src;
+	uint32_t dst;
+	uint32_t size;
+	uint32_t payload;
+	uint32_t stat;
+};
+
+static struct dcp_dev *global_dev;
+
+static inline u32 dcp_chan_reg(u32 reg, int chan)
+{
+	return reg + (chan) * 0x40;
+}
+
+static inline void dcp_write(struct dcp_dev *dev, u32 data, u32 reg)
+{
+	writel(data, dev->dcp_regs_base + reg);
+}
+
+static inline void dcp_set(struct dcp_dev *dev, u32 data, u32 reg)
+{
+	writel(data, dev->dcp_regs_base + (reg | 0x04));
+}
+
+static inline void dcp_clear(struct dcp_dev *dev, u32 data, u32 reg)
+{
+	writel(data, dev->dcp_regs_base + (reg | 0x08));
+}
+
+static inline void dcp_toggle(struct dcp_dev *dev, u32 data, u32 reg)
+{
+	writel(data, dev->dcp_regs_base + (reg | 0x0C));
+}
+
+static inline unsigned int dcp_read(struct dcp_dev *dev, u32 reg)
+{
+	return readl(dev->dcp_regs_base + reg);
+}
+
+static void dcp_dma_unmap(struct dcp_dev *dev, struct dcp_hw_packet *pkt)
+{
+	dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE);
+	dma_unmap_page(dev->dev, pkt->dst, pkt->size, DMA_FROM_DEVICE);
+	dev_dbg(dev->dev, "unmap packet %x", (unsigned int) pkt);
+}
+
+static int dcp_dma_map(struct dcp_dev *dev,
+	struct ablkcipher_walk *walk, struct dcp_hw_packet *pkt)
+{
+	dev_dbg(dev->dev, "map packet %x", (unsigned int) pkt);
+	/* align to length = 16 */
+	pkt->size = walk->nbytes - (walk->nbytes % 16);
+
+	pkt->src = dma_map_page(dev->dev, walk->src.page, walk->src.offset,
+		pkt->size, DMA_TO_DEVICE);
+
+	if (pkt->src == 0) {
+		dev_err(dev->dev, "Unable to map src");
+		return -ENOMEM;
+	}
+
+	pkt->dst = dma_map_page(dev->dev, walk->dst.page, walk->dst.offset,
+		pkt->size, DMA_FROM_DEVICE);
+
+	if (pkt->dst == 0) {
+		dev_err(dev->dev, "Unable to map dst");
+		dma_unmap_page(dev->dev, pkt->src, pkt->size, DMA_TO_DEVICE);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void dcp_op_one(struct dcp_dev *dev, struct dcp_hw_packet *pkt,
+			uint8_t last)
+{
+	struct dcp_op *ctx = dev->ctx;
+	pkt->pkt1 = ctx->pkt1;
+	pkt->pkt2 = ctx->pkt2;
+
+	pkt->payload = (u32) dev->payload_base_dma;
+	pkt->stat = 0;
+
+	if (ctx->flags & DCP_CBC_INIT) {
+		pkt->pkt1 |= DCP_PKT_CIPHER_INIT;
+		ctx->flags &= ~DCP_CBC_INIT;
+	}
+
+	mod_timer(&dev->watchdog, jiffies + msecs_to_jiffies(500));
+	pkt->pkt1 |= DCP_PKT_IRQ;
+	if (!last)
+		pkt->pkt1 |= DCP_PKT_CHAIN;
+
+	dev->pkt_produced++;
+
+	dcp_write(dev, 1,
+		dcp_chan_reg(DCP_REG_CHAN_SEMA, USED_CHANNEL));
+}
+
+static void dcp_op_proceed(struct dcp_dev *dev)
+{
+	struct dcp_op *ctx = dev->ctx;
+	struct dcp_hw_packet *pkt;
+
+	while (ctx->walk.nbytes) {
+		int err = 0;
+
+		pkt = dev->hw_pkg[dev->pkt_produced % DCP_MAX_PKG];
+		err = dcp_dma_map(dev, &ctx->walk, pkt);
+		if (err) {
+			dev->ctx->stat |= err;
+			/* start timer to wait for already set up calls */
+			mod_timer(&dev->watchdog,
+				jiffies + msecs_to_jiffies(500));
+			break;
+		}
+
+
+		err = ctx->walk.nbytes - pkt->size;
+		ablkcipher_walk_done(dev->ctx->req, &dev->ctx->walk, err);
+
+		dcp_op_one(dev, pkt, ctx->walk.nbytes == 0);
+		/* we have to wait if no space is left in buffer */
+		if (dev->pkt_produced - dev->pkt_consumed == DCP_MAX_PKG)
+			break;
+	}
+	clear_bit(DCP_FLAG_PRODUCING, &dev->flags);
+}
+
+static void dcp_op_start(struct dcp_dev *dev, uint8_t use_walk)
+{
+	struct dcp_op *ctx = dev->ctx;
+
+	if (ctx->flags & DCP_NEW_KEY) {
+		memcpy(dev->payload_base, ctx->key, ctx->keylen);
+		ctx->flags &= ~DCP_NEW_KEY;
+	}
+
+	ctx->pkt1 = 0;
+	ctx->pkt1 |= DCP_PKT_CIPHER_ENABLE;
+	ctx->pkt1 |= DCP_PKT_DECR_SEM;
+
+	if (ctx->flags & DCP_OTP_KEY)
+		ctx->pkt1 |= DCP_PKT_OTP_KEY;
+	else
+		ctx->pkt1 |= DCP_PKT_PAYLOAD_KEY;
+
+	if (ctx->flags & DCP_ENC)
+		ctx->pkt1 |= DCP_PKG_CIPHER_ENCRYPT;
+
+	ctx->pkt2 = 0;
+	if (ctx->flags & DCP_CBC)
+		ctx->pkt2 |= DCP_PKT_MODE_CBC;
+
+	dev->pkt_produced = 0;
+	dev->pkt_consumed = 0;
+
+	ctx->stat = 0;
+	dcp_clear(dev, -1, dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL));
+	dcp_write(dev, (u32) dev->hw_phys_pkg,
+		dcp_chan_reg(DCP_REG_CHAN_PTR, USED_CHANNEL));
+
+	set_bit(DCP_FLAG_PRODUCING, &dev->flags);
+
+	if (use_walk) {
+		ablkcipher_walk_init(&ctx->walk, ctx->req->dst,
+				ctx->req->src, ctx->req->nbytes);
+		ablkcipher_walk_phys(ctx->req, &ctx->walk);
+		dcp_op_proceed(dev);
+	} else {
+		dcp_op_one(dev, dev->hw_pkg[0], 1);
+		clear_bit(DCP_FLAG_PRODUCING, &dev->flags);
+	}
+}
+
+static void dcp_done_task(unsigned long data)
+{
+	struct dcp_dev *dev = (struct dcp_dev *)data;
+	struct dcp_hw_packet *last_packet;
+	int fin;
+	fin = 0;
+
+	for (last_packet = dev->hw_pkg[(dev->pkt_consumed) % DCP_MAX_PKG];
+		last_packet->stat == 1;
+		last_packet =
+			dev->hw_pkg[++(dev->pkt_consumed) % DCP_MAX_PKG]) {
+
+		dcp_dma_unmap(dev, last_packet);
+		last_packet->stat = 0;
+		fin++;
+	}
+	/* the last call of this function already consumed this IRQ's packet */
+	if (fin == 0)
+		return;
+
+	dev_dbg(dev->dev,
+		"Packet(s) done with status %x; finished: %d, produced:%d, complete consumed: %d",
+		dev->ctx->stat, fin, dev->pkt_produced, dev->pkt_consumed);
+
+	last_packet = dev->hw_pkg[(dev->pkt_consumed - 1) % DCP_MAX_PKG];
+	if (!dev->ctx->stat && last_packet->pkt1 & DCP_PKT_CHAIN) {
+		if (!test_and_set_bit(DCP_FLAG_PRODUCING, &dev->flags))
+			dcp_op_proceed(dev);
+		return;
+	}
+
+	while (unlikely(dev->pkt_consumed < dev->pkt_produced)) {
+		dcp_dma_unmap(dev,
+			dev->hw_pkg[dev->pkt_consumed++ % DCP_MAX_PKG]);
+	}
+
+	if (dev->ctx->flags & DCP_OTP_KEY) {
+		/* we used the miscdevice, no walk to finish */
+		clear_bit(DCP_FLAG_BUSY, &dev->flags);
+		return;
+	}
+
+	ablkcipher_walk_complete(&dev->ctx->walk);
+	dev->ctx->req->base.complete(&dev->ctx->req->base,
+			dev->ctx->stat);
+	dev->ctx->req = NULL;
+	/* in case there are other requests in the queue */
+	tasklet_schedule(&dev->queue_task);
+}
+
+static void dcp_watchdog(unsigned long data)
+{
+	struct dcp_dev *dev = (struct dcp_dev *)data;
+	dev->ctx->stat |= dcp_read(dev,
+			dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL));
+
+	dev_err(dev->dev, "Timeout, Channel status: %x", dev->ctx->stat);
+
+	if (!dev->ctx->stat)
+		dev->ctx->stat = -ETIMEDOUT;
+
+	dcp_done_task(data);
+}
+
+
+static irqreturn_t dcp_common_irq(int irq, void *context)
+{
+	u32 msk;
+	struct dcp_dev *dev = (struct dcp_dev *) context;
+
+	del_timer(&dev->watchdog);
+
+	msk = DCP_STAT_IRQ(dcp_read(dev, DCP_REG_STAT));
+	dcp_clear(dev, msk, DCP_REG_STAT);
+	if (msk == 0)
+		return IRQ_NONE;
+
+	dev->ctx->stat |= dcp_read(dev,
+			dcp_chan_reg(DCP_REG_CHAN_STAT, USED_CHANNEL));
+
+	if (msk & DCP_STAT_CHAN_1)
+		tasklet_schedule(&dev->done_task);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t dcp_vmi_irq(int irq, void *context)
+{
+	return dcp_common_irq(irq, context);
+}
+
+static irqreturn_t dcp_irq(int irq, void *context)
+{
+	return dcp_common_irq(irq, context);
+}
+
+static void dcp_crypt(struct dcp_dev *dev, struct dcp_op *ctx)
+{
+	dev->ctx = ctx;
+
+	if ((ctx->flags & DCP_CBC) && ctx->req->info) {
+		ctx->flags |= DCP_CBC_INIT;
+		memcpy(dev->payload_base + AES_KEYSIZE_128,
+			ctx->req->info, AES_KEYSIZE_128);
+	}
+
+	dcp_op_start(dev, 1);
+}
+
+static void dcp_queue_task(unsigned long data)
+{
+	struct dcp_dev *dev = (struct dcp_dev *) data;
+	struct crypto_async_request *async_req, *backlog;
+	struct crypto_ablkcipher *tfm;
+	struct dcp_op *ctx;
+	struct dcp_dev_req_ctx *rctx;
+	struct ablkcipher_request *req;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->queue_lock, flags);
+
+	backlog = crypto_get_backlog(&dev->queue);
+	async_req = crypto_dequeue_request(&dev->queue);
+
+	spin_unlock_irqrestore(&dev->queue_lock, flags);
+
+	if (!async_req)
+		goto ret_nothing_done;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	req = ablkcipher_request_cast(async_req);
+	tfm = crypto_ablkcipher_reqtfm(req);
+	rctx = ablkcipher_request_ctx(req);
+	ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!req->src || !req->dst)
+		goto ret_nothing_done;
+
+	ctx->flags |= rctx->mode;
+	ctx->req = req;
+
+	dcp_crypt(dev, ctx);
+
+	return;
+
+ret_nothing_done:
+	clear_bit(DCP_FLAG_BUSY, &dev->flags);
+}
+
+
+static int dcp_cra_init(struct crypto_tfm *tfm)
+{
+	const char *name = tfm->__crt_alg->cra_name;
+	struct dcp_op *ctx = crypto_tfm_ctx(tfm);
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct dcp_dev_req_ctx);
+
+	ctx->fallback = crypto_alloc_ablkcipher(name, 0,
+				CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+
+	if (IS_ERR(ctx->fallback)) {
+		dev_err(global_dev->dev, "Error allocating fallback algo %s\n",
+			name);
+		return PTR_ERR(ctx->fallback);
+	}
+
+	return 0;
+}
+
+static void dcp_cra_exit(struct crypto_tfm *tfm)
+{
+	struct dcp_op *ctx = crypto_tfm_ctx(tfm);
+
+	if (ctx->fallback)
+		crypto_free_ablkcipher(ctx->fallback);
+
+	ctx->fallback = NULL;
+}
+
+/* async interface */
+static int dcp_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int len)
+{
+	struct dcp_op *ctx = crypto_ablkcipher_ctx(tfm);
+	unsigned int ret = 0;
+	ctx->keylen = len;
+	ctx->flags = 0;
+	if (len == AES_KEYSIZE_128) {
+		if (memcmp(ctx->key, key, AES_KEYSIZE_128)) {
+			memcpy(ctx->key, key, len);
+			ctx->flags |= DCP_NEW_KEY;
+		}
+		return 0;
+	}
+
+	ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
+	ctx->fallback->base.crt_flags |=
+		(tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_ablkcipher_setkey(ctx->fallback, key, len);
+	if (ret) {
+		struct crypto_tfm *tfm_aux = crypto_ablkcipher_tfm(tfm);
+
+		tfm_aux->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+		tfm_aux->crt_flags |=
+			(ctx->fallback->base.crt_flags & CRYPTO_TFM_RES_MASK);
+	}
+	return ret;
+}
+
+static int dcp_aes_cbc_crypt(struct ablkcipher_request *req, int mode)
+{
+	struct dcp_dev_req_ctx *rctx = ablkcipher_request_ctx(req);
+	struct dcp_dev *dev = global_dev;
+	unsigned long flags;
+	int err = 0;
+
+	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE))
+		return -EINVAL;
+
+	rctx->mode = mode;
+
+	spin_lock_irqsave(&dev->queue_lock, flags);
+	err = ablkcipher_enqueue_request(&dev->queue, req);
+	spin_unlock_irqrestore(&dev->queue_lock, flags);
+
+	flags = test_and_set_bit(DCP_FLAG_BUSY, &dev->flags);
+
+	if (!(flags & DCP_FLAG_BUSY))
+		tasklet_schedule(&dev->queue_task);
+
+	return err;
+}
+
+static int dcp_aes_cbc_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_tfm *tfm =
+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
+	struct dcp_op *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+
+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
+		int err = 0;
+		ablkcipher_request_set_tfm(req, ctx->fallback);
+		err = crypto_ablkcipher_encrypt(req);
+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
+		return err;
+	}
+
+	return dcp_aes_cbc_crypt(req, DCP_AES | DCP_ENC | DCP_CBC);
+}
+
+static int dcp_aes_cbc_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_tfm *tfm =
+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
+	struct dcp_op *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+
+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
+		int err = 0;
+		ablkcipher_request_set_tfm(req, ctx->fallback);
+		err = crypto_ablkcipher_decrypt(req);
+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
+		return err;
+	}
+	return dcp_aes_cbc_crypt(req, DCP_AES | DCP_DEC | DCP_CBC);
+}
+
+static struct crypto_alg algs[] = {
+	{
+		.cra_name = "cbc(aes)",
+		.cra_driver_name = "dcp-cbc-aes",
+		.cra_alignmask = 3,
+		.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC |
+			  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize = AES_KEYSIZE_128,
+		.cra_type = &crypto_ablkcipher_type,
+		.cra_priority = 300,
+		.cra_u.ablkcipher = {
+			.min_keysize =	AES_KEYSIZE_128,
+			.max_keysize = AES_KEYSIZE_128,
+			.setkey = dcp_aes_setkey,
+			.encrypt = dcp_aes_cbc_encrypt,
+			.decrypt = dcp_aes_cbc_decrypt,
+			.ivsize = AES_KEYSIZE_128,
+		}
+
+	},
+};
+
+/* DCP bootstream verification interface: uses OTP key for crypto */
+static int dcp_bootstream_open(struct inode *inode, struct file *file)
+{
+	file->private_data = container_of((file->private_data),
+			struct dcp_dev, dcp_bootstream_misc);
+	return 0;
+}
+
+static long dcp_bootstream_ioctl(struct file *file,
+					 unsigned int cmd, unsigned long arg)
+{
+	struct dcp_dev *dev = (struct dcp_dev *) file->private_data;
+	void __user *argp = (void __user *)arg;
+	int ret;
+
+	if (dev == NULL)
+		return -EBADF;
+
+	if (cmd != DBS_ENC && cmd != DBS_DEC)
+		return -EINVAL;
+
+	if (copy_from_user(dev->payload_base, argp, 16))
+		return -EFAULT;
+
+	if (test_and_set_bit(DCP_FLAG_BUSY, &dev->flags))
+		return -EAGAIN;
+
+	dev->ctx = kzalloc(sizeof(struct dcp_op), GFP_KERNEL);
+	if (!dev->ctx) {
+		dev_err(dev->dev,
+			"cannot allocate context for OTP crypto");
+		clear_bit(DCP_FLAG_BUSY, &dev->flags);
+		return -ENOMEM;
+	}
+
+	dev->ctx->flags = DCP_AES | DCP_ECB | DCP_OTP_KEY | DCP_CBC_INIT;
+	dev->ctx->flags |= (cmd == DBS_ENC) ? DCP_ENC : DCP_DEC;
+	dev->hw_pkg[0]->src = dev->payload_base_dma;
+	dev->hw_pkg[0]->dst = dev->payload_base_dma;
+	dev->hw_pkg[0]->size = 16;
+
+	dcp_op_start(dev, 0);
+
+	while (test_bit(DCP_FLAG_BUSY, &dev->flags))
+		cpu_relax();
+
+	ret = dev->ctx->stat;
+	if (!ret && copy_to_user(argp, dev->payload_base, 16))
+		ret =  -EFAULT;
+
+	kfree(dev->ctx);
+
+	return ret;
+}
+
+static const struct file_operations dcp_bootstream_fops = {
+	.owner =		THIS_MODULE,
+	.unlocked_ioctl =	dcp_bootstream_ioctl,
+	.open =			dcp_bootstream_open,
+};
+
+static int dcp_probe(struct platform_device *pdev)
+{
+	struct dcp_dev *dev = NULL;
+	struct resource *r;
+	int i, ret, j;
+
+	dev = devm_kzalloc(&pdev->dev, sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	global_dev = dev;
+	dev->dev = &pdev->dev;
+
+	platform_set_drvdata(pdev, dev);
+
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!r) {
+		dev_err(&pdev->dev, "failed to get IORESOURCE_MEM\n");
+		return -ENXIO;
+	}
+	dev->dcp_regs_base = devm_ioremap(&pdev->dev, r->start,
+					  resource_size(r));
+
+	dcp_set(dev, DCP_CTRL_SFRST, DCP_REG_CTRL);
+	udelay(10);
+	dcp_clear(dev, DCP_CTRL_SFRST | DCP_CTRL_CLKGATE, DCP_REG_CTRL);
+
+	dcp_write(dev, DCP_CTRL_GATHER_RES_WRITE |
+		DCP_CTRL_ENABLE_CONTEXT_CACHE | DCP_CTRL_CH_IRQ_E_1,
+		DCP_REG_CTRL);
+
+	dcp_write(dev, DCP_CHAN_CTRL_ENABLE_1, DCP_REG_CHAN_CTRL);
+
+	for (i = 0; i < 4; i++)
+		dcp_clear(dev, -1, dcp_chan_reg(DCP_REG_CHAN_STAT, i));
+
+	dcp_clear(dev, -1, DCP_REG_STAT);
+
+
+	r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!r) {
+		dev_err(&pdev->dev, "can't get IRQ resource (0)\n");
+		return -EIO;
+	}
+	dev->dcp_vmi_irq = r->start;
+	ret = request_irq(dev->dcp_vmi_irq, dcp_vmi_irq, 0, "dcp", dev);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "can't request_irq (0)\n");
+		return -EIO;
+	}
+
+	r = platform_get_resource(pdev, IORESOURCE_IRQ, 1);
+	if (!r) {
+		dev_err(&pdev->dev, "can't get IRQ resource (1)\n");
+		ret = -EIO;
+		goto err_free_irq0;
+	}
+	dev->dcp_irq = r->start;
+	ret = request_irq(dev->dcp_irq, dcp_irq, 0, "dcp", dev);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "can't request_irq (1)\n");
+		ret = -EIO;
+		goto err_free_irq0;
+	}
+
+	dev->hw_pkg[0] = dma_alloc_coherent(&pdev->dev,
+			DCP_MAX_PKG * sizeof(struct dcp_hw_packet),
+			&dev->hw_phys_pkg,
+			GFP_KERNEL);
+	if (!dev->hw_pkg[0]) {
+		dev_err(&pdev->dev, "Could not allocate hw descriptors\n");
+		ret = -ENOMEM;
+		goto err_free_irq1;
+	}
+
+	for (i = 1; i < DCP_MAX_PKG; i++) {
+		dev->hw_pkg[i - 1]->next = dev->hw_phys_pkg
+				+ i * sizeof(struct dcp_hw_packet);
+		dev->hw_pkg[i] = dev->hw_pkg[i - 1] + 1;
+	}
+	dev->hw_pkg[i - 1]->next = dev->hw_phys_pkg;
+
+
+	dev->payload_base = dma_alloc_coherent(&pdev->dev, 2 * AES_KEYSIZE_128,
+			&dev->payload_base_dma, GFP_KERNEL);
+	if (!dev->payload_base) {
+		dev_err(&pdev->dev, "Could not allocate memory for key\n");
+		ret = -ENOMEM;
+		goto err_free_hw_packet;
+	}
+	tasklet_init(&dev->queue_task, dcp_queue_task,
+		(unsigned long) dev);
+	tasklet_init(&dev->done_task, dcp_done_task,
+		(unsigned long) dev);
+	spin_lock_init(&dev->queue_lock);
+
+	crypto_init_queue(&dev->queue, 10);
+
+	init_timer(&dev->watchdog);
+	dev->watchdog.function = &dcp_watchdog;
+	dev->watchdog.data = (unsigned long)dev;
+
+	dev->dcp_bootstream_misc.minor = MISC_DYNAMIC_MINOR,
+	dev->dcp_bootstream_misc.name = "dcpboot",
+	dev->dcp_bootstream_misc.fops = &dcp_bootstream_fops,
+	ret = misc_register(&dev->dcp_bootstream_misc);
+	if (ret != 0) {
+		dev_err(dev->dev, "Unable to register misc device\n");
+		goto err_free_key_iv;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++) {
+		algs[i].cra_priority = 300;
+		algs[i].cra_ctxsize = sizeof(struct dcp_op);
+		algs[i].cra_module = THIS_MODULE;
+		algs[i].cra_init = dcp_cra_init;
+		algs[i].cra_exit = dcp_cra_exit;
+		if (crypto_register_alg(&algs[i])) {
+			dev_err(&pdev->dev, "register algorithm failed\n");
+			ret = -ENOMEM;
+			goto err_unregister;
+		}
+	}
+	dev_notice(&pdev->dev, "DCP crypto enabled.!\n");
+
+	return 0;
+
+err_unregister:
+	for (j = 0; j < i; j++)
+		crypto_unregister_alg(&algs[j]);
+err_free_key_iv:
+	dma_free_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, dev->payload_base,
+			dev->payload_base_dma);
+err_free_hw_packet:
+	dma_free_coherent(&pdev->dev, DCP_MAX_PKG *
+		sizeof(struct dcp_hw_packet), dev->hw_pkg[0],
+		dev->hw_phys_pkg);
+err_free_irq1:
+	free_irq(dev->dcp_irq, dev);
+err_free_irq0:
+	free_irq(dev->dcp_vmi_irq, dev);
+
+	return ret;
+}
+
+static int dcp_remove(struct platform_device *pdev)
+{
+	struct dcp_dev *dev;
+	int j;
+	dev = platform_get_drvdata(pdev);
+
+	dma_free_coherent(&pdev->dev,
+			DCP_MAX_PKG * sizeof(struct dcp_hw_packet),
+			dev->hw_pkg[0],	dev->hw_phys_pkg);
+
+	dma_free_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, dev->payload_base,
+			dev->payload_base_dma);
+
+	free_irq(dev->dcp_irq, dev);
+	free_irq(dev->dcp_vmi_irq, dev);
+
+	tasklet_kill(&dev->done_task);
+	tasklet_kill(&dev->queue_task);
+
+	for (j = 0; j < ARRAY_SIZE(algs); j++)
+		crypto_unregister_alg(&algs[j]);
+
+	misc_deregister(&dev->dcp_bootstream_misc);
+
+	return 0;
+}
+
+static struct of_device_id fs_dcp_of_match[] = {
+	{	.compatible = "fsl-dcp"},
+	{},
+};
+
+static struct platform_driver fs_dcp_driver = {
+	.probe = dcp_probe,
+	.remove = dcp_remove,
+	.driver = {
+		.name = "fsl-dcp",
+		.owner = THIS_MODULE,
+		.of_match_table = fs_dcp_of_match
+	}
+};
+
+module_platform_driver(fs_dcp_driver);
+
+
+MODULE_AUTHOR("Tobias Rauter <tobias.rauter@gmail.com>");
+MODULE_DESCRIPTION("Freescale DCP Crypto Driver");
+MODULE_LICENSE("GPL");
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@ -2676,7 +2676,7 @@ err_out_stop_device:
 	hifn_reset_dma(dev, 1);
 	hifn_stop_device(dev);
 err_out_free_irq:
-	free_irq(dev->irq, dev->name);
+	free_irq(dev->irq, dev);
 	tasklet_kill(&dev->tasklet);
 err_out_free_desc:
 	pci_free_consistent(pdev, sizeof(struct hifn_dma),
@ -2711,7 +2711,7 @@ static void hifn_remove(struct pci_dev *pdev)
 		hifn_reset_dma(dev, 1);
 		hifn_stop_device(dev);

-		free_irq(dev->irq, dev->name);
+		free_irq(dev->irq, dev);
 		tasklet_kill(&dev->tasklet);

 		hifn_flush(dev);
--- a/drivers/crypto/mv_cesa.c
+++ b/drivers/crypto/mv_cesa.c
@ -1146,7 +1146,6 @@ err_unmap_reg:
 err:
 	kfree(cp);
 	cpg = NULL;
-	platform_set_drvdata(pdev, NULL);
 	return ret;
 }

--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@ -203,13 +203,6 @@ static void omap_aes_write_n(struct omap_aes_dev *dd, u32 offset,

 static int omap_aes_hw_init(struct omap_aes_dev *dd)
 {
-	/*
-	 * clocks are enabled when request starts and disabled when finished.
-	 * It may be long delays between requests.
-	 * Device might go to off mode to save power.
-	 */
-	pm_runtime_get_sync(dd->dev);
-
 	if (!(dd->flags & FLAGS_INIT)) {
 		dd->flags |= FLAGS_INIT;
 		dd->err = 0;
@ -636,7 +629,6 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)

 	pr_debug("err: %d\n", err);

-	pm_runtime_put(dd->dev);
 	dd->flags &= ~FLAGS_BUSY;

 	req->base.complete(&req->base, err);
@ -837,8 +829,16 @@ static int omap_aes_ctr_decrypt(struct ablkcipher_request *req)

 static int omap_aes_cra_init(struct crypto_tfm *tfm)
 {
-	pr_debug("enter\n");
+	struct omap_aes_dev *dd = NULL;

+	/* Find AES device, currently picks the first device */
+	spin_lock_bh(&list_lock);
+	list_for_each_entry(dd, &dev_list, list) {
+		break;
+	}
+	spin_unlock_bh(&list_lock);
+
+	pm_runtime_get_sync(dd->dev);
 	tfm->crt_ablkcipher.reqsize = sizeof(struct omap_aes_reqctx);

 	return 0;
@ -846,7 +846,16 @@ static int omap_aes_cra_init(struct crypto_tfm *tfm)

 static void omap_aes_cra_exit(struct crypto_tfm *tfm)
 {
-	pr_debug("enter\n");
+	struct omap_aes_dev *dd = NULL;
+
+	/* Find AES device, currently picks the first device */
+	spin_lock_bh(&list_lock);
+	list_for_each_entry(dd, &dev_list, list) {
+		break;
+	}
+	spin_unlock_bh(&list_lock);
+
+	pm_runtime_put_sync(dd->dev);
 }

 /* ********************** ALGS ************************************ */
@ -1125,10 +1134,9 @@ static int omap_aes_probe(struct platform_device *pdev)
 	if (err)
 		goto err_res;

-	dd->io_base = devm_request_and_ioremap(dev, &res);
-	if (!dd->io_base) {
-		dev_err(dev, "can't ioremap\n");
-		err = -ENOMEM;
+	dd->io_base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(dd->io_base)) {
+		err = PTR_ERR(dd->io_base);
 		goto err_res;
 	}
 	dd->phys_base = res.start;
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@ -1686,10 +1686,9 @@ static int omap_sham_probe(struct platform_device *pdev)
 	if (err)
 		goto res_err;

-	dd->io_base = devm_request_and_ioremap(dev, &res);
-	if (!dd->io_base) {
-		dev_err(dev, "can't ioremap\n");
-		err = -ENOMEM;
+	dd->io_base = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(dd->io_base)) {
+		err = PTR_ERR(dd->io_base);
 		goto res_err;
 	}
 	dd->phys_base = res.start;
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@ -1298,7 +1298,7 @@ static ssize_t spacc_stat_irq_thresh_store(struct device *dev,
 	struct spacc_engine *engine = spacc_dev_to_engine(dev);
 	unsigned long thresh;

-	if (strict_strtoul(buf, 0, &thresh))
+	if (kstrtoul(buf, 0, &thresh))
 		return -EINVAL;

 	thresh = clamp(thresh, 1UL, engine->fifo_sz - 1);
--- a/drivers/crypto/s5p-sss.c
+++ b/drivers/crypto/s5p-sss.c
@ -647,7 +647,6 @@ static int s5p_aes_probe(struct platform_device *pdev)
 	clk_disable(pdata->clk);

 	s5p_dev = NULL;
-	platform_set_drvdata(pdev, NULL);

 	return err;
 }
@ -668,7 +667,6 @@ static int s5p_aes_remove(struct platform_device *pdev)
 	clk_disable(pdata->clk);

 	s5p_dev = NULL;
-	platform_set_drvdata(pdev, NULL);

 	return 0;
 }
--- a/drivers/crypto/ux500/cryp/cryp_core.c
+++ b/drivers/crypto/ux500/cryp/cryp_core.c
@ -1629,7 +1629,7 @@ static int ux500_cryp_remove(struct platform_device *pdev)

 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res)
-		release_mem_region(res->start, res->end - res->start + 1);
+		release_mem_region(res->start, resource_size(res));

 	kfree(device_data);

--- a/include/linux/crc-t10dif.h
+++ b/include/linux/crc-t10dif.h
@ -3,6 +3,10 @@

 #include <linux/types.h>

+#define CRC_T10DIF_DIGEST_SIZE 2
+#define CRC_T10DIF_BLOCK_SIZE 1
+
+__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len);
 __u16 crc_t10dif(unsigned char const *, size_t);

 #endif
--- a/lib/Kconfig
+++ b/lib/Kconfig
@ -66,6 +66,8 @@ config CRC16

 config CRC_T10DIF
 	tristate "CRC calculation for the T10 Data Integrity Field"
+	select CRYPTO
+	select CRYPTO_CRCT10DIF
 	help
 	  This option is only needed if a module that's not in the
 	  kernel tree needs to calculate CRC checks for use with the
--- a/lib/crc-t10dif.c
+++ b/lib/crc-t10dif.c
@ -11,57 +11,44 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/crc-t10dif.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <crypto/hash.h>

-/* Table generated using the following polynomium:
- * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
- * gt: 0x8bb7
- */
-static const __u16 t10_dif_crc_table[256] = {
-	0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
-	0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
-	0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
-	0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
-	0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
-	0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
-	0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
-	0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
-	0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
-	0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
-	0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
-	0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
-	0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
-	0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
-	0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
-	0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
-	0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
-	0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
-	0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
-	0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
-	0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
-	0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
-	0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
-	0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
-	0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
-	0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
-	0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
-	0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
-	0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
-	0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
-	0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
-	0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
-};
+static struct crypto_shash *crct10dif_tfm;

 __u16 crc_t10dif(const unsigned char *buffer, size_t len)
 {
-	__u16 crc = 0;
-	unsigned int i;
+	struct {
+		struct shash_desc shash;
+		char ctx[2];
+	} desc;
+	int err;

-	for (i = 0 ; i < len ; i++)
-		crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff];
+	desc.shash.tfm = crct10dif_tfm;
+	desc.shash.flags = 0;
+	*(__u16 *)desc.ctx = 0;

-	return crc;
+	err = crypto_shash_update(&desc.shash, buffer, len);
+	BUG_ON(err);
+
+	return *(__u16 *)desc.ctx;
 }
 EXPORT_SYMBOL(crc_t10dif);

+static int __init crc_t10dif_mod_init(void)
+{
+	crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
+	return PTR_RET(crct10dif_tfm);
+}
+
+static void __exit crc_t10dif_mod_fini(void)
+{
+	crypto_free_shash(crct10dif_tfm);
+}
+
+module_init(crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_fini);
+
 MODULE_DESCRIPTION("T10 DIF CRC calculation");
 MODULE_LICENSE("GPL");