ARM: add support for bit sliced AES using NEON instructions

Bit sliced AES gives around 45% speedup on Cortex-A15 for encryption
and around 25% for decryption. This implementation of the AES algorithm
does not rely on any lookup tables so it is believed to be invulnerable
to cache timing attacks.

This algorithm processes up to 8 blocks in parallel in constant time. This
means that it is not usable by chaining modes that are strictly sequential
in nature, such as CBC encryption. CBC decryption, however, can benefit from
this implementation and runs about 25% faster. The other chaining modes
implemented in this module, XTS and CTR, can execute fully in parallel in
both directions.

The core code has been adopted from the OpenSSL project (in collaboration
with the original author, on cc). For ease of maintenance, this version is
identical to the upstream OpenSSL code, i.e., all modifications that were
required to make it suitable for inclusion into the kernel have been made
upstream. The original can be found here:

    http://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=6f6a6130

Note to integrators:
While this implementation is significantly faster than the existing table
based ones (generic or ARM asm), especially in CTR mode, the effects on
power efficiency are unclear as of yet. This code does fundamentally more
work, by calculating values that the table based code obtains by a simple
lookup; only by doing all of that work in a SIMD fashion, it manages to
perform better.

Cc: Andy Polyakov <appro@openssl.org>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
This commit is contained in:
Ard Biesheuvel 2013-09-16 18:31:38 +02:00
parent 5ce26f3b5a
commit e4e7f10bfc
5 changed files with 5473 additions and 2 deletions

View File

@ -3,7 +3,17 @@
#
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
aes-arm-y := aes-armv4.o aes_glue.o
aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
$(call cmd,perl)
.PRECIOUS: $(obj)/aesbs-core.S

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,434 @@
/*
* linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <crypto/aes.h>
#include <crypto/ablk_helper.h>
#include <crypto/algapi.h>
#include <linux/module.h>
#include "aes_glue.h"
#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
struct BS_KEY {
struct AES_KEY rk;
int converted;
u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
} __aligned(8);
asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 iv[]);
asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
struct BS_KEY *key, u8 const iv[]);
asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 tweak[]);
asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
struct BS_KEY *key, u8 tweak[]);
struct aesbs_cbc_ctx {
struct AES_KEY enc;
struct BS_KEY dec;
};
struct aesbs_ctr_ctx {
struct BS_KEY enc;
};
struct aesbs_xts_ctx {
struct BS_KEY enc;
struct BS_KEY dec;
struct AES_KEY twkey;
};
static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 8;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->dec.rk = ctx->enc;
private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
ctx->dec.converted = 0;
return 0;
}
static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 8;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->enc.converted = 0;
return 0;
}
static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
int bits = key_len * 4;
if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
ctx->dec.rk = ctx->enc.rk;
private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
ctx->enc.converted = ctx->dec.converted = 0;
return 0;
}
static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
while (walk.nbytes) {
u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *src = walk.src.virt.addr;
if (walk.dst.virt.addr == walk.src.virt.addr) {
u8 *iv = walk.iv;
do {
crypto_xor(src, iv, AES_BLOCK_SIZE);
AES_encrypt(src, src, &ctx->enc);
iv = src;
src += AES_BLOCK_SIZE;
} while (--blocks);
memcpy(walk.iv, iv, AES_BLOCK_SIZE);
} else {
u8 *dst = walk.dst.virt.addr;
do {
crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
AES_encrypt(walk.iv, dst, &ctx->enc);
memcpy(walk.iv, dst, AES_BLOCK_SIZE);
src += AES_BLOCK_SIZE;
dst += AES_BLOCK_SIZE;
} while (--blocks);
}
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
kernel_neon_begin();
bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->dec, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
while (walk.nbytes) {
u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *dst = walk.dst.virt.addr;
u8 *src = walk.src.virt.addr;
u8 bk[2][AES_BLOCK_SIZE];
u8 *iv = walk.iv;
do {
if (walk.dst.virt.addr == walk.src.virt.addr)
memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
AES_decrypt(src, dst, &ctx->dec.rk);
crypto_xor(dst, iv, AES_BLOCK_SIZE);
if (walk.dst.virt.addr == walk.src.virt.addr)
iv = bk[blocks & 1];
else
iv = src;
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--blocks);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static void inc_be128_ctr(__be32 ctr[], u32 addend)
{
int i;
for (i = 3; i >= 0; i--, addend = 1) {
u32 n = be32_to_cpu(ctr[i]) + addend;
ctr[i] = cpu_to_be32(n);
if (n >= addend)
break;
}
}
static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
u32 blocks;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
__be32 *ctr = (__be32 *)walk.iv;
u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
/* avoid 32 bit counter overflow in the NEON code */
if (unlikely(headroom < blocks)) {
blocks = headroom + 1;
tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
}
kernel_neon_begin();
bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
walk.dst.virt.addr, blocks,
&ctx->enc, walk.iv);
kernel_neon_end();
inc_be128_ctr(ctr, blocks);
nbytes -= blocks * AES_BLOCK_SIZE;
if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
break;
err = blkcipher_walk_done(desc, &walk, tail);
}
if (walk.nbytes) {
u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
u8 ks[AES_BLOCK_SIZE];
AES_encrypt(walk.iv, ks, &ctx->enc.rk);
if (tdst != tsrc)
memcpy(tdst, tsrc, nbytes);
crypto_xor(tdst, ks, nbytes);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
/* generate the initial tweak */
AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
while (walk.nbytes) {
kernel_neon_begin();
bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->enc, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
struct blkcipher_walk walk;
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
/* generate the initial tweak */
AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
while (walk.nbytes) {
kernel_neon_begin();
bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
walk.nbytes, &ctx->dec, walk.iv);
kernel_neon_end();
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static struct crypto_alg aesbs_algs[] = { {
.cra_name = "__cbc-aes-neonbs",
.cra_driver_name = "__driver-cbc-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_cbc_set_key,
.encrypt = aesbs_cbc_encrypt,
.decrypt = aesbs_cbc_decrypt,
},
}, {
.cra_name = "__ctr-aes-neonbs",
.cra_driver_name = "__driver-ctr-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_ctr_set_key,
.encrypt = aesbs_ctr_encrypt,
.decrypt = aesbs_ctr_encrypt,
},
}, {
.cra_name = "__xts-aes-neonbs",
.cra_driver_name = "__driver-xts-aes-neonbs",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_blkcipher = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = aesbs_xts_set_key,
.encrypt = aesbs_xts_encrypt,
.decrypt = aesbs_xts_decrypt,
},
}, {
.cra_name = "cbc(aes)",
.cra_driver_name = "cbc-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = __ablk_encrypt,
.decrypt = ablk_decrypt,
}
}, {
.cra_name = "ctr(aes)",
.cra_driver_name = "ctr-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
}
}, {
.cra_name = "xts(aes)",
.cra_driver_name = "xts-aes-neonbs",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct async_helper_ctx),
.cra_alignmask = 7,
.cra_type = &crypto_ablkcipher_type,
.cra_module = THIS_MODULE,
.cra_init = ablk_init,
.cra_exit = ablk_exit,
.cra_ablkcipher = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = ablk_set_key,
.encrypt = ablk_encrypt,
.decrypt = ablk_decrypt,
}
} };
static int __init aesbs_mod_init(void)
{
if (!cpu_has_neon())
return -ENODEV;
return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
}
static void __exit aesbs_mod_exit(void)
{
crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
}
module_init(aesbs_mod_init);
module_exit(aesbs_mod_exit);
MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL");

File diff suppressed because it is too large Load Diff

View File

@ -776,6 +776,22 @@ config CRYPTO_AES_ARM
See <http://csrc.nist.gov/encryption/aes/> for more information.
config CRYPTO_AES_ARM_BS
tristate "Bit sliced AES using NEON instructions"
depends on ARM && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM
select CRYPTO_ABLK_HELPER
help
Use a faster and more secure NEON based implementation of AES in CBC,
CTR and XTS modes
Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode
and for XTS mode encryption, CBC and XTS mode decryption speedup is
around 25%. (CBC encryption speed is not affected by this driver.)
This implementation does not rely on any lookup tables so it is
believed to be invulnerable to cache timing attacks.
config CRYPTO_ANUBIS
tristate "Anubis cipher algorithm"
select CRYPTO_ALGAPI