2016-05-21 20:23:39 +08:00
|
|
|
#! /usr/bin/env perl
|
2020-04-23 20:55:52 +08:00
|
|
|
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
2016-05-21 20:23:39 +08:00
|
|
|
#
|
2018-12-06 20:46:12 +08:00
|
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
2016-05-21 20:23:39 +08:00
|
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
|
|
# in the file LICENSE in the source distribution or at
|
|
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
|
2010-05-04 02:23:29 +08:00
|
|
|
#
|
|
|
|
# ====================================================================
|
|
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
|
|
# ====================================================================
|
|
|
|
#
|
|
|
|
# April 2010
|
|
|
|
#
|
|
|
|
# The module implements "4-bit" GCM GHASH function and underlying
|
|
|
|
# single multiplication operation in GF(2^128). "4-bit" means that it
|
|
|
|
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
|
|
|
|
# experimental performance data available yet. The only approximation
|
|
|
|
# that can be made at this point is based on code size. Inner loop is
|
|
|
|
# 32 instructions long and on single-issue core should execute in <40
|
|
|
|
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
|
|
|
|
# loop, this assembler loop body was found to be ~3x smaller than
|
|
|
|
# compiler-generated one...
|
|
|
|
#
|
2010-07-13 22:03:31 +08:00
|
|
|
# July 2010
|
|
|
|
#
|
|
|
|
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
|
|
|
|
# Cortex A8 core and ~25 cycles per processed byte (which was observed
|
|
|
|
# to be ~3 times faster than gcc-generated code:-)
|
|
|
|
#
|
2011-04-02 04:58:34 +08:00
|
|
|
# February 2011
|
|
|
|
#
|
|
|
|
# Profiler-assisted and platform-specific optimization resulted in 7%
|
|
|
|
# improvement on Cortex A8 core and ~23.5 cycles per byte.
|
|
|
|
#
|
|
|
|
# March 2011
|
|
|
|
#
|
|
|
|
# Add NEON implementation featuring polynomial multiplication, i.e. no
|
|
|
|
# lookup tables involved. On Cortex A8 it was measured to process one
|
|
|
|
# byte in 15 cycles or 55% faster than integer-only code.
|
2014-04-24 16:16:58 +08:00
|
|
|
#
|
|
|
|
# April 2014
|
|
|
|
#
|
|
|
|
# Switch to multiplication algorithm suggested in paper referred
|
|
|
|
# below and combine it with reduction algorithm from x86 module.
|
|
|
|
# Performance improvement over previous version varies from 65% on
|
|
|
|
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
|
2015-03-04 04:44:53 +08:00
|
|
|
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
|
|
|
|
# Snapdragon S4 - in 9.33.
|
2014-04-24 16:16:58 +08:00
|
|
|
#
|
2015-07-13 22:53:37 +08:00
|
|
|
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
2014-04-24 16:16:58 +08:00
|
|
|
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
2016-10-11 00:01:24 +08:00
|
|
|
#
|
2014-04-24 16:16:58 +08:00
|
|
|
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
2011-04-02 04:58:34 +08:00
|
|
|
|
|
|
|
# ====================================================================
|
2010-07-02 16:14:12 +08:00
|
|
|
# Note about "528B" variant. In ARM case it makes lesser sense to
|
|
|
|
# implement it for following reasons:
|
|
|
|
#
|
|
|
|
# - performance improvement won't be anywhere near 50%, because 128-
|
|
|
|
# bit shift operation is neatly fused with 128-bit xor here, and
|
|
|
|
# "538B" variant would eliminate only 4-5 instructions out of 32
|
|
|
|
# in the inner loop (meaning that estimated improvement is ~15%);
|
|
|
|
# - ARM-based systems are often embedded ones and extra memory
|
|
|
|
# consumption might be unappreciated (for so little improvement);
|
|
|
|
#
|
2010-05-04 02:23:29 +08:00
|
|
|
# Byte order [in]dependence. =========================================
|
|
|
|
#
|
|
|
|
# Caller is expected to maintain specific *dword* order in Htable,
|
|
|
|
# namely with *least* significant dword of 128-bit value at *lower*
|
|
|
|
# address. This differs completely from C code and has everything to
|
|
|
|
# do with ldm instruction and order in which dwords are "consumed" by
|
|
|
|
# algorithm. *Byte* order within these dwords in turn is whatever
|
|
|
|
# *native* byte order on current platform. See gcm128.c for working
|
|
|
|
# example...
|
|
|
|
|
Unify all assembler file generators
They now generally conform to the following argument sequence:
script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \
$(PROCESSOR) <output file>
However, in the spirit of being able to use these scripts manually,
they also allow for no argument, or for only the flavour, or for only
the output file. This is done by only using the last argument as
output file if it's a file (it has an extension), and only using the
first argument as flavour if it isn't a file (it doesn't have an
extension).
While we're at it, we make all $xlate calls the same, i.e. the $output
argument is always quoted, and we always die on error when trying to
start $xlate.
There's a perl lesson in this, regarding operator priority...
This will always succeed, even when it fails:
open FOO, "something" || die "ERR: $!";
The reason is that '||' has higher priority than list operators (a
function is essentially a list operator and gobbles up everything
following it that isn't lower priority), and since a non-empty string
is always true, so that ends up being exactly the same as:
open FOO, "something";
This, however, will fail if "something" can't be opened:
open FOO, "something" or die "ERR: $!";
The reason is that 'or' has lower priority that list operators,
i.e. it's performed after the 'open' call.
Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9884)
2019-09-13 06:06:46 +08:00
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
2015-04-02 16:17:42 +08:00
|
|
|
|
|
|
|
if ($flavour && $flavour ne "void") {
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
|
Unify all assembler file generators
They now generally conform to the following argument sequence:
script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \
$(PROCESSOR) <output file>
However, in the spirit of being able to use these scripts manually,
they also allow for no argument, or for only the flavour, or for only
the output file. This is done by only using the last argument as
output file if it's a file (it has an extension), and only using the
first argument as flavour if it isn't a file (it doesn't have an
extension).
While we're at it, we make all $xlate calls the same, i.e. the $output
argument is always quoted, and we always die on error when trying to
start $xlate.
There's a perl lesson in this, regarding operator priority...
This will always succeed, even when it fails:
open FOO, "something" || die "ERR: $!";
The reason is that '||' has higher priority than list operators (a
function is essentially a list operator and gobbles up everything
following it that isn't lower priority), and since a non-empty string
is always true, so that ends up being exactly the same as:
open FOO, "something";
This, however, will fail if "something" can't be opened:
open FOO, "something" or die "ERR: $!";
The reason is that 'or' has lower priority that list operators,
i.e. it's performed after the 'open' call.
Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9884)
2019-09-13 06:06:46 +08:00
|
|
|
open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
|
|
|
|
or die "can't call $xlate: $!";
|
2015-04-02 16:17:42 +08:00
|
|
|
} else {
|
Unify all assembler file generators
They now generally conform to the following argument sequence:
script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \
$(PROCESSOR) <output file>
However, in the spirit of being able to use these scripts manually,
they also allow for no argument, or for only the flavour, or for only
the output file. This is done by only using the last argument as
output file if it's a file (it has an extension), and only using the
first argument as flavour if it isn't a file (it doesn't have an
extension).
While we're at it, we make all $xlate calls the same, i.e. the $output
argument is always quoted, and we always die on error when trying to
start $xlate.
There's a perl lesson in this, regarding operator priority...
This will always succeed, even when it fails:
open FOO, "something" || die "ERR: $!";
The reason is that '||' has higher priority than list operators (a
function is essentially a list operator and gobbles up everything
following it that isn't lower priority), and since a non-empty string
is always true, so that ends up being exactly the same as:
open FOO, "something";
This, however, will fail if "something" can't be opened:
open FOO, "something" or die "ERR: $!";
The reason is that 'or' has lower priority that list operators,
i.e. it's performed after the 'open' call.
Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9884)
2019-09-13 06:06:46 +08:00
|
|
|
$output and open STDOUT,">$output";
|
2015-04-02 16:17:42 +08:00
|
|
|
}
|
2010-07-08 23:03:42 +08:00
|
|
|
|
2010-05-04 02:23:29 +08:00
|
|
|
$Xi="r0"; # argument block
|
|
|
|
$Htbl="r1";
|
|
|
|
$inp="r2";
|
|
|
|
$len="r3";
|
2011-04-02 04:58:34 +08:00
|
|
|
|
2010-05-04 02:23:29 +08:00
|
|
|
$Zll="r4"; # variables
|
|
|
|
$Zlh="r5";
|
|
|
|
$Zhl="r6";
|
|
|
|
$Zhh="r7";
|
|
|
|
$Tll="r8";
|
|
|
|
$Tlh="r9";
|
|
|
|
$Thl="r10";
|
|
|
|
$Thh="r11";
|
|
|
|
$nlo="r12";
|
|
|
|
################# r13 is stack pointer
|
|
|
|
$nhi="r14";
|
|
|
|
################# r15 is program counter
|
|
|
|
|
|
|
|
$rem_4bit=$inp; # used in gcm_gmult_4bit
|
|
|
|
$cnt=$len;
|
|
|
|
|
|
|
|
sub Zsmash() {
|
|
|
|
my $i=12;
|
|
|
|
my @args=@_;
|
|
|
|
for ($Zll,$Zlh,$Zhl,$Zhh) {
|
|
|
|
$code.=<<___;
|
2011-04-02 04:58:34 +08:00
|
|
|
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
|
|
|
|
rev $_,$_
|
|
|
|
str $_,[$Xi,#$i]
|
|
|
|
#elif defined(__ARMEB__)
|
|
|
|
str $_,[$Xi,#$i]
|
|
|
|
#else
|
2010-05-04 02:23:29 +08:00
|
|
|
mov $Tlh,$_,lsr#8
|
|
|
|
strb $_,[$Xi,#$i+3]
|
|
|
|
mov $Thl,$_,lsr#16
|
|
|
|
strb $Tlh,[$Xi,#$i+2]
|
|
|
|
mov $Thh,$_,lsr#24
|
|
|
|
strb $Thl,[$Xi,#$i+1]
|
|
|
|
strb $Thh,[$Xi,#$i]
|
2011-04-02 04:58:34 +08:00
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
___
|
|
|
|
$code.="\t".shift(@args)."\n";
|
|
|
|
$i-=4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$code=<<___;
|
2011-04-02 04:58:34 +08:00
|
|
|
#include "arm_arch.h"
|
|
|
|
|
2016-08-28 02:22:03 +08:00
|
|
|
#if defined(__thumb2__) || defined(__clang__)
|
2015-09-24 00:41:27 +08:00
|
|
|
.syntax unified
|
2018-06-29 23:48:54 +08:00
|
|
|
#define ldrplb ldrbpl
|
|
|
|
#define ldrneb ldrbne
|
2016-08-28 02:22:03 +08:00
|
|
|
#endif
|
|
|
|
#if defined(__thumb2__)
|
2015-09-24 00:41:27 +08:00
|
|
|
.thumb
|
|
|
|
#else
|
2010-05-04 02:23:29 +08:00
|
|
|
.code 32
|
2015-09-24 00:41:27 +08:00
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
|
2019-02-15 16:44:39 +08:00
|
|
|
.text
|
|
|
|
|
2010-05-04 02:23:29 +08:00
|
|
|
.type rem_4bit,%object
|
|
|
|
.align 5
|
|
|
|
rem_4bit:
|
|
|
|
.short 0x0000,0x1C20,0x3840,0x2460
|
|
|
|
.short 0x7080,0x6CA0,0x48C0,0x54E0
|
|
|
|
.short 0xE100,0xFD20,0xD940,0xC560
|
|
|
|
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
|
|
|
|
.size rem_4bit,.-rem_4bit
|
|
|
|
|
|
|
|
.type rem_4bit_get,%function
|
|
|
|
rem_4bit_get:
|
2015-09-24 00:41:27 +08:00
|
|
|
#if defined(__thumb2__)
|
|
|
|
adr $rem_4bit,rem_4bit
|
|
|
|
#else
|
|
|
|
sub $rem_4bit,pc,#8+32 @ &rem_4bit
|
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
b .Lrem_4bit_got
|
|
|
|
nop
|
2015-09-24 00:41:27 +08:00
|
|
|
nop
|
2010-05-04 02:23:29 +08:00
|
|
|
.size rem_4bit_get,.-rem_4bit_get
|
|
|
|
|
|
|
|
.global gcm_ghash_4bit
|
|
|
|
.type gcm_ghash_4bit,%function
|
2015-09-24 00:41:27 +08:00
|
|
|
.align 4
|
2010-05-04 02:23:29 +08:00
|
|
|
gcm_ghash_4bit:
|
2015-09-24 00:41:27 +08:00
|
|
|
#if defined(__thumb2__)
|
|
|
|
adr r12,rem_4bit
|
|
|
|
#else
|
|
|
|
sub r12,pc,#8+48 @ &rem_4bit
|
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
add $len,$inp,$len @ $len to point at the end
|
|
|
|
stmdb sp!,{r3-r11,lr} @ save $len/end too
|
|
|
|
|
|
|
|
ldmia r12,{r4-r11} @ copy rem_4bit ...
|
|
|
|
stmdb sp!,{r4-r11} @ ... to stack
|
|
|
|
|
|
|
|
ldrb $nlo,[$inp,#15]
|
|
|
|
ldrb $nhi,[$Xi,#15]
|
|
|
|
.Louter:
|
|
|
|
eor $nlo,$nlo,$nhi
|
|
|
|
and $nhi,$nlo,#0xf0
|
|
|
|
and $nlo,$nlo,#0x0f
|
|
|
|
mov $cnt,#14
|
|
|
|
|
|
|
|
add $Zhh,$Htbl,$nlo,lsl#4
|
|
|
|
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
|
2010-07-13 22:03:31 +08:00
|
|
|
add $Thh,$Htbl,$nhi
|
2010-05-04 02:23:29 +08:00
|
|
|
ldrb $nlo,[$inp,#14]
|
|
|
|
|
|
|
|
and $nhi,$Zll,#0xf @ rem
|
|
|
|
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
2010-07-13 22:03:31 +08:00
|
|
|
add $nhi,$nhi,$nhi
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Tll,$Zll,lsr#4
|
|
|
|
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
|
|
|
|
eor $Zll,$Zll,$Zlh,lsl#28
|
|
|
|
ldrb $nhi,[$Xi,#14]
|
|
|
|
eor $Zlh,$Tlh,$Zlh,lsr#4
|
|
|
|
eor $Zlh,$Zlh,$Zhl,lsl#28
|
|
|
|
eor $Zhl,$Thl,$Zhl,lsr#4
|
|
|
|
eor $Zhl,$Zhl,$Zhh,lsl#28
|
|
|
|
eor $Zhh,$Thh,$Zhh,lsr#4
|
|
|
|
eor $nlo,$nlo,$nhi
|
|
|
|
and $nhi,$nlo,#0xf0
|
|
|
|
and $nlo,$nlo,#0x0f
|
2010-07-13 22:03:31 +08:00
|
|
|
eor $Zhh,$Zhh,$Tll,lsl#16
|
2010-05-04 02:23:29 +08:00
|
|
|
|
2011-04-02 04:58:34 +08:00
|
|
|
.Linner:
|
2010-05-04 02:23:29 +08:00
|
|
|
add $Thh,$Htbl,$nlo,lsl#4
|
|
|
|
and $nlo,$Zll,#0xf @ rem
|
2011-04-02 04:58:34 +08:00
|
|
|
subs $cnt,$cnt,#1
|
2010-05-04 02:23:29 +08:00
|
|
|
add $nlo,$nlo,$nlo
|
2011-04-02 04:58:34 +08:00
|
|
|
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Tll,$Zll,lsr#4
|
|
|
|
eor $Zll,$Zll,$Zlh,lsl#28
|
|
|
|
eor $Zlh,$Tlh,$Zlh,lsr#4
|
|
|
|
eor $Zlh,$Zlh,$Zhl,lsl#28
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zhl,$Thl,$Zhl,lsr#4
|
2015-09-24 00:41:27 +08:00
|
|
|
#ifdef __thumb2__
|
|
|
|
it pl
|
|
|
|
#endif
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrplb $nlo,[$inp,$cnt]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zhl,$Zhl,$Zhh,lsl#28
|
|
|
|
eor $Zhh,$Thh,$Zhh,lsr#4
|
|
|
|
|
|
|
|
add $Thh,$Htbl,$nhi
|
|
|
|
and $nhi,$Zll,#0xf @ rem
|
2011-04-02 04:58:34 +08:00
|
|
|
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
2010-05-04 02:23:29 +08:00
|
|
|
add $nhi,$nhi,$nhi
|
2011-04-02 04:58:34 +08:00
|
|
|
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Tll,$Zll,lsr#4
|
2015-09-24 00:41:27 +08:00
|
|
|
#ifdef __thumb2__
|
|
|
|
it pl
|
|
|
|
#endif
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrplb $Tll,[$Xi,$cnt]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Zll,$Zlh,lsl#28
|
|
|
|
eor $Zlh,$Tlh,$Zlh,lsr#4
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrh $Tlh,[sp,$nhi]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zlh,$Zlh,$Zhl,lsl#28
|
|
|
|
eor $Zhl,$Thl,$Zhl,lsr#4
|
|
|
|
eor $Zhl,$Zhl,$Zhh,lsl#28
|
2015-09-24 00:41:27 +08:00
|
|
|
#ifdef __thumb2__
|
|
|
|
it pl
|
|
|
|
#endif
|
2011-04-02 04:58:34 +08:00
|
|
|
eorpl $nlo,$nlo,$Tll
|
2010-07-13 22:03:31 +08:00
|
|
|
eor $Zhh,$Thh,$Zhh,lsr#4
|
2015-09-24 00:41:27 +08:00
|
|
|
#ifdef __thumb2__
|
|
|
|
itt pl
|
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
andpl $nhi,$nlo,#0xf0
|
|
|
|
andpl $nlo,$nlo,#0x0f
|
2011-04-02 04:58:34 +08:00
|
|
|
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
|
|
|
|
bpl .Linner
|
2010-05-04 02:23:29 +08:00
|
|
|
|
|
|
|
ldr $len,[sp,#32] @ re-load $len/end
|
|
|
|
add $inp,$inp,#16
|
|
|
|
mov $nhi,$Zll
|
|
|
|
___
|
2015-09-24 00:41:27 +08:00
|
|
|
&Zsmash("cmp\t$inp,$len","\n".
|
|
|
|
"#ifdef __thumb2__\n".
|
|
|
|
" it ne\n".
|
|
|
|
"#endif\n".
|
|
|
|
" ldrneb $nlo,[$inp,#15]");
|
2010-05-04 02:23:29 +08:00
|
|
|
$code.=<<___;
|
|
|
|
bne .Louter
|
|
|
|
|
|
|
|
add sp,sp,#36
|
2011-04-02 04:58:34 +08:00
|
|
|
#if __ARM_ARCH__>=5
|
|
|
|
ldmia sp!,{r4-r11,pc}
|
|
|
|
#else
|
2010-05-04 02:23:29 +08:00
|
|
|
ldmia sp!,{r4-r11,lr}
|
|
|
|
tst lr,#1
|
|
|
|
moveq pc,lr @ be binary compatible with V4, yet
|
|
|
|
bx lr @ interoperable with Thumb ISA:-)
|
2011-04-02 04:58:34 +08:00
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
|
|
|
|
|
|
|
.global gcm_gmult_4bit
|
|
|
|
.type gcm_gmult_4bit,%function
|
|
|
|
gcm_gmult_4bit:
|
|
|
|
stmdb sp!,{r4-r11,lr}
|
|
|
|
ldrb $nlo,[$Xi,#15]
|
|
|
|
b rem_4bit_get
|
|
|
|
.Lrem_4bit_got:
|
|
|
|
and $nhi,$nlo,#0xf0
|
|
|
|
and $nlo,$nlo,#0x0f
|
|
|
|
mov $cnt,#14
|
|
|
|
|
|
|
|
add $Zhh,$Htbl,$nlo,lsl#4
|
|
|
|
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
|
|
|
|
ldrb $nlo,[$Xi,#14]
|
|
|
|
|
|
|
|
add $Thh,$Htbl,$nhi
|
|
|
|
and $nhi,$Zll,#0xf @ rem
|
|
|
|
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
2010-07-13 22:03:31 +08:00
|
|
|
add $nhi,$nhi,$nhi
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Tll,$Zll,lsr#4
|
|
|
|
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
|
|
|
|
eor $Zll,$Zll,$Zlh,lsl#28
|
|
|
|
eor $Zlh,$Tlh,$Zlh,lsr#4
|
|
|
|
eor $Zlh,$Zlh,$Zhl,lsl#28
|
|
|
|
eor $Zhl,$Thl,$Zhl,lsr#4
|
|
|
|
eor $Zhl,$Zhl,$Zhh,lsl#28
|
|
|
|
eor $Zhh,$Thh,$Zhh,lsr#4
|
|
|
|
and $nhi,$nlo,#0xf0
|
|
|
|
eor $Zhh,$Zhh,$Tll,lsl#16
|
|
|
|
and $nlo,$nlo,#0x0f
|
|
|
|
|
2011-04-02 04:58:34 +08:00
|
|
|
.Loop:
|
2010-05-04 02:23:29 +08:00
|
|
|
add $Thh,$Htbl,$nlo,lsl#4
|
|
|
|
and $nlo,$Zll,#0xf @ rem
|
2011-04-02 04:58:34 +08:00
|
|
|
subs $cnt,$cnt,#1
|
2010-05-04 02:23:29 +08:00
|
|
|
add $nlo,$nlo,$nlo
|
2011-04-02 04:58:34 +08:00
|
|
|
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Tll,$Zll,lsr#4
|
|
|
|
eor $Zll,$Zll,$Zlh,lsl#28
|
|
|
|
eor $Zlh,$Tlh,$Zlh,lsr#4
|
|
|
|
eor $Zlh,$Zlh,$Zhl,lsl#28
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zhl,$Thl,$Zhl,lsr#4
|
2015-09-24 00:41:27 +08:00
|
|
|
#ifdef __thumb2__
|
|
|
|
it pl
|
|
|
|
#endif
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrplb $nlo,[$Xi,$cnt]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zhl,$Zhl,$Zhh,lsl#28
|
|
|
|
eor $Zhh,$Thh,$Zhh,lsr#4
|
|
|
|
|
|
|
|
add $Thh,$Htbl,$nhi
|
|
|
|
and $nhi,$Zll,#0xf @ rem
|
2011-04-02 04:58:34 +08:00
|
|
|
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
2010-05-04 02:23:29 +08:00
|
|
|
add $nhi,$nhi,$nhi
|
2011-04-02 04:58:34 +08:00
|
|
|
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zll,$Tll,$Zll,lsr#4
|
|
|
|
eor $Zll,$Zll,$Zlh,lsl#28
|
|
|
|
eor $Zlh,$Tlh,$Zlh,lsr#4
|
2011-04-02 04:58:34 +08:00
|
|
|
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
|
2010-05-04 02:23:29 +08:00
|
|
|
eor $Zlh,$Zlh,$Zhl,lsl#28
|
|
|
|
eor $Zhl,$Thl,$Zhl,lsr#4
|
|
|
|
eor $Zhl,$Zhl,$Zhh,lsl#28
|
|
|
|
eor $Zhh,$Thh,$Zhh,lsr#4
|
2015-09-24 00:41:27 +08:00
|
|
|
#ifdef __thumb2__
|
|
|
|
itt pl
|
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
andpl $nhi,$nlo,#0xf0
|
|
|
|
andpl $nlo,$nlo,#0x0f
|
2010-07-13 22:03:31 +08:00
|
|
|
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
2011-04-02 04:58:34 +08:00
|
|
|
bpl .Loop
|
2010-05-04 02:23:29 +08:00
|
|
|
___
|
|
|
|
&Zsmash();
|
|
|
|
$code.=<<___;
|
2011-04-02 04:58:34 +08:00
|
|
|
#if __ARM_ARCH__>=5
|
|
|
|
ldmia sp!,{r4-r11,pc}
|
|
|
|
#else
|
2010-05-04 02:23:29 +08:00
|
|
|
ldmia sp!,{r4-r11,lr}
|
|
|
|
tst lr,#1
|
|
|
|
moveq pc,lr @ be binary compatible with V4, yet
|
|
|
|
bx lr @ interoperable with Thumb ISA:-)
|
2011-04-02 04:58:34 +08:00
|
|
|
#endif
|
2010-05-04 02:23:29 +08:00
|
|
|
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
2011-04-02 04:58:34 +08:00
|
|
|
___
|
|
|
|
{
|
2014-04-24 16:16:58 +08:00
|
|
|
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
|
|
|
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
|
|
|
|
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
|
2011-04-02 04:58:34 +08:00
|
|
|
|
2014-04-24 16:16:58 +08:00
|
|
|
sub clmul64x64 {
|
|
|
|
my ($r,$a,$b)=@_;
|
|
|
|
$code.=<<___;
|
|
|
|
vext.8 $t0#lo, $a, $a, #1 @ A1
|
|
|
|
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
|
|
|
vext.8 $r#lo, $b, $b, #1 @ B1
|
|
|
|
vmull.p8 $r, $a, $r#lo @ E = A*B1
|
|
|
|
vext.8 $t1#lo, $a, $a, #2 @ A2
|
|
|
|
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
|
|
|
vext.8 $t3#lo, $b, $b, #2 @ B2
|
|
|
|
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
|
|
|
vext.8 $t2#lo, $a, $a, #3 @ A3
|
|
|
|
veor $t0, $t0, $r @ L = E + F
|
|
|
|
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
|
|
|
vext.8 $r#lo, $b, $b, #3 @ B3
|
|
|
|
veor $t1, $t1, $t3 @ M = G + H
|
|
|
|
vmull.p8 $r, $a, $r#lo @ I = A*B3
|
|
|
|
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
|
|
|
vand $t0#hi, $t0#hi, $k48
|
|
|
|
vext.8 $t3#lo, $b, $b, #4 @ B4
|
|
|
|
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
|
|
|
vand $t1#hi, $t1#hi, $k32
|
|
|
|
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
|
|
|
veor $t2, $t2, $r @ N = I + J
|
|
|
|
veor $t0#lo, $t0#lo, $t0#hi
|
|
|
|
veor $t1#lo, $t1#lo, $t1#hi
|
|
|
|
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
|
|
|
vand $t2#hi, $t2#hi, $k16
|
|
|
|
vext.8 $t0, $t0, $t0, #15
|
|
|
|
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
|
|
|
vmov.i64 $t3#hi, #0
|
|
|
|
vext.8 $t1, $t1, $t1, #14
|
|
|
|
veor $t2#lo, $t2#lo, $t2#hi
|
|
|
|
vmull.p8 $r, $a, $b @ D = A*B
|
|
|
|
vext.8 $t3, $t3, $t3, #12
|
|
|
|
vext.8 $t2, $t2, $t2, #13
|
|
|
|
veor $t0, $t0, $t1
|
|
|
|
veor $t2, $t2, $t3
|
|
|
|
veor $r, $r, $t0
|
|
|
|
veor $r, $r, $t2
|
|
|
|
___
|
|
|
|
}
|
2011-04-02 04:58:34 +08:00
|
|
|
|
|
|
|
$code.=<<___;
|
2014-11-08 05:48:22 +08:00
|
|
|
#if __ARM_MAX_ARCH__>=7
|
|
|
|
.arch armv7-a
|
2011-04-02 04:58:34 +08:00
|
|
|
.fpu neon
|
|
|
|
|
2014-04-24 16:16:58 +08:00
|
|
|
.global gcm_init_neon
|
|
|
|
.type gcm_init_neon,%function
|
|
|
|
.align 4
|
|
|
|
gcm_init_neon:
|
2015-04-02 16:17:42 +08:00
|
|
|
vld1.64 $IN#hi,[r1]! @ load H
|
2014-04-24 16:16:58 +08:00
|
|
|
vmov.i8 $t0,#0xe1
|
2015-04-02 16:17:42 +08:00
|
|
|
vld1.64 $IN#lo,[r1]
|
2014-04-24 16:16:58 +08:00
|
|
|
vshl.i64 $t0#hi,#57
|
|
|
|
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
|
|
|
|
vdup.8 $t1,$IN#hi[7]
|
|
|
|
vshr.u64 $Hlo,$IN#lo,#63
|
|
|
|
vshr.s8 $t1,#7 @ broadcast carry bit
|
|
|
|
vshl.i64 $IN,$IN,#1
|
|
|
|
vand $t0,$t0,$t1
|
|
|
|
vorr $IN#hi,$Hlo @ H<<<=1
|
|
|
|
veor $IN,$IN,$t0 @ twisted H
|
|
|
|
vstmia r0,{$IN}
|
|
|
|
|
2014-06-07 03:27:18 +08:00
|
|
|
ret @ bx lr
|
2014-04-24 16:16:58 +08:00
|
|
|
.size gcm_init_neon,.-gcm_init_neon
|
|
|
|
|
2011-04-02 04:58:34 +08:00
|
|
|
.global gcm_gmult_neon
|
|
|
|
.type gcm_gmult_neon,%function
|
|
|
|
.align 4
|
|
|
|
gcm_gmult_neon:
|
2015-04-02 16:17:42 +08:00
|
|
|
vld1.64 $IN#hi,[$Xi]! @ load Xi
|
|
|
|
vld1.64 $IN#lo,[$Xi]!
|
2014-04-24 16:16:58 +08:00
|
|
|
vmov.i64 $k48,#0x0000ffffffffffff
|
|
|
|
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
|
|
|
vmov.i64 $k32,#0x00000000ffffffff
|
2011-04-02 04:58:34 +08:00
|
|
|
#ifdef __ARMEL__
|
|
|
|
vrev64.8 $IN,$IN
|
|
|
|
#endif
|
2014-04-24 16:16:58 +08:00
|
|
|
vmov.i64 $k16,#0x000000000000ffff
|
|
|
|
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
2011-04-02 04:58:34 +08:00
|
|
|
mov $len,#16
|
2014-04-24 16:16:58 +08:00
|
|
|
b .Lgmult_neon
|
2011-04-02 04:58:34 +08:00
|
|
|
.size gcm_gmult_neon,.-gcm_gmult_neon
|
|
|
|
|
|
|
|
.global gcm_ghash_neon
|
|
|
|
.type gcm_ghash_neon,%function
|
|
|
|
.align 4
|
|
|
|
gcm_ghash_neon:
|
2015-04-02 16:17:42 +08:00
|
|
|
vld1.64 $Xl#hi,[$Xi]! @ load Xi
|
|
|
|
vld1.64 $Xl#lo,[$Xi]!
|
2014-04-24 16:16:58 +08:00
|
|
|
vmov.i64 $k48,#0x0000ffffffffffff
|
|
|
|
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
|
|
|
vmov.i64 $k32,#0x00000000ffffffff
|
2011-04-02 04:58:34 +08:00
|
|
|
#ifdef __ARMEL__
|
2014-04-24 16:16:58 +08:00
|
|
|
vrev64.8 $Xl,$Xl
|
2011-04-02 04:58:34 +08:00
|
|
|
#endif
|
2014-04-24 16:16:58 +08:00
|
|
|
vmov.i64 $k16,#0x000000000000ffff
|
|
|
|
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
|
|
|
|
|
|
|
.Loop_neon:
|
|
|
|
vld1.64 $IN#hi,[$inp]! @ load inp
|
|
|
|
vld1.64 $IN#lo,[$inp]!
|
2011-04-02 04:58:34 +08:00
|
|
|
#ifdef __ARMEL__
|
|
|
|
vrev64.8 $IN,$IN
|
|
|
|
#endif
|
2014-04-24 16:16:58 +08:00
|
|
|
veor $IN,$Xl @ inp^=Xi
|
|
|
|
.Lgmult_neon:
|
|
|
|
___
|
2015-07-13 22:53:37 +08:00
|
|
|
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
|
2014-04-24 16:16:58 +08:00
|
|
|
$code.=<<___;
|
|
|
|
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
|
|
|
|
___
|
2015-07-13 22:53:37 +08:00
|
|
|
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
|
|
|
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
|
2014-04-24 16:16:58 +08:00
|
|
|
$code.=<<___;
|
|
|
|
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
|
|
|
|
veor $Xm,$Xm,$Xh
|
|
|
|
veor $Xl#hi,$Xl#hi,$Xm#lo
|
|
|
|
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
|
|
|
|
|
|
|
|
@ equivalent of reduction_avx from ghash-x86_64.pl
|
|
|
|
vshl.i64 $t1,$Xl,#57 @ 1st phase
|
|
|
|
vshl.i64 $t2,$Xl,#62
|
|
|
|
veor $t2,$t2,$t1 @
|
|
|
|
vshl.i64 $t1,$Xl,#63
|
|
|
|
veor $t2, $t2, $t1 @
|
|
|
|
veor $Xl#hi,$Xl#hi,$t2#lo @
|
|
|
|
veor $Xh#lo,$Xh#lo,$t2#hi
|
|
|
|
|
|
|
|
vshr.u64 $t2,$Xl,#1 @ 2nd phase
|
|
|
|
veor $Xh,$Xh,$Xl
|
|
|
|
veor $Xl,$Xl,$t2 @
|
|
|
|
vshr.u64 $t2,$t2,#6
|
|
|
|
vshr.u64 $Xl,$Xl,#1 @
|
|
|
|
veor $Xl,$Xl,$Xh @
|
|
|
|
veor $Xl,$Xl,$t2 @
|
|
|
|
|
2011-04-02 04:58:34 +08:00
|
|
|
subs $len,#16
|
2014-04-24 16:16:58 +08:00
|
|
|
bne .Loop_neon
|
2011-04-02 04:58:34 +08:00
|
|
|
|
|
|
|
#ifdef __ARMEL__
|
2014-04-24 16:16:58 +08:00
|
|
|
vrev64.8 $Xl,$Xl
|
2011-04-02 04:58:34 +08:00
|
|
|
#endif
|
2016-10-11 00:01:24 +08:00
|
|
|
sub $Xi,#16
|
2015-04-02 16:17:42 +08:00
|
|
|
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
|
|
|
|
vst1.64 $Xl#lo,[$Xi]
|
2011-04-02 04:58:34 +08:00
|
|
|
|
2014-06-07 03:27:18 +08:00
|
|
|
ret @ bx lr
|
2011-04-02 04:58:34 +08:00
|
|
|
.size gcm_ghash_neon,.-gcm_ghash_neon
|
|
|
|
#endif
|
|
|
|
___
|
|
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
2010-05-04 02:23:29 +08:00
|
|
|
.align 2
|
|
|
|
___
|
|
|
|
|
2014-04-24 16:16:58 +08:00
|
|
|
foreach (split("\n",$code)) {
|
|
|
|
s/\`([^\`]*)\`/eval $1/geo;
|
|
|
|
|
|
|
|
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
2014-06-07 03:27:18 +08:00
|
|
|
s/\bret\b/bx lr/go or
|
2014-04-24 16:16:58 +08:00
|
|
|
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
|
|
|
|
|
|
|
print $_,"\n";
|
|
|
|
}
|
2020-02-17 10:17:53 +08:00
|
|
|
close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|