2008-03-25 08:54:53 +08:00
|
|
|
/* sha1.c - Functions to compute SHA1 message digest of files or
|
|
|
|
memory blocks according to the NIST specification FIPS-180-1.
|
|
|
|
|
2024-01-03 19:19:35 +08:00
|
|
|
Copyright (C) 2000-2024 Free Software Foundation, Inc.
|
2008-03-25 08:54:53 +08:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
|
|
|
under the terms of the GNU General Public License as published by the
|
|
|
|
Free Software Foundation; either version 2, or (at your option) any
|
|
|
|
later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software Foundation,
|
|
|
|
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
|
|
|
|
|
|
|
|
/* Written by Scott G. Miller
|
|
|
|
Credits:
|
|
|
|
Robert Klep <robert@ilse.nl> -- Expansion function fix
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
|
|
#include "sha1.h"
|
|
|
|
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
libiberty: Use x86 HW optimized sha1
Nick has approved this patch (+ small ld change to use it for --build-id=),
so I'm commiting it to GCC as master as well.
If anyone from ARM would be willing to implement it similarly with
vsha1{cq,mq,pq,h,su0q,su1q}_u32 intrinsics, it could be a useful linker
speedup on those hosts as well, the intent in sha1.c was that
sha1_hw_process_bytes, sha1_hw_process_block functions
would be defined whenever
defined (HAVE_X86_SHA1_HW_SUPPORT) || defined (HAVE_WHATEVERELSE_SHA1_HW_SUPPORT)
but the body of sha1_hw_process_block and sha1_choose_process_bytes
would then have #elif defined (HAVE_WHATEVERELSE_SHA1_HW_SUPPORT) for
the other arch support, similarly for any target attributes on
sha1_hw_process_block if needed.
2023-11-28 Jakub Jelinek <jakub@redhat.com>
include/
* sha1.h (sha1_process_bytes_fn): New typedef.
(sha1_choose_process_bytes): Declare.
libiberty/
* configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check.
* sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h
and cpuid.h.
(sha1_hw_process_bytes, sha1_hw_process_block,
sha1_choose_process_bytes): New functions.
* config.in: Regenerated.
* configure: Regenerated.
2023-11-28 20:14:05 +08:00
|
|
|
#ifdef HAVE_X86_SHA1_HW_SUPPORT
|
|
|
|
# include <x86intrin.h>
|
|
|
|
# include <cpuid.h>
|
|
|
|
#endif
|
|
|
|
|
2008-03-25 08:54:53 +08:00
|
|
|
#if USE_UNLOCKED_IO
|
|
|
|
# include "unlocked-io.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef WORDS_BIGENDIAN
|
|
|
|
# define SWAP(n) (n)
|
|
|
|
#else
|
|
|
|
# define SWAP(n) \
|
|
|
|
(((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define BLOCKSIZE 4096
|
|
|
|
#if BLOCKSIZE % 64 != 0
|
|
|
|
# error "invalid BLOCKSIZE"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* This array contains the bytes used to pad the buffer to the next
|
|
|
|
64-byte boundary. (RFC 1321, 3.1: Step 1) */
|
|
|
|
static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ };
|
|
|
|
|
|
|
|
|
|
|
|
/* Take a pointer to a 160 bit block of data (five 32 bit ints) and
|
|
|
|
initialize it to the start constants of the SHA1 algorithm. This
|
|
|
|
must be called before using hash in the call to sha1_hash. */
|
|
|
|
void
|
|
|
|
sha1_init_ctx (struct sha1_ctx *ctx)
|
|
|
|
{
|
|
|
|
ctx->A = 0x67452301;
|
|
|
|
ctx->B = 0xefcdab89;
|
|
|
|
ctx->C = 0x98badcfe;
|
|
|
|
ctx->D = 0x10325476;
|
|
|
|
ctx->E = 0xc3d2e1f0;
|
|
|
|
|
|
|
|
ctx->total[0] = ctx->total[1] = 0;
|
|
|
|
ctx->buflen = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Put result from CTX in first 20 bytes following RESBUF. The result
|
|
|
|
must be in little endian byte order.
|
|
|
|
|
|
|
|
IMPORTANT: On some systems it is required that RESBUF is correctly
|
|
|
|
aligned for a 32-bit value. */
|
|
|
|
void *
|
|
|
|
sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf)
|
|
|
|
{
|
|
|
|
((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A);
|
|
|
|
((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B);
|
|
|
|
((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C);
|
|
|
|
((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D);
|
|
|
|
((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E);
|
|
|
|
|
|
|
|
return resbuf;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process the remaining bytes in the internal buffer and the usual
|
|
|
|
prolog according to the standard and write the result to RESBUF.
|
|
|
|
|
|
|
|
IMPORTANT: On some systems it is required that RESBUF is correctly
|
|
|
|
aligned for a 32-bit value. */
|
|
|
|
void *
|
|
|
|
sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf)
|
|
|
|
{
|
|
|
|
/* Take yet unprocessed bytes into account. */
|
|
|
|
sha1_uint32 bytes = ctx->buflen;
|
|
|
|
size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4;
|
|
|
|
|
|
|
|
/* Now count remaining bytes. */
|
|
|
|
ctx->total[0] += bytes;
|
|
|
|
if (ctx->total[0] < bytes)
|
|
|
|
++ctx->total[1];
|
|
|
|
|
|
|
|
/* Put the 64-bit file length in *bits* at the end of the buffer. */
|
|
|
|
ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29));
|
|
|
|
ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3);
|
|
|
|
|
|
|
|
memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes);
|
|
|
|
|
|
|
|
/* Process last bytes. */
|
|
|
|
sha1_process_block (ctx->buffer, size * 4, ctx);
|
|
|
|
|
|
|
|
return sha1_read_ctx (ctx, resbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Compute SHA1 message digest for bytes read from STREAM. The
|
|
|
|
resulting message digest number will be written into the 16 bytes
|
|
|
|
beginning at RESBLOCK. */
|
|
|
|
int
|
|
|
|
sha1_stream (FILE *stream, void *resblock)
|
|
|
|
{
|
|
|
|
struct sha1_ctx ctx;
|
|
|
|
char buffer[BLOCKSIZE + 72];
|
|
|
|
size_t sum;
|
|
|
|
|
|
|
|
/* Initialize the computation context. */
|
|
|
|
sha1_init_ctx (&ctx);
|
|
|
|
|
|
|
|
/* Iterate over full file contents. */
|
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
/* We read the file in blocks of BLOCKSIZE bytes. One call of the
|
|
|
|
computation function processes the whole buffer so that with the
|
|
|
|
next round of the loop another block can be read. */
|
|
|
|
size_t n;
|
|
|
|
sum = 0;
|
|
|
|
|
|
|
|
/* Read block. Take care for partial reads. */
|
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
|
|
|
|
|
|
|
|
sum += n;
|
|
|
|
|
|
|
|
if (sum == BLOCKSIZE)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
{
|
|
|
|
/* Check for the error flag IFF N == 0, so that we don't
|
|
|
|
exit the loop after a partial read due to e.g., EAGAIN
|
|
|
|
or EWOULDBLOCK. */
|
|
|
|
if (ferror (stream))
|
|
|
|
return 1;
|
|
|
|
goto process_partial_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We've read at least one byte, so ignore errors. But always
|
|
|
|
check for EOF, since feof may be true even though N > 0.
|
|
|
|
Otherwise, we could end up calling fread after EOF. */
|
|
|
|
if (feof (stream))
|
|
|
|
goto process_partial_block;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process buffer with BLOCKSIZE bytes. Note that
|
|
|
|
BLOCKSIZE % 64 == 0
|
|
|
|
*/
|
|
|
|
sha1_process_block (buffer, BLOCKSIZE, &ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
process_partial_block:;
|
|
|
|
|
|
|
|
/* Process any remaining bytes. */
|
|
|
|
if (sum > 0)
|
|
|
|
sha1_process_bytes (buffer, sum, &ctx);
|
|
|
|
|
|
|
|
/* Construct result in desired memory. */
|
|
|
|
sha1_finish_ctx (&ctx, resblock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The
|
|
|
|
result is always in little endian byte order, so that a byte-wise
|
|
|
|
output yields to the wanted ASCII representation of the message
|
|
|
|
digest. */
|
|
|
|
void *
|
|
|
|
sha1_buffer (const char *buffer, size_t len, void *resblock)
|
|
|
|
{
|
|
|
|
struct sha1_ctx ctx;
|
|
|
|
|
|
|
|
/* Initialize the computation context. */
|
|
|
|
sha1_init_ctx (&ctx);
|
|
|
|
|
|
|
|
/* Process whole buffer but last len % 64 bytes. */
|
|
|
|
sha1_process_bytes (buffer, len, &ctx);
|
|
|
|
|
|
|
|
/* Put result in desired memory area. */
|
|
|
|
return sha1_finish_ctx (&ctx, resblock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
|
|
|
|
{
|
|
|
|
/* When we already have some bits in our internal buffer concatenate
|
|
|
|
both inputs first. */
|
|
|
|
if (ctx->buflen != 0)
|
|
|
|
{
|
|
|
|
size_t left_over = ctx->buflen;
|
|
|
|
size_t add = 128 - left_over > len ? len : 128 - left_over;
|
|
|
|
|
|
|
|
memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
|
|
|
|
ctx->buflen += add;
|
|
|
|
|
|
|
|
if (ctx->buflen > 64)
|
|
|
|
{
|
|
|
|
sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
|
|
|
|
|
|
|
|
ctx->buflen &= 63;
|
|
|
|
/* The regions in the following copy operation cannot overlap. */
|
|
|
|
memcpy (ctx->buffer,
|
|
|
|
&((char *) ctx->buffer)[(left_over + add) & ~63],
|
|
|
|
ctx->buflen);
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer = (const char *) buffer + add;
|
|
|
|
len -= add;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process available complete blocks. */
|
|
|
|
if (len >= 64)
|
|
|
|
{
|
|
|
|
#if !_STRING_ARCH_unaligned
|
|
|
|
# define alignof(type) offsetof (struct { char c; type x; }, x)
|
|
|
|
# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
|
|
|
|
if (UNALIGNED_P (buffer))
|
|
|
|
while (len > 64)
|
|
|
|
{
|
|
|
|
sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
|
|
|
|
buffer = (const char *) buffer + 64;
|
|
|
|
len -= 64;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
sha1_process_block (buffer, len & ~63, ctx);
|
|
|
|
buffer = (const char *) buffer + (len & ~63);
|
|
|
|
len &= 63;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Move remaining bytes in internal buffer. */
|
|
|
|
if (len > 0)
|
|
|
|
{
|
|
|
|
size_t left_over = ctx->buflen;
|
|
|
|
|
|
|
|
memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
|
|
|
|
left_over += len;
|
|
|
|
if (left_over >= 64)
|
|
|
|
{
|
|
|
|
sha1_process_block (ctx->buffer, 64, ctx);
|
|
|
|
left_over -= 64;
|
2021-03-16 22:43:17 +08:00
|
|
|
memmove (ctx->buffer, &ctx->buffer[16], left_over);
|
2008-03-25 08:54:53 +08:00
|
|
|
}
|
|
|
|
ctx->buflen = left_over;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* --- Code below is the primary difference between md5.c and sha1.c --- */
|
|
|
|
|
|
|
|
/* SHA1 round constants */
|
|
|
|
#define K1 0x5a827999
|
|
|
|
#define K2 0x6ed9eba1
|
|
|
|
#define K3 0x8f1bbcdc
|
|
|
|
#define K4 0xca62c1d6
|
|
|
|
|
|
|
|
/* Round functions. Note that F2 is the same as F4. */
|
|
|
|
#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) )
|
|
|
|
#define F2(B,C,D) (B ^ C ^ D)
|
|
|
|
#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) )
|
|
|
|
#define F4(B,C,D) (B ^ C ^ D)
|
|
|
|
|
|
|
|
/* Process LEN bytes of BUFFER, accumulating context into CTX.
|
|
|
|
It is assumed that LEN % 64 == 0.
|
|
|
|
Most of this code comes from GnuPG's cipher/sha1.c. */
|
|
|
|
|
|
|
|
void
|
|
|
|
sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
|
|
|
|
{
|
|
|
|
const sha1_uint32 *words = (const sha1_uint32*) buffer;
|
|
|
|
size_t nwords = len / sizeof (sha1_uint32);
|
|
|
|
const sha1_uint32 *endp = words + nwords;
|
|
|
|
sha1_uint32 x[16];
|
|
|
|
sha1_uint32 a = ctx->A;
|
|
|
|
sha1_uint32 b = ctx->B;
|
|
|
|
sha1_uint32 c = ctx->C;
|
|
|
|
sha1_uint32 d = ctx->D;
|
|
|
|
sha1_uint32 e = ctx->E;
|
|
|
|
|
|
|
|
/* First increment the byte count. RFC 1321 specifies the possible
|
|
|
|
length of the file up to 2^64 bits. Here we only compute the
|
|
|
|
number of bytes. Do a double word increment. */
|
|
|
|
ctx->total[0] += len;
|
2013-01-31 16:17:58 +08:00
|
|
|
ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
|
2008-03-25 08:54:53 +08:00
|
|
|
|
|
|
|
#define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n))))
|
|
|
|
|
|
|
|
#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \
|
|
|
|
^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \
|
|
|
|
, (x[I&0x0f] = rol(tm, 1)) )
|
|
|
|
|
|
|
|
#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \
|
|
|
|
+ F( B, C, D ) \
|
|
|
|
+ K \
|
|
|
|
+ M; \
|
|
|
|
B = rol( B, 30 ); \
|
|
|
|
} while(0)
|
|
|
|
|
|
|
|
while (words < endp)
|
|
|
|
{
|
|
|
|
sha1_uint32 tm;
|
|
|
|
int t;
|
|
|
|
for (t = 0; t < 16; t++)
|
|
|
|
{
|
|
|
|
x[t] = SWAP (*words);
|
|
|
|
words++;
|
|
|
|
}
|
|
|
|
|
|
|
|
R( a, b, c, d, e, F1, K1, x[ 0] );
|
|
|
|
R( e, a, b, c, d, F1, K1, x[ 1] );
|
|
|
|
R( d, e, a, b, c, F1, K1, x[ 2] );
|
|
|
|
R( c, d, e, a, b, F1, K1, x[ 3] );
|
|
|
|
R( b, c, d, e, a, F1, K1, x[ 4] );
|
|
|
|
R( a, b, c, d, e, F1, K1, x[ 5] );
|
|
|
|
R( e, a, b, c, d, F1, K1, x[ 6] );
|
|
|
|
R( d, e, a, b, c, F1, K1, x[ 7] );
|
|
|
|
R( c, d, e, a, b, F1, K1, x[ 8] );
|
|
|
|
R( b, c, d, e, a, F1, K1, x[ 9] );
|
|
|
|
R( a, b, c, d, e, F1, K1, x[10] );
|
|
|
|
R( e, a, b, c, d, F1, K1, x[11] );
|
|
|
|
R( d, e, a, b, c, F1, K1, x[12] );
|
|
|
|
R( c, d, e, a, b, F1, K1, x[13] );
|
|
|
|
R( b, c, d, e, a, F1, K1, x[14] );
|
|
|
|
R( a, b, c, d, e, F1, K1, x[15] );
|
|
|
|
R( e, a, b, c, d, F1, K1, M(16) );
|
|
|
|
R( d, e, a, b, c, F1, K1, M(17) );
|
|
|
|
R( c, d, e, a, b, F1, K1, M(18) );
|
|
|
|
R( b, c, d, e, a, F1, K1, M(19) );
|
|
|
|
R( a, b, c, d, e, F2, K2, M(20) );
|
|
|
|
R( e, a, b, c, d, F2, K2, M(21) );
|
|
|
|
R( d, e, a, b, c, F2, K2, M(22) );
|
|
|
|
R( c, d, e, a, b, F2, K2, M(23) );
|
|
|
|
R( b, c, d, e, a, F2, K2, M(24) );
|
|
|
|
R( a, b, c, d, e, F2, K2, M(25) );
|
|
|
|
R( e, a, b, c, d, F2, K2, M(26) );
|
|
|
|
R( d, e, a, b, c, F2, K2, M(27) );
|
|
|
|
R( c, d, e, a, b, F2, K2, M(28) );
|
|
|
|
R( b, c, d, e, a, F2, K2, M(29) );
|
|
|
|
R( a, b, c, d, e, F2, K2, M(30) );
|
|
|
|
R( e, a, b, c, d, F2, K2, M(31) );
|
|
|
|
R( d, e, a, b, c, F2, K2, M(32) );
|
|
|
|
R( c, d, e, a, b, F2, K2, M(33) );
|
|
|
|
R( b, c, d, e, a, F2, K2, M(34) );
|
|
|
|
R( a, b, c, d, e, F2, K2, M(35) );
|
|
|
|
R( e, a, b, c, d, F2, K2, M(36) );
|
|
|
|
R( d, e, a, b, c, F2, K2, M(37) );
|
|
|
|
R( c, d, e, a, b, F2, K2, M(38) );
|
|
|
|
R( b, c, d, e, a, F2, K2, M(39) );
|
|
|
|
R( a, b, c, d, e, F3, K3, M(40) );
|
|
|
|
R( e, a, b, c, d, F3, K3, M(41) );
|
|
|
|
R( d, e, a, b, c, F3, K3, M(42) );
|
|
|
|
R( c, d, e, a, b, F3, K3, M(43) );
|
|
|
|
R( b, c, d, e, a, F3, K3, M(44) );
|
|
|
|
R( a, b, c, d, e, F3, K3, M(45) );
|
|
|
|
R( e, a, b, c, d, F3, K3, M(46) );
|
|
|
|
R( d, e, a, b, c, F3, K3, M(47) );
|
|
|
|
R( c, d, e, a, b, F3, K3, M(48) );
|
|
|
|
R( b, c, d, e, a, F3, K3, M(49) );
|
|
|
|
R( a, b, c, d, e, F3, K3, M(50) );
|
|
|
|
R( e, a, b, c, d, F3, K3, M(51) );
|
|
|
|
R( d, e, a, b, c, F3, K3, M(52) );
|
|
|
|
R( c, d, e, a, b, F3, K3, M(53) );
|
|
|
|
R( b, c, d, e, a, F3, K3, M(54) );
|
|
|
|
R( a, b, c, d, e, F3, K3, M(55) );
|
|
|
|
R( e, a, b, c, d, F3, K3, M(56) );
|
|
|
|
R( d, e, a, b, c, F3, K3, M(57) );
|
|
|
|
R( c, d, e, a, b, F3, K3, M(58) );
|
|
|
|
R( b, c, d, e, a, F3, K3, M(59) );
|
|
|
|
R( a, b, c, d, e, F4, K4, M(60) );
|
|
|
|
R( e, a, b, c, d, F4, K4, M(61) );
|
|
|
|
R( d, e, a, b, c, F4, K4, M(62) );
|
|
|
|
R( c, d, e, a, b, F4, K4, M(63) );
|
|
|
|
R( b, c, d, e, a, F4, K4, M(64) );
|
|
|
|
R( a, b, c, d, e, F4, K4, M(65) );
|
|
|
|
R( e, a, b, c, d, F4, K4, M(66) );
|
|
|
|
R( d, e, a, b, c, F4, K4, M(67) );
|
|
|
|
R( c, d, e, a, b, F4, K4, M(68) );
|
|
|
|
R( b, c, d, e, a, F4, K4, M(69) );
|
|
|
|
R( a, b, c, d, e, F4, K4, M(70) );
|
|
|
|
R( e, a, b, c, d, F4, K4, M(71) );
|
|
|
|
R( d, e, a, b, c, F4, K4, M(72) );
|
|
|
|
R( c, d, e, a, b, F4, K4, M(73) );
|
|
|
|
R( b, c, d, e, a, F4, K4, M(74) );
|
|
|
|
R( a, b, c, d, e, F4, K4, M(75) );
|
|
|
|
R( e, a, b, c, d, F4, K4, M(76) );
|
|
|
|
R( d, e, a, b, c, F4, K4, M(77) );
|
|
|
|
R( c, d, e, a, b, F4, K4, M(78) );
|
|
|
|
R( b, c, d, e, a, F4, K4, M(79) );
|
|
|
|
|
|
|
|
a = ctx->A += a;
|
|
|
|
b = ctx->B += b;
|
|
|
|
c = ctx->C += c;
|
|
|
|
d = ctx->D += d;
|
|
|
|
e = ctx->E += e;
|
|
|
|
}
|
|
|
|
}
|
libiberty: Use x86 HW optimized sha1
Nick has approved this patch (+ small ld change to use it for --build-id=),
so I'm commiting it to GCC as master as well.
If anyone from ARM would be willing to implement it similarly with
vsha1{cq,mq,pq,h,su0q,su1q}_u32 intrinsics, it could be a useful linker
speedup on those hosts as well, the intent in sha1.c was that
sha1_hw_process_bytes, sha1_hw_process_block functions
would be defined whenever
defined (HAVE_X86_SHA1_HW_SUPPORT) || defined (HAVE_WHATEVERELSE_SHA1_HW_SUPPORT)
but the body of sha1_hw_process_block and sha1_choose_process_bytes
would then have #elif defined (HAVE_WHATEVERELSE_SHA1_HW_SUPPORT) for
the other arch support, similarly for any target attributes on
sha1_hw_process_block if needed.
2023-11-28 Jakub Jelinek <jakub@redhat.com>
include/
* sha1.h (sha1_process_bytes_fn): New typedef.
(sha1_choose_process_bytes): Declare.
libiberty/
* configure.ac (HAVE_X86_SHA1_HW_SUPPORT): New check.
* sha1.c: If HAVE_X86_SHA1_HW_SUPPORT is defined, include x86intrin.h
and cpuid.h.
(sha1_hw_process_bytes, sha1_hw_process_block,
sha1_choose_process_bytes): New functions.
* config.in: Regenerated.
* configure: Regenerated.
2023-11-28 20:14:05 +08:00
|
|
|
|
|
|
|
#if defined(HAVE_X86_SHA1_HW_SUPPORT)
|
|
|
|
/* HW specific version of sha1_process_bytes. */
|
|
|
|
|
|
|
|
static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
|
|
|
|
|
|
|
|
static void
|
|
|
|
sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
|
|
|
|
{
|
|
|
|
/* When we already have some bits in our internal buffer concatenate
|
|
|
|
both inputs first. */
|
|
|
|
if (ctx->buflen != 0)
|
|
|
|
{
|
|
|
|
size_t left_over = ctx->buflen;
|
|
|
|
size_t add = 128 - left_over > len ? len : 128 - left_over;
|
|
|
|
|
|
|
|
memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
|
|
|
|
ctx->buflen += add;
|
|
|
|
|
|
|
|
if (ctx->buflen > 64)
|
|
|
|
{
|
|
|
|
sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
|
|
|
|
|
|
|
|
ctx->buflen &= 63;
|
|
|
|
/* The regions in the following copy operation cannot overlap. */
|
|
|
|
memcpy (ctx->buffer,
|
|
|
|
&((char *) ctx->buffer)[(left_over + add) & ~63],
|
|
|
|
ctx->buflen);
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer = (const char *) buffer + add;
|
|
|
|
len -= add;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process available complete blocks. */
|
|
|
|
if (len >= 64)
|
|
|
|
{
|
|
|
|
#if !_STRING_ARCH_unaligned
|
|
|
|
# define alignof(type) offsetof (struct { char c; type x; }, x)
|
|
|
|
# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
|
|
|
|
if (UNALIGNED_P (buffer))
|
|
|
|
while (len > 64)
|
|
|
|
{
|
|
|
|
sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
|
|
|
|
buffer = (const char *) buffer + 64;
|
|
|
|
len -= 64;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
sha1_hw_process_block (buffer, len & ~63, ctx);
|
|
|
|
buffer = (const char *) buffer + (len & ~63);
|
|
|
|
len &= 63;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Move remaining bytes in internal buffer. */
|
|
|
|
if (len > 0)
|
|
|
|
{
|
|
|
|
size_t left_over = ctx->buflen;
|
|
|
|
|
|
|
|
memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
|
|
|
|
left_over += len;
|
|
|
|
if (left_over >= 64)
|
|
|
|
{
|
|
|
|
sha1_hw_process_block (ctx->buffer, 64, ctx);
|
|
|
|
left_over -= 64;
|
|
|
|
memmove (ctx->buffer, &ctx->buffer[16], left_over);
|
|
|
|
}
|
|
|
|
ctx->buflen = left_over;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process LEN bytes of BUFFER, accumulating context into CTX.
|
|
|
|
Using CPU specific intrinsics. */
|
|
|
|
|
|
|
|
#ifdef HAVE_X86_SHA1_HW_SUPPORT
|
|
|
|
__attribute__((__target__ ("sse4.1,sha")))
|
|
|
|
#endif
|
|
|
|
static void
|
|
|
|
sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
|
|
|
|
{
|
|
|
|
#ifdef HAVE_X86_SHA1_HW_SUPPORT
|
|
|
|
/* Implemented from
|
|
|
|
https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */
|
|
|
|
const __m128i *words = (const __m128i *) buffer;
|
|
|
|
const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
|
|
|
|
__m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
|
|
|
|
const __m128i shuf_mask
|
|
|
|
= _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
|
|
|
|
char check[((offsetof (struct sha1_ctx, B)
|
|
|
|
== offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
|
|
|
|
&& (offsetof (struct sha1_ctx, C)
|
|
|
|
== offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
|
|
|
|
&& (offsetof (struct sha1_ctx, D)
|
|
|
|
== offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
|
|
|
|
? 1 : -1];
|
|
|
|
|
|
|
|
/* First increment the byte count. RFC 1321 specifies the possible
|
|
|
|
length of the file up to 2^64 bits. Here we only compute the
|
|
|
|
number of bytes. Do a double word increment. */
|
|
|
|
ctx->total[0] += len;
|
|
|
|
ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
|
|
|
|
|
|
|
|
(void) &check[0];
|
|
|
|
abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
|
|
|
|
e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
|
|
|
|
abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
|
|
|
|
|
|
|
|
while (words < endp)
|
|
|
|
{
|
|
|
|
abcd_save = abcd;
|
|
|
|
e0_save = e0;
|
|
|
|
|
|
|
|
/* 0..3 */
|
|
|
|
msg0 = _mm_loadu_si128 (words);
|
|
|
|
msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
|
|
|
|
e0 = _mm_add_epi32 (e0, msg0);
|
|
|
|
e1 = abcd;
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
|
|
|
|
|
|
|
|
/* 4..7 */
|
|
|
|
msg1 = _mm_loadu_si128 (words + 1);
|
|
|
|
msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg1);
|
|
|
|
e0 = abcd;
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
|
|
|
|
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
|
|
|
|
|
|
|
|
/* 8..11 */
|
|
|
|
msg2 = _mm_loadu_si128 (words + 2);
|
|
|
|
msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg2);
|
|
|
|
e1 = abcd;
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
|
|
|
|
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
|
|
|
|
msg0 = _mm_xor_si128 (msg0, msg2);
|
|
|
|
|
|
|
|
/* 12..15 */
|
|
|
|
msg3 = _mm_loadu_si128 (words + 3);
|
|
|
|
msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg3);
|
|
|
|
e0 = abcd;
|
|
|
|
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
|
|
|
|
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
|
|
|
|
msg1 = _mm_xor_si128 (msg1, msg3);
|
|
|
|
|
|
|
|
/* 16..19 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg0);
|
|
|
|
e1 = abcd;
|
|
|
|
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
|
|
|
|
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
|
|
|
|
msg2 = _mm_xor_si128 (msg2, msg0);
|
|
|
|
|
|
|
|
/* 20..23 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg1);
|
|
|
|
e0 = abcd;
|
|
|
|
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
|
|
|
|
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
|
|
|
|
msg3 = _mm_xor_si128 (msg3, msg1);
|
|
|
|
|
|
|
|
/* 24..27 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg2);
|
|
|
|
e1 = abcd;
|
|
|
|
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
|
|
|
|
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
|
|
|
|
msg0 = _mm_xor_si128 (msg0, msg2);
|
|
|
|
|
|
|
|
/* 28..31 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg3);
|
|
|
|
e0 = abcd;
|
|
|
|
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
|
|
|
|
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
|
|
|
|
msg1 = _mm_xor_si128 (msg1, msg3);
|
|
|
|
|
|
|
|
/* 32..35 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg0);
|
|
|
|
e1 = abcd;
|
|
|
|
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
|
|
|
|
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
|
|
|
|
msg2 = _mm_xor_si128 (msg2, msg0);
|
|
|
|
|
|
|
|
/* 36..39 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg1);
|
|
|
|
e0 = abcd;
|
|
|
|
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
|
|
|
|
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
|
|
|
|
msg3 = _mm_xor_si128 (msg3, msg1);
|
|
|
|
|
|
|
|
/* 40..43 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg2);
|
|
|
|
e1 = abcd;
|
|
|
|
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
|
|
|
|
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
|
|
|
|
msg0 = _mm_xor_si128 (msg0, msg2);
|
|
|
|
|
|
|
|
/* 44..47 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg3);
|
|
|
|
e0 = abcd;
|
|
|
|
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
|
|
|
|
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
|
|
|
|
msg1 = _mm_xor_si128 (msg1, msg3);
|
|
|
|
|
|
|
|
/* 48..51 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg0);
|
|
|
|
e1 = abcd;
|
|
|
|
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
|
|
|
|
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
|
|
|
|
msg2 = _mm_xor_si128 (msg2, msg0);
|
|
|
|
|
|
|
|
/* 52..55 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg1);
|
|
|
|
e0 = abcd;
|
|
|
|
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
|
|
|
|
msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
|
|
|
|
msg3 = _mm_xor_si128 (msg3, msg1);
|
|
|
|
|
|
|
|
/* 56..59 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg2);
|
|
|
|
e1 = abcd;
|
|
|
|
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
|
|
|
|
msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
|
|
|
|
msg0 = _mm_xor_si128 (msg0, msg2);
|
|
|
|
|
|
|
|
/* 60..63 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg3);
|
|
|
|
e0 = abcd;
|
|
|
|
msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
|
|
|
|
msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
|
|
|
|
msg1 = _mm_xor_si128 (msg1, msg3);
|
|
|
|
|
|
|
|
/* 64..67 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg0);
|
|
|
|
e1 = abcd;
|
|
|
|
msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
|
|
|
|
msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
|
|
|
|
msg2 = _mm_xor_si128 (msg2, msg0);
|
|
|
|
|
|
|
|
/* 68..71 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg1);
|
|
|
|
e0 = abcd;
|
|
|
|
msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
|
|
|
|
msg3 = _mm_xor_si128 (msg3, msg1);
|
|
|
|
|
|
|
|
/* 72..75 */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, msg2);
|
|
|
|
e1 = abcd;
|
|
|
|
msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
|
|
|
|
|
|
|
|
/* 76..79 */
|
|
|
|
e1 = _mm_sha1nexte_epu32 (e1, msg3);
|
|
|
|
e0 = abcd;
|
|
|
|
abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
|
|
|
|
|
|
|
|
/* Finalize. */
|
|
|
|
e0 = _mm_sha1nexte_epu32 (e0, e0_save);
|
|
|
|
abcd = _mm_add_epi32 (abcd, abcd_save);
|
|
|
|
|
|
|
|
words = words + 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
|
|
|
|
_mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
|
|
|
|
ctx->E = _mm_extract_epi32 (e0, 3);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Return sha1_process_bytes or some hardware optimized version thereof
|
|
|
|
depending on current CPU. */
|
|
|
|
|
|
|
|
sha1_process_bytes_fn
|
|
|
|
sha1_choose_process_bytes (void)
|
|
|
|
{
|
|
|
|
#ifdef HAVE_X86_SHA1_HW_SUPPORT
|
|
|
|
unsigned int eax, ebx, ecx, edx;
|
|
|
|
if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
|
|
|
|
&& (ebx & bit_SHA) != 0
|
|
|
|
&& __get_cpuid (1, &eax, &ebx, &ecx, &edx)
|
|
|
|
&& (ecx & bit_SSE4_1) != 0)
|
|
|
|
return sha1_hw_process_bytes;
|
|
|
|
#endif
|
|
|
|
return sha1_process_bytes;
|
|
|
|
}
|