mirror of
https://github.com/python/cpython.git
synced 2024-11-25 02:44:06 +08:00
444 lines
13 KiB
C
444 lines
13 KiB
C
/* Set of hash utility functions to help maintaining the invariant that
|
|
if a==b then hash(a)==hash(b)
|
|
|
|
All the utility functions (_Py_Hash*()) return "-1" to signify an error.
|
|
*/
|
|
#include "Python.h"
|
|
|
|
#ifdef __APPLE__
|
|
# include <libkern/OSByteOrder.h>
|
|
#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
|
|
# include <endian.h>
|
|
#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
|
|
# include <sys/endian.h>
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
_Py_HashSecret_t _Py_HashSecret = {{0}};
|
|
|
|
#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
|
|
extern PyHash_FuncDef PyHash_Func;
|
|
#else
|
|
static PyHash_FuncDef PyHash_Func;
|
|
#endif
|
|
|
|
/* Count _Py_HashBytes() calls */
|
|
#ifdef Py_HASH_STATS
|
|
#define Py_HASH_STATS_MAX 32
|
|
static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
|
|
#endif
|
|
|
|
/* For numeric types, the hash of a number x is based on the reduction
|
|
of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
|
|
hash(x) == hash(y) whenever x and y are numerically equal, even if
|
|
x and y have different types.
|
|
|
|
A quick summary of the hashing strategy:
|
|
|
|
(1) First define the 'reduction of x modulo P' for any rational
|
|
number x; this is a standard extension of the usual notion of
|
|
reduction modulo P for integers. If x == p/q (written in lowest
|
|
terms), the reduction is interpreted as the reduction of p times
|
|
the inverse of the reduction of q, all modulo P; if q is exactly
|
|
divisible by P then define the reduction to be infinity. So we've
|
|
got a well-defined map
|
|
|
|
reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
|
|
|
|
(2) Now for a rational number x, define hash(x) by:
|
|
|
|
reduce(x) if x >= 0
|
|
-reduce(-x) if x < 0
|
|
|
|
If the result of the reduction is infinity (this is impossible for
|
|
integers, floats and Decimals) then use the predefined hash value
|
|
_PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
|
|
_PyHASH_INF and -_PyHASH_INF are also used for the
|
|
hashes of float and Decimal infinities.
|
|
|
|
NaNs hash with a pointer hash. Having distinct hash values prevents
|
|
catastrophic pileups from distinct NaN instances which used to always
|
|
have the same hash value but would compare unequal.
|
|
|
|
A selling point for the above strategy is that it makes it possible
|
|
to compute hashes of decimal and binary floating-point numbers
|
|
efficiently, even if the exponent of the binary or decimal number
|
|
is large. The key point is that
|
|
|
|
reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
|
|
|
|
provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
|
|
binary or decimal float is never infinity, since the denominator is a power
|
|
of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
|
|
for nonnegative x,
|
|
|
|
reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
|
|
|
|
reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
|
|
|
|
and reduce(10**e) can be computed efficiently by the usual modular
|
|
exponentiation algorithm. For reduce(2**e) it's even better: since
|
|
P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
|
|
by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
|
|
|
|
*/
|
|
|
|
Py_hash_t _Py_HashPointer(const void *);
|
|
|
|
Py_hash_t
|
|
_Py_HashDouble(PyObject *inst, double v)
|
|
{
|
|
int e, sign;
|
|
double m;
|
|
Py_uhash_t x, y;
|
|
|
|
if (!Py_IS_FINITE(v)) {
|
|
if (Py_IS_INFINITY(v))
|
|
return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
|
|
else
|
|
return _Py_HashPointer(inst);
|
|
}
|
|
|
|
m = frexp(v, &e);
|
|
|
|
sign = 1;
|
|
if (m < 0) {
|
|
sign = -1;
|
|
m = -m;
|
|
}
|
|
|
|
/* process 28 bits at a time; this should work well both for binary
|
|
and hexadecimal floating point. */
|
|
x = 0;
|
|
while (m) {
|
|
x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
|
|
m *= 268435456.0; /* 2**28 */
|
|
e -= 28;
|
|
y = (Py_uhash_t)m; /* pull out integer part */
|
|
m -= y;
|
|
x += y;
|
|
if (x >= _PyHASH_MODULUS)
|
|
x -= _PyHASH_MODULUS;
|
|
}
|
|
|
|
/* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
|
|
e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
|
|
x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
|
|
|
|
x = x * sign;
|
|
if (x == (Py_uhash_t)-1)
|
|
x = (Py_uhash_t)-2;
|
|
return (Py_hash_t)x;
|
|
}
|
|
|
|
Py_hash_t
|
|
_Py_HashPointerRaw(const void *p)
|
|
{
|
|
size_t y = (size_t)p;
|
|
/* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
|
|
excessive hash collisions for dicts and sets */
|
|
y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
|
|
return (Py_hash_t)y;
|
|
}
|
|
|
|
Py_hash_t
|
|
_Py_HashPointer(const void *p)
|
|
{
|
|
Py_hash_t x = _Py_HashPointerRaw(p);
|
|
if (x == -1) {
|
|
x = -2;
|
|
}
|
|
return x;
|
|
}
|
|
|
|
Py_hash_t
|
|
_Py_HashBytes(const void *src, Py_ssize_t len)
|
|
{
|
|
Py_hash_t x;
|
|
/*
|
|
We make the hash of the empty string be 0, rather than using
|
|
(prefix ^ suffix), since this slightly obfuscates the hash secret
|
|
*/
|
|
if (len == 0) {
|
|
return 0;
|
|
}
|
|
|
|
#ifdef Py_HASH_STATS
|
|
hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
|
|
#endif
|
|
|
|
#if Py_HASH_CUTOFF > 0
|
|
if (len < Py_HASH_CUTOFF) {
|
|
/* Optimize hashing of very small strings with inline DJBX33A. */
|
|
Py_uhash_t hash;
|
|
const unsigned char *p = src;
|
|
hash = 5381; /* DJBX33A starts with 5381 */
|
|
|
|
switch(len) {
|
|
/* ((hash << 5) + hash) + *p == hash * 33 + *p */
|
|
case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
|
|
case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
|
|
case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
|
|
case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
|
|
case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
|
|
case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
|
|
case 1: hash = ((hash << 5) + hash) + *p++; break;
|
|
default:
|
|
Py_UNREACHABLE();
|
|
}
|
|
hash ^= len;
|
|
hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
|
|
x = (Py_hash_t)hash;
|
|
}
|
|
else
|
|
#endif /* Py_HASH_CUTOFF */
|
|
x = PyHash_Func.hash(src, len);
|
|
|
|
if (x == -1)
|
|
return -2;
|
|
return x;
|
|
}
|
|
|
|
void
|
|
_PyHash_Fini(void)
|
|
{
|
|
#ifdef Py_HASH_STATS
|
|
fprintf(stderr, "len calls total\n");
|
|
Py_ssize_t total = 0;
|
|
for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
|
|
total += hashstats[i];
|
|
fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
|
|
}
|
|
total += hashstats[0];
|
|
fprintf(stderr, "> %8zd %8zd\n", hashstats[0], total);
|
|
#endif
|
|
}
|
|
|
|
PyHash_FuncDef *
|
|
PyHash_GetFuncDef(void)
|
|
{
|
|
return &PyHash_Func;
|
|
}
|
|
|
|
/* Optimized memcpy() for Windows */
|
|
#ifdef _MSC_VER
|
|
# if SIZEOF_PY_UHASH_T == 4
|
|
# define PY_UHASH_CPY(dst, src) do { \
|
|
dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
|
|
} while(0)
|
|
# elif SIZEOF_PY_UHASH_T == 8
|
|
# define PY_UHASH_CPY(dst, src) do { \
|
|
dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
|
|
dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
|
|
} while(0)
|
|
# else
|
|
# error SIZEOF_PY_UHASH_T must be 4 or 8
|
|
# endif /* SIZEOF_PY_UHASH_T */
|
|
#else /* not Windows */
|
|
# define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
|
|
#endif /* _MSC_VER */
|
|
|
|
|
|
#if Py_HASH_ALGORITHM == Py_HASH_FNV
|
|
/* **************************************************************************
|
|
* Modified Fowler-Noll-Vo (FNV) hash function
|
|
*/
|
|
static Py_hash_t
|
|
fnv(const void *src, Py_ssize_t len)
|
|
{
|
|
const unsigned char *p = src;
|
|
Py_uhash_t x;
|
|
Py_ssize_t remainder, blocks;
|
|
union {
|
|
Py_uhash_t value;
|
|
unsigned char bytes[SIZEOF_PY_UHASH_T];
|
|
} block;
|
|
|
|
#ifdef Py_DEBUG
|
|
assert(_Py_HashSecret_Initialized);
|
|
#endif
|
|
remainder = len % SIZEOF_PY_UHASH_T;
|
|
if (remainder == 0) {
|
|
/* Process at least one block byte by byte to reduce hash collisions
|
|
* for strings with common prefixes. */
|
|
remainder = SIZEOF_PY_UHASH_T;
|
|
}
|
|
blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
|
|
|
|
x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
|
|
x ^= (Py_uhash_t) *p << 7;
|
|
while (blocks--) {
|
|
PY_UHASH_CPY(block.bytes, p);
|
|
x = (_PyHASH_MULTIPLIER * x) ^ block.value;
|
|
p += SIZEOF_PY_UHASH_T;
|
|
}
|
|
/* add remainder */
|
|
for (; remainder > 0; remainder--)
|
|
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
|
|
x ^= (Py_uhash_t) len;
|
|
x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
|
|
if (x == (Py_uhash_t) -1) {
|
|
x = (Py_uhash_t) -2;
|
|
}
|
|
return x;
|
|
}
|
|
|
|
static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
|
|
16 * SIZEOF_PY_HASH_T};
|
|
|
|
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
|
|
|
|
|
|
/* **************************************************************************
|
|
<MIT License>
|
|
Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
</MIT License>
|
|
|
|
Original location:
|
|
https://github.com/majek/csiphash/
|
|
|
|
Solution inspired by code from:
|
|
Samuel Neves (supercop/crypto_auth/siphash24/little)
|
|
djb (supercop/crypto_auth/siphash24/little2)
|
|
Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
|
|
|
|
Modified for Python by Christian Heimes:
|
|
- C89 / MSVC compatibility
|
|
- _rotl64() on Windows
|
|
- letoh64() fallback
|
|
*/
|
|
|
|
/* byte swap little endian to host endian
|
|
* Endian conversion not only ensures that the hash function returns the same
|
|
* value on all platforms. It is also required to for a good dispersion of
|
|
* the hash values' least significant bits.
|
|
*/
|
|
#if PY_LITTLE_ENDIAN
|
|
# define _le64toh(x) ((uint64_t)(x))
|
|
#elif defined(__APPLE__)
|
|
# define _le64toh(x) OSSwapLittleToHostInt64(x)
|
|
#elif defined(HAVE_LETOH64)
|
|
# define _le64toh(x) le64toh(x)
|
|
#else
|
|
# define _le64toh(x) (((uint64_t)(x) << 56) | \
|
|
(((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
|
|
(((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
|
|
(((uint64_t)(x) << 8) & 0xff00000000ULL) | \
|
|
(((uint64_t)(x) >> 8) & 0xff000000ULL) | \
|
|
(((uint64_t)(x) >> 24) & 0xff0000ULL) | \
|
|
(((uint64_t)(x) >> 40) & 0xff00ULL) | \
|
|
((uint64_t)(x) >> 56))
|
|
#endif
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
# define ROTATE(x, b) _rotl64(x, b)
|
|
#else
|
|
# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
|
|
#endif
|
|
|
|
#define HALF_ROUND(a,b,c,d,s,t) \
|
|
a += b; c += d; \
|
|
b = ROTATE(b, s) ^ a; \
|
|
d = ROTATE(d, t) ^ c; \
|
|
a = ROTATE(a, 32);
|
|
|
|
#define DOUBLE_ROUND(v0,v1,v2,v3) \
|
|
HALF_ROUND(v0,v1,v2,v3,13,16); \
|
|
HALF_ROUND(v2,v1,v0,v3,17,21); \
|
|
HALF_ROUND(v0,v1,v2,v3,13,16); \
|
|
HALF_ROUND(v2,v1,v0,v3,17,21);
|
|
|
|
|
|
static uint64_t
|
|
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
|
|
uint64_t b = (uint64_t)src_sz << 56;
|
|
const uint8_t *in = (const uint8_t*)src;
|
|
|
|
uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
|
|
uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
|
|
uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
|
|
uint64_t v3 = k1 ^ 0x7465646279746573ULL;
|
|
|
|
uint64_t t;
|
|
uint8_t *pt;
|
|
|
|
while (src_sz >= 8) {
|
|
uint64_t mi;
|
|
memcpy(&mi, in, sizeof(mi));
|
|
mi = _le64toh(mi);
|
|
in += sizeof(mi);
|
|
src_sz -= sizeof(mi);
|
|
v3 ^= mi;
|
|
DOUBLE_ROUND(v0,v1,v2,v3);
|
|
v0 ^= mi;
|
|
}
|
|
|
|
t = 0;
|
|
pt = (uint8_t *)&t;
|
|
switch (src_sz) {
|
|
case 7: pt[6] = in[6]; /* fall through */
|
|
case 6: pt[5] = in[5]; /* fall through */
|
|
case 5: pt[4] = in[4]; /* fall through */
|
|
case 4: memcpy(pt, in, sizeof(uint32_t)); break;
|
|
case 3: pt[2] = in[2]; /* fall through */
|
|
case 2: pt[1] = in[1]; /* fall through */
|
|
case 1: pt[0] = in[0]; /* fall through */
|
|
}
|
|
b |= _le64toh(t);
|
|
|
|
v3 ^= b;
|
|
DOUBLE_ROUND(v0,v1,v2,v3);
|
|
v0 ^= b;
|
|
v2 ^= 0xff;
|
|
DOUBLE_ROUND(v0,v1,v2,v3);
|
|
DOUBLE_ROUND(v0,v1,v2,v3);
|
|
|
|
/* modified */
|
|
t = (v0 ^ v1) ^ (v2 ^ v3);
|
|
return t;
|
|
}
|
|
|
|
uint64_t
|
|
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
|
|
{
|
|
return siphash24(key, 0, src, src_sz);
|
|
}
|
|
|
|
|
|
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
|
|
static Py_hash_t
|
|
pysiphash(const void *src, Py_ssize_t src_sz) {
|
|
return (Py_hash_t)siphash24(
|
|
_le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
|
|
src, src_sz);
|
|
}
|
|
|
|
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|