zstd/lib/common/entropy_common.c

341 lines
13 KiB
C
Raw Normal View History

/* ******************************************************************
* Common functions of New Generation Entropy library
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* You can contact the author at :
* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
* - Public forum : https://groups.google.com/forum/#!forum/lz4c
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
****************************************************************** */
2016-05-13 17:27:56 +08:00
/* *************************************
* Dependencies
***************************************/
#include "mem.h"
#include "error_private.h" /* ERR_*, ERROR */
#define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */
#include "fse.h"
#include "huf.h"
2022-01-22 02:29:14 +08:00
#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
2016-05-13 17:27:56 +08:00
2017-03-06 13:07:20 +08:00
/*=== Version ===*/
unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
2017-03-06 13:07:20 +08:00
/*=== Error Management ===*/
2016-05-13 17:27:56 +08:00
unsigned FSE_isError(size_t code) { return ERR_isError(code); }
const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
unsigned HUF_isError(size_t code) { return ERR_isError(code); }
const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
/*-**************************************************************
* FSE NCount encoding-decoding
****************************************************************/
2020-08-18 04:44:49 +08:00
FORCE_INLINE_TEMPLATE
size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
2016-05-13 17:27:56 +08:00
{
const BYTE* const istart = (const BYTE*) headerBuffer;
const BYTE* const iend = istart + hbSize;
const BYTE* ip = istart;
int nbBits;
int remaining;
int threshold;
U32 bitStream;
int bitCount;
unsigned charnum = 0;
2020-08-15 06:28:59 +08:00
unsigned const maxSV1 = *maxSVPtr + 1;
2016-05-13 17:27:56 +08:00
int previous0 = 0;
2020-08-19 07:57:35 +08:00
if (hbSize < 8) {
2020-08-25 03:24:45 +08:00
/* This function only works when hbSize >= 8 */
2020-08-19 07:57:35 +08:00
char buffer[8] = {0};
ZSTD_memcpy(buffer, headerBuffer, hbSize);
2018-05-24 05:58:58 +08:00
{ size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
buffer, sizeof(buffer));
if (FSE_isError(countSize)) return countSize;
if (countSize > hbSize) return ERROR(corruption_detected);
return countSize;
} }
2020-08-25 03:24:45 +08:00
assert(hbSize >= 8);
/* init */
ZSTD_memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */
2016-05-13 17:27:56 +08:00
bitStream = MEM_readLE32(ip);
nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
bitStream >>= 4;
bitCount = 4;
*tableLogPtr = nbBits;
remaining = (1<<nbBits)+1;
threshold = 1<<nbBits;
nbBits++;
2020-08-15 06:28:59 +08:00
for (;;) {
2016-05-13 17:27:56 +08:00
if (previous0) {
2020-08-25 03:24:45 +08:00
/* Count the number of repeats. Each time the
* 2-bit repeat code is 0b11 there is another
* repeat.
* Avoid UB by setting the high bit to 1.
*/
2022-01-22 02:29:14 +08:00
int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
2020-08-15 06:28:59 +08:00
while (repeats >= 12) {
charnum += 3 * 12;
if (LIKELY(ip <= iend-7)) {
2020-08-15 06:28:59 +08:00
ip += 3;
2016-05-13 17:27:56 +08:00
} else {
bitCount -= (int)(8 * (iend - 7 - ip));
bitCount &= 31;
ip = iend - 4;
2020-08-15 06:28:59 +08:00
}
bitStream = MEM_readLE32(ip) >> bitCount;
2022-01-22 02:29:14 +08:00
repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
2016-05-13 17:27:56 +08:00
}
2020-08-15 06:28:59 +08:00
charnum += 3 * repeats;
bitStream >>= 2 * repeats;
bitCount += 2 * repeats;
2020-08-25 03:24:45 +08:00
/* Add the final repeat which isn't 0b11. */
assert((bitStream & 3) < 3);
2020-08-15 06:28:59 +08:00
charnum += bitStream & 3;
2016-05-13 17:27:56 +08:00
bitCount += 2;
2020-08-15 06:28:59 +08:00
/* This is an error, but break and return an error
* at the end, because returning out of a loop makes
* it harder for the compiler to optimize.
*/
if (charnum >= maxSV1) break;
/* We don't need to set the normalized count to 0
* because we already memset the whole buffer to 0.
*/
if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
assert((bitCount >> 3) <= 3); /* For first condition to work */
2016-05-13 17:27:56 +08:00
ip += bitCount>>3;
bitCount &= 7;
} else {
bitCount -= (int)(8 * (iend - 4 - ip));
bitCount &= 31;
ip = iend - 4;
2020-08-15 06:28:59 +08:00
}
bitStream = MEM_readLE32(ip) >> bitCount;
2020-08-15 06:28:59 +08:00
}
{
int const max = (2*threshold-1) - remaining;
int count;
2016-05-13 17:27:56 +08:00
if ((bitStream & (threshold-1)) < (U32)max) {
count = bitStream & (threshold-1);
bitCount += nbBits-1;
2016-05-13 17:27:56 +08:00
} else {
count = bitStream & (2*threshold-1);
2016-05-13 17:27:56 +08:00
if (count >= threshold) count -= max;
bitCount += nbBits;
2016-05-13 17:27:56 +08:00
}
count--; /* extra accuracy */
2020-08-15 06:28:59 +08:00
/* When it matters (small blocks), this is a
* predictable branch, because we don't use -1.
*/
if (count >= 0) {
remaining -= count;
} else {
assert(count == -1);
remaining += count;
}
normalizedCounter[charnum++] = (short)count;
2016-05-13 17:27:56 +08:00
previous0 = !count;
2020-08-15 06:28:59 +08:00
assert(threshold > 1);
if (remaining < threshold) {
/* This branch can be folded into the
* threshold update condition because we
* know that threshold > 1.
*/
if (remaining <= 1) break;
2022-01-22 02:29:14 +08:00
nbBits = ZSTD_highbit32(remaining) + 1;
2020-08-15 06:28:59 +08:00
threshold = 1 << (nbBits - 1);
2016-05-13 17:27:56 +08:00
}
2020-08-15 06:28:59 +08:00
if (charnum >= maxSV1) break;
2016-05-13 17:27:56 +08:00
if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
2016-05-13 17:27:56 +08:00
ip += bitCount>>3;
bitCount &= 7;
} else {
bitCount -= (int)(8 * (iend - 4 - ip));
bitCount &= 31;
2016-05-13 17:27:56 +08:00
ip = iend - 4;
}
bitStream = MEM_readLE32(ip) >> bitCount;
2020-08-15 06:28:59 +08:00
} }
if (remaining != 1) return ERROR(corruption_detected);
2020-08-15 06:28:59 +08:00
/* Only possible when there are too many zeros. */
if (charnum > maxSV1) return ERROR(maxSymbolValue_tooSmall);
2016-07-25 00:02:04 +08:00
if (bitCount > 32) return ERROR(corruption_detected);
2016-05-13 17:27:56 +08:00
*maxSVPtr = charnum-1;
ip += (bitCount+7)>>3;
return ip-istart;
}
2020-08-25 05:44:33 +08:00
/* Avoids the FORCE_INLINE of the _body() function. */
2020-08-18 04:44:49 +08:00
static size_t FSE_readNCount_body_default(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
{
return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
#if DYNAMIC_BMI2
BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2(
2020-08-18 04:44:49 +08:00
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
{
return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
#endif
size_t FSE_readNCount_bmi2(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize, int bmi2)
{
#if DYNAMIC_BMI2
if (bmi2) {
return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
#endif
(void)bmi2;
return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
size_t FSE_readNCount(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
{
return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0);
}
/*! HUF_readStats() :
Read compact Huffman tree, saved by HUF_writeCTable().
`huffWeight` is destination buffer.
`rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
@return : size read from `src` , or an error Code .
Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
*/
size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize)
2020-08-17 13:22:33 +08:00
{
U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
[huf] Add generic C versions of the fast decoding loops Add generic C versions of the fast decoding loops to serve architectures that don't have an assembly implementation. Also allow selecting the C decoding loop over the assembly decoding loop through a zstd decompression parameter `ZSTD_d_disableHuffmanAssembly`. I benchmarked on my Intel i9-9900K and my Macbook Air with an M1 processor. The benchmark command forces zstd to compress without any matches, using only literals compression, and measures only Huffman decompression speed: ``` zstd -b1e1 --compress-literals --zstd=tlen=131072 silesia.tar ``` The new fast decoding loops outperform the previous implementation uniformly, but don't beat the x86-64 assembly. Additionally, the fast C decoding loops suffer from the same stability problems that we've seen in the past, where the assembly version doesn't. So even though clang gets close to assembly on x86-64, it still has stability issues. | Arch | Function | Compiler | Default (MB/s) | Assembly (MB/s) | Fast (MB/s) | |---------|----------------|--------------|----------------|-----------------|-------------| | x86-64 | decompress 4X1 | gcc-12.2.0 | 1029.6 | 1308.1 | 1208.1 | | x86-64 | decompress 4X1 | clang-14.0.6 | 1019.3 | 1305.6 | 1276.3 | | x86-64 | decompress 4X2 | gcc-12.2.0 | 1348.5 | 1657.0 | 1374.1 | | x86-64 | decompress 4X2 | clang-14.0.6 | 1027.6 | 1659.9 | 1468.1 | | aarch64 | decompress 4X1 | clang-12.0.5 | 1081.0 | N/A | 1234.9 | | aarch64 | decompress 4X2 | clang-12.0.5 | 1270.0 | N/A | 1516.6 |
2023-01-14 08:34:52 +08:00
return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
2020-08-17 13:22:33 +08:00
}
FORCE_INLINE_TEMPLATE size_t
HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize,
int bmi2)
{
U32 weightTotal;
const BYTE* ip = (const BYTE*) src;
size_t iSize;
size_t oSize;
if (!srcSize) return ERROR(srcSize_wrong);
iSize = ip[0];
/* ZSTD_memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */
2016-07-24 20:26:11 +08:00
if (iSize >= 128) { /* special header */
oSize = iSize - 127;
iSize = ((oSize+1)/2);
if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
if (oSize >= hwSize) return ERROR(corruption_detected);
ip += 1;
{ U32 n;
for (n=0; n<oSize; n+=2) {
huffWeight[n] = ip[n/2] >> 4;
huffWeight[n+1] = ip[n/2] & 15;
} } }
else { /* header compressed with FSE (normal case) */
if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
/* max (hwSize-1) values decoded, as last one is implied */
oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize, bmi2);
if (FSE_isError(oSize)) return oSize;
}
/* collect weight stats */
ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
weightTotal = 0;
{ U32 n; for (n=0; n<oSize; n++) {
if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
rankStats[huffWeight[n]]++;
weightTotal += (1 << huffWeight[n]) >> 1;
} }
if (weightTotal == 0) return ERROR(corruption_detected);
/* get last non-null symbol weight (implied, total must be 2^n) */
2022-01-22 02:29:14 +08:00
{ U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
*tableLogPtr = tableLog;
/* determine last weight */
{ U32 const total = 1 << tableLog;
U32 const rest = total - weightTotal;
2022-01-22 02:29:14 +08:00
U32 const verif = 1 << ZSTD_highbit32(rest);
U32 const lastWeight = ZSTD_highbit32(rest) + 1;
if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */
huffWeight[oSize] = (BYTE)lastWeight;
rankStats[lastWeight]++;
} }
/* check tree construction validity */
if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
/* results */
*nbSymbolsPtr = (U32)(oSize+1);
return iSize+1;
}
2020-08-18 04:44:49 +08:00
2020-08-25 05:44:33 +08:00
/* Avoids the FORCE_INLINE of the _body() function. */
2020-08-18 04:44:49 +08:00
static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize)
{
return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0);
}
#if DYNAMIC_BMI2
static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats,
2020-08-18 04:44:49 +08:00
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize)
{
return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1);
}
#endif
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize,
[huf] Add generic C versions of the fast decoding loops Add generic C versions of the fast decoding loops to serve architectures that don't have an assembly implementation. Also allow selecting the C decoding loop over the assembly decoding loop through a zstd decompression parameter `ZSTD_d_disableHuffmanAssembly`. I benchmarked on my Intel i9-9900K and my Macbook Air with an M1 processor. The benchmark command forces zstd to compress without any matches, using only literals compression, and measures only Huffman decompression speed: ``` zstd -b1e1 --compress-literals --zstd=tlen=131072 silesia.tar ``` The new fast decoding loops outperform the previous implementation uniformly, but don't beat the x86-64 assembly. Additionally, the fast C decoding loops suffer from the same stability problems that we've seen in the past, where the assembly version doesn't. So even though clang gets close to assembly on x86-64, it still has stability issues. | Arch | Function | Compiler | Default (MB/s) | Assembly (MB/s) | Fast (MB/s) | |---------|----------------|--------------|----------------|-----------------|-------------| | x86-64 | decompress 4X1 | gcc-12.2.0 | 1029.6 | 1308.1 | 1208.1 | | x86-64 | decompress 4X1 | clang-14.0.6 | 1019.3 | 1305.6 | 1276.3 | | x86-64 | decompress 4X2 | gcc-12.2.0 | 1348.5 | 1657.0 | 1374.1 | | x86-64 | decompress 4X2 | clang-14.0.6 | 1027.6 | 1659.9 | 1468.1 | | aarch64 | decompress 4X1 | clang-12.0.5 | 1081.0 | N/A | 1234.9 | | aarch64 | decompress 4X2 | clang-12.0.5 | 1270.0 | N/A | 1516.6 |
2023-01-14 08:34:52 +08:00
int flags)
2020-08-18 04:44:49 +08:00
{
#if DYNAMIC_BMI2
[huf] Add generic C versions of the fast decoding loops Add generic C versions of the fast decoding loops to serve architectures that don't have an assembly implementation. Also allow selecting the C decoding loop over the assembly decoding loop through a zstd decompression parameter `ZSTD_d_disableHuffmanAssembly`. I benchmarked on my Intel i9-9900K and my Macbook Air with an M1 processor. The benchmark command forces zstd to compress without any matches, using only literals compression, and measures only Huffman decompression speed: ``` zstd -b1e1 --compress-literals --zstd=tlen=131072 silesia.tar ``` The new fast decoding loops outperform the previous implementation uniformly, but don't beat the x86-64 assembly. Additionally, the fast C decoding loops suffer from the same stability problems that we've seen in the past, where the assembly version doesn't. So even though clang gets close to assembly on x86-64, it still has stability issues. | Arch | Function | Compiler | Default (MB/s) | Assembly (MB/s) | Fast (MB/s) | |---------|----------------|--------------|----------------|-----------------|-------------| | x86-64 | decompress 4X1 | gcc-12.2.0 | 1029.6 | 1308.1 | 1208.1 | | x86-64 | decompress 4X1 | clang-14.0.6 | 1019.3 | 1305.6 | 1276.3 | | x86-64 | decompress 4X2 | gcc-12.2.0 | 1348.5 | 1657.0 | 1374.1 | | x86-64 | decompress 4X2 | clang-14.0.6 | 1027.6 | 1659.9 | 1468.1 | | aarch64 | decompress 4X1 | clang-12.0.5 | 1081.0 | N/A | 1234.9 | | aarch64 | decompress 4X2 | clang-12.0.5 | 1270.0 | N/A | 1516.6 |
2023-01-14 08:34:52 +08:00
if (flags & HUF_flags_bmi2) {
2020-08-18 04:44:49 +08:00
return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
}
#endif
[huf] Add generic C versions of the fast decoding loops Add generic C versions of the fast decoding loops to serve architectures that don't have an assembly implementation. Also allow selecting the C decoding loop over the assembly decoding loop through a zstd decompression parameter `ZSTD_d_disableHuffmanAssembly`. I benchmarked on my Intel i9-9900K and my Macbook Air with an M1 processor. The benchmark command forces zstd to compress without any matches, using only literals compression, and measures only Huffman decompression speed: ``` zstd -b1e1 --compress-literals --zstd=tlen=131072 silesia.tar ``` The new fast decoding loops outperform the previous implementation uniformly, but don't beat the x86-64 assembly. Additionally, the fast C decoding loops suffer from the same stability problems that we've seen in the past, where the assembly version doesn't. So even though clang gets close to assembly on x86-64, it still has stability issues. | Arch | Function | Compiler | Default (MB/s) | Assembly (MB/s) | Fast (MB/s) | |---------|----------------|--------------|----------------|-----------------|-------------| | x86-64 | decompress 4X1 | gcc-12.2.0 | 1029.6 | 1308.1 | 1208.1 | | x86-64 | decompress 4X1 | clang-14.0.6 | 1019.3 | 1305.6 | 1276.3 | | x86-64 | decompress 4X2 | gcc-12.2.0 | 1348.5 | 1657.0 | 1374.1 | | x86-64 | decompress 4X2 | clang-14.0.6 | 1027.6 | 1659.9 | 1468.1 | | aarch64 | decompress 4X1 | clang-12.0.5 | 1081.0 | N/A | 1234.9 | | aarch64 | decompress 4X2 | clang-12.0.5 | 1270.0 | N/A | 1516.6 |
2023-01-14 08:34:52 +08:00
(void)flags;
2020-08-18 04:44:49 +08:00
return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
}