From 796699c0bc12ae1efa405418b14cb6dd0101cc6f Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 11 Jan 2023 15:11:51 -0800 Subject: [PATCH 1/2] fix root cause of #3416 A minor change in 5434de0 changed a `<=` into a `<`, and as an indirect consequence allowed compression attempt of literals when there are only 6 literals to compress (previous limit was effectively 7 literals). This is not in itself a problem, as the threshold is merely an heuristic, but it emerged a bug that has always been there, and was just never triggered so far due to the previous limit. This bug would make the literal compressor believes that all literals are the same symbol, but for the exact case where nbLiterals==6, plus a pretty wild combination of other limit conditions, this outcome could be false, resulting in data corruption. Replaced the blind heuristic by an actual test for all limit cases, so that even if the threshold is changed again in the future, the detection of RLE mode will remain reliable. --- lib/common/huf.h | 4 +-- lib/compress/huf_compress.c | 6 ++--- lib/compress/zstd_compress_literals.c | 38 +++++++++++++++++++-------- lib/compress/zstd_compress_literals.h | 4 +++ lib/decompress/zstd_decompress.c | 1 + 5 files changed, 37 insertions(+), 16 deletions(-) diff --git a/lib/common/huf.h b/lib/common/huf.h index f7316f9b2..9683f915a 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -221,7 +221,7 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, - unsigned suspectUncompressible, HUF_depth_mode depthMode); + int suspectUncompressible, HUF_depth_mode depthMode); /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. @@ -328,7 +328,7 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, - unsigned suspectUncompressible, HUF_depth_mode depthMode); + int suspectUncompressible, HUF_depth_mode depthMode); size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ #ifndef HUF_FORCE_DECOMPRESS_X1 diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index f7ca2d3bb..75200851b 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -1318,7 +1318,7 @@ HUF_compress_internal (void* dst, size_t dstSize, HUF_nbStreams_e nbStreams, void* workSpace, size_t wkspSize, HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, - const int bmi2, unsigned suspectUncompressible, HUF_depth_mode depthMode) + const int bmi2, int suspectUncompressible, HUF_depth_mode depthMode) { HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); BYTE* const ostart = (BYTE*)dst; @@ -1439,7 +1439,7 @@ size_t HUF_compress1X_repeat (void* dst, size_t dstSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, - int bmi2, unsigned suspectUncompressible, HUF_depth_mode depthMode) + int bmi2, int suspectUncompressible, HUF_depth_mode depthMode) { DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); return HUF_compress_internal(dst, dstSize, src, srcSize, @@ -1472,7 +1472,7 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, - unsigned suspectUncompressible, HUF_depth_mode depthMode) + int suspectUncompressible, HUF_depth_mode depthMode) { DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); return HUF_compress_internal(dst, dstSize, src, srcSize, diff --git a/lib/compress/zstd_compress_literals.c b/lib/compress/zstd_compress_literals.c index 666e5315d..e84781873 100644 --- a/lib/compress/zstd_compress_literals.c +++ b/lib/compress/zstd_compress_literals.c @@ -24,9 +24,9 @@ static size_t showHexa(const void* src, size_t srcSize) const BYTE* const ip = (const BYTE*)src; size_t u; for (u=0; u= 1); + assert(src != NULL); + { const BYTE b = ((const BYTE*)src)[0]; + size_t p; + for (p=1; p31) + (srcSize>4095); - (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ + assert(dstCapacity >= 4); (void)dstCapacity; + assert(allBytesIdentical(src, srcSize)); switch(flSize) { @@ -88,7 +102,7 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* } ostart[flSize] = *(const BYTE*)src; - DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); + DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); return flSize+1; } @@ -105,8 +119,8 @@ ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) /* btultra2 : min 8 bytes; * then 2x larger for each successive compression strategy * max threshold 64 bytes */ - { int const shift = MIN(9-strategy, 3); - size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : 8 << shift; + { int const shift = MIN(9-(int)strategy, 3); + size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); return mintc; } @@ -148,7 +162,7 @@ size_t ZSTD_compressLiterals ( { HUF_repeat repeat = prevHuf->repeatMode; int const preferRepeat = (strategy < ZSTD_lazy) ? srcSize <= 1024 : 0; HUF_depth_mode const depthMode = (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD) ? HUF_depth_optimal : HUF_depth_fast; - typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int, int, unsigned, HUF_depth_mode); + typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int, int, int, HUF_depth_mode); huf_compress_f huf_compress; if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; @@ -159,9 +173,10 @@ size_t ZSTD_compressLiterals ( (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible, depthMode); + DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); if (repeat != HUF_repeat_none) { /* reused the existing table */ - DEBUGLOG(5, "Reusing previous huffman table"); + DEBUGLOG(5, "reusing statistics from previous huffman block"); hType = set_repeat; } } @@ -172,9 +187,10 @@ size_t ZSTD_compressLiterals ( return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } } if (cLitSize==1) { - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); - } + if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); + } } if (hType == set_compressed) { /* using a newly constructed table */ diff --git a/lib/compress/zstd_compress_literals.h b/lib/compress/zstd_compress_literals.h index 9eb74729d..b060c8ad2 100644 --- a/lib/compress/zstd_compress_literals.h +++ b/lib/compress/zstd_compress_literals.h @@ -16,6 +16,10 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); +/* ZSTD_compressRleLiteralsBlock() : + * Conditions : + * - All bytes in @src are identical + * - dstCapacity >= 4 */ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); /* ZSTD_compressLiterals(): diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index f00ef3a67..84577b6e1 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -981,6 +981,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, } ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); /* Allow caller to get size read */ + DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); *srcPtr = ip; *srcSizePtr = remainingSrcSize; return (size_t)(op-ostart); From ac45e078a5bc22e06d1be1375c25795b7ef3c1ff Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 12 Jan 2023 15:49:01 -0800 Subject: [PATCH 2/2] add explanation about new test as requested by @terrelln --- lib/compress/zstd_compress_literals.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/compress/zstd_compress_literals.c b/lib/compress/zstd_compress_literals.c index e84781873..0dbc2c4d4 100644 --- a/lib/compress/zstd_compress_literals.c +++ b/lib/compress/zstd_compress_literals.c @@ -187,6 +187,11 @@ size_t ZSTD_compressLiterals ( return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } } if (cLitSize==1) { + /* A return value of 1 signals that the alphabet consists of a single symbol. + * However, in some rare circumstances, it could be the compressed size (a single byte). + * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. + * (it's also necessary to not generate statistics). + * Therefore, in such a case, actively check that all bytes are identical. */ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);