Merge pull request #1443 from lz4/dictAPI

promote dictionary API to stable
This commit is contained in:
Yann Collet 2024-07-16 16:08:28 -07:00 committed by GitHub
commit 76c48c1e43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 142 additions and 132 deletions

View File

@ -129,8 +129,8 @@ extern "C" {
/*------ Version ------*/
#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */
#define LZ4_VERSION_MINOR 9 /* for new (non-breaking) interface capabilities */
#define LZ4_VERSION_RELEASE 5 /* for tweaks, bug-fixes, or development */
#define LZ4_VERSION_MINOR 10 /* for new (non-breaking) interface capabilities */
#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */
#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
@ -370,7 +370,7 @@ LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
*/
LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
/*! LZ4_loadDictSlow() : v1.9.5+
/*! LZ4_loadDictSlow() : v1.10.0+
* Same as LZ4_loadDict(),
* but uses a bit more cpu to reference the dictionary content more thoroughly.
* This is expected to slightly improve compression ratio.
@ -379,6 +379,42 @@ LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, in
*/
LZ4LIB_API int LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
/*! LZ4_attach_dictionary() : stable since v1.10.0
*
* This allows efficient re-use of a static dictionary multiple times.
*
* Rather than re-loading the dictionary buffer into a working context before
* each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
* working LZ4_stream_t, this function introduces a no-copy setup mechanism,
* in which the working stream references @dictionaryStream in-place.
*
* Several assumptions are made about the state of @dictionaryStream.
* Currently, only states which have been prepared by LZ4_loadDict() or
* LZ4_loadDictSlow() should be expected to work.
*
* Alternatively, the provided @dictionaryStream may be NULL,
* in which case any existing dictionary stream is unset.
*
* If a dictionary is provided, it replaces any pre-existing stream history.
* The dictionary contents are the only history that can be referenced and
* logically immediately precede the data compressed in the first subsequent
* compression call.
*
* The dictionary will only remain attached to the working stream through the
* first compression call, at the end of which it is cleared.
* @dictionaryStream stream (and source buffer) must remain in-place / accessible / unchanged
* through the completion of the compression session.
*
* Note: there is no equivalent LZ4_attach_*() method on the decompression side
* because there is no initialization cost, hence no need to share the cost across multiple sessions.
* To decompress LZ4 blocks using dictionary, attached or not,
* just employ the regular LZ4_setStreamDecode() for streaming,
* or the stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
*/
LZ4LIB_API void
LZ4_attach_dictionary(LZ4_stream_t* workingStream,
const LZ4_stream_t* dictionaryStream);
/*! LZ4_compress_fast_continue() :
* Compress 'src' content using data from previously compressed blocks, for better compression ratio.
* 'dst' buffer must be already allocated.
@ -580,37 +616,6 @@ LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const c
*/
int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration);
/*! LZ4_attach_dictionary() :
* This is an experimental API that allows
* efficient use of a static dictionary many times.
*
* Rather than re-loading the dictionary buffer into a working context before
* each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
* working LZ4_stream_t, this function introduces a no-copy setup mechanism,
* in which the working stream references the dictionary stream in-place.
*
* Several assumptions are made about the state of the dictionary stream.
* Currently, only streams which have been prepared by LZ4_loadDict() should
* be expected to work.
*
* Alternatively, the provided dictionaryStream may be NULL,
* in which case any existing dictionary stream is unset.
*
* If a dictionary is provided, it replaces any pre-existing stream history.
* The dictionary contents are the only history that can be referenced and
* logically immediately precede the data compressed in the first subsequent
* compression call.
*
* The dictionary will only remain attached to the working stream through the
* first compression call, at the end of which it is cleared. The dictionary
* stream (and source buffer) must remain in-place / accessible / unchanged
* through the completion of the first compression call on the stream.
*/
LZ4LIB_STATIC_API void
LZ4_attach_dictionary(LZ4_stream_t* workingStream,
const LZ4_stream_t* dictionaryStream);
/*! In-place compression and decompression
*
* It's possible to have input and output sharing the same buffer,

View File

@ -513,6 +513,109 @@ LZ4F_decompress(LZ4F_dctx* dctx,
LZ4FLIB_API void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx); /* always successful */
/**********************************
* Dictionary compression API
*********************************/
/* A Dictionary is useful for the compression of small messages (KB range).
* It dramatically improves compression efficiency.
*
* LZ4 can ingest any input as dictionary, though only the last 64 KB are useful.
* Better results are generally achieved by using Zstandard's Dictionary Builder
* to generate a high-quality dictionary from a set of samples.
*
* The same dictionary will have to be used on the decompression side
* for decoding to be successful.
* To help identify the correct dictionary at decoding stage,
* the frame header allows optional embedding of a dictID field.
*/
/*! LZ4F_compressBegin_usingDict() : stable since v1.10
* Inits dictionary compression streaming, and writes the frame header into dstBuffer.
* @dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
* @prefsPtr is optional : one may provide NULL as argument,
* however, it's the only way to provide dictID in the frame header.
* @dictBuffer must outlive the compression session.
* @return : number of bytes written into dstBuffer for the header,
* or an error code (which can be tested using LZ4F_isError())
* NOTE: The LZ4Frame spec allows each independent block to be compressed with the dictionary,
* but this entry supports a more limited scenario, where only the first block uses the dictionary.
* This is still useful for small data, which only need one block anyway.
* For larger inputs, one may be more interested in LZ4F_compressFrame_usingCDict() below.
*/
LZ4FLIB_API size_t
LZ4F_compressBegin_usingDict(LZ4F_cctx* cctx,
void* dstBuffer, size_t dstCapacity,
const void* dictBuffer, size_t dictSize,
const LZ4F_preferences_t* prefsPtr);
/*! LZ4F_decompress_usingDict() : stable since v1.10
* Same as LZ4F_decompress(), using a predefined dictionary.
* Dictionary is used "in place", without any preprocessing.
** It must remain accessible throughout the entire frame decoding. */
LZ4FLIB_API size_t
LZ4F_decompress_usingDict(LZ4F_dctx* dctxPtr,
void* dstBuffer, size_t* dstSizePtr,
const void* srcBuffer, size_t* srcSizePtr,
const void* dict, size_t dictSize,
const LZ4F_decompressOptions_t* decompressOptionsPtr);
/*****************************************
* Bulk processing dictionary compression
*****************************************/
/* Loading a dictionary has a cost, since it involves construction of tables.
* The Bulk processing dictionary API makes it possible to share this cost
* over an arbitrary number of compression jobs, even concurrently,
* markedly improving compression latency for these cases.
*
* Note that there is no corresponding bulk API for the decompression side,
* because dictionary does not carry any initialization cost for decompression.
* Use the regular LZ4F_decompress_usingDict() there.
*/
typedef struct LZ4F_CDict_s LZ4F_CDict;
/*! LZ4_createCDict() : stable since v1.10
* When compressing multiple messages / blocks using the same dictionary, it's recommended to initialize it just once.
* LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
* LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
* @dictBuffer can be released after LZ4_CDict creation, since its content is copied within CDict. */
LZ4FLIB_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
LZ4FLIB_API void LZ4F_freeCDict(LZ4F_CDict* CDict);
/*! LZ4_compressFrame_usingCDict() : stable since v1.10
* Compress an entire srcBuffer into a valid LZ4 frame using a digested Dictionary.
* @cctx must point to a context created by LZ4F_createCompressionContext().
* If @cdict==NULL, compress without a dictionary.
* @dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
* If this condition is not respected, function will fail (@return an errorCode).
* The LZ4F_preferences_t structure is optional : one may provide NULL as argument,
* but it's not recommended, as it's the only way to provide @dictID in the frame header.
* @return : number of bytes written into dstBuffer.
* or an error code if it fails (can be tested using LZ4F_isError())
* Note: for larger inputs generating multiple independent blocks,
* this entry point uses the dictionary for each block. */
LZ4FLIB_API size_t
LZ4F_compressFrame_usingCDict(LZ4F_cctx* cctx,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
const LZ4F_CDict* cdict,
const LZ4F_preferences_t* preferencesPtr);
/*! LZ4F_compressBegin_usingCDict() : stable since v1.10
* Inits streaming dictionary compression, and writes the frame header into dstBuffer.
* @dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
* @prefsPtr is optional : one may provide NULL as argument,
* note however that it's the only way to insert a @dictID in the frame header.
* @cdict must outlive the compression session.
* @return : number of bytes written into dstBuffer for the header,
* or an error code, which can be tested using LZ4F_isError(). */
LZ4FLIB_API size_t
LZ4F_compressBegin_usingCDict(LZ4F_cctx* cctx,
void* dstBuffer, size_t dstCapacity,
const LZ4F_CDict* cdict,
const LZ4F_preferences_t* prefsPtr);
#if defined (__cplusplus)
}
@ -612,104 +715,6 @@ LZ4F_uncompressedUpdate(LZ4F_cctx* cctx,
const void* srcBuffer, size_t srcSize,
const LZ4F_compressOptions_t* cOptPtr);
/**********************************
* Dictionary compression API
*********************************/
/* A Dictionary is useful for the compression of small messages (KB range).
* It dramatically improves compression efficiency.
*
* LZ4 can ingest any input as dictionary, though only the last 64 KB are useful.
* Better results are generally achieved by using Zstandard's Dictionary Builder
* to generate a high-quality dictionary from a set of samples.
*
* The same dictionary will have to be used on the decompression side
* for decoding to be successful.
* To help identify the correct dictionary at decoding stage,
* the frame header allows optional embedding of a dictID field.
*/
/*! LZ4F_compressBegin_usingDict() :
* Inits dictionary compression streaming, and writes the frame header into dstBuffer.
* `dstCapacity` must be >= LZ4F_HEADER_SIZE_MAX bytes.
* `prefsPtr` is optional : you may provide NULL as argument,
* however, it's the only way to provide dictID in the frame header.
* `dictBuffer` must outlive the compression session.
* @return : number of bytes written into dstBuffer for the header,
* or an error code (which can be tested using LZ4F_isError())
* NOTE: this entry point doesn't fully exploit the spec,
* which allows each independent block to be compressed with the dictionary.
* Currently, only the first block uses the dictionary.
* This is still technically compliant, but less efficient for large inputs.
*/
LZ4FLIB_STATIC_API size_t
LZ4F_compressBegin_usingDict(LZ4F_cctx* cctx,
void* dstBuffer, size_t dstCapacity,
const void* dictBuffer, size_t dictSize,
const LZ4F_preferences_t* prefsPtr);
/*! LZ4F_decompress_usingDict() :
* Same as LZ4F_decompress(), using a predefined dictionary.
* Dictionary is used "in place", without any preprocessing.
** It must remain accessible throughout the entire frame decoding. */
LZ4FLIB_STATIC_API size_t
LZ4F_decompress_usingDict(LZ4F_dctx* dctxPtr,
void* dstBuffer, size_t* dstSizePtr,
const void* srcBuffer, size_t* srcSizePtr,
const void* dict, size_t dictSize,
const LZ4F_decompressOptions_t* decompressOptionsPtr);
/**********************************
* Bulk processing dictionary API
*********************************/
/* Loading a dictionary has a cost, since it involves construction of tables.
* The Bulk processing dictionary API makes it possible to share this cost
* over an arbitrary number of compression jobs, even concurrently,
* markedly improving compression latency for these cases.
*/
typedef struct LZ4F_CDict_s LZ4F_CDict;
/*! LZ4_createCDict() :
* When compressing multiple messages / blocks using the same dictionary, it's recommended to load it just once.
* LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
* LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
* `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict */
LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
LZ4FLIB_STATIC_API void LZ4F_freeCDict(LZ4F_CDict* CDict);
/*! LZ4_compressFrame_usingCDict() :
* Compress an entire srcBuffer into a valid LZ4 frame using a digested Dictionary.
* cctx must point to a context created by LZ4F_createCompressionContext().
* If cdict==NULL, compress without a dictionary.
* dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
* If this condition is not respected, function will fail (@return an errorCode).
* The LZ4F_preferences_t structure is optional : you may provide NULL as argument,
* but it's not recommended, as it's the only way to provide dictID in the frame header.
* @return : number of bytes written into dstBuffer.
* or an error code if it fails (can be tested using LZ4F_isError()) */
LZ4FLIB_STATIC_API size_t
LZ4F_compressFrame_usingCDict(LZ4F_cctx* cctx,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
const LZ4F_CDict* cdict,
const LZ4F_preferences_t* preferencesPtr);
/*! LZ4F_compressBegin_usingCDict() :
* Inits streaming dictionary compression, and writes the frame header into dstBuffer.
* `dstCapacity` must be >= LZ4F_HEADER_SIZE_MAX bytes.
* `prefsPtr` is optional : you may provide NULL as argument,
* however, it's the only way to provide dictID in the frame header.
* `cdict` must outlive the compression session.
* @return : number of bytes written into dstBuffer for the header,
* or an error code (which can be tested using LZ4F_isError()) */
LZ4FLIB_STATIC_API size_t
LZ4F_compressBegin_usingCDict(LZ4F_cctx* cctx,
void* dstBuffer, size_t dstCapacity,
const LZ4F_CDict* cdict,
const LZ4F_preferences_t* prefsPtr);
/**********************************
* Custom memory allocation
*********************************/