Add --size-hint=# option

This commit is contained in:
Nick Magerko 2019-08-19 08:52:08 -07:00
parent c9072ee674
commit dffbac5f89
8 changed files with 155 additions and 82 deletions

View File

@ -324,6 +324,7 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx);
* ZSTD_c_forceAttachDict
* ZSTD_c_literalCompressionMode
* ZSTD_c_targetCBlockSize
* ZSTD_c_srcSizeHint
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
* note : never ever use experimentalParam? names directly;
* also, the enums values themselves are unstable and can still change.
@ -334,6 +335,7 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx);
ZSTD_c_experimentalParam4=1001,
ZSTD_c_experimentalParam5=1002,
ZSTD_c_experimentalParam6=1003,
ZSTD_c_experimentalParam7=1004,
} ZSTD_cParameter;
</b></pre><BR>
<pre><b>typedef struct {

View File

@ -392,6 +392,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
return bounds;
case ZSTD_c_srcSizeHint:
bounds.lowerBound = 0;
bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
return bounds;
default:
{ ZSTD_bounds const boundError = { ERROR(parameter_unsupported), 0, 0 };
return boundError;
@ -448,6 +453,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_forceAttachDict:
case ZSTD_c_literalCompressionMode:
case ZSTD_c_targetCBlockSize:
case ZSTD_c_srcSizeHint:
default:
return 0;
}
@ -494,6 +500,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_ldmMinMatch:
case ZSTD_c_ldmBucketSizeLog:
case ZSTD_c_targetCBlockSize:
case ZSTD_c_srcSizeHint:
break;
default: RETURN_ERROR(parameter_unsupported);
@ -674,6 +681,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->targetCBlockSize = value;
return CCtxParams->targetCBlockSize;
case ZSTD_c_srcSizeHint :
if (value!=0) /* 0 ==> default */
BOUNDCHECK(ZSTD_c_srcSizeHint, value);
CCtxParams->srcSizeHint = value;
return CCtxParams->srcSizeHint;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
}
@ -779,6 +792,8 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_targetCBlockSize :
*value = (int)CCtxParams->targetCBlockSize;
break;
case ZSTD_c_srcSizeHint :
*value = (int)CCtxParams->srcSizeHint;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
return 0;
@ -1029,7 +1044,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
{
ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
ZSTD_compressionParameters cParams;
if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
srcSizeHint = CCtxParams->srcSizeHint;
}
cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;

View File

@ -203,6 +203,9 @@ struct ZSTD_CCtx_params_s {
size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize.
* No target when targetCBlockSize == 0.
* There is no guarantee on compressed block size */
size_t srcSizeHint; /* User's best guess of source size.
* Hint is not valid when srcSizeHint == 0.
* There is no guarantee that hint is close to actual source size */
ZSTD_dictAttachPref_e attachDictPref;
ZSTD_literalCompressionMode_e literalCompressionMode;

View File

@ -386,6 +386,7 @@ typedef enum {
* ZSTD_c_forceAttachDict
* ZSTD_c_literalCompressionMode
* ZSTD_c_targetCBlockSize
* ZSTD_c_srcSizeHint
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
* note : never ever use experimentalParam? names directly;
* also, the enums values themselves are unstable and can still change.
@ -396,6 +397,7 @@ typedef enum {
ZSTD_c_experimentalParam4=1001,
ZSTD_c_experimentalParam5=1002,
ZSTD_c_experimentalParam6=1003,
ZSTD_c_experimentalParam7=1004,
} ZSTD_cParameter;
typedef struct {
@ -1063,6 +1065,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
/* Advanced parameter bounds */
#define ZSTD_TARGETCBLOCKSIZE_MIN 64
#define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX
#define ZSTD_SRCSIZEHINT_MAX 1e9 /* 1 GB */
/* internal */
#define ZSTD_HASHLOG3_MAX 17
@ -1441,6 +1444,11 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
* There is no guarantee on compressed block size (default:0) */
#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
/* User's best guess of source size.
* Hint is not valid when srcSizeHint == 0.
* There is no guarantee that hint is close to actual source size */
#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
/*! ZSTD_CCtx_getParameter() :
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
* and store it into int* value.

View File

@ -305,6 +305,7 @@ struct FIO_prefs_s {
int ldmBucketSizeLog;
int ldmHashRateLog;
size_t targetCBlockSize;
size_t srcSizeHint;
ZSTD_literalCompressionMode_e literalCompressionMode;
/* IO preferences */
@ -350,6 +351,7 @@ FIO_prefs_t* FIO_createPreferences(void)
ret->ldmBucketSizeLog = FIO_LDM_PARAM_NOTSET;
ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET;
ret->targetCBlockSize = 0;
ret->srcSizeHint = 0;
ret->literalCompressionMode = ZSTD_lcm_auto;
return ret;
}
@ -422,6 +424,10 @@ void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize)
prefs->targetCBlockSize = targetCBlockSize;
}
void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint) {
prefs->srcSizeHint = srcSizeHint;
}
void FIO_setLiteralCompressionMode(
FIO_prefs_t* const prefs,
ZSTD_literalCompressionMode_e mode) {
@ -667,6 +673,8 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) );
/* max compressed block size */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) );
/* source size hint */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) );
/* long distance matching */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) );

View File

@ -72,6 +72,7 @@ void FIO_setRemoveSrcFile(FIO_prefs_t* const prefs, unsigned flag);
void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no sparse; 1: disable on stdout; 2: always enabled */
void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable);
void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize);
void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint);
void FIO_setLiteralCompressionMode(
FIO_prefs_t* const prefs,
ZSTD_literalCompressionMode_e mode);

View File

@ -141,6 +141,7 @@ static int usage_advanced(const char* programName)
DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog);
DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1);
DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n");
DISPLAY( "--size-hint=# optimize compression parameters for streaming input of approximately this size\n");
DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n");
#ifdef ZSTD_MULTITHREAD
DISPLAY( " -T# : spawns # compression threads (default: 1, 0==# cores) \n");
@ -589,6 +590,7 @@ int main(int argCount, const char* argv[])
unsigned maxDictSize = g_defaultMaxDictSize;
unsigned dictID = 0;
size_t targetCBlockSize = 0;
size_t srcSizeHint = 0;
int dictCLevel = g_defaultDictCLevel;
unsigned dictSelect = g_defaultSelectivityLevel;
#ifdef UTIL_HAS_CREATEFILELIST
@ -746,6 +748,7 @@ int main(int argCount, const char* argv[])
if (longCommandWArg(&argument, "--dictID=")) { dictID = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; }
if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; }
if (longCommandWArg(&argument, "--long")) {
unsigned ldmWindowLog = 0;
ldmFlag = 1;
@ -1151,6 +1154,7 @@ int main(int argCount, const char* argv[])
FIO_setAdaptMax(prefs, adaptMax);
FIO_setRsyncable(prefs, rsyncable);
FIO_setTargetCBlockSize(prefs, targetCBlockSize);
FIO_setSrcSizeHint(prefs, srcSizeHint);
FIO_setLiteralCompressionMode(prefs, literalCompressionMode);
if (adaptMin > cLevel) cLevel = adaptMin;
if (adaptMax < cLevel) cLevel = adaptMax;
@ -1160,7 +1164,7 @@ int main(int argCount, const char* argv[])
else
operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams);
#else
(void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */
(void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */
DISPLAY("Compression not supported \n");
#endif
} else { /* decompression or test */

View File

@ -409,6 +409,34 @@ println "compress multiple files including a missing one (notHere) : "
$ZSTD -f tmp1 notHere tmp2 && die "missing file not detected!"
println "\n===> size-hint mode"
./datagen -g11000 > tmp
println "test : basic file compression vs streaming compression vs hinted streaming compression"
$ZSTD -14 -f tmp -o tmp.zst 2>&1 | tee file.out
cat tmp | $ZSTD -14 -f -o tmp.zst # only run for convenience of comparison
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11000 2>&1 | tee stream_sized.out
file_ratio=$(cat file.out | awk '{print $4}' | sed 's/%//g')
stream_sized_ratio=$(cat stream_sized.out | awk '{print $4}' | sed 's/%//g')
rm file.out stream_sized.out
ratio_diff=$(echo $stream_sized_ratio - $file_ratio | bc)
if [ $(echo "(100 * $ratio_diff) > 1" | bc -l) -eq 1 ]
then
die "hinted compression greater than 0.01% larger than file compression"
fi
println "test : hinted streaming compression and decompression"
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11000
$ZSTD -df tmp.zst -o tmp_decompress
cmp tmp tmp_decompress || die "difference between original and decompressed file"
println "test : incorrect hinted stream sizes"
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11050 # slightly too high
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=10950 # slightly too low
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=22000 # considerably too high
cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=5500 # considerably too low
println "\n===> dictionary tests "
println "- test with raw dict (content only) "