diff options
Diffstat (limited to 'contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c')
-rw-r--r-- | contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c | 442 |
1 files changed, 442 insertions, 0 deletions
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c new file mode 100644 index 0000000000000..cd943797bdead --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -0,0 +1,442 @@ +#include <stdio.h> /* fprintf */ +#include <stdlib.h> /* malloc, free, qsort */ +#include <string.h> /* strcmp, strlen */ +#include <errno.h> /* errno */ +#include <ctype.h> +#include <time.h> +#include "random.h" +#include "dictBuilder.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include "io.h" +#include "util.h" +#include "zdict.h" + + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_DISPLAYLEVEL 2 + + +/*-************************************* +* Struct +***************************************/ +typedef struct { + const void* dictBuffer; + size_t dictSize; +} dictInfo; + + +/*-************************************* +* Dictionary related operations +***************************************/ +/** createDictFromFiles() : + * Based on type of param given, train dictionary using the corresponding algorithm + * @return dictInfo containing dictionary buffer and dictionary size + */ +dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams, + ZDICT_legacy_params_t *legacyParams, ZDICT_fastCover_params_t *fastParams) { + unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel : + coverParams ? coverParams->zParams.notificationLevel : + legacyParams ? legacyParams->zParams.notificationLevel : + fastParams ? fastParams->zParams.notificationLevel : + DEFAULT_DISPLAYLEVEL; /* no dict */ + void* const dictBuffer = malloc(maxDictSize); + + dictInfo* dInfo = NULL; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + if(randomParams) { + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *randomParams); + }else if(coverParams) { + /* Run the optimize version if either k or d is not provided */ + if (!coverParams->d || !coverParams->k){ + dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, coverParams); + } else { + dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *coverParams); + } + } else if(legacyParams) { + dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *legacyParams); + } else if(fastParams) { + /* Run the optimize version if either k or d is not provided */ + if (!fastParams->d || !fastParams->k) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, fastParams); + } else { + dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *fastParams); + } + } else { + dictSize = 0; + } + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + free(dictBuffer); + return dInfo; + } + dInfo = (dictInfo *)malloc(sizeof(dictInfo)); + dInfo->dictBuffer = dictBuffer; + dInfo->dictSize = dictSize; + } + return dInfo; +} + + +/** compressWithDict() : + * Compress samples from sample buffer given dictionary stored on dictionary buffer and compression level + * @return compression ratio + */ +double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) { + /* Local variables */ + size_t totalCompressedSize = 0; + size_t totalOriginalSize = 0; + const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0; + double cRatio; + size_t dstCapacity; + int i; + + /* Pointers */ + ZSTD_CDict *cdict = NULL; + ZSTD_CCtx* cctx = NULL; + size_t *offsets = NULL; + void* dst = NULL; + + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + for (i = 0; i < srcInfo->nbSamples; i++) { + maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + + /* Calculate offset for each sample */ + offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t)); + offsets[0] = 0; + for (i = 1; i <= srcInfo->nbSamples; i++) { + offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1]; + } + + /* Create the cctx */ + cctx = ZSTD_createCCtx(); + if(!cctx || !dst) { + cRatio = -1; + goto _cleanup; + } + + /* Create CDict if there's a dictionary stored on buffer */ + if (hasDict) { + cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); + if(!cdict) { + cRatio = -1; + goto _cleanup; + } + } + + /* Compress each sample and sum their sizes*/ + const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer; + for (i = 0; i < srcInfo->nbSamples; i++) { + size_t compressedSize; + if(hasDict) { + compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict); + } else { + compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel); + } + if (ZSTD_isError(compressedSize)) { + cRatio = -1; + goto _cleanup; + } + totalCompressedSize += compressedSize; + } + + /* Sum original sizes */ + for (i = 0; i<srcInfo->nbSamples; i++) { + totalOriginalSize += srcInfo->samplesSizes[i]; + } + + /* Calculate compression ratio */ + DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize); + DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize); + cRatio = (double)totalOriginalSize/(double)totalCompressedSize; + +_cleanup: + free(dst); + free(offsets); + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + return cRatio; +} + + +/** FreeDictInfo() : + * Free memory allocated for dictInfo + */ +void freeDictInfo(dictInfo* info) { + if (!info) return; + if (info->dictBuffer) free((void*)(info->dictBuffer)); + free(info); +} + + + +/*-******************************************************** + * Benchmarking functions +**********************************************************/ +/** benchmarkDictBuilder() : + * Measure how long a dictionary builder takes and compression ratio with the dictionary built + * @return 0 if benchmark successfully, 1 otherwise + */ +int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam, + ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam, + ZDICT_fastCover_params_t *fastParam) { + /* Local variables */ + const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel : + coverParam ? coverParam->zParams.notificationLevel : + legacyParam ? legacyParam->zParams.notificationLevel : + fastParam ? fastParam->zParams.notificationLevel: + DEFAULT_DISPLAYLEVEL; /* no dict */ + const char* name = randomParam ? "RANDOM" : + coverParam ? "COVER" : + legacyParam ? "LEGACY" : + fastParam ? "FAST": + "NODICT"; /* no dict */ + const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel : + coverParam ? coverParam->zParams.compressionLevel : + legacyParam ? legacyParam->zParams.compressionLevel : + fastParam ? fastParam->zParams.compressionLevel: + DEFAULT_CLEVEL; /* no dict */ + int result = 0; + + /* Calculate speed */ + const UTIL_time_t begin = UTIL_getTime(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam, fastParam); + const U64 timeMicro = UTIL_clockSpanMicro(begin); + const double timeSec = timeMicro / (double)SEC_TO_MICRO; + if (!dInfo) { + DISPLAYLEVEL(1, "%s does not train successfully\n", name); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(1, "%s took %f seconds to execute \n", name, timeSec); + + /* Calculate compression ratio */ + const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); + if (cRatio < 0) { + DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name); + result = 1; + goto _cleanup; + + } + DISPLAYLEVEL(1, "Compression ratio with %s dictionary is %f\n", name, cRatio); + +_cleanup: + freeDictInfo(dInfo); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + const int displayLevel = DEFAULT_DISPLAYLEVEL; + const char* programName = argv[0]; + int result = 0; + + /* Initialize arguments to default values */ + unsigned k = 200; + unsigned d = 8; + unsigned f; + unsigned accel; + unsigned i; + const unsigned cLevel = DEFAULT_CLEVEL; + const unsigned dictID = 0; + const unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + const int followLinks = 0; + const char** extendedFileList = NULL; + + /* Parse arguments */ + for (i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n"); + return 1; + } + + /* Get the list of all files recursively (because followLinks==0)*/ + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]); + free((void*)filenameTable); + filenameTable = extendedFileList; + filenameIdx = fileNamesNb; + } + + /* get sampleInfo */ + size_t blockSize = 0; + sampleInfo* srcInfo= getSampleInfo(filenameTable, + filenameIdx, blockSize, maxDictSize, displayLevel); + + /* set up zParams */ + ZDICT_params_t zParams; + zParams.compressionLevel = cLevel; + zParams.notificationLevel = displayLevel; + zParams.dictID = dictID; + + /* with no dict */ + { + const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, NULL); + if(noDictResult) { + result = 1; + goto _cleanup; + } + } + + /* for random */ + { + ZDICT_random_params_t randomParam; + randomParam.zParams = zParams; + randomParam.k = k; + const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\n", randomParam.k); + if(randomResult) { + result = 1; + goto _cleanup; + } + } + + /* for legacy */ + { + ZDICT_legacy_params_t legacyParam; + legacyParam.zParams = zParams; + legacyParam.selectivityLevel = 9; + const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL); + DISPLAYLEVEL(2, "selectivityLevel=%u\n", legacyParam.selectivityLevel); + if(legacyResult) { + result = 1; + goto _cleanup; + } + } + + /* for cover */ + { + /* for cover (optimizing k and d) */ + ZDICT_cover_params_t coverParam; + memset(&coverParam, 0, sizeof(coverParam)); + coverParam.zParams = zParams; + coverParam.splitPoint = 1.0; + coverParam.steps = 40; + coverParam.nbThreads = 1; + const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100)); + if(coverOptResult) { + result = 1; + goto _cleanup; + } + + /* for cover (with k and d provided) */ + const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL); + DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100)); + if(coverResult) { + result = 1; + goto _cleanup; + } + + } + + /* for fastCover */ + for (f = 15; f < 25; f++){ + DISPLAYLEVEL(2, "current f is %u\n", f); + for (accel = 1; accel < 11; accel++) { + DISPLAYLEVEL(2, "current accel is %u\n", accel); + /* for fastCover (optimizing k and d) */ + ZDICT_fastCover_params_t fastParam; + memset(&fastParam, 0, sizeof(fastParam)); + fastParam.zParams = zParams; + fastParam.f = f; + fastParam.steps = 40; + fastParam.nbThreads = 1; + fastParam.accel = accel; + const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel); + if(fastOptResult) { + result = 1; + goto _cleanup; + } + + /* for fastCover (with k and d provided) */ + for (i = 0; i < 5; i++) { + const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel); + if(fastResult) { + result = 1; + goto _cleanup; + } + } + } + } + + + /* Free allocated memory */ +_cleanup: + UTIL_freeFileList(extendedFileList, fileNamesBuf); + freeSampleInfo(srcInfo); + return result; +} |