summaryrefslogtreecommitdiff
path: root/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c')
-rw-r--r--contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c442
1 files changed, 442 insertions, 0 deletions
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
new file mode 100644
index 0000000000000..cd943797bdead
--- /dev/null
+++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
@@ -0,0 +1,442 @@
+#include <stdio.h> /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* strcmp, strlen */
+#include <errno.h> /* errno */
+#include <ctype.h>
+#include <time.h>
+#include "random.h"
+#include "dictBuilder.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include "io.h"
+#include "util.h"
+#include "zdict.h"
+
+
+
+/*-*************************************
+* Console display
+***************************************/
+#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+ if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+ { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+ if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+* Exceptions
+***************************************/
+#ifndef DEBUG
+# define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...) \
+{ \
+ DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+ DISPLAY("Error %i : ", error); \
+ DISPLAY(__VA_ARGS__); \
+ DISPLAY("\n"); \
+ exit(error); \
+}
+
+
+/*-*************************************
+* Constants
+***************************************/
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_DISPLAYLEVEL 2
+
+
+/*-*************************************
+* Struct
+***************************************/
+typedef struct {
+ const void* dictBuffer;
+ size_t dictSize;
+} dictInfo;
+
+
+/*-*************************************
+* Dictionary related operations
+***************************************/
+/** createDictFromFiles() :
+ * Based on type of param given, train dictionary using the corresponding algorithm
+ * @return dictInfo containing dictionary buffer and dictionary size
+ */
+dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
+ ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams,
+ ZDICT_legacy_params_t *legacyParams, ZDICT_fastCover_params_t *fastParams) {
+ unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel :
+ coverParams ? coverParams->zParams.notificationLevel :
+ legacyParams ? legacyParams->zParams.notificationLevel :
+ fastParams ? fastParams->zParams.notificationLevel :
+ DEFAULT_DISPLAYLEVEL; /* no dict */
+ void* const dictBuffer = malloc(maxDictSize);
+
+ dictInfo* dInfo = NULL;
+
+ /* Checks */
+ if (!dictBuffer)
+ EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */
+
+ { size_t dictSize;
+ if(randomParams) {
+ dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
+ info->samplesSizes, info->nbSamples, *randomParams);
+ }else if(coverParams) {
+ /* Run the optimize version if either k or d is not provided */
+ if (!coverParams->d || !coverParams->k){
+ dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+ info->samplesSizes, info->nbSamples, coverParams);
+ } else {
+ dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+ info->samplesSizes, info->nbSamples, *coverParams);
+ }
+ } else if(legacyParams) {
+ dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
+ info->samplesSizes, info->nbSamples, *legacyParams);
+ } else if(fastParams) {
+ /* Run the optimize version if either k or d is not provided */
+ if (!fastParams->d || !fastParams->k) {
+ dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+ info->samplesSizes, info->nbSamples, fastParams);
+ } else {
+ dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+ info->samplesSizes, info->nbSamples, *fastParams);
+ }
+ } else {
+ dictSize = 0;
+ }
+ if (ZDICT_isError(dictSize)) {
+ DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
+ free(dictBuffer);
+ return dInfo;
+ }
+ dInfo = (dictInfo *)malloc(sizeof(dictInfo));
+ dInfo->dictBuffer = dictBuffer;
+ dInfo->dictSize = dictSize;
+ }
+ return dInfo;
+}
+
+
+/** compressWithDict() :
+ * Compress samples from sample buffer given dictionary stored on dictionary buffer and compression level
+ * @return compression ratio
+ */
+double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) {
+ /* Local variables */
+ size_t totalCompressedSize = 0;
+ size_t totalOriginalSize = 0;
+ const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0;
+ double cRatio;
+ size_t dstCapacity;
+ int i;
+
+ /* Pointers */
+ ZSTD_CDict *cdict = NULL;
+ ZSTD_CCtx* cctx = NULL;
+ size_t *offsets = NULL;
+ void* dst = NULL;
+
+ /* Allocate dst with enough space to compress the maximum sized sample */
+ {
+ size_t maxSampleSize = 0;
+ for (i = 0; i < srcInfo->nbSamples; i++) {
+ maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize);
+ }
+ dstCapacity = ZSTD_compressBound(maxSampleSize);
+ dst = malloc(dstCapacity);
+ }
+
+ /* Calculate offset for each sample */
+ offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t));
+ offsets[0] = 0;
+ for (i = 1; i <= srcInfo->nbSamples; i++) {
+ offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1];
+ }
+
+ /* Create the cctx */
+ cctx = ZSTD_createCCtx();
+ if(!cctx || !dst) {
+ cRatio = -1;
+ goto _cleanup;
+ }
+
+ /* Create CDict if there's a dictionary stored on buffer */
+ if (hasDict) {
+ cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
+ if(!cdict) {
+ cRatio = -1;
+ goto _cleanup;
+ }
+ }
+
+ /* Compress each sample and sum their sizes*/
+ const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer;
+ for (i = 0; i < srcInfo->nbSamples; i++) {
+ size_t compressedSize;
+ if(hasDict) {
+ compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+ } else {
+ compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel);
+ }
+ if (ZSTD_isError(compressedSize)) {
+ cRatio = -1;
+ goto _cleanup;
+ }
+ totalCompressedSize += compressedSize;
+ }
+
+ /* Sum original sizes */
+ for (i = 0; i<srcInfo->nbSamples; i++) {
+ totalOriginalSize += srcInfo->samplesSizes[i];
+ }
+
+ /* Calculate compression ratio */
+ DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize);
+ DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize);
+ cRatio = (double)totalOriginalSize/(double)totalCompressedSize;
+
+_cleanup:
+ free(dst);
+ free(offsets);
+ ZSTD_freeCCtx(cctx);
+ ZSTD_freeCDict(cdict);
+ return cRatio;
+}
+
+
+/** FreeDictInfo() :
+ * Free memory allocated for dictInfo
+ */
+void freeDictInfo(dictInfo* info) {
+ if (!info) return;
+ if (info->dictBuffer) free((void*)(info->dictBuffer));
+ free(info);
+}
+
+
+
+/*-********************************************************
+ * Benchmarking functions
+**********************************************************/
+/** benchmarkDictBuilder() :
+ * Measure how long a dictionary builder takes and compression ratio with the dictionary built
+ * @return 0 if benchmark successfully, 1 otherwise
+ */
+int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam,
+ ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam,
+ ZDICT_fastCover_params_t *fastParam) {
+ /* Local variables */
+ const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel :
+ coverParam ? coverParam->zParams.notificationLevel :
+ legacyParam ? legacyParam->zParams.notificationLevel :
+ fastParam ? fastParam->zParams.notificationLevel:
+ DEFAULT_DISPLAYLEVEL; /* no dict */
+ const char* name = randomParam ? "RANDOM" :
+ coverParam ? "COVER" :
+ legacyParam ? "LEGACY" :
+ fastParam ? "FAST":
+ "NODICT"; /* no dict */
+ const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel :
+ coverParam ? coverParam->zParams.compressionLevel :
+ legacyParam ? legacyParam->zParams.compressionLevel :
+ fastParam ? fastParam->zParams.compressionLevel:
+ DEFAULT_CLEVEL; /* no dict */
+ int result = 0;
+
+ /* Calculate speed */
+ const UTIL_time_t begin = UTIL_getTime();
+ dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam, fastParam);
+ const U64 timeMicro = UTIL_clockSpanMicro(begin);
+ const double timeSec = timeMicro / (double)SEC_TO_MICRO;
+ if (!dInfo) {
+ DISPLAYLEVEL(1, "%s does not train successfully\n", name);
+ result = 1;
+ goto _cleanup;
+ }
+ DISPLAYLEVEL(1, "%s took %f seconds to execute \n", name, timeSec);
+
+ /* Calculate compression ratio */
+ const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
+ if (cRatio < 0) {
+ DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name);
+ result = 1;
+ goto _cleanup;
+
+ }
+ DISPLAYLEVEL(1, "Compression ratio with %s dictionary is %f\n", name, cRatio);
+
+_cleanup:
+ freeDictInfo(dInfo);
+ return result;
+}
+
+
+
+int main(int argCount, const char* argv[])
+{
+ const int displayLevel = DEFAULT_DISPLAYLEVEL;
+ const char* programName = argv[0];
+ int result = 0;
+
+ /* Initialize arguments to default values */
+ unsigned k = 200;
+ unsigned d = 8;
+ unsigned f;
+ unsigned accel;
+ unsigned i;
+ const unsigned cLevel = DEFAULT_CLEVEL;
+ const unsigned dictID = 0;
+ const unsigned maxDictSize = g_defaultMaxDictSize;
+
+ /* Initialize table to store input files */
+ const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+ unsigned filenameIdx = 0;
+
+ char* fileNamesBuf = NULL;
+ unsigned fileNamesNb = filenameIdx;
+ const int followLinks = 0;
+ const char** extendedFileList = NULL;
+
+ /* Parse arguments */
+ for (i = 1; i < argCount; i++) {
+ const char* argument = argv[i];
+ if (longCommandWArg(&argument, "in=")) {
+ filenameTable[filenameIdx] = argument;
+ filenameIdx++;
+ continue;
+ }
+ DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n");
+ return 1;
+ }
+
+ /* Get the list of all files recursively (because followLinks==0)*/
+ extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+ &fileNamesNb, followLinks);
+ if (extendedFileList) {
+ unsigned u;
+ for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+ free((void*)filenameTable);
+ filenameTable = extendedFileList;
+ filenameIdx = fileNamesNb;
+ }
+
+ /* get sampleInfo */
+ size_t blockSize = 0;
+ sampleInfo* srcInfo= getSampleInfo(filenameTable,
+ filenameIdx, blockSize, maxDictSize, displayLevel);
+
+ /* set up zParams */
+ ZDICT_params_t zParams;
+ zParams.compressionLevel = cLevel;
+ zParams.notificationLevel = displayLevel;
+ zParams.dictID = dictID;
+
+ /* with no dict */
+ {
+ const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, NULL);
+ if(noDictResult) {
+ result = 1;
+ goto _cleanup;
+ }
+ }
+
+ /* for random */
+ {
+ ZDICT_random_params_t randomParam;
+ randomParam.zParams = zParams;
+ randomParam.k = k;
+ const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL, NULL);
+ DISPLAYLEVEL(2, "k=%u\n", randomParam.k);
+ if(randomResult) {
+ result = 1;
+ goto _cleanup;
+ }
+ }
+
+ /* for legacy */
+ {
+ ZDICT_legacy_params_t legacyParam;
+ legacyParam.zParams = zParams;
+ legacyParam.selectivityLevel = 9;
+ const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam, NULL);
+ DISPLAYLEVEL(2, "selectivityLevel=%u\n", legacyParam.selectivityLevel);
+ if(legacyResult) {
+ result = 1;
+ goto _cleanup;
+ }
+ }
+
+ /* for cover */
+ {
+ /* for cover (optimizing k and d) */
+ ZDICT_cover_params_t coverParam;
+ memset(&coverParam, 0, sizeof(coverParam));
+ coverParam.zParams = zParams;
+ coverParam.splitPoint = 1.0;
+ coverParam.steps = 40;
+ coverParam.nbThreads = 1;
+ const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
+ DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
+ if(coverOptResult) {
+ result = 1;
+ goto _cleanup;
+ }
+
+ /* for cover (with k and d provided) */
+ const int coverResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL, NULL);
+ DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParam.k, coverParam.d, coverParam.steps, (unsigned)(coverParam.splitPoint * 100));
+ if(coverResult) {
+ result = 1;
+ goto _cleanup;
+ }
+
+ }
+
+ /* for fastCover */
+ for (f = 15; f < 25; f++){
+ DISPLAYLEVEL(2, "current f is %u\n", f);
+ for (accel = 1; accel < 11; accel++) {
+ DISPLAYLEVEL(2, "current accel is %u\n", accel);
+ /* for fastCover (optimizing k and d) */
+ ZDICT_fastCover_params_t fastParam;
+ memset(&fastParam, 0, sizeof(fastParam));
+ fastParam.zParams = zParams;
+ fastParam.f = f;
+ fastParam.steps = 40;
+ fastParam.nbThreads = 1;
+ fastParam.accel = accel;
+ const int fastOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+ DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+ if(fastOptResult) {
+ result = 1;
+ goto _cleanup;
+ }
+
+ /* for fastCover (with k and d provided) */
+ for (i = 0; i < 5; i++) {
+ const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
+ DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100), fastParam.accel);
+ if(fastResult) {
+ result = 1;
+ goto _cleanup;
+ }
+ }
+ }
+ }
+
+
+ /* Free allocated memory */
+_cleanup:
+ UTIL_freeFileList(extendedFileList, fileNamesBuf);
+ freeSampleInfo(srcInfo);
+ return result;
+}