11 files changed, 1444 insertions, 583 deletions
diff --git a/programs/Makefile b/programs/Makefile
index 1475cb6109163..0c920a87bcbd8 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -18,6 +18,19 @@
 
 ZSTDDIR = ../lib
 
+# Version numbers
+LIBVER_SRC := $(ZSTDDIR)/zstd.h
+LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
+LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
+LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
+LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
+LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
+LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
+LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
+LIBVER  := $(shell echo $(LIBVER_SCRIPT))
+
+ZSTD_VERSION=$(LIBVER)
+
 ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version "), 1)
 ALIGN_LOOP = -falign-loops=32
 else
@@ -69,10 +82,23 @@ else
 EXT =
 endif
 
+VOID = /dev/null
+
+# thread detection
+NO_THREAD_MSG := ==> no threads, building without multithreading support
+HAVE_PTHREAD := $(shell printf '\#include <pthread.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_pthread$(EXT) -x c - -pthread 2> $(VOID) && rm have_pthread$(EXT) && echo 1 || echo 0)
+HAVE_THREAD := $(shell [ "$(HAVE_PTHREAD)" -eq "1" -o -n "$(filter Windows%,$(OS))" ] && echo 1 || echo 0)
+ifeq ($(HAVE_THREAD), 1)
+THREAD_MSG := ==> building with threading support
+THREAD_CPP := -DZSTD_MULTITHREAD
+THREAD_LD := -pthread
+else
+THREAD_MSG := $(NO_THREAD_MSG)
+endif
+
 # zlib detection
 NO_ZLIB_MSG := ==> no zlib, building zstd without .gz support
-VOID = /dev/null
-HAVE_ZLIB := $(shell printf '\#include <zlib.h>\nint main(){}' | $(CC) -o have_zlib -x c - -lz 2> $(VOID) && rm have_zlib$(EXT) && echo 1 || echo 0)
+HAVE_ZLIB := $(shell printf '\#include <zlib.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_zlib$(EXT) -x c - -lz 2> $(VOID) && rm have_zlib$(EXT) && echo 1 || echo 0)
 ifeq ($(HAVE_ZLIB), 1)
 ZLIB_MSG := ==> building zstd with .gz compression support
 ZLIBCPP = -DZSTD_GZCOMPRESS -DZSTD_GZDECOMPRESS
@@ -80,9 +106,10 @@ ZLIBLD = -lz
 else
 ZLIB_MSG := $(NO_ZLIB_MSG)
 endif
+
 # lzma detection
 NO_LZMA_MSG := ==> no liblzma, building zstd without .xz/.lzma support
-HAVE_LZMA := $(shell printf '\#include <lzma.h>\nint main(){}' | $(CC) -o have_lzma -x c - -llzma 2> $(VOID) && rm have_lzma$(EXT) && echo 1 || echo 0)
+HAVE_LZMA := $(shell printf '\#include <lzma.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_lzma$(EXT) -x c - -llzma 2> $(VOID) && rm have_lzma$(EXT) && echo 1 || echo 0)
 ifeq ($(HAVE_LZMA), 1)
 LZMA_MSG := ==> building zstd with .xz/.lzma compression support
 LZMACPP = -DZSTD_LZMACOMPRESS -DZSTD_LZMADECOMPRESS
@@ -91,6 +118,16 @@ else
 LZMA_MSG := $(NO_LZMA_MSG)
 endif
 
+# lz4 detection
+NO_LZ4_MSG := ==> no liblz4, building zstd without .lz4 support
+HAVE_LZ4 := $(shell printf '\#include <lz4frame.h>\n\#include <lz4.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_lz4$(EXT) -x c - -llz4 2> $(VOID) && rm have_lz4$(EXT) && echo 1 || echo 0)
+ifeq ($(HAVE_LZ4), 1)
+LZ4_MSG := ==> building zstd with .lz4 compression support
+LZ4CPP = -DZSTD_LZ4COMPRESS -DZSTD_LZ4DECOMPRESS
+LZ4LD = -llz4
+else
+LZ4_MSG := $(NO_LZ4_MSG)
+endif
 
 .PHONY: default all clean clean_decomp_o install uninstall generate_res
 
@@ -100,17 +137,20 @@ all: zstd
 
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)
 
-zstd : CPPFLAGS += $(ZLIBCPP)
-zstd : LDFLAGS += $(ZLIBLD)
-zstd : LZMA_MSG := $(NO_LZMA_MSG)
-zstd-nogz : ZLIB_MSG := $(NO_ZLIB_MSG)
-zstd-nogz : LZMA_MSG := $(NO_LZMA_MSG)
-xzstd : CPPFLAGS += $(ZLIBCPP) $(LZMACPP)
-xzstd : LDFLAGS += $(ZLIBLD) $(LZMALD)
-zstd zstd-nogz xzstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
-zstd zstd-nogz xzstd : $(ZSTDLIB_OBJ) zstdcli.o fileio.o bench.o datagen.o dibio.o
+zstd xzstd zstd4 xzstd4 : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP)
+zstd xzstd zstd4 xzstd4 : LDFLAGS += $(THREAD_LD) $(ZLIBLD)
+xzstd xzstd4 : CPPFLAGS += $(LZMACPP)
+xzstd xzstd4 : LDFLAGS += $(LZMALD)
+zstd4 xzstd4 : CPPFLAGS += $(LZ4CPP)
+zstd4 xzstd4 : LDFLAGS += $(LZ4LD)
+zstd zstd4 : LZMA_MSG := - xz/lzma support is disabled
+zstd xzstd : LZ4_MSG := - lz4 support is disabled
+zstd xzstd zstd4 xzstd4 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
+zstd xzstd zstd4 xzstd4 : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
+	@echo "$(THREAD_MSG)"
 	@echo "$(ZLIB_MSG)"
 	@echo "$(LZMA_MSG)"
+	@echo "$(LZ4_MSG)"
 ifneq (,$(filter Windows%,$(OS)))
 	windres/generate_res.bat
 endif
@@ -126,10 +166,20 @@ ifneq (,$(filter Windows%,$(OS)))
 endif
 	$(CC) -m32 $(FLAGS) $^ $(RES32_FILE) -o $@$(EXT)
 
-
 zstd-nolegacy : clean_decomp_o
 	$(MAKE) zstd ZSTD_LEGACY_SUPPORT=0
 
+zstd-nomt : THREAD_CPP :=
+zstd-nomt : THREAD_LD :=
+zstd-nomt : THREAD_MSG := - multi-threading disabled
+zstd-nomt : zstd
+
+zstd-nogz : ZLIBCPP :=
+zstd-nogz : ZLIBLD :=
+zstd-nogz : ZLIB_MSG := - gzip support is disabled
+zstd-nogz : zstd
+
+
 zstd-pgo : MOREFLAGS = -fprofile-generate
 zstd-pgo : clean zstd
 	./zstd -b19i1 $(PROFILE_WITH)
@@ -142,22 +192,18 @@ zstd-pgo : clean zstd
 	$(RM) $(ZSTDDECOMP_O)
 	$(MAKE) zstd MOREFLAGS=-fprofile-use
 
-zstd-frugal: $(ZSTD_FILES) zstdcli.c fileio.c
+# minimal target, with only zstd compression and decompression. no bench. no legacy.
+zstd-small: CFLAGS = "-Os -s"
+zstd-frugal zstd-small: $(ZSTD_FILES) zstdcli.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT $^ -o zstd$(EXT)
 
-zstd-small:
-	CFLAGS="-Os -s" $(MAKE) zstd-frugal
-
 zstd-decompress: $(ZSTDCOMMON_FILES) $(ZSTDDECOMP_FILES) zstdcli.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NOCOMPRESS $^ -o $@$(EXT)
 
 zstd-compress: $(ZSTDCOMMON_FILES) $(ZSTDCOMP_FILES) zstdcli.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NODECOMPRESS $^ -o $@$(EXT)
 
-zstdmt: CPPFLAGS += -DZSTD_MULTITHREAD
-ifeq (,$(filter Windows%,$(OS)))
-zstdmt: LDFLAGS += -lpthread
-endif
+# zstd is now built with Multi-threading by default
 zstdmt: zstd
 
 generate_res:
@@ -174,6 +220,19 @@ clean:
 clean_decomp_o:
 	@$(RM) $(ZSTDDECOMP_O)
 
+MD2ROFF = ronn
+MD2ROFF_FLAGS = --roff --warnings --manual="User Commands" --organization="zstd $(ZSTD_VERSION)"
+
+zstd.1: zstd.1.md
+	cat $^ | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@
+
+man: zstd.1
+
+clean-man:
+	rm zstd.1
+
+preview-man: clean-man man
+	man ./zstd.1
 
 #-----------------------------------------------------------------------------
 # make install is validated only for Linux, OSX, BSD, Hurd and Solaris targets
@@ -206,6 +265,7 @@ install: zstd
 	@$(INSTALL_PROGRAM) zstd $(DESTDIR)$(BINDIR)/zstd
 	@ln -sf zstd $(DESTDIR)$(BINDIR)/zstdcat
 	@ln -sf zstd $(DESTDIR)$(BINDIR)/unzstd
+	@ln -sf zstd $(DESTDIR)$(BINDIR)/zstdmt
 	@$(INSTALL_SCRIPT) zstdless $(DESTDIR)$(BINDIR)/zstdless
 	@$(INSTALL_SCRIPT) zstdgrep $(DESTDIR)$(BINDIR)/zstdgrep
 	@echo Installing man pages
diff --git a/programs/README.md b/programs/README.md
index 203fd7b49bcee..d7922a0969e4c 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -11,8 +11,29 @@ There are however other Makefile targets that create different variations of CLI
 - `zstd-decompress` : decompressor-only version of CLI; without dictionary builder, benchmark, and support for decompression of legacy zstd versions
 
 
+#### Compilation variables
+`zstd` tries to detect and use the following features automatically :
+
+- __HAVE_THREAD__ : multithreading is automatically enabled when `pthread` is detected.
+  It's possible to disable multithread support, by either compiling `zstd-nomt` target or using HAVE_THREAD=0 variable.
+  Example : make zstd HAVE_THREAD=0
+  It's also possible to force compilation with multithread support, using HAVE_THREAD=1.
+  In which case, linking stage will fail if `pthread` library cannot be found.
+  This might be useful to prevent silent feature disabling.
+
+- __HAVE_ZLIB__ : `zstd` can compress and decompress files in `.gz` format.
+  This is done through command `--format=gzip`.
+  Alternatively, symlinks named `gzip` or `gunzip` will mimic intended behavior.
+  .gz support is automatically enabled when `zlib` library is detected at build time.
+  It's possible to disable .gz support, by either compiling `zstd-nogz` target or using HAVE_ZLIB=0 variable.
+  Example : make zstd HAVE_ZLIB=0
+  It's also possible to force compilation with zlib support, using HAVE_ZLIB=1.
+  In which case, linking stage will fail if `zlib` library cannot be found.
+  This might be useful to prevent silent feature disabling.
+
+
 #### Aggregation of parameters
-CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`. 
+CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
 
 
 #### Dictionary builder in Command Line Interface
@@ -23,7 +44,7 @@ which can be loaded before compression and decompression.
 
 Using a dictionary, the compression ratio achievable on small data improves dramatically.
 These compression gains are achieved while simultaneously providing faster compression and decompression speeds.
-Dictionary work if there is some correlation in a family of small data (there is no universal dictionary). 
+Dictionary work if there is some correlation in a family of small data (there is no universal dictionary).
 Hence, deploying one dictionary per type of data will provide the greater benefits.
 Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm
 will rely more and more on previously decoded content to compress the rest of the file.
@@ -35,7 +56,6 @@ Usage of the dictionary builder and created dictionaries with CLI:
 3. Decompress with the dictionary: `zstd --decompress FILE.zst -D dictionaryName`
 
 
-
 #### Benchmark in Command Line Interface
 CLI includes in-memory compression benchmark module for zstd.
 The benchmark is conducted using given filenames. The files are read into memory and joined together.
@@ -48,7 +68,6 @@ One can select compression levels starting from `-b` and ending with `-e`.
 The `-i` parameter selects minimal time used for each of tested levels.
 
 
-
 #### Usage of Command Line Interface
 The full list of options can be obtained with `-h` or `-H` parameter:
 ```
@@ -62,33 +81,40 @@ Arguments :
  -d     : decompression
  -D file: use `file` as Dictionary
  -o file: result stored into `file` (only if 1 input file)
- -f     : overwrite output without prompting
+ -f     : overwrite output without prompting and (de)compress links
 --rm    : remove source file(s) after successful de/compression
  -k     : preserve source file(s) (default)
  -h/-H  : display help/long help and exit
 
 Advanced arguments :
  -V     : display Version number and exit
- -v     : verbose mode; specify multiple times to increase log level (default:2)
+ -v     : verbose mode; specify multiple times to increase verbosity
  -q     : suppress warnings; specify twice to suppress errors too
  -c     : force write to standard output, even if it is the console
- -r     : operate recursively on directories
 --ultra : enable levels beyond 19, up to 22 (requires more memory)
+ -T#    : use # threads for compression (default:1)
+ -B#    : select size of each job (default:0==automatic)
 --no-dictID : don't write dictID into header (dictionary compression)
 --[no-]check : integrity check (default:enabled)
+ -r     : operate recursively on directories
+--format=gzip : compress files to the .gz format
 --test  : test compressed file integrity
---[no-]sparse : sparse mode (default:enabled on file, disabled on stdout)
+--[no-]sparse : sparse mode (default:disabled)
+ -M#    : Set a memory usage limit for decompression
+--      : All arguments after "--" are treated as files
 
 Dictionary builder :
 --train ## : create a dictionary from a training set of files
+--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args
+--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
  -o file : `file` is dictionary name (default: dictionary)
---maxdict ## : limit dictionary to specified size (default : 112640)
- -s#    : dictionary selectivity level (default: 9)
---dictID ## : force dictionary ID to specified value (default: random)
+--maxdict=# : limit dictionary to specified size (default : 112640)
+--dictID=# : force dictionary ID to specified value (default: random)
 
 Benchmark arguments :
  -b#    : benchmark file(s), using # compression level (default : 1)
  -e#    : test all compression levels from -bX to # (default: 1)
  -i#    : minimum evaluation time in seconds (default : 3s)
  -B#    : cut file into independent blocks of size # (default: no block)
- ```
-\ No newline at end of file
+--priority=rt : set process priority to real-time
+```
diff --git a/programs/bench.c b/programs/bench.c
index 2dd1cfb0fab10..22b871952b8e4 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -70,12 +70,12 @@ static U32 g_compressibilityDefault = 50;
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static U32 g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+static int g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
 
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((clock() - g_time > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
 static clock_t g_time = 0;
 
@@ -89,7 +89,7 @@ static clock_t g_time = 0;
 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
 #define EXM_THROW(error, ...)                                             \
 {                                                                         \
-    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__); \
     DISPLAYLEVEL(1, "Error %i : ", error);                                \
     DISPLAYLEVEL(1, __VA_ARGS__);                                         \
     DISPLAYLEVEL(1, " \n");                                               \
@@ -146,17 +146,20 @@ typedef struct {
 } blockParam_t;
 
 
-#define MIN(a,b) ((a)<(b) ? (a) : (b))
-#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+#undef MIN
+#undef MAX
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
+#define MAX(a,b)    ((a) > (b) ? (a) : (b))
 
 static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                         const char* displayName, int cLevel,
                         const size_t* fileSizes, U32 nbFiles,
                         const void* dictBuffer, size_t dictBufferSize,
-                        ZSTD_compressionParameters *comprParams)
+                        const ZSTD_compressionParameters* comprParams)
 {
     size_t const blockSize = ((g_blockSize>=32 && !g_decodeOnly) ? g_blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
-    size_t const avgSize = MIN(g_blockSize, (srcSize / nbFiles));
+    size_t const avgSize = MIN(blockSize, (srcSize / nbFiles));
     U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
     blockParam_t* const blockTable = (blockParam_t*) malloc(maxNbBlocks * sizeof(blockParam_t));
     size_t const maxCompressedSize = ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);   /* add some room for safety */
@@ -176,22 +179,21 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
         EXM_THROW(31, "allocation error : not enough memory");
 
     /* init */
-    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* can only display 17 characters */
+    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* display last 17 characters */
     UTIL_initTimer(&ticksPerSecond);
 
-    if (g_decodeOnly) {
-        const char* srcPtr = (const char*) srcBuffer;
-        U64 dSize64 = 0;
+    if (g_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
+        const char* srcPtr = (const char*)srcBuffer;
+        U64 totalDSize64 = 0;
         U32 fileNb;
         for (fileNb=0; fileNb<nbFiles; fileNb++) {
             U64 const fSize64 = ZSTD_findDecompressedSize(srcPtr, fileSizes[fileNb]);
             if (fSize64==0) EXM_THROW(32, "Impossible to determine original size ");
-            dSize64 += fSize64;
+            totalDSize64 += fSize64;
             srcPtr += fileSizes[fileNb];
         }
-        {   size_t const decodedSize = (size_t)dSize64;
-            if (dSize64 > decodedSize) EXM_THROW(32, "original size is too large");
-            if (decodedSize==0) EXM_THROW(32, "Impossible to determine original size ");
+        {   size_t const decodedSize = (size_t)totalDSize64;
+            if (totalDSize64 > decodedSize) EXM_THROW(32, "original size is too large");   /* size_t overflow */
             free(resultBuffer);
             resultBuffer = malloc(decodedSize);
             if (!resultBuffer) EXM_THROW(33, "not enough memory");
@@ -260,12 +262,11 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                 UTIL_getTime(&clockStart);
 
                 if (!cCompleted) {   /* still some time to do compression tests */
-                    ZSTD_parameters zparams = ZSTD_getParams(cLevel, avgSize, dictBufferSize);
                     ZSTD_customMem const cmem = { NULL, NULL, NULL };
-                    U64 clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1;
+                    U64 const clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1;
                     U32 nbLoops = 0;
-                    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, 1, zparams, cmem);
-                    if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict_advanced() allocation failure");
+                    ZSTD_parameters zparams = ZSTD_getParams(cLevel, avgSize, dictBufferSize);
+                    ZSTD_CDict* cdict;
                     if (comprParams->windowLog) zparams.cParams.windowLog = comprParams->windowLog;
                     if (comprParams->chainLog) zparams.cParams.chainLog = comprParams->chainLog;
                     if (comprParams->hashLog) zparams.cParams.hashLog = comprParams->hashLog;
@@ -273,6 +274,8 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                     if (comprParams->searchLength) zparams.cParams.searchLength = comprParams->searchLength;
                     if (comprParams->targetLength) zparams.cParams.targetLength = comprParams->targetLength;
                     if (comprParams->strategy) zparams.cParams.strategy = (ZSTD_strategy)(comprParams->strategy - 1);
+                    cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, 1, zparams.cParams, cmem);
+                    if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict_advanced() allocation failure");
                     do {
                         U32 blockNb;
                         size_t rSize;
diff --git a/programs/dibio.c b/programs/dibio.c
index 5ef202c8abad5..aac36425cf752 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -53,12 +53,12 @@ static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static unsigned g_displayLevel = 0;   /* 0 : no display;   1: errors;   2: default;  4: full information */
+static int g_displayLevel = 0;   /* 0 : no display;   1: errors;   2: default;  4: full information */
 
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((DIB_clockSpan(g_time) > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 2 / 10;
 static clock_t g_time = 0;
 
@@ -89,7 +89,8 @@ unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
 
 const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
 
-#define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+#undef MIN
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
 
 /* ********************************************************
diff --git a/programs/fileio.c b/programs/fileio.c
index e6481f1fa726b..e188936b21f55 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -31,6 +31,11 @@
 #include <time.h>       /* clock */
 #include <errno.h>      /* errno */
 
+#if defined (_MSC_VER)
+#  include <sys/stat.h>
+#  include <io.h>
+#endif
+
 #include "mem.h"
 #include "fileio.h"
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
@@ -48,6 +53,12 @@
 #  include <lzma.h>
 #endif
 
+#define LZ4_MAGICNUMBER 0x184D2204
+#if defined(ZSTD_LZ4COMPRESS) || defined(ZSTD_LZ4DECOMPRESS)
+#  include <lz4frame.h>
+#  include <lz4.h>
+#endif
+
 
 /*-*************************************
 *  Constants
@@ -71,7 +82,7 @@
 
 #define CACHELINE 64
 
-#define MAX_DICT_SIZE (8 MB)   /* protection against large input (attack scenario) */
+#define DICTSIZE_MAX (32 MB)   /* protection against large input (attack scenario) */
 
 #define FNSPACE 30
 
@@ -81,18 +92,20 @@
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
-static U32 g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+static int g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
 void FIO_setNotificationLevel(unsigned level) { g_displayLevel=level; }
 
 #define DISPLAYUPDATE(l, ...) { if (g_displayLevel>=l) { \
             if ((clock() - g_time > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } } }
+            if (g_displayLevel>=4) fflush(stderr); } } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
 static clock_t g_time = 0;
 
+#undef MIN
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
+
 /* ************************************************************
 * Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
 ***************************************************************/
@@ -188,6 +201,18 @@ void FIO_setOverlapLog(unsigned overlapLog){
 /*-*************************************
 *  Functions
 ***************************************/
+/** FIO_remove() :
+ * @result : Unlink `fileName`, even if it's read-only */
+static int FIO_remove(const char* path)
+{
+#if defined(_WIN32) || defined(WIN32)
+    /* windows doesn't allow remove read-only files, so try to make it
+     * writable first */
+    chmod(path, _S_IWRITE);
+#endif
+    return remove(path);
+}
+
 /** FIO_openSrcFile() :
  * condition : `dstFileName` must be non-NULL.
  * @result : FILE* to `dstFileName`, or NULL if it fails */
@@ -227,23 +252,32 @@ static FILE* FIO_openDstFile(const char* dstFileName)
             DISPLAYLEVEL(4, "Sparse File Support is automatically disabled on stdout ; try --sparse \n");
         }
     } else {
-        if (!g_overwrite && strcmp (dstFileName, nulmark)) {  /* Check if destination file already exists */
+        if (g_sparseFileSupport == 1) {
+            g_sparseFileSupport = ZSTD_SPARSE_DEFAULT;
+        }
+        if (strcmp (dstFileName, nulmark)) {  /* Check if destination file already exists */
             f = fopen( dstFileName, "rb" );
             if (f != 0) {  /* dest file exists, prompt for overwrite authorization */
                 fclose(f);
-                if (g_displayLevel <= 1) {
-                    /* No interaction possible */
-                    DISPLAY("zstd: %s already exists; not overwritten  \n", dstFileName);
-                    return NULL;
-                }
-                DISPLAY("zstd: %s already exists; do you wish to overwrite (y/N) ? ", dstFileName);
-                {   int ch = getchar();
-                    if ((ch!='Y') && (ch!='y')) {
-                        DISPLAY("    not overwritten  \n");
+                if (!g_overwrite) {
+                    if (g_displayLevel <= 1) {
+                        /* No interaction possible */
+                        DISPLAY("zstd: %s already exists; not overwritten  \n", dstFileName);
                         return NULL;
                     }
-                    while ((ch!=EOF) && (ch!='\n')) ch = getchar();  /* flush rest of input line */
-        }   }   }
+                    DISPLAY("zstd: %s already exists; do you wish to overwrite (y/N) ? ", dstFileName);
+                    {   int ch = getchar();
+                        if ((ch!='Y') && (ch!='y')) {
+                            DISPLAY("    not overwritten  \n");
+                            return NULL;
+                        }
+                        while ((ch!=EOF) && (ch!='\n')) ch = getchar();  /* flush rest of input line */
+                    }
+                }
+
+                /* need to unlink */
+                FIO_remove(dstFileName);
+        }   }
         f = fopen( dstFileName, "wb" );
         if (f==NULL) DISPLAYLEVEL(1, "zstd: %s: %s\n", dstFileName, strerror(errno));
     }
@@ -252,13 +286,13 @@ static FILE* FIO_openDstFile(const char* dstFileName)
 }
 
 
-/*! FIO_loadFile() :
-*   creates a buffer, pointed by `*bufferPtr`,
-*   loads `filename` content into it,
-*   up to MAX_DICT_SIZE bytes.
-*   @return : loaded size
-*/
-static size_t FIO_loadFile(void** bufferPtr, const char* fileName)
+/*! FIO_createDictBuffer() :
+ *  creates a buffer, pointed by `*bufferPtr`,
+ *  loads `filename` content into it, up to DICTSIZE_MAX bytes.
+ *  @return : loaded size
+ *  if fileName==NULL, returns 0 and a NULL pointer
+ */
+static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName)
 {
     FILE* fileHandle;
     U64 fileSize;
@@ -270,14 +304,7 @@ static size_t FIO_loadFile(void** bufferPtr, const char* fileName)
     fileHandle = fopen(fileName, "rb");
     if (fileHandle==0) EXM_THROW(31, "zstd: %s: %s", fileName, strerror(errno));
     fileSize = UTIL_getFileSize(fileName);
-    if (fileSize > MAX_DICT_SIZE) {
-        int seekResult;
-        if (fileSize > 1 GB) EXM_THROW(32, "Dictionary file %s is too large", fileName);   /* avoid extreme cases */
-        DISPLAYLEVEL(2,"Dictionary %s is too large : using last %u bytes only \n", fileName, (U32)MAX_DICT_SIZE);
-        seekResult = fseek(fileHandle, (long int)(fileSize-MAX_DICT_SIZE), SEEK_SET);   /* use end of file */
-        if (seekResult != 0) EXM_THROW(33, "zstd: %s: %s", fileName, strerror(errno));
-        fileSize = MAX_DICT_SIZE;
-    }
+    if (fileSize > DICTSIZE_MAX) EXM_THROW(32, "Dictionary file %s is too large (> %u MB)", fileName, DICTSIZE_MAX >> 20);   /* avoid extreme cases */
     *bufferPtr = malloc((size_t)fileSize);
     if (*bufferPtr==NULL) EXM_THROW(34, "zstd: %s", strerror(errno));
     { size_t const readSize = fread(*bufferPtr, 1, (size_t)fileSize, fileHandle);
@@ -330,7 +357,7 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
 
     /* dictionary */
     {   void* dictBuffer;
-        size_t const dictBuffSize = FIO_loadFile(&dictBuffer, dictFileName);
+        size_t const dictBuffSize = FIO_createDictBuffer(&dictBuffer, dictFileName);   /* works with dictFileName==NULL */
         if (dictFileName && (dictBuffer==NULL)) EXM_THROW(32, "zstd: allocation error : can't create dictBuffer");
         {   ZSTD_parameters params = ZSTD_getParams(cLevel, srcSize, dictBuffSize);
             params.fParams.contentSizeFlag = srcRegFile;
@@ -342,7 +369,7 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
             if (comprParams->searchLog) params.cParams.searchLog = comprParams->searchLog;
             if (comprParams->searchLength) params.cParams.searchLength = comprParams->searchLength;
             if (comprParams->targetLength) params.cParams.targetLength = comprParams->targetLength;
-            if (comprParams->strategy) params.cParams.strategy = (ZSTD_strategy)(comprParams->strategy - 1);
+            if (comprParams->strategy) params.cParams.strategy = (ZSTD_strategy)(comprParams->strategy - 1);   /* 0 means : do not change */
 #ifdef ZSTD_MULTITHREAD
             {   size_t const errorCode = ZSTDMT_initCStream_advanced(ress.cctx, dictBuffer, dictBuffSize, params, srcSize);
                 if (ZSTD_isError(errorCode)) EXM_THROW(33, "Error initializing CStream : %s", ZSTD_getErrorName(errorCode));
@@ -494,6 +521,84 @@ static unsigned long long FIO_compressLzmaFrame(cRess_t* ress, const char* srcFi
 }
 #endif
 
+#ifdef ZSTD_LZ4COMPRESS
+static int FIO_LZ4_GetBlockSize_FromBlockId (int id) { return (1 << (8 + (2 * id))); }
+static unsigned long long FIO_compressLz4Frame(cRess_t* ress, const char* srcFileName, U64 const srcFileSize, int compressionLevel, U64* readsize)
+{
+    unsigned long long inFileSize = 0, outFileSize = 0;
+
+    LZ4F_preferences_t prefs;
+    LZ4F_compressionContext_t ctx;
+
+    LZ4F_errorCode_t const errorCode = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
+    if (LZ4F_isError(errorCode)) EXM_THROW(31, "zstd: failed to create lz4 compression context");
+
+    memset(&prefs, 0, sizeof(prefs));
+
+#if LZ4_VERSION_NUMBER <= 10600
+#define LZ4F_blockIndependent blockIndependent
+#define LZ4F_max4MB max4MB
+#endif
+
+    prefs.autoFlush = 1;
+    prefs.compressionLevel = compressionLevel;
+    prefs.frameInfo.blockMode = LZ4F_blockIndependent; /* stick to defaults for lz4 cli */
+    prefs.frameInfo.blockSizeID = LZ4F_max4MB;
+    prefs.frameInfo.contentChecksumFlag = (contentChecksum_t)g_checksumFlag;
+#if LZ4_VERSION_NUMBER >= 10600
+    prefs.frameInfo.contentSize = srcFileSize;
+#endif
+
+    {
+        size_t blockSize = FIO_LZ4_GetBlockSize_FromBlockId(LZ4F_max4MB);
+        size_t readSize;
+        size_t headerSize = LZ4F_compressBegin(ctx, ress->dstBuffer, ress->dstBufferSize, &prefs);
+        if (LZ4F_isError(headerSize)) EXM_THROW(33, "File header generation failed : %s", LZ4F_getErrorName(headerSize));
+        { size_t const sizeCheck = fwrite(ress->dstBuffer, 1, headerSize, ress->dstFile);
+          if (sizeCheck!=headerSize) EXM_THROW(34, "Write error : cannot write header"); }
+        outFileSize += headerSize;
+
+        /* Read first block */
+        readSize  = fread(ress->srcBuffer, (size_t)1, (size_t)blockSize, ress->srcFile);
+        inFileSize += readSize;
+
+        /* Main Loop */
+        while (readSize>0) {
+            size_t outSize;
+
+            /* Compress Block */
+            outSize = LZ4F_compressUpdate(ctx, ress->dstBuffer, ress->dstBufferSize, ress->srcBuffer, readSize, NULL);
+            if (LZ4F_isError(outSize)) EXM_THROW(35, "zstd: %s: lz4 compression failed : %s", srcFileName, LZ4F_getErrorName(outSize));
+            outFileSize += outSize;
+            if (!srcFileSize) DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%", (U32)(inFileSize>>20), (double)outFileSize/inFileSize*100)
+            else DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%", (U32)(inFileSize>>20), (U32)(srcFileSize>>20), (double)outFileSize/inFileSize*100);
+
+            /* Write Block */
+            { size_t const sizeCheck = fwrite(ress->dstBuffer, 1, outSize, ress->dstFile);
+              if (sizeCheck!=outSize) EXM_THROW(36, "Write error : cannot write compressed block"); }
+
+            /* Read next block */
+            readSize  = fread(ress->srcBuffer, (size_t)1, (size_t)blockSize, ress->srcFile);
+            inFileSize += readSize;
+        }
+        if (ferror(ress->srcFile)) EXM_THROW(37, "Error reading %s ", srcFileName);
+
+        /* End of Stream mark */
+        headerSize = LZ4F_compressEnd(ctx, ress->dstBuffer, ress->dstBufferSize, NULL);
+        if (LZ4F_isError(headerSize)) EXM_THROW(38, "zstd: %s: lz4 end of file generation failed : %s", srcFileName, LZ4F_getErrorName(headerSize));
+
+        { size_t const sizeCheck = fwrite(ress->dstBuffer, 1, headerSize, ress->dstFile);
+          if (sizeCheck!=headerSize) EXM_THROW(39, "Write error : cannot write end of stream"); }
+        outFileSize += headerSize;
+    }
+
+    *readsize = inFileSize;
+    LZ4F_freeCompressionContext(ctx);
+
+    return outFileSize;
+}
+#endif
+
 
 /*! FIO_compressFilename_internal() :
  *  same as FIO_compressFilename_extRess(), with `ress.desFile` already opened.
@@ -512,6 +617,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
     switch (g_compressionType) {
         case FIO_zstdCompression:
             break;
+
         case FIO_gzipCompression:
 #ifdef ZSTD_GZCOMPRESS
             compressedfilesize = FIO_compressGzFrame(&ress, srcFileName, fileSize, compressionLevel, &readsize);
@@ -520,6 +626,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
             EXM_THROW(20, "zstd: %s: file cannot be compressed as gzip (zstd compiled without ZSTD_GZCOMPRESS) -- ignored \n", srcFileName);
 #endif
             goto finish;
+
         case FIO_xzCompression:
         case FIO_lzmaCompression:
 #ifdef ZSTD_LZMACOMPRESS
@@ -529,6 +636,15 @@ static int FIO_compressFilename_internal(cRess_t ress,
             EXM_THROW(20, "zstd: %s: file cannot be compressed as xz/lzma (zstd compiled without ZSTD_LZMACOMPRESS) -- ignored \n", srcFileName);
 #endif
             goto finish;
+
+        case FIO_lz4Compression:
+#ifdef ZSTD_LZ4COMPRESS
+            compressedfilesize = FIO_compressLz4Frame(&ress, srcFileName, fileSize, compressionLevel, &readsize);
+#else
+            (void)compressionLevel;
+            EXM_THROW(20, "zstd: %s: file cannot be compressed as lz4 (zstd compiled without ZSTD_LZ4COMPRESS) -- ignored \n", srcFileName);
+#endif
+            goto finish;
     }
 
     /* init */
@@ -548,8 +664,8 @@ static int FIO_compressFilename_internal(cRess_t ress,
         readsize += inSize;
 
         {   ZSTD_inBuffer  inBuff = { ress.srcBuffer, inSize, 0 };
-            while (inBuff.pos != inBuff.size) {   /* note : is there any possibility of endless loop ? for example, if outBuff is not large enough ? */
-                ZSTD_outBuffer outBuff= { ress.dstBuffer, ress.dstBufferSize, 0 };
+            while (inBuff.pos != inBuff.size) {
+                ZSTD_outBuffer outBuff = { ress.dstBuffer, ress.dstBufferSize, 0 };
 #ifdef ZSTD_MULTITHREAD
                 size_t const result = ZSTDMT_compressStream(ress.cctx, &outBuff, &inBuff);
 #else
@@ -563,13 +679,13 @@ static int FIO_compressFilename_internal(cRess_t ress,
                     if (sizeCheck!=outBuff.pos) EXM_THROW(25, "Write error : cannot write compressed block into %s", dstFileName);
                     compressedfilesize += outBuff.pos;
         }   }   }
-#ifdef ZSTD_MULTITHREAD
-        if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB", (U32)(readsize>>20))
-        else DISPLAYUPDATE(2, "\rRead : %u / %u MB", (U32)(readsize>>20), (U32)(fileSize>>20));
-#else
-        if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%", (U32)(readsize>>20), (double)compressedfilesize/readsize*100)
-        else DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%", (U32)(readsize>>20), (U32)(fileSize>>20), (double)compressedfilesize/readsize*100);
-#endif
+        if (g_nbThreads > 1) {
+            if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB", (U32)(readsize>>20))
+            else DISPLAYUPDATE(2, "\rRead : %u / %u MB", (U32)(readsize>>20), (U32)(fileSize>>20));
+        } else {
+            if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%", (U32)(readsize>>20), (double)compressedfilesize/readsize*100)
+            else DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%", (U32)(readsize>>20), (U32)(fileSize>>20), (double)compressedfilesize/readsize*100);
+        }
     }
 
     /* End of Frame */
@@ -750,7 +866,7 @@ static dRess_t FIO_createDResources(const char* dictFileName)
 
     /* dictionary */
     {   void* dictBuffer;
-        size_t const dictBufferSize = FIO_loadFile(&dictBuffer, dictFileName);
+        size_t const dictBufferSize = FIO_createDictBuffer(&dictBuffer, dictFileName);
         size_t const initError = ZSTD_initDStream_usingDict(ress.dctx, dictBuffer, dictBufferSize);
         if (ZSTD_isError(initError)) EXM_THROW(61, "ZSTD_initDStream_usingDict error : %s", ZSTD_getErrorName(initError));
         free(dictBuffer);
@@ -1019,6 +1135,66 @@ static unsigned long long FIO_decompressLzmaFrame(dRess_t* ress, FILE* srcFile,
 }
 #endif
 
+#ifdef ZSTD_LZ4DECOMPRESS
+static unsigned long long FIO_decompressLz4Frame(dRess_t* ress, FILE* srcFile, const char* srcFileName)
+{
+    unsigned long long filesize = 0;
+    LZ4F_errorCode_t nextToLoad;
+    LZ4F_decompressionContext_t dCtx;
+    LZ4F_errorCode_t const errorCode = LZ4F_createDecompressionContext(&dCtx, LZ4F_VERSION);
+
+    if (LZ4F_isError(errorCode)) EXM_THROW(61, "zstd: failed to create lz4 decompression context");
+
+    /* Init feed with magic number (already consumed from FILE* sFile) */
+    {   size_t inSize = 4;
+        size_t outSize= 0;
+        MEM_writeLE32(ress->srcBuffer, LZ4_MAGICNUMBER);
+        nextToLoad = LZ4F_decompress(dCtx, ress->dstBuffer, &outSize, ress->srcBuffer, &inSize, NULL);
+        if (LZ4F_isError(nextToLoad)) EXM_THROW(62, "zstd: %s: lz4 header error : %s", srcFileName, LZ4F_getErrorName(nextToLoad));
+    }
+
+    /* Main Loop */
+    for (;nextToLoad;) {
+        size_t readSize;
+        size_t pos = 0;
+        size_t decodedBytes = ress->dstBufferSize;
+
+        /* Read input */
+        if (nextToLoad > ress->srcBufferSize) nextToLoad = ress->srcBufferSize;
+        readSize = fread(ress->srcBuffer, 1, nextToLoad, srcFile);
+        if (!readSize) break;   /* reached end of file or stream */
+
+        while ((pos < readSize) || (decodedBytes == ress->dstBufferSize)) {  /* still to read, or still to flush */
+            /* Decode Input (at least partially) */
+            size_t remaining = readSize - pos;
+            decodedBytes = ress->dstBufferSize;
+            nextToLoad = LZ4F_decompress(dCtx, ress->dstBuffer, &decodedBytes, (char*)(ress->srcBuffer)+pos, &remaining, NULL);
+            if (LZ4F_isError(nextToLoad)) EXM_THROW(66, "zstd: %s: decompression error : %s", srcFileName, LZ4F_getErrorName(nextToLoad));
+            pos += remaining;
+
+            /* Write Block */
+            if (decodedBytes) {
+                if (fwrite(ress->dstBuffer, 1, decodedBytes, ress->dstFile) != decodedBytes) EXM_THROW(63, "Write error : cannot write to output file");
+                filesize += decodedBytes;
+                DISPLAYUPDATE(2, "\rDecompressed : %u MB  ", (unsigned)(filesize>>20));
+            }
+
+            if (!nextToLoad) break;
+        }
+    }
+    /* can be out because readSize == 0, which could be an fread() error */
+    if (ferror(srcFile)) EXM_THROW(67, "zstd: %s: read error", srcFileName);
+
+    if (nextToLoad!=0) EXM_THROW(68, "zstd: %s: unfinished stream", srcFileName);
+
+    LZ4F_freeDecompressionContext(dCtx);
+    ress->srcBufferLoaded = 0; /* LZ4F will go to the frame boundary */
+
+    return filesize;
+}
+#endif
+
+
 
 /** FIO_decompressSrcFile() :
     Decompression `srcFileName` into `ress.dstFile`
@@ -1071,6 +1247,15 @@ static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const ch
             DISPLAYLEVEL(1, "zstd: %s: xz/lzma file cannot be uncompressed (zstd compiled without ZSTD_LZMADECOMPRESS) -- ignored \n", srcFileName);
             return 1;
 #endif
+        } else if (MEM_readLE32(buf) == LZ4_MAGICNUMBER) {
+#ifdef ZSTD_LZ4DECOMPRESS
+            unsigned long long const result = FIO_decompressLz4Frame(&ress, srcFile, srcFileName);
+            if (result == 0) return 1;
+            filesize += result;
+#else
+            DISPLAYLEVEL(1, "zstd: %s: lz4 file cannot be uncompressed (zstd compiled without ZSTD_LZ4DECOMPRESS) -- ignored \n", srcFileName);
+            return 1;
+#endif
         } else {
             if (!ZSTD_isFrame(ress.srcBuffer, toRead)) {
                 if ((g_overwrite) && !strcmp (dstFileName, stdoutmark)) {  /* pass-through mode */
@@ -1179,7 +1364,7 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
                 dstFileName = (char*)malloc(dfnSize);
                 if (dstFileName==NULL) EXM_THROW(74, "not enough memory for dstFileName");
             }
-            if (sfnSize <= suffixSize || (strcmp(suffixPtr, GZ_EXTENSION) && strcmp(suffixPtr, XZ_EXTENSION) && strcmp(suffixPtr, ZSTD_EXTENSION) && strcmp(suffixPtr, LZMA_EXTENSION))) {
+            if (sfnSize <= suffixSize || (strcmp(suffixPtr, GZ_EXTENSION) && strcmp(suffixPtr, XZ_EXTENSION) && strcmp(suffixPtr, ZSTD_EXTENSION) && strcmp(suffixPtr, LZMA_EXTENSION) && strcmp(suffixPtr, LZ4_EXTENSION))) {
                 DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%s/%s/%s/%s expected) -- ignored \n", srcFileName, GZ_EXTENSION, XZ_EXTENSION, ZSTD_EXTENSION, LZMA_EXTENSION);
                 skippedFiles++;
                 continue;
diff --git a/programs/fileio.h b/programs/fileio.h
index 0dd58d625d44a..65da98d7fa88a 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -33,12 +33,13 @@ extern "C" {
 #define XZ_EXTENSION    ".xz"
 #define GZ_EXTENSION    ".gz"
 #define ZSTD_EXTENSION  ".zst"
+#define LZ4_EXTENSION   ".lz4"
 
 
 /*-*************************************
 *  Types
 ***************************************/
-typedef enum { FIO_zstdCompression, FIO_gzipCompression, FIO_xzCompression, FIO_lzmaCompression } FIO_compressionType_t;
+typedef enum { FIO_zstdCompression, FIO_gzipCompression, FIO_xzCompression, FIO_lzmaCompression, FIO_lz4Compression } FIO_compressionType_t;
 
 
 /*-*************************************
diff --git a/programs/platform.h b/programs/platform.h
index 89a9f6cd42a01..74412cde332ea 100644
--- a/programs/platform.h
+++ b/programs/platform.h
@@ -100,9 +100,18 @@ extern "C" {
 #if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) || (PLATFORM_POSIX_VERSION >= 200112L) || defined(__DJGPP__)
 #  include <unistd.h>   /* isatty */
 #  define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
-#elif defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
+#elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__)
 #  include <io.h>       /* _isatty */
 #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
+#elif defined(WIN32) || defined(_WIN32)
+#  include <io.h>      /* _isatty */
+#  include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
+#  include <stdio.h>   /* FILE */
+static __inline int IS_CONSOLE(FILE* stdStream)
+{
+    DWORD dummy;
+    return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy);
+}
 #else
 #  define IS_CONSOLE(stdStream) 0
 #endif
@@ -129,6 +138,14 @@ extern "C" {
 #endif
 
 
+#ifndef ZSTD_SPARSE_DEFAULT
+#  if (defined(__APPLE__) && defined(__MACH__))
+#    define ZSTD_SPARSE_DEFAULT 0
+#  else
+#    define ZSTD_SPARSE_DEFAULT 1
+#  endif
+#endif
+
 
 #if defined (__cplusplus)
 }
diff --git a/programs/util.h b/programs/util.h
index 59e19d027ccd2..5f437b2b268c4 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -1,6 +1,6 @@
 /**
  * util.h - utility functions
- * 
+ *
  * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
@@ -25,6 +25,7 @@ extern "C" {
 #include <stdlib.h>       /* malloc */
 #include <stddef.h>       /* size_t, ptrdiff_t */
 #include <stdio.h>        /* fprintf */
+#include <string.h>       /* strncmp */
 #include <sys/types.h>    /* stat, utime */
 #include <sys/stat.h>     /* stat */
 #if defined(_MSC_VER)
@@ -166,8 +167,8 @@ UTIL_STATIC void UTIL_waitForNextTick(UTIL_freq_t ticksPerSecond)
 *  File functions
 ******************************************/
 #if defined(_MSC_VER)
-	#define chmod _chmod
-	typedef struct __stat64 stat_t;
+    #define chmod _chmod
+    typedef struct __stat64 stat_t;
 #else
     typedef struct stat stat_t;
 #endif
@@ -178,9 +179,9 @@ UTIL_STATIC int UTIL_setFileStat(const char *filename, stat_t *statbuf)
     int res = 0;
     struct utimbuf timebuf;
 
-	timebuf.actime = time(NULL);
-	timebuf.modtime = statbuf->st_mtime;
-	res += utime(filename, &timebuf);  /* set access and modification times */
+    timebuf.actime = time(NULL);
+    timebuf.modtime = statbuf->st_mtime;
+    res += utime(filename, &timebuf);  /* set access and modification times */
 
 #if !defined(_WIN32)
     res += chown(filename, statbuf->st_uid, statbuf->st_gid);  /* Copy ownership */
@@ -228,6 +229,20 @@ UTIL_STATIC U32 UTIL_isDirectory(const char* infilename)
     return 0;
 }
 
+UTIL_STATIC U32 UTIL_isLink(const char* infilename)
+{
+#if defined(_WIN32)
+    /* no symlinks on windows */
+    (void)infilename;
+#else
+    int r;
+    stat_t statbuf;
+    r = lstat(infilename, &statbuf);
+    if (!r && S_ISLNK(statbuf.st_mode)) return 1;
+#endif
+    return 0;
+}
+
 
 UTIL_STATIC U64 UTIL_getFileSize(const char* infilename)
 {
@@ -271,11 +286,14 @@ UTIL_STATIC void *UTIL_realloc(void *ptr, size_t size)
     return NULL;
 }
 
+static int g_utilDisplayLevel;
+#define UTIL_DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define UTIL_DISPLAYLEVEL(l, ...) { if (g_utilDisplayLevel>=l) { UTIL_DISPLAY(__VA_ARGS__); } }
 
 #ifdef _WIN32
 #  define UTIL_HAS_CREATEFILELIST
 
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd)
+UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
     char* path;
     int dirLength, fnameLength, pathLength, nbFiles = 0;
@@ -311,7 +329,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
             if (strcmp (cFile.cFileName, "..") == 0 ||
                 strcmp (cFile.cFileName, ".") == 0) continue;
 
-            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd);  /* Recursively call "UTIL_prepareFileList" with the new path. */
+            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
             if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; }
         }
         else if ((cFile.dwFileAttributes & FILE_ATTRIBUTE_NORMAL) || (cFile.dwFileAttributes & FILE_ATTRIBUTE_ARCHIVE) || (cFile.dwFileAttributes & FILE_ATTRIBUTE_COMPRESSED)) {
@@ -339,7 +357,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
 #  include <dirent.h>       /* opendir, readdir */
 #  include <string.h>       /* strerror, memcpy */
 
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd)
+UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
     DIR *dir;
     struct dirent *entry;
@@ -360,13 +378,19 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
         path = (char*) malloc(dirLength + fnameLength + 2);
         if (!path) { closedir(dir); return 0; }
         memcpy(path, dirName, dirLength);
+
         path[dirLength] = '/';
         memcpy(path+dirLength+1, entry->d_name, fnameLength);
         pathLength = dirLength+1+fnameLength;
         path[pathLength] = 0;
 
+        if (!followLinks && UTIL_isLink(path)) {
+            UTIL_DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", path);
+            continue;
+        }
+
         if (UTIL_isDirectory(path)) {
-            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd);  /* Recursively call "UTIL_prepareFileList" with the new path. */
+            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
             if (*bufStart == NULL) { free(path); closedir(dir); return 0; }
         } else {
             if (*bufStart + *pos + pathLength >= *bufEnd) {
@@ -396,7 +420,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
 
 #else
 
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd)
+UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
     (void)bufStart; (void)bufEnd; (void)pos;
     fprintf(stderr, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName);
@@ -411,7 +435,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
  * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
  * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
  */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb)
+UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
 {
     size_t pos;
     unsigned i, nbFiles;
@@ -436,7 +460,7 @@ UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned i
                 nbFiles++;
             }
         } else {
-            nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend);
+            nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend, followLinks);
             if (buf == NULL) return NULL;
     }   }
 
@@ -465,6 +489,201 @@ UTIL_STATIC void UTIL_freeFileList(const char** filenameTable, char* allocatedBu
     if (filenameTable) free((void*)filenameTable);
 }
 
+/* count the number of physical cores */
+#if defined(_WIN32) || defined(WIN32)
+
+#include <windows.h>
+
+typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    {   LPFN_GLPI glpi;
+        BOOL done = FALSE;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
+        DWORD returnLength = 0;
+        size_t byteOffset = 0;
+
+        glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")),
+                                         "GetLogicalProcessorInformation");
+
+        if (glpi == NULL) {
+            goto failed;
+        }
+
+        while(!done) {
+            DWORD rc = glpi(buffer, &returnLength);
+            if (FALSE == rc) {
+                if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
+                    if (buffer)
+                        free(buffer);
+                    buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
+
+                    if (buffer == NULL) {
+                        perror("zstd");
+                        exit(1);
+                    }
+                } else {
+                    /* some other error */
+                    goto failed;
+                }
+            } else {
+                done = TRUE;
+            }
+        }
+
+        ptr = buffer;
+
+        while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) {
+
+            if (ptr->Relationship == RelationProcessorCore) {
+                numPhysicalCores++;
+            }
+
+            ptr++;
+            byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        }
+
+        free(buffer);
+
+        return numPhysicalCores;
+    }
+
+failed:
+    /* try to fall back on GetSystemInfo */
+    {   SYSTEM_INFO sysinfo;
+        GetSystemInfo(&sysinfo);
+        numPhysicalCores = sysinfo.dwNumberOfProcessors;
+        if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */
+    }
+    return numPhysicalCores;
+}
+
+#elif defined(__APPLE__)
+
+#include <sys/sysctl.h>
+
+/* Use apple-provided syscall
+ * see: man 3 sysctl */
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static S32 numPhysicalCores = 0; /* apple specifies int32_t */
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    {   size_t size = sizeof(S32);
+        int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0);
+        if (ret != 0) {
+            if (errno == ENOENT) {
+                /* entry not present, fall back on 1 */
+                numPhysicalCores = 1;
+            } else {
+                perror("zstd: can't get number of physical cpus");
+                exit(1);
+            }
+        }
+
+        return numPhysicalCores;
+    }
+}
+
+#elif defined(__linux__)
+
+/* parse /proc/cpuinfo
+ * siblings / cpu cores should give hyperthreading ratio
+ * otherwise fall back on sysconf */
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        return numPhysicalCores = 1;
+    }
+
+    /* try to determine if there's hyperthreading */
+    {   FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+        size_t const BUF_SIZE = 80;
+        char buff[BUF_SIZE];
+
+        int siblings = 0;
+        int cpu_cores = 0;
+        int ratio = 1;
+
+        if (cpuinfo == NULL) {
+            /* fall back on the sysconf value */
+            return numPhysicalCores;
+        }
+
+        /* assume the cpu cores/siblings values will be constant across all
+         * present processors */
+        while (!feof(cpuinfo)) {
+            if (fgets(buff, BUF_SIZE, cpuinfo) != NULL) {
+                if (strncmp(buff, "siblings", 8) == 0) {
+                    const char* const sep = strchr(buff, ':');
+                    if (*sep == '\0') {
+                        /* formatting was broken? */
+                        goto failed;
+                    }
+
+                    siblings = atoi(sep + 1);
+                }
+                if (strncmp(buff, "cpu cores", 9) == 0) {
+                    const char* const sep = strchr(buff, ':');
+                    if (*sep == '\0') {
+                        /* formatting was broken? */
+                        goto failed;
+                    }
+
+                    cpu_cores = atoi(sep + 1);
+                }
+            } else if (ferror(cpuinfo)) {
+                /* fall back on the sysconf value */
+                goto failed;
+            }
+        }
+        if (siblings && cpu_cores) {
+            ratio = siblings / cpu_cores;
+        }
+failed:
+        fclose(cpuinfo);
+        return numPhysicalCores = numPhysicalCores / ratio;
+    }
+}
+
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
+
+/* Use apple-provided syscall
+ * see: man 3 sysctl */
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        return numPhysicalCores = 1;
+    }
+    return numPhysicalCores;
+}
+
+#else
+
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    /* assume 1 */
+    return 1;
+}
+
+#endif
 
 #if defined (__cplusplus)
 }
diff --git a/programs/zstd.1 b/programs/zstd.1
index 684fb868aa309..6cc5f7e9d2f62 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -1,408 +1,334 @@
-\"
-\" zstd.1: This is a manual page for 'zstd' program. This file is part of the
-\" zstd <http://www.zstd.net/> project.
-\" Author: Yann Collet
-\"
-
-\" No hyphenation
-.hy 0
-.nr HY 0
-
-.TH zstd "1" "2015-08-22" "zstd" "User Commands"
-.SH NAME
-\fBzstd, unzstd, zstdcat\fR - Compress or decompress .zst files
-
-.SH SYNOPSIS
-.TP 5
-\fBzstd\fR [\fBOPTIONS\fR] [-|INPUT-FILE] [-o <OUTPUT-FILE>]
-.PP
-.B unzstd
-is equivalent to
-.BR "zstd \-d"
-.br
-.B zstdcat
-is equivalent to
-.BR "zstd \-dcf"
-.br
-
-.SH DESCRIPTION
-.PP
-\fBzstd\fR is a fast lossless compression algorithm
-and data compression tool,
-with command line syntax similar to \fB gzip (1) \fR and \fB xz (1) \fR .
-It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages.
-\fBzstd\fR offers highly configurable compression speed,
-with fast modes at > 200 MB/s per core,
-and strong modes nearing lzma compression ratios.
-It also features a very fast decoder, with speeds > 500 MB/s per core.
-
-\fBzstd\fR command line syntax is generally similar to gzip,
-but features the following differences :
- - Source files are preserved by default.
-   It's possible to remove them automatically by using \fB--rm\fR command.
- - When compressing a single file, \fBzstd\fR displays progress notifications and result summary by default.
-   Use \fB-q\fR to turn them off
-
-.PP
-.B zstd
-compresses or decompresses each
-.I file
-according to the selected operation mode.
-If no
-.I files
-are given or
-.I file
-is
-.BR \- ,
-.B zstd
-reads from standard input and writes the processed data
-to standard output.
-.B zstd
-will refuse (display an error and skip the
-.IR file )
-to write compressed data to standard output if it is a terminal.
-Similarly,
-.B zstd
-will refuse to read compressed data
-from standard input if it is a terminal.
-
-.PP
-Unless
-.B \-\-stdout
-or
-.B \-o
-is specified,
-.I files
-are written to a new file whose name is derived from the source
-.I file
-name:
-.IP \(bu 3
-When compressing, the suffix
-.B .zst
-is appended to the source filename to get the target filename.
-.IP \(bu 3
-When decompressing, the
-.B .zst
-suffix is removed from the filename to get the target filename.
-
-.SS "Concatenation with .zst files"
-It is possible to concatenate
-.B .zst
-files as is.
-.B zstd
-will decompress such files as if they were a single
-.B .zst
-file.
-
-
-
-.SH OPTIONS
-
+.
+.TH "ZSTD" "1" "May 2017" "zstd 1.2.0" "User Commands"
+.
+.SH "NAME"
+\fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
+.
+.SH "SYNOPSIS"
+\fBzstd\fR [\fIOPTIONS\fR] [\-|\fIINPUT\-FILE\fR] [\-o \fIOUTPUT\-FILE\fR]
+.
+.P
+\fBzstdmt\fR is equivalent to \fBzstd \-T0\fR
+.
+.P
+\fBunzstd\fR is equivalent to \fBzstd \-d\fR
+.
+.P
+\fBzstdcat\fR is equivalent to \fBzstd \-dcf\fR
+.
+.SH "DESCRIPTION"
+\fBzstd\fR is a fast lossless compression algorithm and data compression tool, with command line syntax similar to \fBgzip (1)\fR and \fBxz (1)\fR\. It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages\. \fBzstd\fR offers highly configurable compression speed, with fast modes at > 200 MB/s per code, and strong modes nearing lzma compression ratios\. It also features a very fast decoder, with speeds > 500 MB/s per core\.
+.
+.P
+\fBzstd\fR command line syntax is generally similar to gzip, but features the following differences :
+.
+.IP "\(bu" 4
+Source files are preserved by default\. It\'s possible to remove them automatically by using the \fB\-\-rm\fR command\.
+.
+.IP "\(bu" 4
+When compressing a single file, \fBzstd\fR displays progress notifications and result summary by default\. Use \fB\-q\fR to turn them off\.
+.
+.IP "\(bu" 4
+\fBzstd\fR does not accept input from console, but it properly accepts \fBstdin\fR when it\'s not the console\.
+.
+.IP "\(bu" 4
+\fBzstd\fR displays a short help page when command line is an error\. Use \fB\-q\fR to turn it off\.
+.
+.IP "" 0
+.
+.P
+\fBzstd\fR compresses or decompresses each \fIfile\fR according to the selected operation mode\. If no \fIfiles\fR are given or \fIfile\fR is \fB\-\fR, \fBzstd\fR reads from standard input and writes the processed data to standard output\. \fBzstd\fR will refuse to write compressed data to standard output if it is a terminal : it will display an error message and skip the \fIfile\fR\. Similarly, \fBzstd\fR will refuse to read compressed data from standard input if it is a terminal\.
+.
+.P
+Unless \fB\-\-stdout\fR or \fB\-o\fR is specified, \fIfiles\fR are written to a new file whose name is derived from the source \fIfile\fR name:
+.
+.IP "\(bu" 4
+When compressing, the suffix \fB\.zst\fR is appended to the source filename to get the target filename\.
+.
+.IP "\(bu" 4
+When decompressing, the \fB\.zst\fR suffix is removed from the source filename to get the target filename
+.
+.IP "" 0
+.
+.SS "Concatenation with \.zst files"
+It is possible to concatenate \fB\.zst\fR files as is\. \fBzstd\fR will decompress such files as if they were a single \fB\.zst\fR file\.
+.
+.SH "OPTIONS"
 .
 .SS "Integer suffixes and special values"
-In most places where an integer argument is expected,
-an optional suffix is supported to easily indicate large integers.
-There must be no space between the integer and the suffix.
-.TP
-.B KiB
-Multiply the integer by 1,024 (2^10).
-.BR Ki ,
-.BR K ,
-and
-.B KB
-are accepted as synonyms for
-.BR KiB .
-.TP
-.B MiB
-Multiply the integer by 1,048,576 (2^20).
-.BR Mi ,
-.BR M ,
-and
-.B MB
-are accepted as synonyms for
-.BR MiB .
-
+In most places where an integer argument is expected, an optional suffix is supported to easily indicate large integers\. There must be no space between the integer and the suffix\.
+.
+.TP
+\fBKiB\fR
+Multiply the integer by 1,024 (2^10)\. \fBKi\fR, \fBK\fR, and \fBKB\fR are accepted as synonyms for \fBKiB\fR\.
+.
+.TP
+\fBMiB\fR
+Multiply the integer by 1,048,576 (2^20)\. \fBMi\fR, \fBM\fR, and \fBMB\fR are accepted as synonyms for \fBMiB\fR\.
 .
 .SS "Operation mode"
-If multiple operation mode options are given,
-the last one takes effect.
-.TP
-.BR \-z ", " \-\-compress
-Compress.
-This is the default operation mode when no operation mode option
-is specified and no other operation mode is implied from
-the command name (for example,
-.B unzstd
-implies
-.BR \-\-decompress ).
-.TP
-.BR \-d ", " \-\-decompress ", " \-\-uncompress
-Decompress.
-.TP
-.BR \-t ", " \-\-test
-Test the integrity of compressed
-.IR files .
-This option is equivalent to
-.B "\-\-decompress \-\-stdout"
-except that the decompressed data is discarded instead of being
-written to standard output.
-No files are created or removed.
-.TP
-.B \-b#
- benchmark file(s) using compression level #
-.TP
-.B \--train FILEs
- use FILEs as training set to create a dictionary. The training set should contain a lot of small files (> 100).
-
+If multiple operation mode options are given, the last one takes effect\.
+.
+.TP
+\fB\-z\fR, \fB\-\-compress\fR
+Compress\. This is the default operation mode when no operation mode option is specified and no other operation mode is implied from the command name (for example, \fBunzstd\fR implies \fB\-\-decompress\fR)\.
+.
+.TP
+\fB\-d\fR, \fB\-\-decompress\fR, \fB\-\-uncompress\fR
+Decompress\.
+.
+.TP
+\fB\-t\fR, \fB\-\-test\fR
+Test the integrity of compressed \fIfiles\fR\. This option is equivalent to \fB\-\-decompress \-\-stdout\fR except that the decompressed data is discarded instead of being written to standard output\. No files are created or removed\.
+.
+.TP
+\fB\-b#\fR
+Benchmark file(s) using compression level #
+.
+.TP
+\fB\-\-train FILEs\fR
+Use FILEs as a training set to create a dictionary\. The training set should contain a lot of small files (> 100)\.
 .
 .SS "Operation modifiers"
+.
 .TP
-.B \-#
- # compression level [1-19] (default:3)
-.TP
-.BR \--ultra
- unlocks high compression levels 20+ (maximum 22), using a lot more memory.
-Note that decompression will also require more memory when using these levels.
-.TP
-.B \-D file
- use `file` as Dictionary to compress or decompress FILE(s)
-.TP
-.BR \--no-dictID
- do not store dictionary ID within frame header (dictionary compression).
- The decoder will have to rely on implicit knowledge about which dictionary to use,
-it won't be able to check if it's correct.
-.TP
-.B \-o file
- save result into `file` (only possible with a single INPUT-FILE)
-.TP
-.BR \-f ", " --force
- overwrite output without prompting
-.TP
-.BR \-c ", " --stdout
- force write to standard output, even if it is the console
-.TP
-.BR \--[no-]sparse
- enable / disable sparse FS support, to make files with many zeroes smaller on disk.
- Creating sparse files may save disk space and speed up the decompression
-by reducing the amount of disk I/O.
- default : enabled when output is into a file, and disabled when output is stdout.
- This setting overrides default and can force sparse mode over stdout.
-.TP
-.BR \--rm
- remove source file(s) after successful compression or decompression
-.TP
-.BR \-k ", " --keep
- keep source file(s) after successful compression or decompression.
- This is the default behavior.
-.TP
-.BR \-r
- operate recursively on directories
-.TP
-.BR \-h/\-H ", " --help
- display help/long help and exit
-.TP
-.BR \-V ", " --version
- display Version number and exit
-.TP
-.BR \-v ", " --verbose
- verbose mode
-.TP
-.BR \-q ", " --quiet
- suppress warnings, interactivity and notifications.
- specify twice to suppress errors too.
-.TP
-.BR \-C ", " --[no-]check
- add integrity check computed from uncompressed data (default : enabled)
-.TP
-.BR \-t ", " --test
- Test the integrity of compressed files. This option is equivalent to \fB--decompress --stdout > /dev/null\fR.
- No files are created or removed.
-.TP
-.BR --
- All arguments after -- are treated as files
-
-
-.SH DICTIONARY BUILDER
-.PP
-\fBzstd\fR offers \fIdictionary\fR compression, useful for very small files and messages.
-It's possible to train \fBzstd\fR with some samples, the result of which is saved into a file called `dictionary`.
-Then during compression and decompression, make reference to the same dictionary.
-It will improve compression ratio of small files.
-Typical gains range from ~10% (at 64KB) to x5 better (at <1KB).
-.TP
-.B \--train FILEs
- use FILEs as training set to create a dictionary. The training set should contain a lot of small files (> 100),
-and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)
-.TP
-.B \-o file
- dictionary saved into `file` (default: dictionary)
-.TP
-.B \--maxdict #
- limit dictionary to specified size (default : 112640)
-.TP
-.B \--dictID #
- A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary.
- By default, zstd will create a 4-bytes random number ID.
- It's possible to give a precise number instead.
- Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header,
- and an ID < 65536 will only need 2 bytes. This compares favorably to 4 bytes default.
- However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries.
-.TP
-.B \-s#
- dictionary selectivity level (default: 9)
- the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size.
-.TP
-.B \--cover=k=#,d=#
- Use alternate dictionary builder algorithm named cover with parameters \fIk\fR and \fId\fR with \fId\fR <= \fIk\fR.
- Selects segments of size \fIk\fR with the highest score to put in the dictionary.
- The score of a segment is computed by the sum of the frequencies of all the subsegments of of size \fId\fR.
- Generally \fId\fR should be in the range [6, 24].
- Good values for \fIk\fR vary widely based on the input data, but a safe range is [32, 2048].
- Example: \fB--train --cover=k=64,d=8 FILEs\fR.
-.TP
-.B \--optimize-cover[=steps=#,k=#,d=#]
- If \fIsteps\fR is not specified, the default value of 32 is used.
- If \fIk\fR is not specified, \fIsteps\fR values in [16, 2048] are checked for each value of \fId\fR.
- If \fId\fR is not specified, the values checked are [6, 8, ..., 16].
-
- Runs the cover dictionary builder for each parameter set saves the optimal parameters and dictionary.
- Prints the optimal parameters and writes the optimal dictionary to the output file.
- Supports multithreading if \fBzstd\fR is compiled with threading support.
-
- The parameter \fIk\fR is more sensitve than \fId\fR, and is faster to optimize over.
- Suggested use is to run with a \fIsteps\fR <= 32 with neither \fIk\fR nor \fId\fR set.
- Once it completes, use the value of \fId\fR it selects with a higher \fIsteps\fR (in the range [256, 1024]).
- \fBzstd --train --optimize-cover FILEs
- \fBzstd --train --optimize-cover=d=d,steps=512 FILEs
-.TP
-
-.SH BENCHMARK
-.TP
-.B \-b#
- benchmark file(s) using compression level #
-.TP
-.B \-e#
- benchmark file(s) using multiple compression levels, from -b# to -e# (included).
-.TP
-.B \-i#
- minimum evaluation time, in seconds (default : 3s), benchmark mode only
-.TP
-.B \-B#
- cut file into independent blocks of size # (default: no block)
-.B \--priority=rt
- set process priority to real-time
-
-.SH ADVANCED COMPRESSION OPTIONS
-.TP
-.B \--zstd[=\fIoptions\fR]
-.PD
-\fBzstd\fR provides 22 predefined compression levels. The selected or default predefined compression level can be changed with advanced compression options.
-The \fIoptions\fR are provided as a comma-separated list. You may specify only the \fIoptions\fR you want to change and the rest will be taken from the selected or default compression level.
-The list of available \fIoptions\fR:
-.RS
-
-.TP
-.BI strategy= strat
-.PD 0
-.TP
-.BI strat= strat
-.PD
-Specify a strategy used by a match finder.
-.IP ""
-There are 8 strategies numbered from 0 to 7, from faster to stronger:
-0=ZSTD_fast, 1=ZSTD_dfast, 2=ZSTD_greedy, 3=ZSTD_lazy, 4=ZSTD_lazy2, 5=ZSTD_btlazy2, 6=ZSTD_btopt, 7=ZSTD_btopt2.
-.IP ""
-
-.TP
-.BI windowLog= wlog
-.PD 0
-.TP
-.BI wlog= wlog
-.PD
-Specify the maximum number of bits for a match distance.
-.IP ""
-The higher number of bits increases the chance to find a match what usually improves compression ratio.
-It also increases memory requirements for compressor and decompressor.
-.IP ""
-The minimum \fIwlog\fR is 10 (1 KiB) and the maximum is 25 (32 MiB) for 32-bit compilation and 27 (128 MiB) for 64-bit compilation.
-.IP ""
-
-.TP
-.BI hashLog= hlog
-.PD 0
-.TP
-.BI hlog= hlog
-.PD
-Specify the maximum number of bits for a hash table.
-.IP ""
-The bigger hash table causes less collisions what usually make compression faster but requires more memory during compression.
-.IP ""
-The minimum \fIhlog\fR is 6 (64 B) and the maximum is 25 (32 MiB) for 32-bit compilation and 27 (128 MiB) for 64-bit compilation.
-
-.TP
-.BI chainLog= clog
-.PD 0
-.TP
-.BI clog= clog
-.PD
-Specify the maximum number of bits for a hash chain or a binary tree.
-.IP ""
-The higher number of bits increases the chance to find a match what usually improves compression ratio.
-It also slows down compression speed and increases memory requirements for compression.
-This option is ignored for the ZSTD_fast strategy.
-.IP ""
-The minimum \fIclog\fR is 6 (64 B) and the maximum is 26 (64 MiB) for 32-bit compilation and 28 (256 MiB) for 64-bit compilation.
-.IP ""
-
-.TP
-.BI searchLog= slog
-.PD 0
-.TP
-.BI slog= slog
-.PD
-Specify the maximum number of searches in a hash chain or a binary tree using logarithmic scale.
-.IP ""
-The bigger number of searches increases the chance to find a match what usually improves compression ratio but decreases compression speed.
-.IP ""
-The minimum \fIslog\fR is 1 and the maximum is 24 for 32-bit compilation and 26 for 64-bit compilation.
-.IP ""
-
-.TP
-.BI searchLength= slen
-.PD 0
-.TP
-.BI slen= slen
-.PD
-Specify the minimum searched length of a match in a hash table.
-.IP ""
-The bigger search length usually decreases compression ratio but improves decompression speed.
-.IP ""
-The minimum \fIslen\fR is 3 and the maximum is 7.
-.IP ""
-
-.TP
-.BI targetLength= tlen
-.PD 0
-.TP
-.BI tlen= tlen
-.PD
-Specify the minimum match length that causes a match finder to interrupt searching of better matches.
-.IP ""
-The bigger minimum match length usually improves compression ratio but decreases compression speed.
-This option is used only with ZSTD_btopt and ZSTD_btopt2 strategies.
-.IP ""
-The minimum \fItlen\fR is 4 and the maximum is 999.
-.IP ""
-
-.PP
-.B An example
-.br
-The following parameters sets advanced compression options to predefined level 19 for files bigger than 256 KB:
-.IP ""
-\fB--zstd=\fRwindowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
-
-.SH BUGS
-Report bugs at:- https://github.com/facebook/zstd/issues
-
-.SH AUTHOR
+\fB\-#\fR
+\fB#\fR compression level [1\-19] (default: 3)
+.
+.TP
+\fB\-\-ultra\fR
+unlocks high compression levels 20+ (maximum 22), using a lot more memory\. Note that decompression will also require more memory when using these levels\.
+.
+.TP
+\fB\-T#\fR, \fB\-\-threads=#\fR
+Compress using \fB#\fR threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
+.
+.TP
+\fB\-D file\fR
+use \fBfile\fR as Dictionary to compress or decompress FILE(s)
+.
+.TP
+\fB\-\-nodictID\fR
+do not store dictionary ID within frame header (dictionary compression)\. The decoder will have to rely on implicit knowledge about which dictionary to use, it won\'t be able to check if it\'s correct\.
+.
+.TP
+\fB\-o file\fR
+save result into \fBfile\fR (only possible with a single \fIINPUT\-FILE\fR)
+.
+.TP
+\fB\-f\fR, \fB\-\-force\fR
+overwrite output without prompting, and (de)compress symbolic links
+.
+.TP
+\fB\-c\fR, \fB\-\-stdout\fR
+force write to standard output, even if it is the console
+.
+.TP
+\fB\-\-[no\-]sparse\fR
+enable / disable sparse FS support, to make files with many zeroes smaller on disk\. Creating sparse files may save disk space and speed up decompression by reducing the amount of disk I/O\. default : enabled when output is into a file, and disabled when output is stdout\. This setting overrides default and can force sparse mode over stdout\.
+.
+.TP
+\fB\-\-rm\fR
+remove source file(s) after successful compression or decompression
+.
+.TP
+\fB\-k\fR, \fB\-\-keep\fR
+keep source file(s) after successful compression or decompression\. This is the default behavior\.
+.
+.TP
+\fB\-r\fR
+operate recursively on dictionaries
+.
+.TP
+\fB\-h\fR/\fB\-H\fR, \fB\-\-help\fR
+display help/long help and exit
+.
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+display version number and exit
+.
+.TP
+\fB\-v\fR
+verbose mode
+.
+.TP
+\fB\-q\fR, \fB\-\-quiet\fR
+suppress warnings, interactivity, and notifications\. specify twice to suppress errors too\.
+.
+.TP
+\fB\-C\fR, \fB\-\-[no\-]check\fR
+add integrity check computed from uncompressed data (default : enabled)
+.
+.TP
+\fB\-\-\fR
+All arguments after \fB\-\-\fR are treated as files
+.
+.SH "DICTIONARY BUILDER"
+\fBzstd\fR offers \fIdictionary\fR compression, useful for very small files and messages\. It\'s possible to train \fBzstd\fR with some samples, the result of which is saved into a file called a \fBdictionary\fR\. Then during compression and decompression, reference the same dictionary\. It will improve compression ratio of small files\. Typical gains range from 10% (at 64KB) to x5 better (at <1KB)\.
+.
+.TP
+\fB\-\-train FILEs\fR
+Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
+.
+.IP
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
+.
+.TP
+\fB\-o file\fR
+Dictionary saved into \fBfile\fR (default name: dictionary)\.
+.
+.TP
+\fB\-\-maxdict=#\fR
+Limit dictionary to specified size (default: 112640)\.
+.
+.TP
+\fB\-\-dictID=#\fR
+A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\.
+.
+.TP
+\fB\-\-train\-cover[=k#,d=#,steps=#]\fR
+Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\.
+.
+.IP
+Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-cover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=k=50,d=8 FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=d=8,steps=500 FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=k=50 FILEs\fR
+.
+.TP
+\fB\-\-train\-legacy[=selectivity=#]\fR
+Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-legacy FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-legacy=selectivity=8 FILEs\fR
+.
+.SH "BENCHMARK"
+.
+.TP
+\fB\-b#\fR
+benchmark file(s) using compression level #
+.
+.TP
+\fB\-e#\fR
+benchmark file(s) using multiple compression levels, from \fB\-b#\fR to \fB\-e#\fR (inclusive)
+.
+.TP
+\fB\-i#\fR
+minimum evaluation time, in seconds (default : 3s), benchmark mode only
+.
+.TP
+\fB\-B#\fR
+cut file into independent blocks of size # (default: no block)
+.
+.TP
+\fB\-\-priority=rt\fR
+set process priority to real\-time
+.
+.SH "ADVANCED COMPRESSION OPTIONS"
+.
+.SS "\-\-zstd[=options]:"
+\fBzstd\fR provides 22 predefined compression levels\. The selected or default predefined compression level can be changed with advanced compression options\. The \fIoptions\fR are provided as a comma\-separated list\. You may specify only the options you want to change and the rest will be taken from the selected or default compression level\. The list of available \fIoptions\fR:
+.
+.TP
+\fBstrategy\fR=\fIstrat\fR, \fBstrat\fR=\fIstrat\fR
+Specify a strategy used by a match finder\.
+.
+.IP
+There are 8 strategies numbered from 0 to 7, from faster to stronger: 0=ZSTD_fast, 1=ZSTD_dfast, 2=ZSTD_greedy, 3=ZSTD_lazy, 4=ZSTD_lazy2, 5=ZSTD_btlazy2, 6=ZSTD_btopt, 7=ZSTD_btopt2\.
+.
+.TP
+\fBwindowLog\fR=\fIwlog\fR, \fBwlog\fR=\fIwlog\fR
+Specify the maximum number of bits for a match distance\.
+.
+.IP
+The higher number of increases the chance to find a match which usually improves compression ratio\. It also increases memory requirements for the compressor and decompressor\. The minimum \fIwlog\fR is 10 (1 KiB) and the maximum is 27 (128 MiB)\.
+.
+.TP
+\fBhashLog\fR=\fIhlog\fR, \fBhlog\fR=\fIhlog\fR
+Specify the maximum number of bits for a hash table\.
+.
+.IP
+Bigger hash tables cause less collisions which usually makes compression faster, but requires more memory during compression\.
+.
+.IP
+The minimum \fIhlog\fR is 6 (64 B) and the maximum is 26 (128 MiB)\.
+.
+.TP
+\fBchainLog\fR=\fIclog\fR, \fBclog\fR=\fIclog\fR
+Specify the maximum number of bits for a hash chain or a binary tree\.
+.
+.IP
+Higher numbers of bits increases the chance to find a match which usually improves compression ratio\. It also slows down compression speed and increases memory requirements for compression\. This option is ignored for the ZSTD_fast strategy\.
+.
+.IP
+The minimum \fIclog\fR is 6 (64 B) and the maximum is 28 (256 MiB)\.
+.
+.TP
+\fBsearchLog\fR=\fIslog\fR, \fBslog\fR=\fIslog\fR
+Specify the maximum number of searches in a hash chain or a binary tree using logarithmic scale\.
+.
+.IP
+More searches increases the chance to find a match which usually increases compression ratio but decreases compression speed\.
+.
+.IP
+The minimum \fIslog\fR is 1 and the maximum is 26\.
+.
+.TP
+\fBsearchLength\fR=\fIslen\fR, \fBslen\fR=\fIslen\fR
+Specify the minimum searched length of a match in a hash table\.
+.
+.IP
+Larger search lengths usually decrease compression ratio but improve decompression speed\.
+.
+.IP
+The minimum \fIslen\fR is 3 and the maximum is 7\.
+.
+.TP
+\fBtargetLen\fR=\fItlen\fR, \fBtlen\fR=\fItlen\fR
+Specify the minimum match length that causes a match finder to stop searching for better matches\.
+.
+.IP
+A larger minimum match length usually improves compression ratio but decreases compression speed\. This option is only used with strategies ZSTD_btopt and ZSTD_btopt2\.
+.
+.IP
+The minimum \fItlen\fR is 4 and the maximum is 999\.
+.
+.TP
+\fBoverlapLog\fR=\fIovlog\fR, \fBovlog\fR=\fIovlog\fR
+Determine \fBoverlapSize\fR, amount of data reloaded from previous job\. This parameter is only available when multithreading is enabled\. Reloading more data improves compression ratio, but decreases speed\.
+.
+.IP
+The minimum \fIovlog\fR is 0, and the maximum is 9\. 0 means "no overlap", hence completely independent jobs\. 9 means "full overlap", meaning up to \fBwindowSize\fR is reloaded from previous job\. Reducing \fIovlog\fR by 1 reduces the amount of reload by a factor 2\. Default \fIovlog\fR is 6, which means "reload \fBwindowSize / 8\fR"\. Exception : the maximum compression level (22) has a default \fIovlog\fR of 9\.
+.
+.SS "\-B#:"
+Select the size of each compression job\. This parameter is available only when multi\-threading is enabled\. Default value is \fB4 * windowSize\fR, which means it varies depending on compression level\. \fB\-B#\fR makes it possible to select a custom value\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 1 MB, or \fBoverlapSize\fR, whichever is largest\.
+.
+.SS "Example"
+The following parameters sets advanced compression options to those of predefined level 19 for files bigger than 256 KB:
+.
+.P
+\fB\-\-zstd\fR=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
+.
+.SH "BUGS"
+Report bugs at: https://github\.com/facebook/zstd/issues
+.
+.SH "AUTHOR"
 Yann Collet
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
new file mode 100644
index 0000000000000..118c9f2f8ee58
--- /dev/null
+++ b/programs/zstd.1.md
@@ -0,0 +1,343 @@
+zstd(1) -- zstd, zstdmt, unzstd, zstdcat - Compress or decompress .zst files
+============================================================================
+
+SYNOPSIS
+--------
+
+`zstd` [*OPTIONS*] [-|_INPUT-FILE_] [-o _OUTPUT-FILE_]
+
+`zstdmt` is equivalent to `zstd -T0`
+
+`unzstd` is equivalent to `zstd -d`
+
+`zstdcat` is equivalent to `zstd -dcf`
+
+
+DESCRIPTION
+-----------
+`zstd` is a fast lossless compression algorithm and data compression tool,
+with command line syntax similar to `gzip (1)` and `xz (1)`.
+It is based on the **LZ77** family, with further FSE & huff0 entropy stages.
+`zstd` offers highly configurable compression speed,
+with fast modes at > 200 MB/s per code,
+and strong modes nearing lzma compression ratios.
+It also features a very fast decoder, with speeds > 500 MB/s per core.
+
+`zstd` command line syntax is generally similar to gzip,
+but features the following differences :
+
+  - Source files are preserved by default.
+    It's possible to remove them automatically by using the `--rm` command.
+  - When compressing a single file, `zstd` displays progress notifications
+    and result summary by default.
+    Use `-q` to turn them off.
+  - `zstd` does not accept input from console,
+    but it properly accepts `stdin` when it's not the console.
+  - `zstd` displays a short help page when command line is an error.
+    Use `-q` to turn it off.
+
+`zstd` compresses or decompresses each _file_ according to the selected
+operation mode.
+If no _files_ are given or _file_ is `-`, `zstd` reads from standard input
+and writes the processed data to standard output.
+`zstd` will refuse to write compressed data to standard output
+if it is a terminal : it will display an error message and skip the _file_.
+Similarly, `zstd` will refuse to read compressed data from standard input
+if it is a terminal.
+
+Unless `--stdout` or `-o` is specified, _files_ are written to a new file
+whose name is derived from the source _file_ name:
+
+* When compressing, the suffix `.zst` is appended to the source filename to
+  get the target filename.
+* When decompressing, the `.zst` suffix is removed from the source filename to
+  get the target filename
+
+### Concatenation with .zst files
+It is possible to concatenate `.zst` files as is.
+`zstd` will decompress such files as if they were a single `.zst` file.
+
+OPTIONS
+-------
+
+### Integer suffixes and special values
+In most places where an integer argument is expected,
+an optional suffix is supported to easily indicate large integers.
+There must be no space between the integer and the suffix.
+
+* `KiB`:
+    Multiply the integer by 1,024 (2\^10).
+    `Ki`, `K`, and `KB` are accepted as synonyms for `KiB`.
+* `MiB`:
+    Multiply the integer by 1,048,576 (2\^20).
+    `Mi`, `M`, and `MB` are accepted as synonyms for `MiB`.
+
+### Operation mode
+If multiple operation mode options are given,
+the last one takes effect.
+
+* `-z`, `--compress`:
+    Compress.
+    This is the default operation mode when no operation mode option is specified
+    and no other operation mode is implied from the command name
+    (for example, `unzstd` implies `--decompress`).
+* `-d`, `--decompress`, `--uncompress`:
+    Decompress.
+* `-t`, `--test`:
+    Test the integrity of compressed _files_.
+    This option is equivalent to `--decompress --stdout` except that the
+    decompressed data is discarded instead of being written to standard output.
+    No files are created or removed.
+* `-b#`:
+    Benchmark file(s) using compression level #
+* `--train FILEs`:
+    Use FILEs as a training set to create a dictionary.
+    The training set should contain a lot of small files (> 100).
+
+### Operation modifiers
+
+* `-#`:
+    `#` compression level \[1-19] (default: 3)
+* `--ultra`:
+    unlocks high compression levels 20+ (maximum 22), using a lot more memory.
+    Note that decompression will also require more memory when using these levels.
+* `-T#`, `--threads=#`:
+    Compress using `#` threads (default: 1).
+    If `#` is 0, attempt to detect and use the number of physical CPU cores.
+    This modifier does nothing if `zstd` is compiled without multithread support.
+* `-D file`:
+    use `file` as Dictionary to compress or decompress FILE(s)
+* `--nodictID`:
+    do not store dictionary ID within frame header (dictionary compression).
+    The decoder will have to rely on implicit knowledge about which dictionary to use,
+    it won't be able to check if it's correct.
+* `-o file`:
+    save result into `file` (only possible with a single _INPUT-FILE_)
+* `-f`, `--force`:
+    overwrite output without prompting, and (de)compress symbolic links
+* `-c`, `--stdout`:
+    force write to standard output, even if it is the console
+* `--[no-]sparse`:
+    enable / disable sparse FS support,
+    to make files with many zeroes smaller on disk.
+    Creating sparse files may save disk space and speed up decompression by
+    reducing the amount of disk I/O.
+    default : enabled when output is into a file,
+    and disabled when output is stdout.
+    This setting overrides default and can force sparse mode over stdout.
+* `--rm`:
+    remove source file(s) after successful compression or decompression
+* `-k`, `--keep`:
+    keep source file(s) after successful compression or decompression.
+    This is the default behavior.
+* `-r`:
+    operate recursively on dictionaries
+* `-h`/`-H`, `--help`:
+    display help/long help and exit
+* `-V`, `--version`:
+    display version number and exit
+* `-v`:
+    verbose mode
+* `-q`, `--quiet`:
+    suppress warnings, interactivity, and notifications.
+    specify twice to suppress errors too.
+* `-C`, `--[no-]check`:
+    add integrity check computed from uncompressed data (default : enabled)
+* `--`:
+    All arguments after `--` are treated as files
+
+
+DICTIONARY BUILDER
+------------------
+`zstd` offers _dictionary_ compression,
+useful for very small files and messages.
+It's possible to train `zstd` with some samples,
+the result of which is saved into a file called a `dictionary`.
+Then during compression and decompression, reference the same dictionary.
+It will improve compression ratio of small files.
+Typical gains range from 10% (at 64KB) to x5 better (at <1KB).
+
+* `--train FILEs`:
+    Use FILEs as training set to create a dictionary.
+    The training set should contain a lot of small files (> 100),
+    and weight typically 100x the target dictionary size
+    (for example, 10 MB for a 100 KB dictionary).
+
+    Supports multithreading if `zstd` is compiled with threading support.
+    Additional parameters can be specified with `--train-cover`.
+    The legacy dictionary builder can be accessed with `--train-legacy`.
+    Equivalent to `--train-cover=d=8,steps=4`.
+* `-o file`:
+    Dictionary saved into `file` (default name: dictionary).
+* `--maxdict=#`:
+    Limit dictionary to specified size (default: 112640).
+* `--dictID=#`:
+    A dictionary ID is a locally unique ID that a decoder can use to verify it is
+    using the right dictionary.
+    By default, zstd will create a 4-bytes random number ID.
+    It's possible to give a precise number instead.
+    Short numbers have an advantage : an ID < 256 will only need 1 byte in the
+    compressed frame header, and an ID < 65536 will only need 2 bytes.
+    This compares favorably to 4 bytes default.
+    However, it's up to the dictionary manager to not assign twice the same ID to
+    2 different dictionaries.
+* `--train-cover[=k#,d=#,steps=#]`:
+    Select parameters for the default dictionary builder algorithm named cover.
+    If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
+    If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
+    If _steps_ is not specified, then the default value of 40 is used.
+    Requires that _d_ <= _k_.
+
+    Selects segments of size _k_ with highest score to put in the dictionary.
+    The score of a segment is computed by the sum of the frequencies of all the
+    subsegments of size _d_.
+    Generally _d_ should be in the range [6, 8], occasionally up to 16, but the
+    algorithm will run faster with d <= _8_.
+    Good values for _k_ vary widely based on the input data, but a safe range is
+    [2 * _d_, 2000].
+    Supports multithreading if `zstd` is compiled with threading support.
+
+    Examples:
+
+    `zstd --train-cover FILEs`
+
+    `zstd --train-cover=k=50,d=8 FILEs`
+
+    `zstd --train-cover=d=8,steps=500 FILEs`
+
+    `zstd --train-cover=k=50 FILEs`
+
+* `--train-legacy[=selectivity=#]`:
+    Use legacy dictionary builder algorithm with the given dictionary
+    _selectivity_ (default: 9).
+    The smaller the _selectivity_ value, the denser the dictionary,
+    improving its efficiency but reducing its possible maximum size.
+    `--train-legacy=s=#` is also accepted.
+
+    Examples:
+
+    `zstd --train-legacy FILEs`
+
+    `zstd --train-legacy=selectivity=8 FILEs`
+
+
+BENCHMARK
+---------
+
+* `-b#`:
+    benchmark file(s) using compression level #
+* `-e#`:
+    benchmark file(s) using multiple compression levels, from `-b#` to `-e#` (inclusive)
+* `-i#`:
+    minimum evaluation time, in seconds (default : 3s), benchmark mode only
+* `-B#`:
+    cut file into independent blocks of size # (default: no block)
+* `--priority=rt`:
+    set process priority to real-time
+
+
+ADVANCED COMPRESSION OPTIONS
+----------------------------
+### --zstd[=options]:
+`zstd` provides 22 predefined compression levels.
+The selected or default predefined compression level can be changed with
+advanced compression options.
+The _options_ are provided as a comma-separated list.
+You may specify only the options you want to change and the rest will be
+taken from the selected or default compression level.
+The list of available _options_:
+
+- `strategy`=_strat_, `strat`=_strat_:
+    Specify a strategy used by a match finder.
+
+    There are 8 strategies numbered from 0 to 7, from faster to stronger:
+    0=ZSTD\_fast, 1=ZSTD\_dfast, 2=ZSTD\_greedy, 3=ZSTD\_lazy,
+    4=ZSTD\_lazy2, 5=ZSTD\_btlazy2, 6=ZSTD\_btopt, 7=ZSTD\_btopt2.
+
+- `windowLog`=_wlog_, `wlog`=_wlog_:
+    Specify the maximum number of bits for a match distance.
+
+    The higher number of increases the chance to find a match which usually
+    improves compression ratio.
+    It also increases memory requirements for the compressor and decompressor.
+    The minimum _wlog_ is 10 (1 KiB) and the maximum is 27 (128 MiB).
+
+- `hashLog`=_hlog_, `hlog`=_hlog_:
+    Specify the maximum number of bits for a hash table.
+
+    Bigger hash tables cause less collisions which usually makes compression
+    faster, but requires more memory during compression.
+
+    The minimum _hlog_ is 6 (64 B) and the maximum is 26 (128 MiB).
+
+- `chainLog`=_clog_, `clog`=_clog_:
+    Specify the maximum number of bits for a hash chain or a binary tree.
+
+    Higher numbers of bits increases the chance to find a match which usually
+    improves compression ratio.
+    It also slows down compression speed and increases memory requirements for
+    compression.
+    This option is ignored for the ZSTD_fast strategy.
+
+    The minimum _clog_ is 6 (64 B) and the maximum is 28 (256 MiB).
+
+- `searchLog`=_slog_, `slog`=_slog_:
+    Specify the maximum number of searches in a hash chain or a binary tree
+    using logarithmic scale.
+
+    More searches increases the chance to find a match which usually increases
+    compression ratio but decreases compression speed.
+
+    The minimum _slog_ is 1 and the maximum is 26.
+
+- `searchLength`=_slen_, `slen`=_slen_:
+    Specify the minimum searched length of a match in a hash table.
+
+    Larger search lengths usually decrease compression ratio but improve
+    decompression speed.
+
+    The minimum _slen_ is 3 and the maximum is 7.
+
+- `targetLen`=_tlen_, `tlen`=_tlen_:
+    Specify the minimum match length that causes a match finder to stop
+    searching for better matches.
+
+    A larger minimum match length usually improves compression ratio but
+    decreases compression speed.
+    This option is only used with strategies ZSTD_btopt and ZSTD_btopt2.
+
+    The minimum _tlen_ is 4 and the maximum is 999.
+
+- `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
+    Determine `overlapSize`, amount of data reloaded from previous job.
+    This parameter is only available when multithreading is enabled.
+    Reloading more data improves compression ratio, but decreases speed.
+
+    The minimum _ovlog_ is 0, and the maximum is 9.
+    0 means "no overlap", hence completely independent jobs.
+    9 means "full overlap", meaning up to `windowSize` is reloaded from previous job.
+    Reducing _ovlog_ by 1 reduces the amount of reload by a factor 2.
+    Default _ovlog_ is 6, which means "reload `windowSize / 8`".
+    Exception : the maximum compression level (22) has a default _ovlog_ of 9.
+
+### -B#:
+Select the size of each compression job.
+This parameter is available only when multi-threading is enabled.
+Default value is `4 * windowSize`, which means it varies depending on compression level.
+`-B#` makes it possible to select a custom value.
+Note that job size must respect a minimum value which is enforced transparently.
+This minimum is either 1 MB, or `overlapSize`, whichever is largest.
+
+### Example
+The following parameters sets advanced compression options to those of
+predefined level 19 for files bigger than 256 KB:
+
+`--zstd`=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
+
+BUGS
+----
+Report bugs at: https://github.com/facebook/zstd/issues
+
+AUTHOR
+------
+Yann Collet
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index ae49da7b1e72d..32fef99930e91 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -49,6 +49,7 @@
 #define AUTHOR "Yann Collet"
 #define WELCOME_MESSAGE "*** %s %i-bits %s, by %s ***\n", COMPRESSOR_NAME, (int)(sizeof(size_t)*8), ZSTD_VERSION, AUTHOR
 
+#define ZSTD_ZSTDMT "zstdmt"
 #define ZSTD_UNZSTD "unzstd"
 #define ZSTD_CAT "zstdcat"
 #define ZSTD_GZ "gzip"
@@ -74,10 +75,10 @@ static U32 g_overlapLog = OVERLAP_LOG_DEFAULT;
 /*-************************************
 *  Display Macros
 **************************************/
-#define DISPLAY(...)           fprintf(displayOut, __VA_ARGS__)
-#define DISPLAYLEVEL(l, ...)   if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static FILE* displayOut;
-static unsigned displayLevel = DEFAULT_DISPLAY_LEVEL;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+#define DISPLAY(...)         fprintf(g_displayOut, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
+static int g_displayLevel = DEFAULT_DISPLAY_LEVEL;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+static FILE* g_displayOut;
 
 
 /*-************************************
@@ -99,7 +100,7 @@ static int usage(const char* programName)
 #endif
     DISPLAY( " -D file: use `file` as Dictionary \n");
     DISPLAY( " -o file: result stored into `file` (only if 1 input file) \n");
-    DISPLAY( " -f     : overwrite output without prompting \n");
+    DISPLAY( " -f     : overwrite output without prompting and (de)compress links \n");
     DISPLAY( "--rm    : remove source file(s) after successful de/compression \n");
     DISPLAY( " -k     : preserve source file(s) (default) \n");
     DISPLAY( " -h/-H  : display help/long help and exit\n");
@@ -113,19 +114,20 @@ static int usage_advanced(const char* programName)
     DISPLAY( "\n");
     DISPLAY( "Advanced arguments :\n");
     DISPLAY( " -V     : display Version number and exit\n");
-    DISPLAY( " -v     : verbose mode; specify multiple times to increase log level (default:%d)\n", DEFAULT_DISPLAY_LEVEL);
+    DISPLAY( " -v     : verbose mode; specify multiple times to increase verbosity\n");
     DISPLAY( " -q     : suppress warnings; specify twice to suppress errors too\n");
     DISPLAY( " -c     : force write to standard output, even if it is the console\n");
-#ifdef UTIL_HAS_CREATEFILELIST
-    DISPLAY( " -r     : operate recursively on directories \n");
-#endif
 #ifndef ZSTD_NOCOMPRESS
     DISPLAY( "--ultra : enable levels beyond %i, up to %i (requires more memory)\n", ZSTDCLI_CLEVEL_MAX, ZSTD_maxCLevel());
-    DISPLAY( "--no-dictID : don't write dictID into header (dictionary compression)\n");
-    DISPLAY( "--[no-]check : integrity check (default:enabled) \n");
 #ifdef ZSTD_MULTITHREAD
     DISPLAY( " -T#    : use # threads for compression (default:1) \n");
-    DISPLAY( " -B#    : select size of independent sections (default:0==automatic) \n");
+    DISPLAY( " -B#    : select size of each job (default:0==automatic) \n");
+#endif
+    DISPLAY( "--no-dictID : don't write dictID into header (dictionary compression)\n");
+    DISPLAY( "--[no-]check : integrity check (default:enabled) \n");
+#endif
+#ifdef UTIL_HAS_CREATEFILELIST
+    DISPLAY( " -r     : operate recursively on directories \n");
 #endif
 #ifdef ZSTD_GZCOMPRESS
     DISPLAY( "--format=gzip : compress files to the .gz format \n");
@@ -134,10 +136,16 @@ static int usage_advanced(const char* programName)
     DISPLAY( "--format=xz : compress files to the .xz format \n");
     DISPLAY( "--format=lzma : compress files to the .lzma format \n");
 #endif
+#ifdef ZSTD_LZ4COMPRESS
+    DISPLAY( "--format=lz4 : compress files to the .lz4 format \n");
 #endif
 #ifndef ZSTD_NODECOMPRESS
     DISPLAY( "--test  : test compressed file integrity \n");
+#if ZSTD_SPARSE_DEFAULT
     DISPLAY( "--[no-]sparse : sparse mode (default:enabled on file, disabled on stdout)\n");
+#else
+    DISPLAY( "--[no-]sparse : sparse mode (default:disabled)\n");
+#endif
 #endif
     DISPLAY( " -M#    : Set a memory usage limit for decompression \n");
     DISPLAY( "--      : All arguments after \"--\" are treated as files \n");
@@ -145,12 +153,11 @@ static int usage_advanced(const char* programName)
     DISPLAY( "\n");
     DISPLAY( "Dictionary builder :\n");
     DISPLAY( "--train ## : create a dictionary from a training set of files \n");
-    DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n");
-    DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n");
+    DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
     DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
-    DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
-    DISPLAY( " -s#    : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
-    DISPLAY( "--dictID ## : force dictionary ID to specified value (default: random)\n");
+    DISPLAY( "--maxdict=# : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
+    DISPLAY( "--dictID=# : force dictionary ID to specified value (default: random)\n");
 #endif
 #ifndef ZSTD_NOBENCH
     DISPLAY( "\n");
@@ -167,7 +174,7 @@ static int usage_advanced(const char* programName)
 static int badusage(const char* programName)
 {
     DISPLAYLEVEL(1, "Incorrect parameters\n");
-    if (displayLevel >= 1) usage(programName);
+    if (g_displayLevel >= 2) usage(programName);
     return 1;
 }
 
@@ -179,6 +186,23 @@ static void waitEnter(void)
     (void)unused;
 }
 
+static const char* lastNameFromPath(const char* path)
+{
+    const char* name = path;
+    if (strrchr(name, '/')) name = strrchr(name, '/') + 1;
+    if (strrchr(name, '\\')) name = strrchr(name, '\\') + 1; /* windows */
+    return name;
+}
+
+/*! exeNameMatch() :
+    @return : a non-zero value if exeName matches test, excluding the extension
+   */
+static int exeNameMatch(const char* exeName, const char* test)
+{
+    return !strncmp(exeName, test, strlen(test)) &&
+        (exeName[strlen(test)] == '\0' || exeName[strlen(test)] == '.');
+}
+
 /*! readU32FromChar() :
     @return : unsigned integer value read from input in `char` format
     allows and interprets K, KB, KiB, M, MB and MiB suffix.
@@ -216,11 +240,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
 #ifndef ZSTD_NODICT
 /**
  * parseCoverParameters() :
- * reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params
+ * reads cover parameters from *stringPtr (e.g. "--train-cover=k=48,d=8,steps=32") into *params
  * @return 1 means that cover parameters were correct
  * @return 0 in case of malformed parameters
  */
-static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params)
+static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t* params)
 {
     memset(params, 0, sizeof(*params));
     for (; ;) {
@@ -230,9 +254,33 @@ static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *para
         return 0;
     }
     if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    return 1;
+}
+
+/**
+ * parseLegacyParameters() :
+ * reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity
+ * @return 1 means that legacy dictionary builder parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseLegacyParameters(const char* stringPtr, unsigned* selectivity)
+{
+    if (!longCommandWArg(&stringPtr, "s=") && !longCommandWArg(&stringPtr, "selectivity=")) { return 0; }
+    *selectivity = readU32FromChar(&stringPtr);
+    if (stringPtr[0] != 0) return 0;
+    DISPLAYLEVEL(4, "legacy: selectivity=%u\n", *selectivity);
     return 1;
 }
+
+static COVER_params_t defaultCoverParams(void)
+{
+    COVER_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.steps = 4;
+    return params;
+}
 #endif
 
 
@@ -270,6 +318,7 @@ int main(int argCount, const char* argv[])
 {
     int argNb,
         forceStdout=0,
+        followLinks=0,
         main_pause=0,
         nextEntryIsDictionary=0,
         operationResult=0,
@@ -305,8 +354,8 @@ int main(int argCount, const char* argv[])
     unsigned fileNamesNb;
 #endif
 #ifndef ZSTD_NODICT
-    COVER_params_t coverParams;
-    int cover = 0;
+    COVER_params_t coverParams = defaultCoverParams();
+    int cover = 1;
 #endif
 
     /* init */
@@ -316,21 +365,19 @@ int main(int argCount, const char* argv[])
     (void)memLimit;   /* not used when ZSTD_NODECOMPRESS set */
     if (filenameTable==NULL) { DISPLAY("zstd: %s \n", strerror(errno)); exit(1); }
     filenameTable[0] = stdinmark;
-    displayOut = stderr;
-    /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
-    {   size_t pos;
-        for (pos = (int)strlen(programName); pos > 0; pos--) { if (programName[pos] == '/') { pos++; break; } }
-        programName += pos;
-    }
+    g_displayOut = stderr;
+
+    programName = lastNameFromPath(programName);
 
     /* preset behaviors */
-    if (!strcmp(programName, ZSTD_UNZSTD)) operation=zom_decompress;
-    if (!strcmp(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; displayLevel=1; }
-    if (!strcmp(programName, ZSTD_GZ)) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); FIO_setRemoveSrcFile(1); }    /* behave like gzip */
-    if (!strcmp(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                          /* behave like gunzip */
-    if (!strcmp(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; displayLevel=1; }  /* behave like gzcat */
-    if (!strcmp(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like lzma */
-    if (!strcmp(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
+    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbThreads=0;
+    if (exeNameMatch(programName, ZSTD_UNZSTD)) operation=zom_decompress;
+    if (exeNameMatch(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }
+    if (exeNameMatch(programName, ZSTD_GZ)) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); FIO_setRemoveSrcFile(1); }    /* behave like gzip */
+    if (exeNameMatch(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                          /* behave like gunzip */
+    if (exeNameMatch(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like gzcat */
+    if (exeNameMatch(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like lzma */
+    if (exeNameMatch(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
     memset(&compressionParams, 0, sizeof(compressionParams));
 
     /* command switches */
@@ -344,7 +391,7 @@ int main(int argCount, const char* argv[])
                 if (!filenameIdx) {
                     filenameIdx=1, filenameTable[0]=stdinmark;
                     outFileName=stdoutmark;
-                    displayLevel-=(displayLevel==2);
+                    g_displayLevel-=(g_displayLevel==2);
                     continue;
             }   }
 
@@ -357,12 +404,12 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--compress")) { operation=zom_compress; continue; }
                     if (!strcmp(argument, "--decompress")) { operation=zom_decompress; continue; }
                     if (!strcmp(argument, "--uncompress")) { operation=zom_decompress; continue; }
-                    if (!strcmp(argument, "--force")) { FIO_overwriteMode(); continue; }
-                    if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0); }
-                    if (!strcmp(argument, "--help")) { displayOut=stdout; CLEAN_RETURN(usage_advanced(programName)); }
-                    if (!strcmp(argument, "--verbose")) { displayLevel++; continue; }
-                    if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
-                    if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; displayLevel-=(displayLevel==2); continue; }
+                    if (!strcmp(argument, "--force")) { FIO_overwriteMode(); forceStdout=1; followLinks=1; continue; }
+                    if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0); }
+                    if (!strcmp(argument, "--help")) { g_displayOut=stdout; CLEAN_RETURN(usage_advanced(programName)); }
+                    if (!strcmp(argument, "--verbose")) { g_displayLevel++; continue; }
+                    if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
+                    if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; g_displayLevel-=(g_displayLevel==2); continue; }
                     if (!strcmp(argument, "--ultra")) { ultra=1; continue; }
                     if (!strcmp(argument, "--check")) { FIO_setChecksumFlag(2); continue; }
                     if (!strcmp(argument, "--no-check")) { FIO_setChecksumFlag(0); continue; }
@@ -370,8 +417,8 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; }
                     if (!strcmp(argument, "--test")) { operation=zom_test; continue; }
                     if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; }
-                    if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }
-                    if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }
+                    if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
+                    if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; }
                     if (!strcmp(argument, "--keep")) { FIO_setRemoveSrcFile(0); continue; }
                     if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
@@ -383,26 +430,40 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--format=lzma")) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression);  continue; }
                     if (!strcmp(argument, "--format=xz")) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression);  continue; }
 #endif
+#ifdef ZSTD_LZ4COMPRESS
+                    if (!strcmp(argument, "--format=lz4")) { suffix = LZ4_EXTENSION; FIO_setCompressionType(FIO_lz4Compression);  continue; }
+#endif
 
                     /* long commands with arguments */
 #ifndef ZSTD_NODICT
-                    if (longCommandWArg(&argument, "--cover=")) {
-                      cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName));
-                      continue;
-                    }
-                    if (longCommandWArg(&argument, "--optimize-cover")) {
-                      cover=2;
+                    if (longCommandWArg(&argument, "--train-cover")) {
+                      operation = zom_train;
+                      outFileName = g_defaultDictName;
+                      cover = 1;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
                       else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
                       continue;
                     }
+                    if (longCommandWArg(&argument, "--train-legacy")) {
+                      operation = zom_train;
+                      outFileName = g_defaultDictName;
+                      cover = 0;
+                      /* Allow optional arguments following an = */
+                      if (*argument == 0) { continue; }
+                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+                      else if (!parseLegacyParameters(argument, &dictSelect)) { CLEAN_RETURN(badusage(programName)); }
+                      continue;
+                    }
 #endif
+                    if (longCommandWArg(&argument, "--threads=")) { nbThreads = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--block-size=")) { blockSize = readU32FromChar(&argument); continue; }
+                    if (longCommandWArg(&argument, "--maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+                    if (longCommandWArg(&argument, "--dictID=")) { dictID = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; }
                     /* fall-through, will trigger bad_usage() later on */
                 }
@@ -424,9 +485,9 @@ int main(int argCount, const char* argv[])
                     switch(argument[0])
                     {
                         /* Display help */
-                    case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0);   /* Version Only */
+                    case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0);   /* Version Only */
                     case 'H':
-                    case 'h': displayOut=stdout; CLEAN_RETURN(usage_advanced(programName));
+                    case 'h': g_displayOut=stdout; CLEAN_RETURN(usage_advanced(programName));
 
                          /* Compress */
                     case 'z': operation=zom_compress; argument++; break;
@@ -445,19 +506,19 @@ int main(int argCount, const char* argv[])
                     case 'D': nextEntryIsDictionary = 1; lastCommand = 1; argument++; break;
 
                         /* Overwrite */
-                    case 'f': FIO_overwriteMode(); forceStdout=1; argument++; break;
+                    case 'f': FIO_overwriteMode(); forceStdout=1; followLinks=1; argument++; break;
 
                         /* Verbose mode */
-                    case 'v': displayLevel++; argument++; break;
+                    case 'v': g_displayLevel++; argument++; break;
 
                         /* Quiet mode */
-                    case 'q': displayLevel--; argument++; break;
+                    case 'q': g_displayLevel--; argument++; break;
 
-                        /* keep source file (default); for gzip/xz compatibility */
+                        /* keep source file (default) */
                     case 'k': FIO_setRemoveSrcFile(0); argument++; break;
 
                         /* Checksum */
-                    case 'C': argument++; FIO_setChecksumFlag(2); break;
+                    case 'C': FIO_setChecksumFlag(2); argument++; break;
 
                         /* test compressed file */
                     case 't': operation=zom_test; argument++; break;
@@ -532,14 +593,14 @@ int main(int argCount, const char* argv[])
                 continue;
             }   /* if (argument[0]=='-') */
 
-            if (nextArgumentIsMaxDict) {
+            if (nextArgumentIsMaxDict) {  /* kept available for compatibility with old syntax ; will be removed one day */
                 nextArgumentIsMaxDict = 0;
                 lastCommand = 0;
                 maxDictSize = readU32FromChar(&argument);
                 continue;
             }
 
-            if (nextArgumentIsDictID) {
+            if (nextArgumentIsDictID) {  /* kept available for compatibility with old syntax ; will be removed one day */
                 nextArgumentIsDictID = 0;
                 lastCommand = 0;
                 dictID = readU32FromChar(&argument);
@@ -581,9 +642,27 @@ int main(int argCount, const char* argv[])
     DISPLAYLEVEL(4, "PLATFORM_POSIX_VERSION defined: %ldL\n", (long) PLATFORM_POSIX_VERSION);
 #endif
 
+    if (nbThreads == 0) {
+        /* try to guess */
+        nbThreads = UTIL_countPhysicalCores();
+        DISPLAYLEVEL(3, "Note: %d physical core(s) detected\n", nbThreads);
+    }
+
+    g_utilDisplayLevel = g_displayLevel;
+    if (!followLinks) {
+        unsigned u;
+        for (u=0, fileNamesNb=0; u<filenameIdx; u++) {
+            if (UTIL_isLink(filenameTable[u])) {
+                DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", filenameTable[u]);
+            } else {
+                filenameTable[fileNamesNb++] = filenameTable[u];
+            }
+        }
+        filenameIdx = fileNamesNb;
+    }
 #ifdef UTIL_HAS_CREATEFILELIST
     if (recursive) {  /* at this stage, filenameTable is a list of paths, which can contain both files and directories */
-        extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb);
+        extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
         if (extendedFileList) {
             unsigned u;
             for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
@@ -597,7 +676,7 @@ int main(int argCount, const char* argv[])
     /* Check if benchmark is selected */
     if (operation==zom_bench) {
 #ifndef ZSTD_NOBENCH
-        BMK_setNotificationLevel(displayLevel);
+        BMK_setNotificationLevel(g_displayLevel);
         BMK_setBlockSize(blockSize);
         BMK_setNbThreads(nbThreads);
         BMK_setNbSeconds(bench_nbSeconds);
@@ -611,19 +690,20 @@ int main(int argCount, const char* argv[])
     if (operation==zom_train) {
 #ifndef ZSTD_NODICT
         if (cover) {
+            int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = nbThreads;
             coverParams.compressionLevel = dictCLevel;
-            coverParams.notificationLevel = displayLevel;
+            coverParams.notificationLevel = g_displayLevel;
             coverParams.dictID = dictID;
-            DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize);
         } else {
             ZDICT_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
             dictParams.compressionLevel = dictCLevel;
             dictParams.selectivityLevel = dictSelect;
-            dictParams.notificationLevel = displayLevel;
+            dictParams.notificationLevel = g_displayLevel;
             dictParams.dictID = dictID;
-            DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
         }
 #endif
         goto _end;
@@ -635,7 +715,7 @@ int main(int argCount, const char* argv[])
 
     /* Check if input/output defined as console; trigger an error in this case */
     if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) CLEAN_RETURN(badusage(programName));
-    if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && strcmp(filenameTable[0], stdinmark) && !(forceStdout && (operation==zom_decompress)))
+    if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && !strcmp(filenameTable[0], stdinmark) && !forceStdout && operation!=zom_decompress)
         CLEAN_RETURN(badusage(programName));
 
     /* user-selected output filename, only possible with a single file */
@@ -654,11 +734,11 @@ int main(int argCount, const char* argv[])
 #endif
 
     /* No status message in pipe mode (stdin - stdout) or multi-files mode */
-    if (!strcmp(filenameTable[0], stdinmark) && outFileName && !strcmp(outFileName,stdoutmark) && (displayLevel==2)) displayLevel=1;
-    if ((filenameIdx>1) & (displayLevel==2)) displayLevel=1;
+    if (!strcmp(filenameTable[0], stdinmark) && outFileName && !strcmp(outFileName,stdoutmark) && (g_displayLevel==2)) g_displayLevel=1;
+    if ((filenameIdx>1) & (g_displayLevel==2)) g_displayLevel=1;
 
     /* IO Stream/File */
-    FIO_setNotificationLevel(displayLevel);
+    FIO_setNotificationLevel(g_displayLevel);
     if (operation==zom_compress) {
 #ifndef ZSTD_NOCOMPRESS
         FIO_setNbThreads(nbThreads);