From ffcbc2d7ba03067492045e4cbead519a3b3c27ef Mon Sep 17 00:00:00 2001
From: Baptiste Daroussin <bapt@FreeBSD.org>
Date: Sat, 6 May 2017 10:17:59 +0000
Subject: Import Zstandard 1.2.0 Among changes: threading support is now
 default and supports freebsd

---
 .gitignore                                  |  41 --
 .travis.yml                                 |  51 --
 Makefile                                    |  32 +-
 NEWS                                        |  32 +-
 README.md                                   |  13 +-
 appveyor.yml                                | 271 +++++++----
 circle.yml                                  |   2 +-
 contrib/cleanTabs                           |   2 +
 contrib/pzstd/Options.cpp                   |  22 +-
 contrib/pzstd/utils/test/ThreadPoolTest.cpp |   6 +-
 contrib/pzstd/utils/test/WorkQueueTest.cpp  |   7 +
 doc/educational_decoder/zstd_decompress.c   | 243 +++++-----
 doc/images/Cspeed4.png                      | Bin 35361 -> 71276 bytes
 doc/images/Dspeed4.png                      | Bin 8984 -> 24692 bytes
 doc/images/dict-cr.png                      | Bin 23323 -> 90047 bytes
 doc/images/dict-cs.png                      | Bin 25052 -> 93837 bytes
 doc/images/dict-ds.png                      | Bin 27053 -> 89590 bytes
 doc/zstd_compression_format.md              | 387 ++++++++-------
 doc/zstd_manual.html                        | 157 +++---
 examples/simple_compression.c               |   3 +-
 examples/streaming_compression.c            |   3 +-
 examples/streaming_decompression.c          |   3 +-
 lib/Makefile                                |  12 +-
 lib/README.md                               |   8 +
 lib/common/bitstream.h                      |  83 ++--
 lib/common/error_private.c                  |   3 +-
 lib/common/fse.h                            |  10 +-
 lib/common/huf.h                            |  61 ++-
 lib/common/mem.h                            |   5 +-
 lib/common/zstd_errors.h                    |   1 +
 lib/common/zstd_internal.h                  |   5 +-
 lib/compress/fse_compress.c                 |  20 +-
 lib/compress/zstd_compress.c                | 699 ++++++++++++++++----------
 lib/compress/zstd_opt.h                     |  10 +-
 lib/compress/zstdmt_compress.c              |  68 +--
 lib/decompress/zstd_decompress.c            | 334 +++++--------
 lib/dictBuilder/cover.c                     |  47 +-
 lib/dictBuilder/zdict.c                     | 108 +++--
 lib/dictBuilder/zdict.h                     |  14 +-
 lib/legacy/zstd_v01.c                       |   2 +-
 lib/legacy/zstd_v02.c                       |  42 +-
 lib/legacy/zstd_v03.c                       |  40 +-
 lib/legacy/zstd_v04.c                       |   8 +-
 lib/legacy/zstd_v05.c                       |   4 +-
 lib/legacy/zstd_v06.c                       |   4 +-
 lib/zstd.h                                  | 156 +++---
 programs/Makefile                           | 102 +++-
 programs/README.md                          |  50 +-
 programs/bench.c                            |  41 +-
 programs/dibio.c                            |   7 +-
 programs/fileio.c                           | 271 +++++++++--
 programs/fileio.h                           |   3 +-
 programs/platform.h                         |  19 +-
 programs/util.h                             | 245 +++++++++-
 programs/zstd.1                             | 726 +++++++++++++---------------
 programs/zstd.1.md                          | 343 +++++++++++++
 programs/zstdcli.c                          | 220 ++++++---
 tests/Makefile                              |  22 +-
 tests/decodecorpus.c                        |  60 +--
 tests/fullbench.c                           |   4 +-
 tests/fuzzer.c                              | 264 +++++++---
 tests/paramgrill.c                          |  82 ++--
 tests/playTests.sh                          | 168 ++++++-
 tests/test-zstd-speed.py                    |  18 +-
 tests/zbufftest.c                           |   2 +-
 tests/zstreamtest.c                         | 112 ++++-
 zlibWrapper/examples/zwrapbench.c           |  16 +-
 67 files changed, 3710 insertions(+), 2084 deletions(-)
 delete mode 100644 .gitignore
 delete mode 100644 .travis.yml
 create mode 100755 contrib/cleanTabs
 create mode 100644 programs/zstd.1.md

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index e02119883a7d..000000000000
--- a/.gitignore
+++ /dev/null
@@ -1,41 +0,0 @@
-# Object files
-*.o
-*.ko
-*.dSYM
-
-# Libraries
-*.lib
-*.a
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-zstd
-zstdmt
-*.exe
-*.out
-*.app
-
-# Test artefacts
-tmp*
-dictionary*
-
-# Other files
-.directory
-_codelite/
-_zstdbench/
-.clang_complete
-*.idea
-*.swp
-.DS_Store
-googletest/
-*.d
-
-# Directories
-bin/
-.buckd/
-buck-out/
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 3f98ea04dd75..000000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Long tests: run on commits to master branch/cron builds
-
-language: c
-sudo: required
-dist: trusty
-matrix:
-  include:
-    # Ubuntu 14.04
-    - env: Cmd='make gcc6install && CC=gcc-6 make clean uasan-test'
-    - env: Cmd='make gcc6install libc6install && CC=gcc-6 make clean uasan-test32'
-    - env: Cmd='make clang38install && CC=clang-3.8 make clean msan-test'
-    - env: Cmd='make clang38install && CC=clang-3.8 make clean tsan-test-zstream'
-    - env: Cmd='make valgrindinstall && make -C tests clean valgrindTest'
-
-    - env: Cmd='make arminstall && make armtest'
-    - env: Cmd='make arminstall && make aarch64test'
-    - env: Cmd='make ppcinstall && make ppctest'
-    - env: Cmd='make ppcinstall && make ppc64test'
-
-
-    - env: Cmd='make gpp6install valgrindinstall && make -C zlibWrapper test && make -C zlibWrapper valgrindTest'
-    - env: Cmd='make -C tests versionsTest'
-    - env: Cmd='make gpp6install && cd contrib/pzstd && make test-pzstd && make test-pzstd32 && make test-pzstd-tsan && make test-pzstd-asan'
-      install:
-        - export CXX="g++-6" CC="gcc-6"
-    - env: Cmd='make gcc6install && CC=gcc-6 make uasan-test-zstd-nolegacy'
-    - env: Cmd='make gcc6install && CC=gcc-6 make uasan-test-zbuff'
-
-    # OS X Mavericks
-    - env: Cmd="make gnu90build && make clean && make test && make clean && make travis-install"
-      os: osx
-
-git:
-  depth: 1
-
-branches:
-  only:
-  - dev
-  - master
-
-script:
-  - JOB_NUMBER=$(echo $TRAVIS_JOB_NUMBER | sed -e 's:[0-9][0-9]*\.\(.*\):\1:')
-  - echo JOB_NUMBER=$JOB_NUMBER TRAVIS_BRANCH=$TRAVIS_BRANCH TRAVIS_EVENT_TYPE=$TRAVIS_EVENT_TYPE TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST
-  - export FUZZERTEST=-T5mn;
-    export ZSTREAM_TESTTIME=-T5mn;
-    export DECODECORPUS_TESTTIME=-T1mn;
-    if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then
-        git fetch origin dev;
-        git checkout -f FETCH_HEAD;
-    fi;
-    sh -c "$Cmd" || travis_terminate 1;
diff --git a/Makefile b/Makefile
index 49f29d782276..54652665bc76 100644
--- a/Makefile
+++ b/Makefile
@@ -90,6 +90,10 @@ examples:
 manual:
 	$(MAKE) -C contrib/gen_html $@
 
+.PHONY: cleanTabs
+cleanTabs:
+	cd contrib; ./cleanTabs
+
 .PHONY: clean
 clean:
 	@$(MAKE) -C $(ZSTDDIR) $@ > $(VOID)
@@ -105,9 +109,15 @@ clean:
 # make install is validated only for Linux, OSX, Hurd and some BSD targets
 #------------------------------------------------------------------------------
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU FreeBSD DragonFly NetBSD))
+
 HOST_OS = POSIX
-.PHONY: install uninstall travis-install clangtest gpptest armtest usan asan uasan
+CMAKE_PARAMS = -DZSTD_BUILD_CONTRIB:BOOL=ON -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON -DZSTD_ZLIB_SUPPORT:BOOL=ON -DZSTD_LZMA_SUPPORT:BOOL=ON
 
+.PHONY: list
+list:
+	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+
+.PHONY: install uninstall travis-install clangtest gpptest armtest usan asan uasan
 install:
 	@$(MAKE) -C $(ZSTDDIR) $@
 	@$(MAKE) -C $(PRGDIR) $@
@@ -152,16 +162,16 @@ ppc64build: clean
 	CC=powerpc-linux-gnu-gcc CFLAGS="-m64 -Werror" $(MAKE) allarch
 
 armfuzz: clean
-	CC=arm-linux-gnueabi-gcc QEMU_SYS=qemu-arm-static MOREFLAGS="-static" $(MAKE) -C $(TESTDIR) fuzztest
+	CC=arm-linux-gnueabi-gcc QEMU_SYS=qemu-arm-static MOREFLAGS="-static" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) fuzztest
 
 aarch64fuzz: clean
-	CC=aarch64-linux-gnu-gcc QEMU_SYS=qemu-aarch64-static MOREFLAGS="-static" $(MAKE) -C $(TESTDIR) fuzztest
+	CC=aarch64-linux-gnu-gcc QEMU_SYS=qemu-aarch64-static MOREFLAGS="-static" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) fuzztest
 
 ppcfuzz: clean
-	CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc-static MOREFLAGS="-static" $(MAKE) -C $(TESTDIR) fuzztest
+	CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc-static MOREFLAGS="-static" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) fuzztest
 
 ppc64fuzz: clean
-	CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc64-static MOREFLAGS="-m64 -static" $(MAKE) -C $(TESTDIR) fuzztest
+	CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc64-static MOREFLAGS="-m64 -static" FUZZER_FLAGS=--no-big-tests $(MAKE) -C $(TESTDIR) fuzztest
 
 gpptest: clean
 	CC=g++ $(MAKE) -C $(PRGDIR) all CFLAGS="-O3 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Werror"
@@ -180,19 +190,19 @@ clangtest: clean
 
 armtest: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
-	$(MAKE) -C $(TESTDIR) test CC=arm-linux-gnueabi-gcc QEMU_SYS=qemu-arm-static ZSTDRTTEST= MOREFLAGS="-Werror -static"
+	$(MAKE) -C $(TESTDIR) test CC=arm-linux-gnueabi-gcc QEMU_SYS=qemu-arm-static ZSTDRTTEST= MOREFLAGS="-Werror -static" FUZZER_FLAGS=--no-big-tests
 
 aarch64test:
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
-	$(MAKE) -C $(TESTDIR) test CC=aarch64-linux-gnu-gcc QEMU_SYS=qemu-aarch64-static ZSTDRTTEST= MOREFLAGS="-Werror -static"
+	$(MAKE) -C $(TESTDIR) test CC=aarch64-linux-gnu-gcc QEMU_SYS=qemu-aarch64-static ZSTDRTTEST= MOREFLAGS="-Werror -static" FUZZER_FLAGS=--no-big-tests
 
 ppctest: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
-	$(MAKE) -C $(TESTDIR) test CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc-static ZSTDRTTEST= MOREFLAGS="-Werror -Wno-attributes -static"
+	$(MAKE) -C $(TESTDIR) test CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc-static ZSTDRTTEST= MOREFLAGS="-Werror -Wno-attributes -static" FUZZER_FLAGS=--no-big-tests
 
 ppc64test: clean
 	$(MAKE) -C $(TESTDIR) datagen   # use native, faster
-	$(MAKE) -C $(TESTDIR) test CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc64-static ZSTDRTTEST= MOREFLAGS="-m64 -static"
+	$(MAKE) -C $(TESTDIR) test CC=powerpc-linux-gnu-gcc QEMU_SYS=qemu-ppc64-static ZSTDRTTEST= MOREFLAGS="-m64 -static" FUZZER_FLAGS=--no-big-tests
 
 arm-ppc-compilation:
 	$(MAKE) -C $(PRGDIR) clean zstd CC=arm-linux-gnueabi-gcc QEMU_SYS=qemu-arm-static ZSTDRTTEST= MOREFLAGS="-Werror -static"
@@ -263,7 +273,7 @@ endif
 
 ifneq (,$(filter MSYS%,$(shell uname)))
 HOST_OS = MSYS
-CMAKE_PARAMS = -G"MSYS Makefiles"
+CMAKE_PARAMS = -G"MSYS Makefiles" -DZSTD_MULTITHREAD_SUPPORT:BOOL=OFF -DZSTD_BUILD_STATIC:BOOL=ON -DZSTD_BUILD_TESTS:BOOL=ON
 endif
 
 
@@ -275,7 +285,7 @@ cmakebuild:
 	cmake --version
 	$(RM) -r $(BUILDIR)/cmake/build
 	mkdir $(BUILDIR)/cmake/build
-	cd $(BUILDIR)/cmake/build ; cmake -DPREFIX:STRING=~/install_test_dir $(CMAKE_PARAMS) .. ; $(MAKE) install ; $(MAKE) uninstall
+	cd $(BUILDIR)/cmake/build ; cmake -DCMAKE_INSTALL_PREFIX:PATH=~/install_test_dir $(CMAKE_PARAMS) .. ; $(MAKE) install ; $(MAKE) uninstall
 
 c90build: clean
 	gcc -v
diff --git a/NEWS b/NEWS
index bbd9e1688386..7d9c9c94e186 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,25 @@
+v1.2.0
+cli : changed : Multithreading enabled by default (use target zstd-nomt or HAVE_THREAD=0 to disable)
+cli : new : command -T0 means "detect and use nb of cores", by Sean Purcell
+cli : new : zstdmt symlink hardwired to `zstd -T0`
+cli : new : command --threads=# (#671)
+cli : changed : cover dictionary builder by default, for improved quality, by Nick Terrell
+cli : new : commands --train-cover and --train-legacy, to select dictionary algorithm and parameters
+cli : experimental targets `zstd4` and `xzstd4`, with support for lz4 format, by Sean Purcell
+cli : fix : does not output compressed data on console
+cli : fix : ignore symbolic links unless --force specified,
+API : breaking change : ZSTD_createCDict_advanced(), only use compressionParameters as argument
+API : added : prototypes ZSTD_*_usingCDict_advanced(), for direct control over frameParameters.
+API : improved: ZSTDMT_compressCCtx() reduced memory usage
+API : fix : ZSTDMT_compressCCtx() now provides srcSize in header (#634)
+API : fix : src size stored in frame header is controlled at end of frame
+API : fix : enforced consistent rules for pledgedSrcSize==0 (#641)
+API : fix : error code "GENERIC" replaced by "dstSizeTooSmall" when appropriate
+build: improved cmake script, by @Majlen
+build: enabled Multi-threading support for *BSD, by Baptiste Daroussin
+tools: updated Paramgrill. Command -O# provides best parameters for sample and speed target.
+new : contrib/linux-kernel version, by Nick Terrell
+
 v1.1.4
 cli : new : can compress in *.gz format, using --format=gzip command, by Przemyslaw Skibinski
 cli : new : advanced benchmark command --priority=rt
@@ -5,13 +27,13 @@ cli : fix : write on sparse-enabled file systems in 32-bits mode, by @ds77
 cli : fix : --rm remains silent when input is stdin
 cli : experimental : xzstd, with support for xz/lzma decoding, by Przemyslaw Skibinski
 speed : improved decompression speed in streaming mode for single shot scenarios (+5%)
-memory : DDict (decompression dictionary) memory usage down from 150 KB to 20 KB
-arch : 32-bits variant able to generate and decode very long matches (>32 MB), by Sean Purcell
+memory: DDict (decompression dictionary) memory usage down from 150 KB to 20 KB
+arch: 32-bits variant able to generate and decode very long matches (>32 MB), by Sean Purcell
 API : new : ZSTD_findFrameCompressedSize(), ZSTD_getFrameContentSize(), ZSTD_findDecompressedSize()
 API : changed : dropped support of legacy versions <= v0.3 (can be changed by modifying ZSTD_LEGACY_SUPPORT value)
-build: new: meson build system in contrib/meson, by Dima Krasner
-build: improved cmake script, by @Majlen
-build: added -Wformat-security flag, as recommended by Padraig Brady
+build : new: meson build system in contrib/meson, by Dima Krasner
+build : improved cmake script, by @Majlen
+build : added -Wformat-security flag, as recommended by Padraig Brady
 doc : new : educational decoder, by Sean Purcell
 
 v1.1.3
diff --git a/README.md b/README.md
index 6de5a10790db..7caee5fd3f67 100644
--- a/README.md
+++ b/README.md
@@ -12,13 +12,13 @@ you can consult a list of known ports on [Zstandard homepage](http://www.zstd.ne
 |dev         | [![Build Status](https://travis-ci.org/facebook/zstd.svg?branch=dev)](https://travis-ci.org/facebook/zstd) |
 
 As a reference, several fast compression algorithms were tested and compared
-on a server running Linux Mint Debian Edition (`Linux version 4.8.0-1-amd64`),
+on a server running Linux Debian (`Linux version 4.8.0-1-amd64`),
 with a Core i7-6700K CPU @ 4.0GHz,
-using [lzbench v1.6], an open-source in-memory benchmark by @inikep
+using [lzbench], an open-source in-memory benchmark by @inikep
 compiled with GCC 6.3.0,
 on the [Silesia compression corpus].
 
-[lzbench v1.6]: https://github.com/inikep/lzbench
+[lzbench]: https://github.com/inikep/lzbench
 [Silesia compression corpus]: http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
 
 | Compressor name         | Ratio | Compression| Decompress.|
@@ -38,7 +38,12 @@ on the [Silesia compression corpus].
 Zstd can also offer stronger compression ratios at the cost of compression speed.
 Speed vs Compression trade-off is configurable by small increments. Decompression speed is preserved and remains roughly the same at all settings, a property shared by most LZ compression algorithms, such as [zlib] or lzma.
 
-The following tests were run on a Core i7-3930K CPU @ 4.5GHz, using [lzbench], an open-source in-memory benchmark by @inikep compiled with GCC 5.2.1, on the [Silesia compression corpus].
+The following tests were run
+on a server running Linux Debian (`Linux version 4.8.0-1-amd64`)
+with a Core i7-6700K CPU @ 4.0GHz,
+using [lzbench], an open-source in-memory benchmark by @inikep
+compiled with GCC 6.3.0,
+on the [Silesia compression corpus].
 
 Compression Speed vs Ratio | Decompression Speed
 ---------------------------|--------------------
diff --git a/appveyor.yml b/appveyor.yml
index 9507fec6e24c..67c4f72d7995 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,66 +1,103 @@
-version: 1.0.{build}
-environment:
-  matrix:
-  - COMPILER: "gcc"
-    PLATFORM: "mingw64"
-    MAKE_PARAMS: '"make test && make lib && make -C tests test-symbols fullbench-dll fullbench-lib"'
-  - COMPILER: "gcc"
-    PLATFORM: "mingw32"
-    MAKE_PARAMS: '"make -C tests test-zstd test-fullbench test-fuzzer test-invalidDictionaries"'
-  - COMPILER: "gcc"
-    PLATFORM: "clang"
-    MAKE_PARAMS: '"make -C tests zstd fullbench fuzzer paramgrill datagen CC=clang MOREFLAGS="--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion""'
-  - COMPILER: "visual"
-    CONFIGURATION: "Debug"
-    PLATFORM: "x64"
-  - COMPILER: "visual"
-    CONFIGURATION: "Debug"
-    PLATFORM: "Win32"
-  - COMPILER: "visual"
-    CONFIGURATION: "Release"
-    PLATFORM: "x64"
-  - COMPILER: "visual"
-    CONFIGURATION: "Release"
-    PLATFORM: "Win32"
+-
+  version: 1.0.{build}
+  branches:
+    only:
+    - dev
+    - master
+  environment:
+    matrix:
+    - COMPILER: "gcc"
+      HOST:     "mingw"
+      PLATFORM: "x64"
+      SCRIPT:   "make allarch && make -C tests test-symbols fullbench-dll fullbench-lib"
+      ARTIFACT: "true"
+      BUILD:    "true"
+    - COMPILER: "gcc"
+      HOST:     "mingw"
+      PLATFORM: "x86"
+      SCRIPT:   "make allarch"
+      ARTIFACT: "true"
+      BUILD:    "true"
+    - COMPILER: "clang"
+      HOST:     "mingw"
+      PLATFORM: "x64"
+      SCRIPT:   "MOREFLAGS='--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion' make allarch"
+      BUILD:    "true"
 
-install:
+    - COMPILER: "gcc"
+      HOST:     "mingw"
+      PLATFORM: "x64"
+      SCRIPT:   ""
+      TEST:     "cmake"
+
+    - COMPILER: "gcc"
+      HOST:     "mingw"
+      PLATFORM: "x64"
+      SCRIPT:   ""
+      TEST:     "pzstd"
+
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "x64"
+      CONFIGURATION: "Debug"
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "Win32"
+      CONFIGURATION: "Debug"
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "x64"
+      CONFIGURATION: "Release"
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "Win32"
+      CONFIGURATION: "Release"
+
+  install:
   - ECHO Installing %COMPILER% %PLATFORM% %CONFIGURATION%
-  - MKDIR bin
-  - if [%COMPILER%]==[gcc] SET PATH_ORIGINAL=%PATH%
-  - if [%COMPILER%]==[gcc] (
-      SET "PATH_MINGW32=c:\MinGW\bin;c:\MinGW\usr\bin" &&
-      SET "PATH_MINGW64=c:\msys64\mingw64\bin;c:\msys64\usr\bin" &&
-      COPY C:\msys64\usr\bin\make.exe C:\MinGW\bin\make.exe &&
-      COPY C:\MinGW\bin\gcc.exe C:\MinGW\bin\cc.exe
-    ) else (
-      IF [%PLATFORM%]==[x64] (SET ADDITIONALPARAM=/p:LibraryPath="C:\Program Files\Microsoft SDKs\Windows\v7.1\lib\x64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;C:\Program Files (x86)\Microsoft Visual Studio 10.0\;C:\Program Files (x86)\Microsoft Visual Studio 10.0\lib\amd64;")
+  - SET PATH_ORIGINAL=%PATH%
+  - if [%HOST%]==[mingw] (
+      SET "PATH_MINGW32=C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin" &&
+      SET "PATH_MINGW64=C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin" &&
+      COPY C:\msys64\usr\bin\make.exe C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin\make.exe &&
+      COPY C:\msys64\usr\bin\make.exe C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin\make.exe
     )
-
-build_script:
-  - ECHO Building %COMPILER% %PLATFORM% %CONFIGURATION%
-  - if [%PLATFORM%]==[mingw32] SET PATH=%PATH_MINGW32%;%PATH_ORIGINAL%
-  - if [%PLATFORM%]==[mingw64] SET PATH=%PATH_MINGW64%;%PATH_ORIGINAL%
-  - if [%PLATFORM%]==[clang] SET PATH=%PATH_MINGW64%;%PATH_ORIGINAL%
-  - if [%COMPILER%]==[gcc] (
-      ECHO *** &&
-      ECHO *** Building %PLATFORM% &&
-      ECHO *** &&
-      make -v &&
-      cc -v &&
-      ECHO %MAKE_PARAMS% &&
-      sh -c %MAKE_PARAMS%
+  - IF [%HOST%]==[visual] IF [%PLATFORM%]==[x64] (
+      SET ADDITIONALPARAM=/p:LibraryPath="C:\Program Files\Microsoft SDKs\Windows\v7.1\lib\x64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;C:\Program Files (x86)\Microsoft Visual Studio 10.0\;C:\Program Files (x86)\Microsoft Visual Studio 10.0\lib\amd64;"
     )
-  - if [%PLATFORM%]==[clang] COPY tests\fuzzer.exe tests\fuzzer_clang.exe
-  - if [%COMPILER%]==[gcc] if [%PLATFORM%]==[mingw64] (
-      COPY programs\zstd.exe bin\zstd.exe &&
-      appveyor PushArtifact bin\zstd.exe
+
+  build_script:
+  - if [%HOST%]==[mingw] (
+      ( if [%PLATFORM%]==[x64] (
+        SET "PATH=%PATH_MINGW64%;%PATH_ORIGINAL%"
+      ) else if [%PLATFORM%]==[x86] (
+        SET "PATH=%PATH_MINGW32%;%PATH_ORIGINAL%"
+      ) )
     )
-  - if [%COMPILER%]==[gcc] if [%PLATFORM%]==[mingw32] (
-      COPY programs\zstd.exe bin\zstd32.exe &&
-      appveyor PushArtifact bin\zstd32.exe
+  - if [%HOST%]==[mingw] if [%BUILD%]==[true] (
+      make -v &&
+      sh -c "%COMPILER% -v" &&
+      ECHO Building zlib to static link &&
+      SET "CC=%COMPILER%" &&
+      sh -c "cd .. && git clone --depth 1 --branch v1.2.11 https://github.com/madler/zlib" &&
+      sh -c "cd ../zlib && make -f win32/Makefile.gcc libz.a"
+      ECHO Building zstd &&
+      SET "CPPFLAGS=-I../../zlib" &&
+      SET "LDFLAGS=../../zlib/libz.a" &&
+      sh -c "%SCRIPT%" &&
+      ( if [%COMPILER%]==[gcc] if [%ARTIFACT%]==[true]
+          lib\dll\example\build_package.bat &&
+          make -C programs DEBUGFLAGS= clean zstd &&
+          cp programs\zstd.exe zstd_%PLATFORM%.exe &&
+          appveyor PushArtifact zstd_%PLATFORM%.exe &&
+          cp programs\zstd.exe bin\zstd.exe &&
+          make -C programs DEBUGFLAGS= clean zstdmt &&
+          cp programs\zstd.exe bin\zstdmt.exe &&
+          cd bin\ && 7z a -tzip zstd-win-release-%PLATFORM%.zip * &&
+          appveyor PushArtifact zstd-win-release-%PLATFORM%.zip
+      )
     )
-  - if [%COMPILER%]==[gcc] make clean
-  - if [%COMPILER%]==[visual] (
+  - if [%HOST%]==[visual] (
       ECHO *** &&
       ECHO *** Building Visual Studio 2008 %PLATFORM%\%CONFIGURATION% in %APPVEYOR_BUILD_FOLDER% &&
       ECHO *** &&
@@ -111,29 +148,26 @@ build_script:
       COPY build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe tests\
     )
 
-test_script:
+  test_script:
   - ECHO Testing %COMPILER% %PLATFORM% %CONFIGURATION%
-  - SET FUZZERTEST=-T1mn
-  - if [%COMPILER%]==[gcc] if [%PLATFORM%]==[clang] (
-      tests\fuzzer_clang.exe %FUZZERTEST% &&
-      ECHO *** &&
-      ECHO *** Building cmake for %PLATFORM% &&
-      ECHO *** &&
+  - SET "CC=gcc"
+  - SET "CXX=g++"
+  - if [%TEST%]==[cmake] (
       mkdir build\cmake\build &&
       cd build\cmake\build &&
       cmake -G "Visual Studio 14 2015 Win64" .. &&
       cd ..\..\.. &&
-      make clean &&
-      ECHO *** &&
-      ECHO *** Building pzstd for %PLATFORM% &&
-      ECHO *** &&
+      make clean
+    )
+  - if [%TEST%]==[pzstd] (
       make -C contrib\pzstd googletest-mingw64 &&
       make -C contrib\pzstd pzstd.exe &&
       make -C contrib\pzstd tests &&
       make -C contrib\pzstd check &&
       make -C contrib\pzstd clean
     )
-  - if [%COMPILER%]==[visual] if [%CONFIGURATION%]==[Release] (
+  - SET "FUZZERTEST=-T30s"
+  - if [%HOST%]==[visual] if [%CONFIGURATION%]==[Release] (
       CD tests &&
       SET ZSTD=./zstd.exe &&
       sh -e playTests.sh --test-large-data &&
@@ -146,33 +180,76 @@ test_script:
       fuzzer_VS2015_%PLATFORM%_Release.exe %FUZZERTEST%
     )
 
-branches:
-  only:
-  - dev
-  - master
+-
+  version: 1.0.{build}
+  environment:
+    matrix:
+    - COMPILER: "gcc"
+      HOST:     "mingw"
+      PLATFORM: "x64"
+      SCRIPT:   "make allarch"
+    - COMPILER: "gcc"
+      HOST:     "mingw"
+      PLATFORM: "x86"
+      SCRIPT:   "make allarch"
+    - COMPILER: "clang"
+      HOST:     "mingw"
+      PLATFORM: "x64"
+      SCRIPT:   "MOREFLAGS='--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion' make allarch"
 
-artifacts:
-  - path: bin\zstd.exe
-  - path: bin\zstd32.exe
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "x64"
+      CONFIGURATION: "Debug"
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "Win32"
+      CONFIGURATION: "Debug"
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "x64"
+      CONFIGURATION: "Release"
+    - COMPILER: "visual"
+      HOST:     "visual"
+      PLATFORM: "Win32"
+      CONFIGURATION: "Release"
+
+  install:
+  - ECHO Installing %COMPILER% %PLATFORM% %CONFIGURATION%
+  - SET PATH_ORIGINAL=%PATH%
+  - if [%HOST%]==[mingw] (
+      SET "PATH_MINGW32=C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin" &&
+      SET "PATH_MINGW64=C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin" &&
+      COPY C:\msys64\usr\bin\make.exe C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin\make.exe &&
+      COPY C:\msys64\usr\bin\make.exe C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin\make.exe
+    )
+  - IF [%HOST%]==[visual] IF [%PLATFORM%]==[x64] (
+      SET ADDITIONALPARAM=/p:LibraryPath="C:\Program Files\Microsoft SDKs\Windows\v7.1\lib\x64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;C:\Program Files (x86)\Microsoft Visual Studio 10.0\;C:\Program Files (x86)\Microsoft Visual Studio 10.0\lib\amd64;"
+    )
 
-deploy:
-- provider: GitHub
-  auth_token:
-    secure: LgJo8emYc3sFnlNWkGl4/VYK3nk/8+RagcsqDlAi3xeqNGNutnKjcftjg84uJoT4
-  artifact: bin\zstd.exe
-  force_update: true
-  on:
-    branch: autobuild
-    COMPILER: gcc
-    PLATFORM: "mingw64"
-    appveyor_repo_tag: true
-- provider: GitHub
-  auth_token:
-    secure: LgJo8emYc3sFnlNWkGl4/VYK3nk/8+RagcsqDlAi3xeqNGNutnKjcftjg84uJoT4
-  artifact: bin\zstd32.exe
-  force_update: true
-  on:
-    branch: autobuild
-    COMPILER: gcc
-    PLATFORM: "mingw32"
-    appveyor_repo_tag: true
+  build_script:
+  - ECHO Building %COMPILER% %PLATFORM% %CONFIGURATION%
+  - if [%HOST%]==[mingw] (
+      ( if [%PLATFORM%]==[x64] (
+        SET "PATH=%PATH_MINGW64%;%PATH_ORIGINAL%"
+      ) else if [%PLATFORM%]==[x86] (
+        SET "PATH=%PATH_MINGW32%;%PATH_ORIGINAL%"
+      ) ) &&
+      make -v &&
+      sh -c "%COMPILER% -v" &&
+      set "CC=%COMPILER%" &&
+      sh -c "%SCRIPT%"
+    )
+  - if [%HOST%]==[visual] (
+      ECHO *** &&
+      ECHO *** Building Visual Studio 2015 %PLATFORM%\%CONFIGURATION% &&
+      ECHO *** &&
+      msbuild "build\VS2010\zstd.sln" /m /verbosity:minimal /property:PlatformToolset=v140 /p:ForceImportBeforeCppTargets=%APPVEYOR_BUILD_FOLDER%\build\VS2010\CompileAsCpp.props /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" &&
+      DIR build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe &&
+      MD5sum build/VS2010/bin/%PLATFORM%_%CONFIGURATION%/*.exe &&
+      msbuild "build\VS2010\zstd.sln" /m /verbosity:minimal /property:PlatformToolset=v140 /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" &&
+      DIR build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe &&
+      MD5sum build/VS2010/bin/%PLATFORM%_%CONFIGURATION%/*.exe &&
+      COPY build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\fuzzer.exe tests\fuzzer_VS2015_%PLATFORM%_%CONFIGURATION%.exe &&
+      COPY build\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe tests\
+    )
diff --git a/circle.yml b/circle.yml
index 298569d14551..218e33bfc333 100644
--- a/circle.yml
+++ b/circle.yml
@@ -3,7 +3,7 @@ dependencies:
     - sudo dpkg --add-architecture i386
     - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test; sudo apt-get -y -qq update
     - sudo apt-get -y install gcc-powerpc-linux-gnu gcc-arm-linux-gnueabi libc6-dev-armel-cross gcc-aarch64-linux-gnu libc6-dev-arm64-cross
-    - sudo apt-get -y install libstdc++-6-dev clang gcc g++ gcc-5 gcc-6
+    - sudo apt-get -y install libstdc++-6-dev clang gcc g++ gcc-5 gcc-6 zlib1g-dev liblzma-dev
     - sudo apt-get -y install linux-libc-dev:i386 libc6-dev-i386
 
 test:
diff --git a/contrib/cleanTabs b/contrib/cleanTabs
new file mode 100755
index 000000000000..215913a90ace
--- /dev/null
+++ b/contrib/cleanTabs
@@ -0,0 +1,2 @@
+#!/bin/sh
+sed -i '' $'s/\t/    /g' ../lib/**/*.{h,c} ../programs/*.{h,c} ../tests/*.c ./**/*.{h,cpp} ../examples/*.c ../zlibWrapper/*.{h,c}
diff --git a/contrib/pzstd/Options.cpp b/contrib/pzstd/Options.cpp
index a0d969393c9b..1f53f2bff78a 100644
--- a/contrib/pzstd/Options.cpp
+++ b/contrib/pzstd/Options.cpp
@@ -91,7 +91,7 @@ void usage() {
   std::fprintf(stderr, "  -#                     : # compression level (1-%d, default:%d)\n", kMaxNonUltraCompressionLevel, kDefaultCompressionLevel);
   std::fprintf(stderr, "  -d, --decompress       : decompression\n");
   std::fprintf(stderr, "  -o                file : result stored into `file` (only if 1 input file)\n");
-  std::fprintf(stderr, "  -f, --force            : overwrite output without prompting\n");
+  std::fprintf(stderr, "  -f, --force            : overwrite output without prompting, (de)compress links\n");
   std::fprintf(stderr, "      --rm               : remove source file(s) after successful (de)compression\n");
   std::fprintf(stderr, "  -k, --keep             : preserve source file(s) (default)\n");
   std::fprintf(stderr, "  -h, --help             : display help and exit\n");
@@ -121,6 +121,7 @@ Options::Status Options::parse(int argc, const char **argv) {
   bool recursive = false;
   bool ultra = false;
   bool forceStdout = false;
+  bool followLinks = false;
   // Local copy of input files, which are pointers into argv.
   std::vector<const char *> localInputFiles;
   for (int i = 1; i < argc; ++i) {
@@ -255,6 +256,7 @@ Options::Status Options::parse(int argc, const char **argv) {
       case 'f':
         overwrite = true;
         forceStdout = true;
+        followLinks = true;
         break;
       case 't':
         test = true;
@@ -328,13 +330,29 @@ Options::Status Options::parse(int argc, const char **argv) {
     }
   }
 
+  g_utilDisplayLevel = verbosity;
+  // Remove local input files that are symbolic links
+  if (!followLinks) {
+      std::remove_if(localInputFiles.begin(), localInputFiles.end(),
+                     [&](const char *path) {
+                        bool isLink = UTIL_isLink(path);
+                        if (isLink && verbosity >= 2) {
+                            std::fprintf(
+                                    stderr,
+                                    "Warning : %s is symbolic link, ignoring\n",
+                                    path);
+                        }
+                        return isLink;
+                    });
+  }
+
   // Translate input files/directories into files to (de)compress
   if (recursive) {
     char *scratchBuffer = nullptr;
     unsigned numFiles = 0;
     const char **files =
         UTIL_createFileList(localInputFiles.data(), localInputFiles.size(),
-                            &scratchBuffer, &numFiles);
+                            &scratchBuffer, &numFiles, followLinks);
     if (files == nullptr) {
       std::fprintf(stderr, "Error traversing directories\n");
       return Status::Failure;
diff --git a/contrib/pzstd/utils/test/ThreadPoolTest.cpp b/contrib/pzstd/utils/test/ThreadPoolTest.cpp
index 1d857aae808d..89085afd434c 100644
--- a/contrib/pzstd/utils/test/ThreadPoolTest.cpp
+++ b/contrib/pzstd/utils/test/ThreadPoolTest.cpp
@@ -10,6 +10,7 @@
 
 #include <gtest/gtest.h>
 #include <atomic>
+#include <iostream>
 #include <thread>
 #include <vector>
 
@@ -34,16 +35,19 @@ TEST(ThreadPool, AllJobsFinished) {
   std::atomic<unsigned> numFinished{0};
   std::atomic<bool> start{false};
   {
+    std::cerr << "Creating executor" << std::endl;
     ThreadPool executor(5);
     for (int i = 0; i < 10; ++i) {
       executor.add([ &numFinished, &start ] {
         while (!start.load()) {
-          // spin
+          std::this_thread::yield();
         }
         ++numFinished;
       });
     }
+    std::cerr << "Starting" << std::endl;
     start.store(true);
+    std::cerr << "Finishing" << std::endl;
   }
   EXPECT_EQ(10, numFinished.load());
 }
diff --git a/contrib/pzstd/utils/test/WorkQueueTest.cpp b/contrib/pzstd/utils/test/WorkQueueTest.cpp
index 7f58ccb3f199..8caf170d2948 100644
--- a/contrib/pzstd/utils/test/WorkQueueTest.cpp
+++ b/contrib/pzstd/utils/test/WorkQueueTest.cpp
@@ -10,6 +10,7 @@
 #include "utils/WorkQueue.h"
 
 #include <gtest/gtest.h>
+#include <iostream>
 #include <memory>
 #include <mutex>
 #include <thread>
@@ -201,11 +202,13 @@ TEST(WorkQueue, BoundedSizeMPMC) {
   WorkQueue<int> queue(10);
   std::vector<int> results(200, -1);
   std::mutex mutex;
+  std::cerr << "Creating popperThreads" << std::endl;
   std::vector<std::thread> popperThreads;
   for (int i = 0; i < 4; ++i) {
     popperThreads.emplace_back(Popper{&queue, results.data(), &mutex});
   }
 
+  std::cerr << "Creating pusherThreads" << std::endl;
   std::vector<std::thread> pusherThreads;
   for (int i = 0; i < 2; ++i) {
     auto min = i * 100;
@@ -218,15 +221,19 @@ TEST(WorkQueue, BoundedSizeMPMC) {
         });
   }
 
+  std::cerr << "Joining pusherThreads" << std::endl;
   for (auto& thread : pusherThreads) {
     thread.join();
   }
+  std::cerr << "Finishing queue" << std::endl;
   queue.finish();
 
+  std::cerr << "Joining popperThreads" << std::endl;
   for (auto& thread : popperThreads) {
     thread.join();
   }
 
+  std::cerr << "Inspecting results" << std::endl;
   for (int i = 0; i < 200; ++i) {
     EXPECT_EQ(i, results[i]);
   }
diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c
index ae4eaa81c6ae..7c8d8114d401 100644
--- a/doc/educational_decoder/zstd_decompress.c
+++ b/doc/educational_decoder/zstd_decompress.c
@@ -27,16 +27,19 @@ size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
 
 /// Get the decompressed size of an input stream so memory can be allocated in
 /// advance
+/// Returns -1 if the size can't be determined
 size_t ZSTD_get_decompressed_size(const void *const src, const size_t src_len);
 
 /******* UTILITY MACROS AND TYPES *********************************************/
-// Max block size decompressed size is 128 KB and literal blocks must be smaller
-// than that
+// Max block size decompressed size is 128 KB and literal blocks can't be
+// larger than their block
 #define MAX_LITERALS_SIZE ((size_t)128 * 1024)
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
+/// This decoder calls exit(1) when it encounters an error, however a production
+/// library should propagate error codes
 #define ERROR(s)                                                               \
     do {                                                                       \
         fprintf(stderr, "Error: %s\n", s);                                     \
@@ -67,29 +70,31 @@ typedef int64_t i64;
 /// decompression functions.
 
 /*** IO STREAM OPERATIONS *************/
-/// These structs are the interface for IO, and do bounds checking on all
-/// operations.  They should be used opaquely to ensure safety.
 
-/// Output is always done byte-by-byte
+/// ostream_t/istream_t are used to wrap the pointers/length data passed into
+/// ZSTD_decompress, so that all IO operations are safely bounds checked
+/// They are written/read forward, and reads are treated as little-endian
+/// They should be used opaquely to ensure safety
 typedef struct {
     u8 *ptr;
     size_t len;
 } ostream_t;
 
-/// Input often reads a few bits at a time, so maintain an internal offset
 typedef struct {
     const u8 *ptr;
-    int bit_offset;
     size_t len;
+
+    // Input often reads a few bits at a time, so maintain an internal offset
+    int bit_offset;
 } istream_t;
 
 /// The following two functions are the only ones that allow the istream to be
 /// non-byte aligned
 
 /// Reads `num` bits from a bitstream, and updates the internal offset
-static inline u64 IO_read_bits(istream_t *const in, const int num);
-/// Rewinds the stream by `num` bits
-static inline void IO_rewind_bits(istream_t *const in, const int num);
+static inline u64 IO_read_bits(istream_t *const in, const int num_bits);
+/// Backs-up the stream by `num` bits so they can be read again
+static inline void IO_rewind_bits(istream_t *const in, const int num_bits);
 /// If the remaining bits in a byte will be unused, advance to the end of the
 /// byte
 static inline void IO_align_stream(istream_t *const in);
@@ -101,30 +106,31 @@ static inline void IO_write_byte(ostream_t *const out, u8 symb);
 /// be byte aligned.
 static inline size_t IO_istream_len(const istream_t *const in);
 
-/// Returns a pointer where `len` bytes can be read, and advances the internal
-/// state.  The stream must be byte aligned.
+/// Advances the stream by `len` bytes, and returns a pointer to the chunk that
+/// was skipped.  The stream must be byte aligned.
 static inline const u8 *IO_read_bytes(istream_t *const in, size_t len);
-/// Returns a pointer where `len` bytes can be written, and advances the internal
-/// state.  The stream must be byte aligned.
+/// Advances the stream by `len` bytes, and returns a pointer to the chunk that
+/// was skipped so it can be written to.
 static inline u8 *IO_write_bytes(ostream_t *const out, size_t len);
 
 /// Advance the inner state by `len` bytes.  The stream must be byte aligned.
 static inline void IO_advance_input(istream_t *const in, size_t len);
 
-/// Returns an `ostream_t` constructed from the given pointer and length
+/// Returns an `ostream_t` constructed from the given pointer and length.
 static inline ostream_t IO_make_ostream(u8 *out, size_t len);
-/// Returns an `istream_t` constructed from the given pointer and length
+/// Returns an `istream_t` constructed from the given pointer and length.
 static inline istream_t IO_make_istream(const u8 *in, size_t len);
 
-/// Returns an `istream_t` with the same base as `in`, and length `len`
-/// Then, advance `in` to account for the consumed bytes
-/// `in` must be byte aligned
+/// Returns an `istream_t` with the same base as `in`, and length `len`.
+/// Then, advance `in` to account for the consumed bytes.
+/// `in` must be byte aligned.
 static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len);
 /*** END IO STREAM OPERATIONS *********/
 
 /*** BITSTREAM OPERATIONS *************/
-/// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
-static inline u64 read_bits_LE(const u8 *src, const int num,
+/// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits,
+/// and return them interpreted as a little-endian unsigned integer.
+static inline u64 read_bits_LE(const u8 *src, const int num_bits,
                                const size_t offset);
 
 /// Read bits from the end of a HUF or FSE bitstream.  `offset` is in bits, so
@@ -136,9 +142,8 @@ static inline u64 STREAM_read_bits(const u8 *src, const int bits,
 /*** END BITSTREAM OPERATIONS *********/
 
 /*** BIT COUNTING OPERATIONS **********/
-/// Returns `x`, where `2^x` is the largest power of 2 less than or equal to
-/// `num`, or `-1` if `num == 0`.
-static inline int log2inf(const u64 num);
+/// Returns the index of the highest set bit in `num`, or `-1` if `num == 0`
+static inline int highest_set_bit(const u64 num);
 /*** END BIT COUNTING OPERATIONS ******/
 
 /*** HUFFMAN PRIMITIVES ***************/
@@ -384,8 +389,8 @@ size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
         parse_dictionary(&parsed_dict, (const u8 *)dict, dict_len);
     }
 
-    istream_t in = {(const u8 *)src, 0, src_len};
-    ostream_t out = {(u8 *)dst, dst_len};
+    istream_t in = IO_make_istream(src, src_len);
+    ostream_t out = IO_make_ostream(dst, dst_len);
 
     // "A content compressed by Zstandard is transformed into a Zstandard frame.
     // Multiple frames can be appended into a single file or stream. A frame is
@@ -633,6 +638,7 @@ static void frame_context_apply_dict(frame_context_t *const ctx,
         FSE_copy_dtable(&ctx->of_dtable, &dict->of_dtable);
         FSE_copy_dtable(&ctx->ml_dtable, &dict->ml_dtable);
 
+        // Copy the repeated offsets
         memcpy(ctx->previous_offsets, dict->previous_offsets,
                sizeof(ctx->previous_offsets));
     }
@@ -668,7 +674,7 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
             // number of bytes to read and copy."
             const u8 *const read_ptr = IO_read_bytes(in, block_len);
             u8 *const write_ptr = IO_write_bytes(out, block_len);
-            //
+
             // Copy the raw data into the output
             memcpy(write_ptr, read_ptr, block_len);
 
@@ -682,7 +688,7 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
             const u8 *const read_ptr = IO_read_bytes(in, 1);
             u8 *const write_ptr = IO_write_bytes(out, block_len);
 
-            // Copy `block_len` copies of `streams->src[0]` to the output
+            // Copy `block_len` copies of `read_ptr[0]` to the output
             memset(write_ptr, read_ptr[0], block_len);
 
             ctx->current_total_output += block_len;
@@ -751,7 +757,7 @@ static size_t decode_literals_compressed(frame_context_t *const ctx,
                                          u8 **const literals,
                                          const int block_type,
                                          const int size_format);
-static void decode_huf_table(istream_t *const in, HUF_dtable *const dtable);
+static void decode_huf_table(HUF_dtable *const dtable, istream_t *const in);
 static void fse_decode_hufweights(ostream_t *weights, istream_t *const in,
                                     int *const num_symbs);
 
@@ -894,12 +900,12 @@ static size_t decode_literals_compressed(frame_context_t *const ctx,
     istream_t huf_stream = IO_make_sub_istream(in, compressed_size);
 
     if (block_type == 2) {
-        // Decode provided Huffman table
+        // Decode the provided Huffman table
         // "This section is only present when Literals_Block_Type type is
         // Compressed_Literals_Block (2)."
 
         HUF_free_dtable(&ctx->literals_dtable);
-        decode_huf_table(&huf_stream, &ctx->literals_dtable);
+        decode_huf_table(&ctx->literals_dtable, &huf_stream);
     } else {
         // If the previous Huffman table is being repeated, ensure it exists
         if (!ctx->literals_dtable.symbols) {
@@ -922,13 +928,13 @@ static size_t decode_literals_compressed(frame_context_t *const ctx,
 }
 
 // Decode the Huffman table description
-static void decode_huf_table(istream_t *const in, HUF_dtable *const dtable) {
-    const u8 header = IO_read_bits(in, 8);
-
+static void decode_huf_table(HUF_dtable *const dtable, istream_t *const in) {
     // "All literal values from zero (included) to last present one (excluded)
     // are represented by Weight with values from 0 to Max_Number_of_Bits."
 
     // "This is a single byte value (0-255), which describes how to decode the list of weights."
+    const u8 header = IO_read_bits(in, 8);
+
     u8 weights[HUF_MAX_SYMBS];
     memset(weights, 0, sizeof(weights));
 
@@ -997,7 +1003,7 @@ typedef struct {
     u16 ll_state;
     u16 of_state;
     u16 ml_state;
-} sequence_state_t;
+} sequence_states_t;
 
 /// Different modes to signal to decode_seq_tables what to do
 typedef enum {
@@ -1052,10 +1058,10 @@ static void decompress_sequences(frame_context_t *const ctx,
                                  istream_t *const in,
                                  sequence_command_t *const sequences,
                                  const size_t num_sequences);
-static sequence_command_t decode_sequence(sequence_state_t *const state,
+static sequence_command_t decode_sequence(sequence_states_t *const state,
                                           const u8 *const src,
                                           i64 *const offset);
-static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
+static void decode_seq_table(FSE_dtable *const table, istream_t *const in,
                                const seq_part_t type, const seq_mode_t mode);
 
 static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
@@ -1131,34 +1137,33 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
     // Offsets
     // Match Lengths"
     // Update the tables we have stored in the context
-    decode_seq_table(in, &ctx->ll_dtable, seq_literal_length,
+    decode_seq_table(&ctx->ll_dtable, in, seq_literal_length,
                      (compression_modes >> 6) & 3);
 
-    decode_seq_table(in, &ctx->of_dtable, seq_offset,
+    decode_seq_table(&ctx->of_dtable, in, seq_offset,
                      (compression_modes >> 4) & 3);
 
-    decode_seq_table(in, &ctx->ml_dtable, seq_match_length,
+    decode_seq_table(&ctx->ml_dtable, in, seq_match_length,
                      (compression_modes >> 2) & 3);
 
-    // Check to make sure none of the tables are uninitialized
-    if (!ctx->ll_dtable.symbols || !ctx->of_dtable.symbols ||
-        !ctx->ml_dtable.symbols) {
-        CORRUPTION();
-    }
 
-    sequence_state_t state;
-    // Copy the context's tables into the local state
-    memcpy(&state.ll_table, &ctx->ll_dtable, sizeof(FSE_dtable));
-    memcpy(&state.of_table, &ctx->of_dtable, sizeof(FSE_dtable));
-    memcpy(&state.ml_table, &ctx->ml_dtable, sizeof(FSE_dtable));
+    sequence_states_t states;
 
-    size_t len = IO_istream_len(in);
+    // Initialize the decoding tables
+    {
+        states.ll_table = ctx->ll_dtable;
+        states.of_table = ctx->of_dtable;
+        states.ml_table = ctx->ml_dtable;
+    }
+
+    const size_t len = IO_istream_len(in);
     const u8 *const src = IO_read_bytes(in, len);
 
     // "After writing the last bit containing information, the compressor writes
     // a single 1-bit and then fills the byte with 0-7 0 bits of padding."
-    const int padding = 8 - log2inf(src[len - 1]);
-    i64 offset = len * 8 - padding;
+    const int padding = 8 - highest_set_bit(src[len - 1]);
+    // The offset starts at the end because FSE streams are read backwards
+    i64 bit_offset = len * 8 - padding;
 
     // "The bitstream starts with initial state values, each using the required
     // number of bits in their respective accuracy, decoded previously from
@@ -1166,24 +1171,22 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
     //
     // It starts by Literals_Length_State, followed by Offset_State, and finally
     // Match_Length_State."
-    FSE_init_state(&state.ll_table, &state.ll_state, src, &offset);
-    FSE_init_state(&state.of_table, &state.of_state, src, &offset);
-    FSE_init_state(&state.ml_table, &state.ml_state, src, &offset);
+    FSE_init_state(&states.ll_table, &states.ll_state, src, &bit_offset);
+    FSE_init_state(&states.of_table, &states.of_state, src, &bit_offset);
+    FSE_init_state(&states.ml_table, &states.ml_state, src, &bit_offset);
 
     for (size_t i = 0; i < num_sequences; i++) {
         // Decode sequences one by one
-        sequences[i] = decode_sequence(&state, src, &offset);
+        sequences[i] = decode_sequence(&states, src, &bit_offset);
     }
 
-    if (offset != 0) {
+    if (bit_offset != 0) {
         CORRUPTION();
     }
-
-    // Don't free tables so they can be used in the next block
 }
 
 // Decode a single sequence and update the state
-static sequence_command_t decode_sequence(sequence_state_t *const state,
+static sequence_command_t decode_sequence(sequence_states_t *const states,
                                           const u8 *const src,
                                           i64 *const offset) {
     // "Each symbol is a code in its own context, which specifies Baseline and
@@ -1191,9 +1194,9 @@ static sequence_command_t decode_sequence(sequence_state_t *const state,
     // additional bits in the same bitstream."
 
     // Decode symbols, but don't update states
-    const u8 of_code = FSE_peek_symbol(&state->of_table, state->of_state);
-    const u8 ll_code = FSE_peek_symbol(&state->ll_table, state->ll_state);
-    const u8 ml_code = FSE_peek_symbol(&state->ml_table, state->ml_state);
+    const u8 of_code = FSE_peek_symbol(&states->of_table, states->of_state);
+    const u8 ll_code = FSE_peek_symbol(&states->ll_table, states->ll_state);
+    const u8 ml_code = FSE_peek_symbol(&states->ml_table, states->ml_state);
 
     // Offset doesn't need a max value as it's not decoded using a table
     if (ll_code > SEQ_MAX_CODES[seq_literal_length] ||
@@ -1221,17 +1224,18 @@ static sequence_command_t decode_sequence(sequence_state_t *const state,
     // then Offset_State."
     // If the stream is complete don't read bits to update state
     if (*offset != 0) {
-        FSE_update_state(&state->ll_table, &state->ll_state, src, offset);
-        FSE_update_state(&state->ml_table, &state->ml_state, src, offset);
-        FSE_update_state(&state->of_table, &state->of_state, src, offset);
+        FSE_update_state(&states->ll_table, &states->ll_state, src, offset);
+        FSE_update_state(&states->ml_table, &states->ml_state, src, offset);
+        FSE_update_state(&states->of_table, &states->of_state, src, offset);
     }
 
     return seq;
 }
 
 /// Given a sequence part and table mode, decode the FSE distribution
-static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
-                               const seq_part_t type, const seq_mode_t mode) {
+/// Errors if the mode is `seq_repeat` without a pre-existing table in `table`
+static void decode_seq_table(FSE_dtable *const table, istream_t *const in,
+                             const seq_part_t type, const seq_mode_t mode) {
     // Constant arrays indexed by seq_part_t
     const i16 *const default_distributions[] = {SEQ_LITERAL_LENGTH_DEFAULT_DIST,
                                                 SEQ_OFFSET_DEFAULT_DIST,
@@ -1272,12 +1276,17 @@ static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
         // "Repeat_Mode : re-use distribution table from previous compressed
         // block."
         // Nothing to do here, table will be unchanged
+        if (!table->symbols) {
+            // This mode is invalid if we don't already have a table
+            CORRUPTION();
+        }
         break;
     default:
         // Impossible, as mode is from 0-3
         IMPOSSIBLE();
         break;
     }
+
 }
 /******* END SEQUENCE DECODING ************************************************/
 
@@ -1296,6 +1305,8 @@ static void execute_sequences(frame_context_t *const ctx, ostream_t *const out,
         const sequence_command_t seq = sequences[i];
 
         {
+            // If the sequence asks for more literals than are left, the
+            // sequence must be corrupted
             if (seq.literal_length > IO_istream_len(&litstream)) {
                 CORRUPTION();
             }
@@ -1336,7 +1347,8 @@ static void execute_sequences(frame_context_t *const ctx, ostream_t *const out,
                 // as per the exception listed above
                 offset = idx < 3 ? offset_hist[idx] : offset_hist[0] - 1;
 
-                // If idx == 1 we don't need to modify offset_hist[2]
+                // If idx == 1 we don't need to modify offset_hist[2], since
+                // we're using the second-most recent code
                 if (idx > 1) {
                     offset_hist[2] = offset_hist[1];
                 }
@@ -1344,6 +1356,8 @@ static void execute_sequences(frame_context_t *const ctx, ostream_t *const out,
                 offset_hist[0] = offset;
             }
         } else {
+            // When it's not a repeat offset:
+            // "if (Offset_Value > 3) offset = Offset_Value - 3;"
             offset = seq.offset - 3;
 
             // Shift back history
@@ -1391,11 +1405,11 @@ static void execute_sequences(frame_context_t *const ctx, ostream_t *const out,
         total_output += seq.match_length;
     }
 
+    // Copy any leftover literals
     {
         size_t len = IO_istream_len(&litstream);
         u8 *const write_ptr = IO_write_bytes(out, len);
         const u8 *const read_ptr = IO_read_bytes(&litstream, len);
-        // Copy any leftover literals
         memcpy(write_ptr, read_ptr, len);
 
         total_output += len;
@@ -1517,10 +1531,10 @@ static void parse_dictionary(dictionary_t *const dict, const u8 *src,
     // recent offsets (instead of using {1,4,8}), stored in order, 4-bytes
     // little-endian each, for a total of 12 bytes. Each recent offset must have
     // a value < dictionary size."
-    decode_huf_table(&in, &dict->literals_dtable);
-    decode_seq_table(&in, &dict->of_dtable, seq_offset, seq_fse);
-    decode_seq_table(&in, &dict->ml_dtable, seq_match_length, seq_fse);
-    decode_seq_table(&in, &dict->ll_dtable, seq_literal_length, seq_fse);
+    decode_huf_table(&dict->literals_dtable, &in);
+    decode_seq_table(&dict->of_dtable, &in, seq_offset, seq_fse);
+    decode_seq_table(&dict->ml_dtable, &in, seq_match_length, seq_fse);
+    decode_seq_table(&dict->ll_dtable, &in, seq_literal_length, seq_fse);
 
     // Read in the previous offset history
     dict->previous_offsets[0] = IO_read_bits(&in, 32);
@@ -1571,20 +1585,20 @@ static void free_dictionary(dictionary_t *const dict) {
 /******* IO STREAM OPERATIONS *************************************************/
 #define UNALIGNED() ERROR("Attempting to operate on a non-byte aligned stream")
 /// Reads `num` bits from a bitstream, and updates the internal offset
-static inline u64 IO_read_bits(istream_t *const in, const int num) {
-    if (num > 64 || num <= 0) {
+static inline u64 IO_read_bits(istream_t *const in, const int num_bits) {
+    if (num_bits > 64 || num_bits <= 0) {
         ERROR("Attempt to read an invalid number of bits");
     }
 
-    const size_t bytes = (num + in->bit_offset + 7) / 8;
-    const size_t full_bytes = (num + in->bit_offset) / 8;
+    const size_t bytes = (num_bits + in->bit_offset + 7) / 8;
+    const size_t full_bytes = (num_bits + in->bit_offset) / 8;
     if (bytes > in->len) {
         INP_SIZE();
     }
 
-    const u64 result = read_bits_LE(in->ptr, num, in->bit_offset);
+    const u64 result = read_bits_LE(in->ptr, num_bits, in->bit_offset);
 
-    in->bit_offset = (num + in->bit_offset) % 8;
+    in->bit_offset = (num_bits + in->bit_offset) % 8;
     in->ptr += full_bytes;
     in->len -= full_bytes;
 
@@ -1593,16 +1607,21 @@ static inline u64 IO_read_bits(istream_t *const in, const int num) {
 
 /// If a non-zero number of bits have been read from the current byte, advance
 /// the offset to the next byte
-static inline void IO_rewind_bits(istream_t *const in, int num) {
-    if (num < 0) {
+static inline void IO_rewind_bits(istream_t *const in, int num_bits) {
+    if (num_bits < 0) {
         ERROR("Attempting to rewind stream by a negative number of bits");
     }
 
-    const int new_offset = in->bit_offset - num;
-    const i64 bytes = (new_offset - 7) / 8;
+    // move the offset back by `num_bits` bits
+    const int new_offset = in->bit_offset - num_bits;
+    // determine the number of whole bytes we have to rewind, rounding up to an
+    // integer number (e.g. if `new_offset == -5`, `bytes == 1`)
+    const i64 bytes = -(new_offset - 7) / 8;
 
-    in->ptr += bytes;
-    in->len -= bytes;
+    in->ptr -= bytes;
+    in->len += bytes;
+    // make sure the resulting `bit_offset` is positive, as mod in C does not
+    // convert numbers from negative to positive (e.g. -22 % 8 == -6)
     in->bit_offset = ((new_offset % 8) + 8) % 8;
 }
 
@@ -1683,33 +1702,26 @@ static inline ostream_t IO_make_ostream(u8 *out, size_t len) {
 
 /// Returns an `istream_t` constructed from the given pointer and length
 static inline istream_t IO_make_istream(const u8 *in, size_t len) {
-    return (istream_t) { in, 0, len };
+    return (istream_t) { in, len, 0 };
 }
 
 /// Returns an `istream_t` with the same base as `in`, and length `len`
 /// Then, advance `in` to account for the consumed bytes
 /// `in` must be byte aligned
 static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len) {
-    if (len > in->len) {
-        INP_SIZE();
-    }
-    if (in->bit_offset != 0) {
-        UNALIGNED();
-    }
-    const istream_t sub = { in->ptr, in->bit_offset, len };
+    // Consume `len` bytes of the parent stream
+    const u8 *const ptr = IO_read_bytes(in, len);
 
-    in->ptr += len;
-    in->len -= len;
-
-    return sub;
+    // Make a substream using the pointer to those `len` bytes
+    return IO_make_istream(ptr, len);
 }
 /******* END IO STREAM OPERATIONS *********************************************/
 
 /******* BITSTREAM OPERATIONS *************************************************/
 /// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
-static inline u64 read_bits_LE(const u8 *src, const int num,
+static inline u64 read_bits_LE(const u8 *src, const int num_bits,
                                const size_t offset) {
-    if (num > 64) {
+    if (num_bits > 64) {
         ERROR("Attempt to read an invalid number of bits");
     }
 
@@ -1719,10 +1731,10 @@ static inline u64 read_bits_LE(const u8 *src, const int num,
     u64 res = 0;
 
     int shift = 0;
-    int left = num;
+    int left = num_bits;
     while (left > 0) {
         u64 mask = left >= 8 ? 0xff : (((u64)1 << left) - 1);
-        // Dead the next byte, shift it to account for the offset, and then mask
+        // Read the next byte, shift it to account for the offset, and then mask
         // out the top part if we don't need all the bits
         res += (((u64)*src++ >> bit_offset) & mask) << shift;
         shift += 8 - bit_offset;
@@ -1761,7 +1773,7 @@ static inline u64 STREAM_read_bits(const u8 *const src, const int bits,
 /******* BIT COUNTING OPERATIONS **********************************************/
 /// Returns `x`, where `2^x` is the largest power of 2 less than or equal to
 /// `num`, or `-1` if `num == 0`.
-static inline int log2inf(const u64 num) {
+static inline int highest_set_bit(const u64 num) {
     for (int i = 63; i >= 0; i--) {
         if (((u64)1 << i) <= num) {
             return i;
@@ -1813,17 +1825,18 @@ static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
     // final-bit-flag. Consequently, a last byte of 0 is not possible. And the
     // final-bit-flag itself is not part of the useful bitstream. Hence, the
     // last byte contains between 0 and 7 useful bits."
-    const int padding = 8 - log2inf(src[len - 1]);
+    const int padding = 8 - highest_set_bit(src[len - 1]);
 
-    i64 offset = len * 8 - padding;
+    // Offset starts at the end because HUF streams are read backwards
+    i64 bit_offset = len * 8 - padding;
     u16 state;
 
-    HUF_init_state(dtable, &state, src, &offset);
+    HUF_init_state(dtable, &state, src, &bit_offset);
 
     size_t symbols_written = 0;
-    while (offset > -dtable->max_bits) {
+    while (bit_offset > -dtable->max_bits) {
         // Iterate over the stream, decoding one symbol at a time
-        IO_write_byte(out, HUF_decode_symbol(dtable, &state, src, &offset));
+        IO_write_byte(out, HUF_decode_symbol(dtable, &state, src, &bit_offset));
         symbols_written++;
     }
     // "The process continues up to reading the required number of symbols per
@@ -1836,7 +1849,7 @@ static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
     // before the start of `src`
     // Therefore `offset`, the edge to start reading new bits at, should be
     // dtable->max_bits before the start of the stream
-    if (offset != -dtable->max_bits) {
+    if (bit_offset != -dtable->max_bits) {
         CORRUPTION();
     }
 
@@ -1960,7 +1973,7 @@ static void HUF_init_dtable_usingweights(HUF_dtable *const table,
     }
 
     // Find the first power of 2 larger than the sum
-    const int max_bits = log2inf(weight_sum) + 1;
+    const int max_bits = highest_set_bit(weight_sum) + 1;
     const u64 left_over = ((u64)1 << max_bits) - weight_sum;
     // If the left over isn't a power of 2, the weights are invalid
     if (left_over & (left_over - 1)) {
@@ -1969,7 +1982,7 @@ static void HUF_init_dtable_usingweights(HUF_dtable *const table,
 
     // left_over is used to find the last weight as it's not transmitted
     // by inverting 2^(weight - 1) we can determine the value of last_weight
-    const int last_weight = log2inf(left_over) + 1;
+    const int last_weight = highest_set_bit(left_over) + 1;
 
     for (int i = 0; i < num_symbs; i++) {
         // "Number_of_Bits = Number_of_Bits ? Max_Number_of_Bits + 1 - Weight : 0"
@@ -2063,7 +2076,7 @@ static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
     // final-bit-flag. Consequently, a last byte of 0 is not possible. And the
     // final-bit-flag itself is not part of the useful bitstream. Hence, the
     // last byte contains between 0 and 7 useful bits."
-    const int padding = 8 - log2inf(src[len - 1]);
+    const int padding = 8 - highest_set_bit(src[len - 1]);
     i64 offset = len * 8 - padding;
 
     u16 state1, state2;
@@ -2184,7 +2197,7 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
         u16 next_state_desc = state_desc[symbol]++;
         // Fills in the table appropriately, next_state_desc increases by symbol
         // over time, decreasing number of bits
-        dtable->num_bits[i] = (u8)(accuracy_log - log2inf(next_state_desc));
+        dtable->num_bits[i] = (u8)(accuracy_log - highest_set_bit(next_state_desc));
         // Baseline increases until the bit threshold is passed, at which point
         // it resets to 0
         dtable->new_state_base[i] =
@@ -2235,7 +2248,7 @@ static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in,
     int symb = 0;
     while (remaining > 0 && symb < FSE_MAX_SYMBS) {
         // Log of the number of possible values we could read
-        int bits = log2inf(remaining + 1) + 1;
+        int bits = highest_set_bit(remaining + 1) + 1;
 
         u16 val = IO_read_bits(in, bits);
 
diff --git a/doc/images/Cspeed4.png b/doc/images/Cspeed4.png
index f0ca0ffba9c4..318204c00e96 100644
Binary files a/doc/images/Cspeed4.png and b/doc/images/Cspeed4.png differ
diff --git a/doc/images/Dspeed4.png b/doc/images/Dspeed4.png
index eba485d0d1ed..b7baef1ff3f7 100644
Binary files a/doc/images/Dspeed4.png and b/doc/images/Dspeed4.png differ
diff --git a/doc/images/dict-cr.png b/doc/images/dict-cr.png
index f555a46c7b99..f3a9ce2bdda6 100644
Binary files a/doc/images/dict-cr.png and b/doc/images/dict-cr.png differ
diff --git a/doc/images/dict-cs.png b/doc/images/dict-cs.png
index ccc02b0d1713..55e5ef518fed 100644
Binary files a/doc/images/dict-cs.png and b/doc/images/dict-cs.png differ
diff --git a/doc/images/dict-ds.png b/doc/images/dict-ds.png
index 858cad685509..1153f1b95fd5 100644
Binary files a/doc/images/dict-ds.png and b/doc/images/dict-ds.png differ
diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md
index d4b46548a3d2..1f212fea2305 100644
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@@ -16,7 +16,8 @@ Distribution of this document is unlimited.
 
 ### Version
 
-0.2.4 (17/02/17)
+0.2.5 (31/03/17)
+
 
 Introduction
 ------------
@@ -109,7 +110,7 @@ The structure of a single Zstandard frame is following:
 
 __`Magic_Number`__
 
-4 Bytes, little-endian format.
+4 Bytes, __little-endian__ format.
 Value : 0xFD2FB528
 
 __`Frame_Header`__
@@ -127,7 +128,7 @@ An optional 32-bit checksum, only present if `Content_Checksum_flag` is set.
 The content checksum is the result
 of [xxh64() hash function](http://www.xxhash.org)
 digesting the original (decoded) data as input, and a seed of zero.
-The low 4 bytes of the checksum are stored in little endian format.
+The low 4 bytes of the checksum are stored in __little-endian__ format.
 
 ### `Frame_Header`
 
@@ -154,41 +155,42 @@ Decoding this byte is enough to tell the size of `Frame_Header`.
 | 2          | `Content_Checksum_flag`   |
 | 1-0        | `Dictionary_ID_flag`      |
 
-In this table, bit 7 the is highest bit, while bit 0 the is lowest.
+In this table, bit 7 is the highest bit, while bit 0 is the lowest one.
 
 __`Frame_Content_Size_flag`__
 
 This is a 2-bits flag (`= Frame_Header_Descriptor >> 6`),
-specifying if decompressed data size is provided within the header.
-The `Flag_Value` can be converted into `Field_Size`,
+specifying if `Frame_Content_Size` (the decompressed data size)
+is provided within the header.
+`Flag_Value` provides `FCS_Field_Size`,
 which is the number of bytes used by `Frame_Content_Size`
 according to the following table:
 
-|`Flag_Value`|    0   |  1  |  2  |  3  |
-| ---------- | ------ | --- | --- | --- |
-|`Field_Size`| 0 or 1 |  2  |  4  |  8  |
+|  `Flag_Value`  |    0   |  1  |  2  |  3  |
+| -------------- | ------ | --- | --- | --- |
+|`FCS_Field_Size`| 0 or 1 |  2  |  4  |  8  |
 
-When `Flag_Value` is `0`, `Field_Size` depends on `Single_Segment_flag` :
+When `Flag_Value` is `0`, `FCS_Field_Size` depends on `Single_Segment_flag` :
 if `Single_Segment_flag` is set, `Field_Size` is 1.
-Otherwise, `Field_Size` is 0 (content size not provided).
+Otherwise, `Field_Size` is 0 : `Frame_Content_Size` is not provided.
 
 __`Single_Segment_flag`__
 
 If this flag is set,
 data must be regenerated within a single continuous memory segment.
 
-In this case, `Frame_Content_Size` is necessarily present,
-but `Window_Descriptor` byte is skipped.
+In this case, `Window_Descriptor` byte is skipped,
+but `Frame_Content_Size` is necessarily present.
 As a consequence, the decoder must allocate a memory segment
 of size equal or bigger than `Frame_Content_Size`.
 
 In order to preserve the decoder from unreasonable memory requirements,
-a decoder can reject a compressed frame
+a decoder is allowed to reject a compressed frame
 which requests a memory size beyond decoder's authorized range.
 
 For broader compatibility, decoders are recommended to support
 memory sizes of at least 8 MB.
-This is just a recommendation,
+This is only a recommendation,
 each decoder is free to support higher or lower limits,
 depending on local limitations.
 
@@ -224,37 +226,38 @@ It also specifies the size of this field as `Field_Size`.
 
 #### `Window_Descriptor`
 
-Provides guarantees on maximum back-reference distance
-that will be used within compressed data.
+Provides guarantees on minimum memory buffer required to decompress a frame.
 This information is important for decoders to allocate enough memory.
 
-The `Window_Descriptor` byte is optional. It is absent when `Single_Segment_flag` is set.
-In this case, the maximum back-reference distance is the content size itself,
-which can be any value from 1 to 2^64-1 bytes (16 EB).
+The `Window_Descriptor` byte is optional.
+When `Single_Segment_flag` is set, `Window_Descriptor` is not present.
+In this case, `Window_Size` is `Frame_Content_Size`,
+which can be any value from 0 to 2^64-1 bytes (16 ExaBytes).
 
 | Bit numbers |     7-3    |     2-0    |
 | ----------- | ---------- | ---------- |
 | Field name  | `Exponent` | `Mantissa` |
 
-Maximum distance is given by the following formulas :
+The minimum memory buffer size is called `Window_Size`.
+It is described by the following formulas :
 ```
 windowLog = 10 + Exponent;
 windowBase = 1 << windowLog;
 windowAdd = (windowBase / 8) * Mantissa;
 Window_Size = windowBase + windowAdd;
 ```
-The minimum window size is 1 KB.
-The maximum size is `15*(1<<38)` bytes, which is 1.875 TB.
+The minimum `Window_Size` is 1 KB.
+The maximum `Window_Size` is `(1<<41) + 7*(1<<38)` bytes, which is 3.75 TB.
 
 To properly decode compressed data,
 a decoder will need to allocate a buffer of at least `Window_Size` bytes.
 
 In order to preserve decoder from unreasonable memory requirements,
-a decoder can refuse a compressed frame
+a decoder is allowed to reject a compressed frame
 which requests a memory size beyond decoder's authorized range.
 
 For improved interoperability,
-decoders are recommended to be compatible with window sizes of 8 MB,
+decoders are recommended to be compatible with `Window_Size >= 8 MB`,
 and encoders are recommended to not request more than 8 MB.
 It's merely a recommendation though,
 decoders are free to support larger or lower limits,
@@ -264,112 +267,118 @@ depending on local limitations.
 
 This is a variable size field, which contains
 the ID of the dictionary required to properly decode the frame.
-Note that this field is optional. When it's not present,
+`Dictionary_ID` field is optional. When it's not present,
 it's up to the decoder to make sure it uses the correct dictionary.
-Format is little-endian.
 
 Field size depends on `Dictionary_ID_flag`.
 1 byte can represent an ID 0-255.
 2 bytes can represent an ID 0-65535.
 4 bytes can represent an ID 0-4294967295.
+Format is __little-endian__.
 
 It's allowed to represent a small ID (for example `13`)
-with a large 4-bytes dictionary ID, losing some compacity in the process.
+with a large 4-bytes dictionary ID, even if it is less efficient.
 
 _Reserved ranges :_
 If the frame is going to be distributed in a private environment,
 any dictionary ID can be used.
 However, for public distribution of compressed frames using a dictionary,
-the following ranges are reserved for future use and should not be used :
-- low range : 1 - 32767
-- high range : >= (2^31)
-
+the following ranges are reserved and shall not be used :
+- low range  : `<= 32767`
+- high range : `>= (1 << 31)`
 
 #### `Frame_Content_Size`
 
 This is the original (uncompressed) size. This information is optional.
-The `Field_Size` is provided according to value of `Frame_Content_Size_flag`.
-The `Field_Size` can be equal to 0 (not present), 1, 2, 4 or 8 bytes.
-Format is little-endian.
-
-| `Field_Size` |    Range   |
-| ------------ | ---------- |
-|      1       |   0 - 255  |
-|      2       | 256 - 65791|
-|      4       | 0 - 2^32-1 |
-|      8       | 0 - 2^64-1 |
-
-When `Field_Size` is 1, 4 or 8 bytes, the value is read directly.
-When `Field_Size` is 2, _the offset of 256 is added_.
+`Frame_Content_Size` uses a variable number of bytes, provided by `FCS_Field_Size`.
+`FCS_Field_Size` is provided by the value of `Frame_Content_Size_flag`.
+`FCS_Field_Size` can be equal to 0 (not present), 1, 2, 4 or 8 bytes.
+
+| `FCS_Field_Size` |    Range   |
+| ---------------- | ---------- |
+|        0         |   unknown  |
+|        1         |   0 - 255  |
+|        2         | 256 - 65791|
+|        4         | 0 - 2^32-1 |
+|        8         | 0 - 2^64-1 |
+
+`Frame_Content_Size` format is __little-endian__.
+When `FCS_Field_Size` is 1, 4 or 8 bytes, the value is read directly.
+When `FCS_Field_Size` is 2, _the offset of 256 is added_.
 It's allowed to represent a small size (for example `18`) using any compatible variant.
 
+
 Blocks
 -------
-After the magic number and header of each block,
-there are some number of blocks.
-Each frame must have at least one block  but there is no upper limit
-on the number of blocks per frame.
+
+After `Magic_Number` and `Frame_Header`, there are some number of blocks.
+Each frame must have at least one block,
+but there is no upper limit on the number of blocks per frame.
 
 The structure of a block is as follows:
 
-| `Last_Block` | `Block_Type` | `Block_Size` | `Block_Content` |
-|:------------:|:------------:|:------------:|:---------------:|
-|   1 bit      |  2 bits      |  21 bits     |  n bytes        |
+| `Block_Header` | `Block_Content` |
+|:--------------:|:---------------:|
+|    3 bytes     |     n bytes     |
+
+`Block_Header` uses 3 bytes, written using __little-endian__ convention.
+It contains 3 fields :
 
-The block header (`Last_Block`, `Block_Type`, and `Block_Size`) uses 3-bytes.
+| `Last_Block` | `Block_Type` | `Block_Size` |
+|:------------:|:------------:|:------------:|
+|    bit 0     |  bits 1-2    |  bits 3-23   |
 
 __`Last_Block`__
 
 The lowest bit signals if this block is the last one.
-The frame will end after this one.
+The frame will end after this last block.
 It may be followed by an optional `Content_Checksum`
 (see [Zstandard Frames](#zstandard-frames)).
 
-__`Block_Type` and `Block_Size`__
-
-The next 2 bits represent the `Block_Type`,
-while the remaining 21 bits represent the `Block_Size`.
-Format is __little-endian__.
+__`Block_Type`__
 
+The next 2 bits represent the `Block_Type`.
 There are 4 block types :
 
-|    Value     |      0      |     1       |  2                 |    3      |
+|    Value     |      0      |      1      |         2          |     3     |
 | ------------ | ----------- | ----------- | ------------------ | --------- |
 | `Block_Type` | `Raw_Block` | `RLE_Block` | `Compressed_Block` | `Reserved`|
 
 - `Raw_Block` - this is an uncompressed block.
-  `Block_Content` contains `Block_Size` bytes to read and copy
-  as decoded data.
+  `Block_Content` contains `Block_Size` bytes.
 
-- `RLE_Block` - this is a single byte, repeated N times.
-  `Block_Content` consists of a single byte,
-  and `Block_Size` is the number of times this byte should be repeated.
+- `RLE_Block` - this is a single byte, repeated `Block_Size` times.
+  `Block_Content` consists of a single byte.
+  On the decompression side, this byte must be repeated `Block_Size` times.
 
 - `Compressed_Block` - this is a [Zstandard compressed block](#compressed-blocks),
   explained later on.
   `Block_Size` is the length of `Block_Content`, the compressed data.
-  The decompressed size is unknown,
+  The decompressed size is not known,
   but its maximum possible value is guaranteed (see below)
 
 - `Reserved` - this is not a block.
   This value cannot be used with current version of this specification.
 
+__`Block_Size`__
+
+The upper 21 bits of `Block_Header` represent the `Block_Size`.
+
 Block sizes must respect a few rules :
-- In compressed mode, compressed size is always strictly less than decompressed size.
-- Block decompressed size is always <= maximum back-reference distance.
+- For `Compressed_Block`, `Block_Size` is always strictly less than decompressed size.
+- Block decompressed size is always <= `Window_Size`
 - Block decompressed size is always <= 128 KB.
 
-A data block is not necessarily "full" :
-since an arbitrary “flush” may happen anytime,
-block decompressed content can be any size (even empty),
+A block can contain any number of bytes (even empty),
 up to `Block_Maximum_Decompressed_Size`, which is the smallest of :
-- Maximum back-reference distance
+- `Window_Size`
 - 128 KB
 
+
 Compressed Blocks
 -----------------
-To decompress a compressed block, the compressed size must be provided from
-`Block_Size` field in the block header.
+To decompress a compressed block, the compressed size must be provided
+from `Block_Size` field within `Block_Header`.
 
 A compressed block consists of 2 sections :
 - [Literals Section](#literals-section)
@@ -381,36 +390,34 @@ data in [Sequence Execution](#sequence-execution)
 #### Prerequisites
 To decode a compressed block, the following elements are necessary :
 - Previous decoded data, up to a distance of `Window_Size`,
-  or all previous data when `Single_Segment_flag` is set.
-- List of "recent offsets" from the previous compressed block.
-- Decoding tables of the previous compressed block for each symbol type
+  or all previously decoded data when `Single_Segment_flag` is set.
+- List of "recent offsets" from previous `Compressed_Block`.
+- Decoding tables of previous `Compressed_Block` for each symbol type
   (literals, literals lengths, match lengths, offsets).
 
 Literals Section
 ----------------
-During sequence execution, symbols from the literals section
-During sequence phase, literals will be entangled with match copy operations.
 All literals are regrouped in the first part of the block.
-They can be decoded first, and then copied during sequence operations,
-or they can be decoded on the flow, as needed by sequence commands.
-
-| `Literals_Section_Header` | [`Huffman_Tree_Description`] | Stream1 | [Stream2] | [Stream3] | [Stream4] |
-| ------------------------- | ---------------------------- | ------- | --------- | --------- | --------- |
+They can be decoded first, and then copied during [Sequence Execution],
+or they can be decoded on the flow during [Sequence Execution].
 
 Literals can be stored uncompressed or compressed using Huffman prefix codes.
 When compressed, an optional tree description can be present,
 followed by 1 or 4 streams.
 
+| `Literals_Section_Header` | [`Huffman_Tree_Description`] | Stream1 | [Stream2] | [Stream3] | [Stream4] |
+| ------------------------- | ---------------------------- | ------- | --------- | --------- | --------- |
+
 
 #### `Literals_Section_Header`
 
 Header is in charge of describing how literals are packed.
 It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes,
-using little-endian convention.
+using __little-endian__ convention.
 
 | `Literals_Block_Type` | `Size_Format` | `Regenerated_Size` | [`Compressed_Size`] |
-| --------------------- | ------------- | ------------------ | ----------------- |
-|   2 bits              |  1 - 2 bits   |    5 - 20 bits     |    0 - 18 bits    |
+| --------------------- | ------------- | ------------------ | ------------------- |
+|       2 bits          |  1 - 2 bits   |    5 - 20 bits     |     0 - 18 bits     |
 
 In this representation, bits on the left are the lowest bits.
 
@@ -418,33 +425,38 @@ __`Literals_Block_Type`__
 
 This field uses 2 lowest bits of first byte, describing 4 different block types :
 
-| `Literals_Block_Type`         | Value |
-| ----------------------------- | ----- |
-| `Raw_Literals_Block`          |   0   |
-| `RLE_Literals_Block`          |   1   |
-| `Compressed_Literals_Block`   |   2   |
-| `Repeat_Stats_Literals_Block` |   3   |
+| `Literals_Block_Type`       | Value |
+| --------------------------- | ----- |
+| `Raw_Literals_Block`        |   0   |
+| `RLE_Literals_Block`        |   1   |
+| `Compressed_Literals_Block` |   2   |
+| `Treeless_Literals_Block`   |   3   |
 
 - `Raw_Literals_Block` - Literals are stored uncompressed.
-- `RLE_Literals_Block` - Literals consist of a single byte value repeated N times.
+- `RLE_Literals_Block` - Literals consist of a single byte value
+        repeated `Regenerated_Size` times.
 - `Compressed_Literals_Block` - This is a standard Huffman-compressed block,
         starting with a Huffman tree description.
         See details below.
-- `Repeat_Stats_Literals_Block` - This is a Huffman-compressed block,
+- `Treeless_Literals_Block` - This is a Huffman-compressed block,
         using Huffman tree _from previous Huffman-compressed literals block_.
-        Huffman tree description will be skipped.
-        Note: If this mode is used without any previous Huffman-table in the frame
-        (or [dictionary](#dictionary-format)), this should be treated as corruption.
+        `Huffman_Tree_Description` will be skipped.
+        Note: If this mode is triggered without any previous Huffman-table in the frame
+        (or [dictionary](#dictionary-format)), this should be treated as data corruption.
 
 __`Size_Format`__
 
 `Size_Format` is divided into 2 families :
 
-- For `Raw_Literals_Block` and `RLE_Literals_Block` it's enough to decode `Regenerated_Size`.
-- For `Compressed_Block`, its required to decode both `Compressed_Size`
-  and `Regenerated_Size` (the decompressed size). It will also decode the number of streams.
+- For `Raw_Literals_Block` and `RLE_Literals_Block`,
+  it's only necessary to decode `Regenerated_Size`.
+  There is no `Compressed_Size` field.
+- For `Compressed_Block` and `Treeless_Literals_Block`,
+  it's required to decode both `Compressed_Size`
+  and `Regenerated_Size` (the decompressed size).
+  It's also necessary to decode the number of streams (1 or 4).
 
-For values spanning several bytes, convention is little-endian.
+For values spanning several bytes, convention is __little-endian__.
 
 __`Size_Format` for `Raw_Literals_Block` and `RLE_Literals_Block`__ :
 
@@ -463,9 +475,9 @@ __`Size_Format` for `Raw_Literals_Block` and `RLE_Literals_Block`__ :
 
 Only Stream1 is present for these cases.
 Note : it's allowed to represent a short value (for example `13`)
-using a long format, accepting the increased compressed data size.
+using a long format, even if it's less efficient.
 
-__`Size_Format` for `Compressed_Literals_Block` and `Repeat_Stats_Literals_Block`__ :
+__`Size_Format` for `Compressed_Literals_Block` and `Treeless_Literals_Block`__ :
 
 - Value 00 : _A single stream_.
                Both `Regenerated_Size` and `Compressed_Size` use 10 bits (0-1023).
@@ -480,67 +492,68 @@ __`Size_Format` for `Compressed_Literals_Block` and `Repeat_Stats_Literals_Block
                Both `Regenerated_Size` and `Compressed_Size` use 18 bits (0-262143).
                `Literals_Section_Header` has 5 bytes.
 
-Both `Compressed_Size` and `Regenerated_Size` fields follow little-endian convention.
-Note: `Compressed_Size` __includes__ the size of the Huffman Tree description if it
-is present.
+Both `Compressed_Size` and `Regenerated_Size` fields follow __little-endian__ convention.
+Note: `Compressed_Size` __includes__ the size of the Huffman Tree description
+_when_ it is present.
 
 ### Raw Literals Block
-The data in Stream1 is `Regenerated_Size` bytes long, and contains the raw literals data
-to be used in sequence execution.
+The data in Stream1 is `Regenerated_Size` bytes long,
+it contains the raw literals data to be used during [Sequence Execution].
 
 ### RLE Literals Block
 Stream1 consists of a single byte which should be repeated `Regenerated_Size` times
 to generate the decoded literals.
 
-### Compressed Literals Block and Repeat Stats Literals Block
-Both of these modes contain Huffman encoded data 
+### Compressed Literals Block and Treeless Literals Block
+Both of these modes contain Huffman encoded data.
+`Treeless_Literals_Block` does not have a `Huffman_Tree_Description`.
 
 #### `Huffman_Tree_Description`
 This section is only present when `Literals_Block_Type` type is `Compressed_Literals_Block` (`2`).
 The format of the Huffman tree description can be found at [Huffman Tree description](#huffman-tree-description).
-The size Huffman Tree description will be determined during the decoding process,
-and must be used to determine where the compressed Huffman streams begin.
+The size of `Huffman_Tree_Description` is determined during decoding process,
+it must be used to determine where streams begin.
+`Total_Streams_Size = Compressed_Size - Huffman_Tree_Description_Size`.
 
-If repeat stats mode is used, the Huffman table used in the previous compressed block will
-be used to decompress this block as well.
+For `Treeless_Literals_Block`,
+the Huffman table comes from previously compressed literals block.
 
-Huffman compressed data consists either 1 or 4 Huffman-coded streams.
+Huffman compressed data consists of either 1 or 4 Huffman-coded streams.
 
 If only one stream is present, it is a single bitstream occupying the entire
-remaining portion of the literals block, encoded as described at
+remaining portion of the literals block, encoded as described within
 [Huffman-Coded Streams](#huffman-coded-streams).
 
 If there are four streams, the literals section header only provides enough
-information to know the regenerated and compressed sizes of all four streams combined.
-The regenerated size of each stream is equal to `(totalSize+3)/4`, except for the last stream,
-which may be up to 3 bytes smaller, to reach a total decompressed size match that described
-in the literals header.
+information to know the decompressed and compressed sizes of all four streams _combined_.
+The decompressed size of each stream is equal to `(Regenerated_Size+3)/4`,
+except for the last stream which may be up to 3 bytes smaller,
+to reach a total decompressed size as specified in `Regenerated_Size`.
 
-The compressed size of each stream is provided explicitly: the first 6 bytes of the compressed
-data consist of three 2-byte little endian fields, describing the compressed sizes
-of the first three streams.
-The last streams size is computed from the total compressed size and the size of the other
-three streams.
+The compressed size of each stream is provided explicitly:
+the first 6 bytes of the compressed data consist of three 2-byte __little-endian__ fields,
+describing the compressed sizes of the first three streams.
+`Stream4_Size` is computed from total `Total_Streams_Size` minus sizes of other streams.
 
-`stream4CSize = totalCSize - 6 - stream1CSize - stream2CSize - stream3CSize`.
+`Stream4_Size = Total_Streams_Size - 6 - Stream1_Size - Stream2_Size - Stream3_Size`.
 
-Note: remember that totalCSize may be smaller than the `Compressed_Size` found in the literals
-block header as `Compressed_Size` also contains the size of the Huffman Tree description if it
-is present.
+Note: remember that `Total_Streams_Size` can be smaller than `Compressed_Size` in header,
+because `Compressed_Size` also contains `Huffman_Tree_Description_Size` when it is present.
 
 Each of these 4 bitstreams is then decoded independently as a Huffman-Coded stream,
 as described at [Huffman-Coded Streams](#huffman-coded-streams)
 
+
 Sequences Section
 -----------------
 A compressed block is a succession of _sequences_ .
 A sequence is a literal copy command, followed by a match copy command.
 A literal copy command specifies a length.
-It is the number of bytes to be copied (or extracted) from the literal section.
+It is the number of bytes to be copied (or extracted) from the Literals Section.
 A match copy command specifies an offset and a length.
 
 When all _sequences_ are decoded,
-if there is are any literals left in the _literal section_,
+if there are literals left in the _literal section_,
 these bytes are added at the end of the block.
 
 This is described in more detail in [Sequence Execution](#sequence-execution)
@@ -557,7 +570,7 @@ followed by the bitstream.
 | -------------------------- | ------------------------- | ---------------- | ---------------------- | --------- |
 
 To decode the `Sequences_Section`, it's required to know its size.
-This size is deduced from `blockSize - literalSectionSize`.
+This size is deduced from `Block_Size - Literals_Section_Size`.
 
 
 #### `Sequences_Section_Header`
@@ -572,7 +585,7 @@ This is a variable size field using between 1 and 3 bytes.
 Let's call its first byte `byte0`.
 - `if (byte0 == 0)` : there are no sequences.
             The sequence section stops there.
-            Regenerated content is defined entirely by literals section.
+            Decompressed content is defined entirely as Literals Section content.
 - `if (byte0 < 128)` : `Number_of_Sequences = byte0` . Uses 1 byte.
 - `if (byte0 < 255)` : `Number_of_Sequences = ((byte0-128) << 8) + byte1` . Uses 2 bytes.
 - `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00` . Uses 3 bytes.
@@ -581,14 +594,14 @@ __Symbol compression modes__
 
 This is a single byte, defining the compression mode of each symbol type.
 
-|Bit number|   7-6                   |   5-4          |   3-2                |     1-0    |
+|Bit number|          7-6            |      5-4       |        3-2           |     1-0    |
 | -------- | ----------------------- | -------------- | -------------------- | ---------- |
 |Field name| `Literals_Lengths_Mode` | `Offsets_Mode` | `Match_Lengths_Mode` | `Reserved` |
 
 The last field, `Reserved`, must be all-zeroes.
 
 `Literals_Lengths_Mode`, `Offsets_Mode` and `Match_Lengths_Mode` define the `Compression_Mode` of
-literals lengths, offsets, and match lengths respectively.
+literals lengths, offsets, and match lengths symbols respectively.
 
 They follow the same enumeration :
 
@@ -598,17 +611,17 @@ They follow the same enumeration :
 
 - `Predefined_Mode` : A predefined FSE distribution table is used, defined in
           [default distributions](#default-distributions).
-          The table takes no space in the compressed data.
+          No distribution table will be present.
 - `RLE_Mode` : The table description consists of a single byte.
-          This code will be repeated for every sequence.
+          This code will be repeated for all sequences.
 - `Repeat_Mode` : The table used in the previous compressed block will be used again.
           No distribution table will be present.
-          Note: this includes RLE mode, so if repeat_mode follows rle_mode the same symbol will be repeated.
+          Note: this includes RLE mode, so if `Repeat_Mode` follows `RLE_Mode`, the same symbol will be repeated.
           If this mode is used without any previous sequence table in the frame
           (or [dictionary](#dictionary-format)) to repeat, this should be treated as corruption.
 - `FSE_Compressed_Mode` : standard FSE compression.
           A distribution table will be present.
-          The format of this distribution table is described in (FSE Table Description)[#fse-table-description].
+          The format of this distribution table is described in [FSE Table Description](#fse-table-description).
           Note that the maximum allowed accuracy log for literals length and match length tables is 9,
           and the maximum accuracy log for the offsets table is 8.
 
@@ -625,7 +638,7 @@ Literals length codes are values ranging from `0` to `35` included.
 They define lengths from 0 to 131071 bytes.
 The literals length is equal to the decoded `Baseline` plus
 the result of reading `Number_of_Bits` bits from the bitstream,
-as a little-endian value.
+as a __little-endian__ value.
 
 | `Literals_Length_Code` |         0-15           |
 | ---------------------- | ---------------------- |
@@ -654,7 +667,7 @@ Match length codes are values ranging from `0` to `52` included.
 They define lengths from 3 to 131074 bytes.
 The match length is equal to the decoded `Baseline` plus
 the result of reading `Number_of_Bits` bits from the bitstream,
-as a little-endian value.
+as a __little-endian__ value.
 
 | `Match_Length_Code` |         0-31            |
 | ------------------- | ----------------------- |
@@ -685,7 +698,7 @@ Recommendation is to support at least up to `22`.
 For information, at the time of this writing.
 the reference decoder supports a maximum `N` value of `28` in 64-bits mode.
 
-An offset code is also the number of additional bits to read in little-endian fashion,
+An offset code is also the number of additional bits to read in __little-endian__ fashion,
 and can be translated into an `Offset_Value` using the following formulas :
 
 ```
@@ -720,8 +733,8 @@ begins.
 FSE decoding requires a 'state' to be carried from symbol to symbol.
 For more explanation on FSE decoding, see the [FSE section](#fse).
 
-For sequence decoding, a separate state must be kept track of for each of
-literal lengths, offsets, and match lengths.
+For sequence decoding, a separate state keeps track of each
+literal lengths, offsets, and match lengths symbols.
 Some FSE primitives are also used.
 For more details on the operation of these primitives, see the [FSE section](#fse).
 
@@ -753,8 +766,7 @@ See the [description of the codes] for how to determine these values.
 [description of the codes]: #the-codes-for-literals-lengths-match-lengths-and-offsets
 
 Decoding starts by reading the `Number_of_Bits` required to decode `Offset`.
-It then does the same for `Match_Length`,
-and then for `Literals_Length`.
+It then does the same for `Match_Length`, and then for `Literals_Length`.
 This sequence is then used for [sequence execution](#sequence-execution).
 
 If it is not the last sequence in the block,
@@ -807,6 +819,7 @@ short offsetCodes_defaultDistribution[29] =
           1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
 ```
 
+
 Sequence Execution
 ------------------
 Once literals and sequences have been decoded,
@@ -826,7 +839,8 @@ in this case.
 
 The offset is defined as from the current position, so an offset of 6
 and a match length of 3 means that 3 bytes should be copied from 6 bytes back.
-Note that all offsets must be at most equal to the window size defined by the frame header.
+Note that all offsets leading to previously decoded data
+must be smaller than `Window_Size` defined in `Frame_Header_Descriptor`.
 
 #### Repeat offsets
 As seen in [Sequence Execution](#sequence-execution),
@@ -842,11 +856,10 @@ so an `offset_value` of 1 means `Repeated_Offset2`,
 an `offset_value` of 2 means `Repeated_Offset3`,
 and an `offset_value` of 3 means `Repeated_Offset1 - 1_byte`.
 
-In the first block, the offset history is populated with the following values : 1, 4 and 8 (in order).
+For the first block, the starting offset history is populated with the following values : 1, 4 and 8 (in order).
 
-Then each block gets its starting offset history from the ending values of the most recent compressed block.
-Note that non-compressed blocks are skipped,
-they do not contribute to offset history.
+Then each block gets its starting offset history from the ending values of the most recent `Compressed_Block`.
+Note that blocks which are not `Compressed_Block` are skipped, they do not contribute to offset history.
 
 [Offset Codes]: #offset-codes
 
@@ -859,6 +872,7 @@ This means that when `Repeated_Offset1` (most recent) is used, history is unmodi
 When `Repeated_Offset2` is used, it's swapped with `Repeated_Offset1`.
 If any other offset is used, it becomes `Repeated_Offset1` and the rest are shift back by one.
 
+
 Skippable Frames
 ----------------
 
@@ -878,7 +892,7 @@ Skippable frames defined in this specification are compatible with [LZ4] ones.
 
 __`Magic_Number`__
 
-4 Bytes, little-endian format.
+4 Bytes, __little-endian__ format.
 Value : 0x184D2A5?, which means any value from 0x184D2A50 to 0x184D2A5F.
 All 16 values are valid to identify a skippable frame.
 
@@ -886,13 +900,14 @@ __`Frame_Size`__
 
 This is the size, in bytes, of the following `User_Data`
 (without including the magic number nor the size field itself).
-This field is represented using 4 Bytes, little-endian format, unsigned 32-bits.
+This field is represented using 4 Bytes, __little-endian__ format, unsigned 32-bits.
 This means `User_Data` can’t be bigger than (2^32-1) bytes.
 
 __`User_Data`__
 
 The `User_Data` can be anything. Data will just be skipped by the decoder.
 
+
 Entropy Encoding
 ----------------
 Two types of entropy encoding are used by the Zstandard format:
@@ -900,7 +915,7 @@ FSE, and Huffman coding.
 
 FSE
 ---
-FSE, or FiniteStateEntropy is an entropy coding based on [ANS].
+FSE, short for Finite State Entropy, is an entropy codec based on [ANS].
 FSE encoding/decoding involves a state that is carried over between symbols,
 so decoding must be done in the opposite direction as encoding.
 Therefore, all FSE bitstreams are read from end to beginning.
@@ -909,15 +924,15 @@ For additional details on FSE, see [Finite State Entropy].
 
 [Finite State Entropy]:https://github.com/Cyan4973/FiniteStateEntropy/
 
-FSE decoding involves a decoding table which has a power of 2 size and three elements:
+FSE decoding involves a decoding table which has a power of 2 size, and contain three elements:
 `Symbol`, `Num_Bits`, and `Baseline`.
 The `log2` of the table size is its `Accuracy_Log`.
 The FSE state represents an index in this table.
-The next symbol in the stream is the symbol indicated by the table value for that state.
-To obtain the next state value,
-the decoder should consume `Num_Bits` bits from the stream as a little endian value and add it to baseline.
 
-To obtain the initial state value, consume `Accuracy_Log` bits from the stream as a little endian value.
+To obtain the initial state value, consume `Accuracy_Log` bits from the stream as a __little-endian__ value.
+The next symbol in the stream is the `Symbol` indicated in the table for that state.
+To obtain the next state value,
+the decoder should consume `Num_Bits` bits from the stream as a __little-endian__ value and add it to `Baseline`.
 
 [ANS]: https://en.wikipedia.org/wiki/Asymmetric_Numeral_Systems
 
@@ -929,7 +944,7 @@ An FSE distribution table describes the probabilities of all symbols
 from `0` to the last present one (included)
 on a normalized scale of `1 << Accuracy_Log` .
 
-It's a bitstream which is read forward, in little-endian fashion.
+It's a bitstream which is read forward, in __little-endian__ fashion.
 It's not necessary to know its exact size,
 since it will be discovered and reported by the decoding process.
 
@@ -1064,7 +1079,7 @@ Huffman Coding
 --------------
 Zstandard Huffman-coded streams are read backwards,
 similar to the FSE bitstreams.
-Therefore, to find the start of the bitstream it is therefore necessary to
+Therefore, to find the start of the bitstream, it is therefore to
 know the offset of the last byte of the Huffman-coded stream.
 
 After writing the last bit containing information, the compressor
@@ -1077,7 +1092,7 @@ byte to read. The decompressor needs to skip 0-7 initial `0`-bits and
 the first `1`-bit it occurs. Afterwards, the useful part of the bitstream
 begins.
 
-The bitstream contains Huffman-coded symbols in little-endian order,
+The bitstream contains Huffman-coded symbols in __little-endian__ order,
 with the codes defined by the method below.
 
 ### Huffman Tree Description
@@ -1182,14 +1197,14 @@ The Huffman header compression uses 2 states,
 which share the same FSE distribution table.
 The first state (`State1`) encodes the even indexed symbols,
 and the second (`State2`) encodes the odd indexes.
-State1 is initialized first, and then State2, and they take turns decoding
-a single symbol and updating their state.
+`State1` is initialized first, and then `State2`, and they take turns
+decoding a single symbol and updating their state.
 For more details on these FSE operations, see the [FSE section](#fse).
 
 The number of symbols to decode is determined
 by tracking bitStream overflow condition:
 If updating state after decoding a symbol would require more bits than
-remain in the stream, it is assumed the extra bits are 0.  Then,
+remain in the stream, it is assumed that extra bits are 0.  Then,
 the symbols for each of the final states are decoded and the process is complete.
 
 ##### Conversion from weights to Huffman prefix codes
@@ -1245,7 +1260,7 @@ it would be encoded as:
 |Encoding|`0000`|`0001`|`01`|`1`| `10000` |
 
 Starting from the end,
-it's possible to read the bitstream in a little-endian fashion,
+it's possible to read the bitstream in a __little-endian__ fashion,
 keeping track of already used bits.  Since the bitstream is encoded in reverse
 order, by starting at the end the symbols can be read in forward order.
 
@@ -1258,13 +1273,14 @@ If a bitstream is not entirely and exactly consumed,
 hence reaching exactly its beginning position with _all_ bits consumed,
 the decoding process is considered faulty.
 
+
 Dictionary Format
 -----------------
 
-Zstandard is compatible with "raw content" dictionaries, free of any format restriction,
-except that they must be at least 8 bytes.
-These dictionaries function as if they were just the `Content` block of a formatted
-dictionary.
+Zstandard is compatible with "raw content" dictionaries,
+free of any format restriction, except that they must be at least 8 bytes.
+These dictionaries function as if they were just the `Content` part
+of a formatted dictionary.
 
 But dictionaries created by `zstd --train` follow a format, described here.
 
@@ -1274,9 +1290,9 @@ __Pre-requisites__ : a dictionary has a size,
 | `Magic_Number` | `Dictionary_ID` | `Entropy_Tables` | `Content` |
 | -------------- | --------------- | ---------------- | --------- |
 
-__`Magic_Number`__ : 4 bytes ID, value 0xEC30A437, little-endian format
+__`Magic_Number`__ : 4 bytes ID, value 0xEC30A437, __little-endian__ format
 
-__`Dictionary_ID`__ : 4 bytes, stored in little-endian format.
+__`Dictionary_ID`__ : 4 bytes, stored in __little-endian__ format.
               `Dictionary_ID` can be any value, except 0 (which means no `Dictionary_ID`).
               It's used by decoders to check if they use the correct dictionary.
 
@@ -1284,9 +1300,9 @@ _Reserved ranges :_
               If the frame is going to be distributed in a private environment,
               any `Dictionary_ID` can be used.
               However, for public distribution of compressed frames,
-              the following ranges are reserved for future use and should not be used :
+              the following ranges are reserved and shall not be used :
 
-              - low range : 1 - 32767
+              - low range  : <= 32767
               - high range : >= (2^31)
 
 __`Entropy_Tables`__ : following the same format as the tables in compressed blocks.
@@ -1298,26 +1314,30 @@ __`Entropy_Tables`__ : following the same format as the tables in compressed blo
               These tables populate the Repeat Stats literals mode and
               Repeat distribution mode for sequence decoding.
               It's finally followed by 3 offset values, populating recent offsets (instead of using `{1,4,8}`),
-              stored in order, 4-bytes little-endian each, for a total of 12 bytes.
+              stored in order, 4-bytes __little-endian__ each, for a total of 12 bytes.
               Each recent offset must have a value < dictionary size.
 
 __`Content`__ : The rest of the dictionary is its content.
               The content act as a "past" in front of data to compress or decompress,
               so it can be referenced in sequence commands.
               As long as the amount of data decoded from this frame is less than or
-              equal to the window-size, sequence commands may specify offsets longer
-              than the lenght of total decoded output so far to reference back to the
-              dictionary.  After the total output has surpassed the window size however,
+              equal to `Window_Size`, sequence commands may specify offsets longer
+              than the total length of decoded output so far to reference back to the
+              dictionary.  After the total output has surpassed `Window_Size` however,
               this is no longer allowed and the dictionary is no longer accessible.
 
 [compressed blocks]: #the-format-of-compressed_block
+
+
 Appendix A - Decoding tables for predefined codes
 -------------------------------------------------
 
-This appendix contains FSE decoding tables for the predefined literal length, match length, and offset
-codes. The tables have been constructed using the algorithm as given above in the
-"from normalized distribution to decoding tables" chapter. The tables here can be used as examples
-to crosscheck that an implementation implements the decoding table generation algorithm correctly.
+This appendix contains FSE decoding tables
+for the predefined literal length, match length, and offset codes.
+The tables have been constructed using the algorithm as given above in chapter
+"from normalized distribution to decoding tables".
+The tables here can be used as examples
+to crosscheck that an implementation build its decoding tables correctly.
 
 #### Literal Length Code:
 
@@ -1496,6 +1516,7 @@ to crosscheck that an implementation implements the decoding table generation al
 
 Version changes
 ---------------
+- 0.2.5 : minor typos and clarifications
 - 0.2.4 : section restructuring, by Sean Purcell
 - 0.2.3 : clarified several details, by Sean Purcell
 - 0.2.2 : added predefined codes, by Johannes Rudolph
diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index 204f56ea5f21..2e77e7742f60 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -1,10 +1,10 @@
 <html>
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<title>zstd 1.1.4 Manual</title>
+<title>zstd 1.2.0 Manual</title>
 </head>
 <body>
-<h1>zstd 1.1.4 Manual</h1>
+<h1>zstd 1.2.0 Manual</h1>
 <hr>
 <a name="Contents"></a><h2>Contents</h2>
 <ol>
@@ -57,46 +57,46 @@
 <pre><b>size_t ZSTD_compress( void* dst, size_t dstCapacity,
                 const void* src, size_t srcSize,
                       int compressionLevel);
-</b><p>    Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
-    Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
-    @return : compressed size written into `dst` (<= `dstCapacity),
-              or an error code if it fails (which can be tested using ZSTD_isError()). 
+</b><p>  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+  @return : compressed size written into `dst` (<= `dstCapacity),
+            or an error code if it fails (which can be tested using ZSTD_isError()). 
 </p></pre><BR>
 
 <pre><b>size_t ZSTD_decompress( void* dst, size_t dstCapacity,
                   const void* src, size_t compressedSize);
-</b><p>    `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
-    `dstCapacity` is an upper bound of originalSize.
-    If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
-    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
-              or an errorCode if it fails (which can be tested using ZSTD_isError()). 
+</b><p>  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+  `dstCapacity` is an upper bound of originalSize.
+  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+            or an errorCode if it fails (which can be tested using ZSTD_isError()). 
 </p></pre><BR>
 
 <pre><b>unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
-</b><p>   NOTE: This function is planned to be obsolete, in favour of ZSTD_getFrameContentSize.
-   ZSTD_getFrameContentSize functions the same way, returning the decompressed size of a single
-   frame, but distinguishes empty frames from frames with an unknown size, or errors.
-
-   Additionally, ZSTD_findDecompressedSize can be used instead.  It can handle multiple
-   concatenated frames in one buffer, and so is more general.
-   As a result however, it requires more computation and entire frames to be passed to it,
-   as opposed to ZSTD_getFrameContentSize which requires only a single frame's header.
-
-   'src' is the start of a zstd compressed frame.
-   @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise.
-    note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
-             When `return==0`, data to decompress could be any size.
-             In which case, it's necessary to use streaming mode to decompress data.
-             Optionally, application can still use ZSTD_decompress() while relying on implied limits.
-             (For example, data may be necessarily cut into blocks <= 16 KB).
-    note 2 : decompressed size is always present when compression is done with ZSTD_compress()
-    note 3 : decompressed size can be very large (64-bits value),
-             potentially larger than what local system can handle as a single memory segment.
-             In which case, it's necessary to use streaming mode to decompress data.
-    note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
-             Always ensure result fits within application's authorized limits.
-             Each application can set its own limits.
-    note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameParams() to know more. 
+</b><p>  NOTE: This function is planned to be obsolete, in favour of ZSTD_getFrameContentSize.
+  ZSTD_getFrameContentSize functions the same way, returning the decompressed size of a single
+  frame, but distinguishes empty frames from frames with an unknown size, or errors.
+
+  Additionally, ZSTD_findDecompressedSize can be used instead.  It can handle multiple
+  concatenated frames in one buffer, and so is more general.
+  As a result however, it requires more computation and entire frames to be passed to it,
+  as opposed to ZSTD_getFrameContentSize which requires only a single frame's header.
+
+  'src' is the start of a zstd compressed frame.
+  @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise.
+   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+            When `return==0`, data to decompress could be any size.
+            In which case, it's necessary to use streaming mode to decompress data.
+            Optionally, application can still use ZSTD_decompress() while relying on implied limits.
+            (For example, data may be necessarily cut into blocks <= 16 KB).
+   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+   note 3 : decompressed size can be very large (64-bits value),
+            potentially larger than what local system can handle as a single memory segment.
+            In which case, it's necessary to use streaming mode to decompress data.
+   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+            Always ensure result fits within application's authorized limits.
+            Each application can set its own limits.
+   note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameParams() to know more. 
 </p></pre><BR>
 
 <h3>Helper functions</h3><pre></pre><b><pre>int         ZSTD_maxCLevel(void);               </b>/*!< maximum compression level available */<b>
@@ -106,28 +106,28 @@ const char* ZSTD_getErrorName(size_t code);     </b>/*!< provides readable strin
 </pre></b><BR>
 <a name="Chapter4"></a><h2>Explicit memory management</h2><pre></pre>
 
-<h3>Compression context</h3><pre>   When compressing many times,
-   it is recommended to allocate a context just once, and re-use it for each successive compression operation.
-   This will make workload friendlier for system's memory.
-   Use one context per thread for parallel execution in multi-threaded environments. 
+<h3>Compression context</h3><pre>  When compressing many times,
+  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+  This will make workload friendlier for system's memory.
+  Use one context per thread for parallel execution in multi-threaded environments. 
 </pre><b><pre>typedef struct ZSTD_CCtx_s ZSTD_CCtx;
 ZSTD_CCtx* ZSTD_createCCtx(void);
 size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
 </pre></b><BR>
 <pre><b>size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel);
-</b><p>    Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). 
+</b><p>  Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). 
 </p></pre><BR>
 
-<h3>Decompression context</h3><pre>   When decompressing many times,
-   it is recommended to allocate a context just once, and re-use it for each successive compression operation.
-   This will make workload friendlier for system's memory.
-   Use one context per thread for parallel execution in multi-threaded environments. 
+<h3>Decompression context</h3><pre>  When decompressing many times,
+  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+  This will make workload friendlier for system's memory.
+  Use one context per thread for parallel execution in multi-threaded environments. 
 </pre><b><pre>typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 ZSTD_DCtx* ZSTD_createDCtx(void);
 size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 </pre></b><BR>
 <pre><b>size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
-</b><p>   Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()). 
+</b><p>  Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()). 
 </p></pre><BR>
 
 <a name="Chapter5"></a><h2>Simple dictionary API</h2><pre></pre>
@@ -169,9 +169,10 @@ size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
                                 void* dst, size_t dstCapacity,
                           const void* src, size_t srcSize,
                           const ZSTD_CDict* cdict);
-</b><p>   Compression using a digested Dictionary.
-   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
-   Note that compression level is decided during dictionary creation. 
+</b><p>  Compression using a digested Dictionary.
+  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+  Note that compression level is decided during dictionary creation.
+  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) 
 </p></pre><BR>
 
 <pre><b>ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
@@ -399,7 +400,7 @@ typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; v
 </p></pre><BR>
 
 <pre><b>ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, unsigned byReference,
-                                      ZSTD_parameters params, ZSTD_customMem customMem);
+                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem);
 </b><p>  Create a ZSTD_CDict using external alloc and free, and customized compression parameters 
 </p></pre><BR>
 
@@ -426,12 +427,19 @@ typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; v
    both values are optional, select `0` if unknown. 
 </p></pre><BR>
 
-<pre><b>size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
-                               void* dst, size_t dstCapacity,
-                         const void* src, size_t srcSize,
-                         const void* dict,size_t dictSize,
-                               ZSTD_parameters params);
-</b><p>   Same as ZSTD_compress_usingDict(), with fine-tune control of each compression parameter 
+<pre><b>size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                      void* dst, size_t dstCapacity,
+                const void* src, size_t srcSize,
+                const void* dict,size_t dictSize,
+                      ZSTD_parameters params);
+</b><p>   Same as ZSTD_compress_usingDict(), with fine-tune control over each compression parameter 
+</p></pre><BR>
+
+<pre><b>size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                      void* dst, size_t dstCapacity,
+                const void* src, size_t srcSize,
+                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams);
+</b><p>   Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters 
 </p></pre><BR>
 
 <a name="Chapter15"></a><h2>Advanced decompression functions</h2><pre></pre>
@@ -491,20 +499,29 @@ typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; v
     Note : this use case also happens when using a non-conformant dictionary.
   - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
   - This is not a Zstandard frame.
-  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. 
+  When identifying the exact failure cause, it's possible to use ZSTD_getFrameParams(), which will provide a more precise error code. 
 </p></pre><BR>
 
 <a name="Chapter16"></a><h2>Advanced streaming functions</h2><pre></pre>
 
 <h3>Advanced Streaming compression functions</h3><pre></pre><b><pre>ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);   </b>/**< size of CStream is variable, depending primarily on compression level */<b>
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   </b>/**< pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */<b>
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); </b>/**< note: a dict will not be used if dict == NULL or dictSize < 8 */<b>
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
                                              ZSTD_parameters params, unsigned long long pledgedSrcSize);  </b>/**< pledgedSrcSize is optional and can be 0 (meaning unknown). note: if the contentSizeFlag is set, pledgedSrcSize == 0 means the source size is actually 0 */<b>
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);  </b>/**< note : cdict will just be referenced, and must outlive compression session */<b>
-size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);  </b>/**< re-use compression parameters from previous init; skip dictionary loading stage; zcs must be init at least once before. note: pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */<b>
-size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize, ZSTD_frameParameters fParams);  </b>/**< same as ZSTD_initCStream_usingCDict(), with control over frame parameters */<b>
 </pre></b><BR>
+<pre><b>size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+</b><p>  start a new compression job, using same parameters from previous job.
+  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  pledgedSrcSize==0 means "srcSize unknown".
+  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+  @return : 0, or an error code (which can be tested using ZSTD_isError()) 
+</p></pre><BR>
+
 <h3>Advanced Streaming decompression functions</h3><pre></pre><b><pre>typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
 ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
 size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); </b>/**< note: a dict will not be used if dict == NULL or dictSize < 8 */<b>
@@ -552,10 +569,9 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
 <h3>Buffer-less streaming compression functions</h3><pre></pre><b><pre>size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
 size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
 size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); </b>/**< pledgedSrcSize is optional and can be 0 (meaning unknown). note: if the contentSizeFlag is set, pledgedSrcSize == 0 means the source size is actually 0 */<b>
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); </b>/**< note: fails if cdict==NULL */<b>
+size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   </b>/* compression parameters are already set within cdict. pledgedSrcSize=0 means null-size */<b>
 size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); </b>/**<  note: if pledgedSrcSize can be 0, indicating unknown size.  if it is non-zero, it must be accurate.  for 0 size frames, use compressBegin_advanced */<b>
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize); </b>/**< note: if pledgedSrcSize can be 0, indicating unknown size.  if it is non-zero, it must be accurate.  for 0 size frames, use compressBegin_advanced */<b>
-size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
-size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 </pre></b><BR>
 <a name="Chapter19"></a><h2>Buffer-less streaming decompression (synchronous mode)</h2><pre>
   A ZSTD_DCtx object is required to track streaming operations.
@@ -640,19 +656,20 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
     - Compressing and decompressing require a context structure
       + Use ZSTD_createCCtx() and ZSTD_createDCtx()
     - It is necessary to init context before starting
-      + compression : ZSTD_compressBegin()
-      + decompression : ZSTD_decompressBegin()
-      + variants _usingDict() are also allowed
-      + copyCCtx() and copyDCtx() work too
-    - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
-      + If you need to compress more, cut data into multiple blocks
-      + Consider using the regular ZSTD_compress() instead, as frame metadata costs become negligible when source size is large.
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSizeMax() <= ZSTD_BLOCKSIZE_ABSOLUTEMAX
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block size, consider using the regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger.
     - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
       In which case, nothing is produced into `dst`.
       + User must test for such outcome and deal directly with uncompressed data
       + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!!
-      + In case of multiple successive blocks, decoder must be informed of uncompressed block existence to follow proper history.
-        Use ZSTD_insertBlock() in such a case.
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
 <BR></pre>
 
 <h3>Raw zstd block functions</h3><pre></pre><b><pre>size_t ZSTD_getBlockSizeMax(ZSTD_CCtx* cctx);
diff --git a/examples/simple_compression.c b/examples/simple_compression.c
index 9d448712ec34..ab1131475575 100644
--- a/examples/simple_compression.c
+++ b/examples/simple_compression.c
@@ -116,7 +116,6 @@ static char* createOutFilename_orDie(const char* filename)
 int main(int argc, const char** argv)
 {
     const char* const exeName = argv[0];
-    const char* const inFilename = argv[1];
 
     if (argc!=2) {
         printf("wrong arguments\n");
@@ -125,6 +124,8 @@ int main(int argc, const char** argv)
         return 1;
     }
 
+    const char* const inFilename = argv[1];
+
     char* const outFilename = createOutFilename_orDie(inFilename);
     compress_orDie(inFilename, outFilename);
     free(outFilename);
diff --git a/examples/streaming_compression.c b/examples/streaming_compression.c
index 4c2c1a1d8bc7..24ad15bd614c 100644
--- a/examples/streaming_compression.c
+++ b/examples/streaming_compression.c
@@ -112,7 +112,6 @@ static const char* createOutFilename_orDie(const char* filename)
 int main(int argc, const char** argv)
 {
     const char* const exeName = argv[0];
-    const char* const inFilename = argv[1];
 
     if (argc!=2) {
         printf("wrong arguments\n");
@@ -121,6 +120,8 @@ int main(int argc, const char** argv)
         return 1;
     }
 
+    const char* const inFilename = argv[1];
+
     const char* const outFilename = createOutFilename_orDie(inFilename);
     compressFile_orDie(inFilename, outFilename, 1);
 
diff --git a/examples/streaming_decompression.c b/examples/streaming_decompression.c
index 400aa673d64b..bb2d80987081 100644
--- a/examples/streaming_decompression.c
+++ b/examples/streaming_decompression.c
@@ -99,7 +99,6 @@ static void decompressFile_orDie(const char* fname)
 int main(int argc, const char** argv)
 {
     const char* const exeName = argv[0];
-    const char* const inFilename = argv[1];
 
     if (argc!=2) {
         fprintf(stderr, "wrong arguments\n");
@@ -108,6 +107,8 @@ int main(int argc, const char** argv)
         return 1;
     }
 
+    const char* const inFilename = argv[1];
+
     decompressFile_orDie(inFilename);
     return 0;
 }
diff --git a/lib/Makefile b/lib/Makefile
index 197fdeeea033..d8d8e179d205 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -71,6 +71,9 @@ libzstd.a: $(ZSTD_OBJ)
 	@echo compiling static library
 	@$(AR) $(ARFLAGS) $@ $^
 
+libzstd.a-mt: CPPFLAGS += -DZSTD_MULTHREAD
+libzstd.a-mt: libzstd.a
+
 $(LIBZSTD): LDFLAGS += -shared -fPIC -fvisibility=hidden
 $(LIBZSTD): $(ZSTD_FILES)
 	@echo compiling dynamic library $(LIBVER)
@@ -86,10 +89,17 @@ endif
 
 libzstd : $(LIBZSTD)
 
+libzstd-mt : CPPFLAGS += -DZSTD_MULTITHREAD
+libzstd-mt : libzstd
+
 lib: libzstd.a libzstd
 
-lib-release: DEBUGFLAGS :=
+lib-mt: CPPFLAGS += -DZSTD_MULTITHREAD
+lib-mt: lib
+
+lib-release lib-release-mt: DEBUGFLAGS :=
 lib-release: lib
+lib-release-mt: lib-mt
 
 clean:
 	@$(RM) -r *.dSYM   # Mac OS-X specific
diff --git a/lib/README.md b/lib/README.md
index 3357e3d87096..79b6fd50014d 100644
--- a/lib/README.md
+++ b/lib/README.md
@@ -22,6 +22,14 @@ Some additional API may be useful if you're looking into advanced features :
                           They are not "stable", their definition may change in the future.
                           Only static linking is allowed.
 
+#### ZSTDMT API
+
+To enable multithreaded compression within the library, invoke `make lib-mt` target.
+Prototypes are defined in header file `compress/zstdmt_compress.h`.
+When linking a program that uses ZSTDMT API against libzstd.a on a POSIX system,
+`-pthread` flag must be provided to the compiler and linker.
+Note : ZSTDMT prototypes can still be used with a library built without multithread support,
+but in this case, they will be single threaded only.
 
 #### Modular build
 
diff --git a/lib/common/bitstream.h b/lib/common/bitstream.h
index d3873002ebd1..ca42850df324 100644
--- a/lib/common/bitstream.h
+++ b/lib/common/bitstream.h
@@ -2,7 +2,7 @@
    bitstream
    Part of FSE library
    header file (to include)
-   Copyright (C) 2013-2016, Yann Collet.
+   Copyright (C) 2013-2017, Yann Collet.
 
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
@@ -53,6 +53,16 @@ extern "C" {
 #include "error_private.h"  /* error codes and messages */
 
 
+/*-*************************************
+*  Debug
+***************************************/
+#if defined(BIT_DEBUG) && (BIT_DEBUG>=1)
+#  include <assert.h>
+#else
+#  define assert(condition) ((void)0)
+#endif
+
+
 /*=========================================
 *  Target specific
 =========================================*/
@@ -74,7 +84,7 @@ extern "C" {
 typedef struct
 {
     size_t bitContainer;
-    int    bitPos;
+    unsigned bitPos;
     char*  startPtr;
     char*  ptr;
     char*  endPtr;
@@ -112,6 +122,7 @@ typedef struct
     unsigned bitsConsumed;
     const char* ptr;
     const char* start;
+    const char* limitPtr;
 } BIT_DStream_t;
 
 typedef enum { BIT_DStream_unfinished = 0,
@@ -163,7 +174,10 @@ MEM_STATIC unsigned BIT_highbit32 (register U32 val)
 #   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
     return 31 - __builtin_clz (val);
 #   else   /* Software version */
-    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+                                             11, 14, 16, 18, 22, 25,  3, 30,
+                                              8, 12, 20, 28, 15, 17, 24,  7,
+                                             19, 27, 23,  6, 26,  5,  4, 31 };
     U32 v = val;
     v |= v >> 1;
     v |= v >> 2;
@@ -175,31 +189,36 @@ MEM_STATIC unsigned BIT_highbit32 (register U32 val)
 }
 
 /*=====    Local Constants   =====*/
-static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,  0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF };   /* up to 26 bits */
+static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F,
+                                    0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF,
+                                    0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
+                                    0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF };   /* up to 26 bits */
 
 
 /*-**************************************************************
 *  bitStream encoding
 ****************************************************************/
 /*! BIT_initCStream() :
- *  `dstCapacity` must be > sizeof(void*)
+ *  `dstCapacity` must be > sizeof(size_t)
  *  @return : 0 if success,
               otherwise an error code (can be tested using ERR_isError() ) */
-MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t dstCapacity)
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
 {
     bitC->bitContainer = 0;
     bitC->bitPos = 0;
     bitC->startPtr = (char*)startPtr;
     bitC->ptr = bitC->startPtr;
-    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr);
-    if (dstCapacity <= sizeof(bitC->ptr)) return ERROR(dstSize_tooSmall);
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
     return 0;
 }
 
 /*! BIT_addBits() :
     can add up to 26 bits into `bitC`.
     Does not check for register overflow ! */
-MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
 {
     bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
     bitC->bitPos += nbBits;
@@ -207,34 +226,42 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
 
 /*! BIT_addBitsFast() :
  *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
 {
+    assert((value>>nbBits) == 0);
     bitC->bitContainer |= value << bitC->bitPos;
     bitC->bitPos += nbBits;
 }
 
 /*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
  *  unsafe version; does not check buffer overflow */
 MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
 {
     size_t const nbBytes = bitC->bitPos >> 3;
+    assert( bitC->bitPos <= (sizeof(bitC->bitContainer)*8) );
     MEM_writeLEST(bitC->ptr, bitC->bitContainer);
     bitC->ptr += nbBytes;
+    assert(bitC->ptr <= bitC->endPtr);
     bitC->bitPos &= 7;
-    bitC->bitContainer >>= nbBytes*8;   /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+    bitC->bitContainer >>= nbBytes*8;
 }
 
 /*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
  *  safe version; check for buffer overflow, and prevents it.
- *  note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
 MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
 {
     size_t const nbBytes = bitC->bitPos >> 3;
+    assert( bitC->bitPos <= (sizeof(bitC->bitContainer)*8) );
     MEM_writeLEST(bitC->ptr, bitC->bitContainer);
     bitC->ptr += nbBytes;
     if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
     bitC->bitPos &= 7;
-    bitC->bitContainer >>= nbBytes*8;   /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+    bitC->bitContainer >>= nbBytes*8;
 }
 
 /*! BIT_closeCStream() :
@@ -244,9 +271,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
 {
     BIT_addBitsFast(bitC, 1, 1);   /* endMark */
     BIT_flushBits(bitC);
-
-    if (bitC->ptr >= bitC->endPtr) return 0; /* doesn't fit within authorized budget : cancel */
-
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
     return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
 }
 
@@ -264,15 +289,16 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
 {
     if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
 
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
     if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
-        bitD->start = (const char*)srcBuffer;
         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
           bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
     } else {
-        bitD->start = (const char*)srcBuffer;
         bitD->ptr   = bitD->start;
         bitD->bitContainer = *(const BYTE*)(bitD->start);
         switch(srcSize)
@@ -330,17 +356,18 @@ MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
 #if defined(__BMI__) && defined(__GNUC__)   /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
     return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
 #else
-    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
-    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
 #endif
 }
 
 /*! BIT_lookBitsFast() :
-*   unsafe version; only works only if nbBits >= 1 */
+ *  unsafe version; only works if nbBits >= 1 */
 MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
 {
-    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
-    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
 }
 
 MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
@@ -365,6 +392,7 @@ MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
 {
     size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
     BIT_skipBits(bitD, nbBits);
     return value;
 }
@@ -376,10 +404,10 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
               if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
-	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should not happen => corruption detected */
-		return BIT_DStream_overflow;
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+        return BIT_DStream_overflow;
 
-    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+    if (bitD->ptr >= bitD->limitPtr) {
         bitD->ptr -= bitD->bitsConsumed >> 3;
         bitD->bitsConsumed &= 7;
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
@@ -389,6 +417,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
         return BIT_DStream_completed;
     }
+    /* start < ptr < limitPtr */
     {   U32 nbBytes = bitD->bitsConsumed >> 3;
         BIT_DStream_status result = BIT_DStream_unfinished;
         if (bitD->ptr - nbBytes < bitD->start) {
@@ -397,7 +426,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
         }
         bitD->ptr -= nbBytes;
         bitD->bitsConsumed -= nbBytes*8;
-        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
         return result;
     }
 }
diff --git a/lib/common/error_private.c b/lib/common/error_private.c
index a0fa1724aee8..b3287245f1ee 100644
--- a/lib/common/error_private.c
+++ b/lib/common/error_private.c
@@ -29,7 +29,7 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
     case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
-    case PREFIX(srcSize_wrong): return "Src size incorrect";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
     case PREFIX(corruption_detected): return "Corrupted block detected";
     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
@@ -37,6 +37,7 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
     case PREFIX(maxCode):
     default: return notErrorCode;
     }
diff --git a/lib/common/fse.h b/lib/common/fse.h
index baac39032675..6d5d41def19b 100644
--- a/lib/common/fse.h
+++ b/lib/common/fse.h
@@ -316,6 +316,10 @@ If there is an error, the function will return an error code, which can be teste
 #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
 #define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
 
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
 
 /* *****************************************
 *  FSE advanced API
@@ -353,7 +357,7 @@ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsi
  * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
  * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
  */
-#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + (1<<((maxTableLog>2)?(maxTableLog-2):0)) )
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
 size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 
 size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
@@ -550,9 +554,9 @@ MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U3
 
 MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
 {
-    const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
     const U16* const stateTable = (const U16*)(statePtr->stateTable);
-    U32 nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
     BIT_addBits(bitC, statePtr->value, nbBitsOut);
     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
 }
diff --git a/lib/common/huf.h b/lib/common/huf.h
index e5572760a548..7873ca3d42a5 100644
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@@ -43,6 +43,21 @@ extern "C" {
 #include <stddef.h>    /* size_t */
 
 
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ *        HUF symbols remain "private" (internal symbols for library only).
+ *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+#  define HUF_PUBLIC_API
+#endif
+
+
 /* *** simple functions *** */
 /**
 HUF_compress() :
@@ -55,8 +70,8 @@ HUF_compress() :
                      if return == 1, srcData is a single repeated byte symbol (RLE compression).
                      if HUF_isError(return), compression failed (more details using HUF_getErrorName())
 */
-size_t HUF_compress(void* dst, size_t dstCapacity,
-              const void* src, size_t srcSize);
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
 
 /**
 HUF_decompress() :
@@ -69,32 +84,42 @@ HUF_decompress() :
     @return : size of regenerated data (== originalSize),
               or an error code, which can be tested using HUF_isError()
 */
-size_t HUF_decompress(void* dst,  size_t originalSize,
-                const void* cSrc, size_t cSrcSize);
+HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+                               const void* cSrc, size_t cSrcSize);
 
 
 /* ***   Tool functions *** */
-#define HUF_BLOCKSIZE_MAX (128 * 1024)       /**< maximum input size for a single block compressed with HUF_compress */
-size_t HUF_compressBound(size_t size);       /**< maximum compressed size (worst case) */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
 
 /* Error Management */
-unsigned    HUF_isError(size_t code);        /**< tells if a return value is an error code */
-const char* HUF_getErrorName(size_t code);   /**< provides error code string (useful for debugging) */
+HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
 
 
 /* ***   Advanced function   *** */
 
 /** HUF_compress2() :
- *   Same as HUF_compress(), but offers direct control over `maxSymbolValue` and `tableLog` .
- *   `tableLog` must be `<= HUF_TABLELOG_MAX` . */
-size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+ *  Same as HUF_compress(), but offers direct control over `maxSymbolValue` and `tableLog`.
+ *  `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
 
 /** HUF_compress4X_wksp() :
-*   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+ *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ *  `workspace` must have minimum alignment of 4, and be at least as large as following macro */
+#define HUF_WORKSPACE_SIZE (6 << 10)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 
 
 
+/* ******************************************************************
+ *  WARNING !!
+ *  The following section contains advanced and experimental definitions
+ *  which shall never be used in the context of dll
+ *  because they are not guaranteed to remain stable in the future.
+ *  Only consider them in association with static linking.
+ *******************************************************************/
 #ifdef HUF_STATIC_LINKING_ONLY
 
 /* *** Dependencies *** */
@@ -117,12 +142,14 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize, const void* src, size_t s
 ******************************************/
 /* HUF buffer bounds */
 #define HUF_CTABLEBOUND 129
-#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
 #define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
 
 /* static allocation of HUF's Compression Table */
+#define HUF_CTABLE_SIZE_U32(maxSymbolValue)   ((maxSymbolValue)+1)   /* Use tables of U32, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
 #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
-    U32 name##hb[maxSymbolValue+1]; \
+    U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \
     void* name##hv = &(name##hb); \
     HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
 
@@ -134,10 +161,6 @@ typedef U32 HUF_DTable;
 #define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
         HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
 
-/* The workspace must have alignment at least 4 and be at least this large */
-#define HUF_WORKSPACE_SIZE (6 << 10)
-#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
-
 
 /* ****************************************
 *  Advanced decompression functions
diff --git a/lib/common/mem.h b/lib/common/mem.h
index 3cacd216aa02..4773a8b9309e 100644
--- a/lib/common/mem.h
+++ b/lib/common/mem.h
@@ -89,8 +89,7 @@ MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (size
 #ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
 #  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
 #    define MEM_FORCE_MEMORY_ACCESS 2
-#  elif defined(__INTEL_COMPILER) /*|| defined(_MSC_VER)*/ || \
-  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#  elif defined(__INTEL_COMPILER) || defined(__GNUC__)
 #    define MEM_FORCE_MEMORY_ACCESS 1
 #  endif
 #endif
@@ -122,7 +121,7 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
 /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
 /* currently only defined for gcc and icc */
 #if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
-	__pragma( pack(push, 1) )
+    __pragma( pack(push, 1) )
     typedef union { U16 u16; U32 u32; U64 u64; size_t st; } unalign;
     __pragma( pack(pop) )
 #else
diff --git a/lib/common/zstd_errors.h b/lib/common/zstd_errors.h
index 949dbd0fffac..3d579d969363 100644
--- a/lib/common/zstd_errors.h
+++ b/lib/common/zstd_errors.h
@@ -57,6 +57,7 @@ typedef enum {
   ZSTD_error_maxSymbolValue_tooSmall,
   ZSTD_error_dictionary_corrupted,
   ZSTD_error_dictionary_wrong,
+  ZSTD_error_dictionaryCreation_failed,
   ZSTD_error_maxCode
 } ZSTD_ErrorCode;
 
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index 5c5b28732975..2533333ba83c 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -16,9 +16,9 @@
 #ifdef _MSC_VER    /* Visual Studio */
 #  define FORCE_INLINE static __forceinline
 #  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
 #  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
 #  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
-#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
 #else
 #  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
 #    ifdef __GNUC__
@@ -58,6 +58,8 @@
 /*-*************************************
 *  shared macros
 ***************************************/
+#undef MIN
+#undef MAX
 #define MIN(a,b) ((a)<(b) ? (a) : (b))
 #define MAX(a,b) ((a)>(b) ? (a) : (b))
 #define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; }  /* check and Forward error code */
@@ -104,7 +106,6 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define LONGNBSEQ 0x7F00
 
 #define MINMATCH 3
-#define EQUAL_READ32 4
 
 #define Litbits  8
 #define MaxLit ((1<<Litbits) - 1)
diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c
index 6708fb9d78ea..26e8052ddcce 100644
--- a/lib/compress/fse_compress.c
+++ b/lib/compress/fse_compress.c
@@ -291,7 +291,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
 
 size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
-    if (tableLog > FSE_MAX_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
     if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
 
     if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
@@ -476,20 +476,20 @@ void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-	U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
-	U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
-	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
-	return minBits;
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    return minBits;
 }
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
 {
-	U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
     U32 tableLog = maxTableLog;
-	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
     if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-	if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
-	if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
     if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
     if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
     return tableLog;
@@ -808,7 +808,7 @@ size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t src
     if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
 
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(maxCount, FSE_count(count, &maxSymbolValue, src, srcSize) );
+    {   CHECK_V_F(maxCount, FSE_count_wksp(count, &maxSymbolValue, src, srcSize, (unsigned*)scratchBuffer) );
         if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
         if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
         if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 450e5970a4a5..c08b315dab93 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -20,6 +20,26 @@
 #include "zstd_internal.h"  /* includes zstd.h */
 
 
+/*-*************************************
+*  Debug
+***************************************/
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
+#  include <assert.h>
+#else
+#  define assert(condition) ((void)0)
+#endif
+
+#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
+
+#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=2)
+#  include <stdio.h>
+   static unsigned g_debugLevel = ZSTD_DEBUG;
+#  define DEBUGLOG(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __FILE__ ": "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, " \n"); }
+#else
+#  define DEBUGLOG(l, ...)      {}    /* disabled */
+#endif
+
+
 /*-*************************************
 *  Constants
 ***************************************/
@@ -27,12 +47,22 @@ static const U32 g_searchStrength = 8;   /* control skip over incompressible dat
 #define HASH_READ_SIZE 8
 typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
 
+/* entropy tables always have same size */
+static size_t const hufCTable_size = HUF_CTABLE_SIZE(255);
+static size_t const litlengthCTable_size = FSE_CTABLE_SIZE(LLFSELog, MaxLL);
+static size_t const offcodeCTable_size = FSE_CTABLE_SIZE(OffFSELog, MaxOff);
+static size_t const matchlengthCTable_size = FSE_CTABLE_SIZE(MLFSELog, MaxML);
+static size_t const entropyScratchSpace_size = HUF_WORKSPACE_SIZE;
+
 
 /*-*************************************
 *  Helper functions
 ***************************************/
-#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
-size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
+size_t ZSTD_compressBound(size_t srcSize) {
+    size_t const lowLimit = 256 KB;
+    size_t const margin = (srcSize < lowLimit) ? (lowLimit-srcSize) >> 12 : 0;  /* from 64 to 0 */
+    return srcSize + (srcSize >> 8) + margin;
+}
 
 
 /*-*************************************
@@ -70,6 +100,7 @@ struct ZSTD_CCtx_s {
     size_t workSpaceSize;
     size_t blockSize;
     U64 frameContentSize;
+    U64 consumedSrcSize;
     XXH64_state_t xxhState;
     ZSTD_customMem customMem;
 
@@ -77,13 +108,13 @@ struct ZSTD_CCtx_s {
     U32* hashTable;
     U32* hashTable3;
     U32* chainTable;
-    HUF_CElt* hufTable;
-    U32 flagStaticTables;
-    HUF_repeat flagStaticHufTable;
-    FSE_CTable offcodeCTable  [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
-    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
-    FSE_CTable litlengthCTable  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
-    unsigned tmpCounters[HUF_WORKSPACE_SIZE_U32];
+    HUF_repeat hufCTable_repeatMode;
+    HUF_CElt* hufCTable;
+    U32 fseCTables_ready;
+    FSE_CTable* offcodeCTable;
+    FSE_CTable* matchlengthCTable;
+    FSE_CTable* litlengthCTable;
+    unsigned* entropyScratchSpace;
 };
 
 ZSTD_CCtx* ZSTD_createCCtx(void)
@@ -150,9 +181,7 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
     CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
     CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
     CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
-    { U32 const searchLengthMin = ((cParams.strategy == ZSTD_fast) | (cParams.strategy == ZSTD_greedy)) ? ZSTD_SEARCHLENGTH_MIN+1 : ZSTD_SEARCHLENGTH_MIN;
-      U32 const searchLengthMax = (cParams.strategy == ZSTD_fast) ? ZSTD_SEARCHLENGTH_MAX : ZSTD_SEARCHLENGTH_MAX-1;
-      CLAMPCHECK(cParams.searchLength, searchLengthMin, searchLengthMax); }
+    CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
     CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX);
     if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2) return ERROR(compressionParameter_unsupported);
     return 0;
@@ -206,11 +235,14 @@ size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams)
     size_t const hSize = ((size_t)1) << cParams.hashLog;
     U32    const hashLog3 = (cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog);
     size_t const h3Size = ((size_t)1) << hashLog3;
+    size_t const entropySpace = hufCTable_size + litlengthCTable_size
+                              + offcodeCTable_size + matchlengthCTable_size
+                              + entropyScratchSpace_size;
     size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
 
     size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits))*sizeof(U32)
                           + (ZSTD_OPT_NUM+1)*(sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
-    size_t const neededSpace = tableSpace + (256*sizeof(U32)) /* huffTable */ + tokenSpace
+    size_t const neededSpace = entropySpace + tableSpace + tokenSpace
                              + (((cParams.strategy == ZSTD_btopt) || (cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
 
     return sizeof(ZSTD_CCtx) + neededSpace;
@@ -232,6 +264,7 @@ static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_parameters params, U64 fra
     U32 const end = (U32)(cctx->nextSrc - cctx->base);
     cctx->params = params;
     cctx->frameContentSize = frameContentSize;
+    cctx->consumedSrcSize = 0;
     cctx->lowLimit = end;
     cctx->dictLimit = end;
     cctx->nextToUpdate = end+1;
@@ -246,16 +279,16 @@ static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_parameters params, U64 fra
 
 typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e;
 
-/*! ZSTD_resetCCtx_advanced() :
+/*! ZSTD_resetCCtx_internal() :
     note : `params` must be validated */
-static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
+static size_t ZSTD_resetCCtx_internal (ZSTD_CCtx* zc,
                                        ZSTD_parameters params, U64 frameContentSize,
                                        ZSTD_compResetPolicy_e const crp)
 {
     if (crp == ZSTDcrp_continue)
         if (ZSTD_equivalentParams(params, zc->params)) {
-            zc->flagStaticTables = 0;
-            zc->flagStaticHufTable = HUF_repeat_none;
+            zc->fseCTables_ready = 0;
+            zc->hufCTable_repeatMode = HUF_repeat_none;
             return ZSTD_continueCCtx(zc, params, frameContentSize);
         }
 
@@ -271,41 +304,67 @@ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
         void* ptr;
 
         /* Check if workSpace is large enough, alloc a new one if needed */
-        {   size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits))*sizeof(U32)
-                                  + (ZSTD_OPT_NUM+1)*(sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
-            size_t const neededSpace = tableSpace + (256*sizeof(U32)) /* huffTable */ + tokenSpace
-                                  + (((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+        {   size_t const entropySpace = hufCTable_size + litlengthCTable_size
+                                  + offcodeCTable_size + matchlengthCTable_size
+                                  + entropyScratchSpace_size;
+            size_t const optPotentialSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits)) * sizeof(U32)
+                                  + (ZSTD_OPT_NUM+1) * (sizeof(ZSTD_match_t)+sizeof(ZSTD_optimal_t));
+            size_t const optSpace = ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optPotentialSpace : 0;
+            size_t const neededSpace = entropySpace + optSpace + tableSpace + tokenSpace;
             if (zc->workSpaceSize < neededSpace) {
+                zc->workSpaceSize = 0;
                 ZSTD_free(zc->workSpace, zc->customMem);
                 zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
                 if (zc->workSpace == NULL) return ERROR(memory_allocation);
                 zc->workSpaceSize = neededSpace;
+                ptr = zc->workSpace;
+
+                /* entropy space */
+                zc->hufCTable = (HUF_CElt*)ptr;
+                ptr = (char*)zc->hufCTable + hufCTable_size;  /* note : HUF_CElt* is incomplete type, size is estimated via macro */
+                zc->offcodeCTable = (FSE_CTable*) ptr;
+                ptr = (char*)ptr + offcodeCTable_size;
+                zc->matchlengthCTable = (FSE_CTable*) ptr;
+                ptr = (char*)ptr + matchlengthCTable_size;
+                zc->litlengthCTable = (FSE_CTable*) ptr;
+                ptr = (char*)ptr + litlengthCTable_size;
+                assert(((size_t)ptr & 3) == 0);   /* ensure correct alignment */
+                zc->entropyScratchSpace = (unsigned*) ptr;
         }   }
 
-        if (crp!=ZSTDcrp_noMemset) memset(zc->workSpace, 0, tableSpace);   /* reset tables only */
-        XXH64_reset(&zc->xxhState, 0);
-        zc->hashLog3 = hashLog3;
-        zc->hashTable = (U32*)(zc->workSpace);
-        zc->chainTable = zc->hashTable + hSize;
-        zc->hashTable3 = zc->chainTable + chainSize;
-        ptr = zc->hashTable3 + h3Size;
-        zc->hufTable = (HUF_CElt*)ptr;
-        zc->flagStaticTables = 0;
-        zc->flagStaticHufTable = HUF_repeat_none;
-        ptr = ((U32*)ptr) + 256;  /* note : HUF_CElt* is incomplete type, size is simulated using U32 */
+        /* init params */
+        zc->params = params;
+        zc->blockSize = blockSize;
+        zc->frameContentSize = frameContentSize;
+        zc->consumedSrcSize = 0;
 
+        XXH64_reset(&zc->xxhState, 0);
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+        zc->loadedDictEnd = 0;
+        zc->fseCTables_ready = 0;
+        zc->hufCTable_repeatMode = HUF_repeat_none;
         zc->nextToUpdate = 1;
         zc->nextSrc = NULL;
         zc->base = NULL;
         zc->dictBase = NULL;
         zc->dictLimit = 0;
         zc->lowLimit = 0;
-        zc->params = params;
-        zc->blockSize = blockSize;
-        zc->frameContentSize = frameContentSize;
         { int i; for (i=0; i<ZSTD_REP_NUM; i++) zc->rep[i] = repStartValue[i]; }
+        zc->hashLog3 = hashLog3;
+        zc->seqStore.litLengthSum = 0;
+
+        /* ensure entropy tables are close together at the beginning */
+        assert((void*)zc->hufCTable == zc->workSpace);
+        assert((char*)zc->offcodeCTable == (char*)zc->hufCTable + hufCTable_size);
+        assert((char*)zc->matchlengthCTable == (char*)zc->offcodeCTable + offcodeCTable_size);
+        assert((char*)zc->litlengthCTable == (char*)zc->matchlengthCTable + matchlengthCTable_size);
+        assert((char*)zc->entropyScratchSpace == (char*)zc->litlengthCTable + litlengthCTable_size);
+        ptr = (char*)zc->entropyScratchSpace + entropyScratchSpace_size;
 
+        /* opt parser space */
         if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) {
+            assert(((size_t)ptr & 3) == 0);  /* ensure ptr is properly aligned */
             zc->seqStore.litFreq = (U32*)ptr;
             zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1<<Litbits);
             zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL+1);
@@ -315,8 +374,17 @@ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
             ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM+1;
             zc->seqStore.priceTable = (ZSTD_optimal_t*)ptr;
             ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM+1;
-            zc->seqStore.litLengthSum = 0;
         }
+
+        /* table Space */
+        if (crp!=ZSTDcrp_noMemset) memset(ptr, 0, tableSpace);   /* reset tables only */
+        assert(((size_t)ptr & 3) == 0);  /* ensure ptr is properly aligned */
+        zc->hashTable = (U32*)(ptr);
+        zc->chainTable = zc->hashTable + hSize;
+        zc->hashTable3 = zc->chainTable + chainSize;
+        ptr = zc->hashTable3 + h3Size;
+
+        /* sequences storage */
         zc->seqStore.sequencesStart = (seqDef*)ptr;
         ptr = zc->seqStore.sequencesStart + maxNbSeq;
         zc->seqStore.llCode = (BYTE*) ptr;
@@ -324,10 +392,6 @@ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
         zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
         zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
 
-        zc->stage = ZSTDcs_init;
-        zc->dictID = 0;
-        zc->loadedDictEnd = 0;
-
         return 0;
     }
 }
@@ -341,27 +405,32 @@ void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
     for (i=0; i<ZSTD_REP_NUM; i++) cctx->rep[i] = 0;
 }
 
-/*! ZSTD_copyCCtx() :
-*   Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
-*   Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
-*   @return : 0, or an error code */
-size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+
+/*! ZSTD_copyCCtx_internal() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize=0 means "empty" if fParams.contentSizeFlag=1
+ *  @return : 0, or an error code */
+size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx,
+                              ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize)
 {
     if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong);
 
-
     memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
     {   ZSTD_parameters params = srcCCtx->params;
-        params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
-        ZSTD_resetCCtx_advanced(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset);
+        params.fParams = fParams;
+        DEBUGLOG(5, "ZSTD_resetCCtx_internal : dictIDFlag : %u \n", !fParams.noDictIDFlag);
+        ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset);
     }
 
     /* copy tables */
     {   size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog);
-        size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog;
+        size_t const hSize =  (size_t)1 << srcCCtx->params.cParams.hashLog;
         size_t const h3Size = (size_t)1 << srcCCtx->hashLog3;
         size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
-        memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace);
+        assert((U32*)dstCCtx->chainTable == (U32*)dstCCtx->hashTable + hSize);  /* chainTable must follow hashTable */
+        assert((U32*)dstCCtx->hashTable3 == (U32*)dstCCtx->chainTable + chainSize);
+        memcpy(dstCCtx->hashTable, srcCCtx->hashTable, tableSpace);   /* presumes all tables follow each other */
     }
 
     /* copy dictionary offsets */
@@ -376,23 +445,36 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long
     dstCCtx->dictID       = srcCCtx->dictID;
 
     /* copy entropy tables */
-    dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
-    dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable;
-    if (srcCCtx->flagStaticTables) {
-        memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
-        memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
-        memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
+    dstCCtx->fseCTables_ready = srcCCtx->fseCTables_ready;
+    if (srcCCtx->fseCTables_ready) {
+        memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, litlengthCTable_size);
+        memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, matchlengthCTable_size);
+        memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, offcodeCTable_size);
     }
-    if (srcCCtx->flagStaticHufTable) {
-        memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
+    dstCCtx->hufCTable_repeatMode = srcCCtx->hufCTable_repeatMode;
+    if (srcCCtx->hufCTable_repeatMode) {
+        memcpy(dstCCtx->hufCTable, srcCCtx->hufCTable, hufCTable_size);
     }
 
     return 0;
 }
 
+/*! ZSTD_copyCCtx() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize==0 means "unknown".
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    fParams.contentSizeFlag = pledgedSrcSize>0;
+
+    return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx, fParams, pledgedSrcSize);
+}
+
 
 /*! ZSTD_reduceTable() :
-*   reduce table indexes by `reducerValue` */
+ *  reduce table indexes by `reducerValue` */
 static void ZSTD_reduceTable (U32* const table, U32 const size, U32 const reducerValue)
 {
     U32 u;
@@ -499,26 +581,28 @@ static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc,
 
     /* small ? don't even attempt compression (speed opt) */
 #   define LITERAL_NOENTROPY 63
-    {   size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
+    {   size_t const minLitSize = zc->hufCTable_repeatMode == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
         if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
 
     if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
-    {   HUF_repeat repeat = zc->flagStaticHufTable;
+    {   HUF_repeat repeat = zc->hufCTable_repeatMode;
         int const preferRepeat = zc->params.cParams.strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
-        cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat)
-                                : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat);
+        cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
+                                      zc->entropyScratchSpace, entropyScratchSpace_size, zc->hufCTable, &repeat, preferRepeat)
+                                : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11,
+                                      zc->entropyScratchSpace, entropyScratchSpace_size, zc->hufCTable, &repeat, preferRepeat);
         if (repeat != HUF_repeat_none) { hType = set_repeat; }    /* reused the existing table */
-        else { zc->flagStaticHufTable = HUF_repeat_check; }       /* now have a table to reuse */
+        else { zc->hufCTable_repeatMode = HUF_repeat_check; }       /* now have a table to reuse */
     }
 
     if ((cLitSize==0) | (cLitSize >= srcSize - minGain)) {
-        zc->flagStaticHufTable = HUF_repeat_none;
+        zc->hufCTable_repeatMode = HUF_repeat_none;
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
     if (cLitSize==1) {
-        zc->flagStaticHufTable = HUF_repeat_none;
+        zc->hufCTable_repeatMode = HUF_repeat_none;
         return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
     }
 
@@ -637,12 +721,12 @@ MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
 
     /* CTable for Literal Lengths */
     {   U32 max = MaxLL;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, zc->tmpCounters);
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, zc->entropyScratchSpace);
         if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
             *op++ = llCodeTable[0];
             FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
             LLtype = set_rle;
-        } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        } else if ((zc->fseCTables_ready) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
             LLtype = set_repeat;
         } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog-1)))) {
             FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
@@ -653,7 +737,7 @@ MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
             if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; }
             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
-              if (FSE_isError(NCountSize)) return ERROR(GENERIC);
+              if (FSE_isError(NCountSize)) return NCountSize;
               op += NCountSize; }
             FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
             LLtype = set_compressed;
@@ -661,12 +745,12 @@ MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
 
     /* CTable for Offsets */
     {   U32 max = MaxOff;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, zc->tmpCounters);
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, zc->entropyScratchSpace);
         if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
             *op++ = ofCodeTable[0];
             FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
             Offtype = set_rle;
-        } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        } else if ((zc->fseCTables_ready) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
             Offtype = set_repeat;
         } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog-1)))) {
             FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
@@ -677,7 +761,7 @@ MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
             if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; }
             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
-              if (FSE_isError(NCountSize)) return ERROR(GENERIC);
+              if (FSE_isError(NCountSize)) return NCountSize;
               op += NCountSize; }
             FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
             Offtype = set_compressed;
@@ -685,12 +769,12 @@ MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
 
     /* CTable for MatchLengths */
     {   U32 max = MaxML;
-        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, zc->tmpCounters);
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, zc->entropyScratchSpace);
         if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
             *op++ = *mlCodeTable;
             FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
             MLtype = set_rle;
-        } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+        } else if ((zc->fseCTables_ready) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
             MLtype = set_repeat;
         } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog-1)))) {
             FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
@@ -701,14 +785,14 @@ MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
             if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; }
             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
-              if (FSE_isError(NCountSize)) return ERROR(GENERIC);
+              if (FSE_isError(NCountSize)) return NCountSize;
               op += NCountSize; }
             FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
             MLtype = set_compressed;
     }   }
 
     *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
-    zc->flagStaticTables = 0;
+    zc->fseCTables_ready = 0;
 
     /* Encoding Sequences */
     {   BIT_CStream_t blockStream;
@@ -787,7 +871,7 @@ _check_compressibility:
     {   size_t const minGain = ZSTD_minGain(srcSize);
         size_t const maxCSize = srcSize - minGain;
         if ((size_t)(op-ostart) >= maxCSize) {
-            zc->flagStaticHufTable = HUF_repeat_none;
+            zc->hufCTable_repeatMode = HUF_repeat_none;
             return 0;
     }   }
 
@@ -816,7 +900,7 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
         const U32 pos = (U32)((const BYTE*)literals - g_start);
         if (g_start==NULL) g_start = (const BYTE*)literals;
         if ((pos > 1895000) && (pos < 1895300))
-            fprintf(stderr, "Cpos %6u :%5u literals & match %3u bytes at distance %6u \n",
+            DEBUGLOG(5, "Cpos %6u :%5u literals & match %3u bytes at distance %6u \n",
                    pos, (U32)litLength, (U32)matchCode+MINMATCH, (U32)offsetCode);
     }
 #endif
@@ -825,14 +909,20 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
     seqStorePtr->lit += litLength;
 
     /* literal Length */
-    if (litLength>0xFFFF) { seqStorePtr->longLengthID = 1; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); }
+    if (litLength>0xFFFF) {
+        seqStorePtr->longLengthID = 1;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
     seqStorePtr->sequences[0].litLength = (U16)litLength;
 
     /* match offset */
     seqStorePtr->sequences[0].offset = offsetCode + 1;
 
     /* match Length */
-    if (matchCode>0xFFFF) { seqStorePtr->longLengthID = 2; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); }
+    if (matchCode>0xFFFF) {
+        seqStorePtr->longLengthID = 2;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
     seqStorePtr->sequences[0].matchLength = (U16)matchCode;
 
     seqStorePtr->sequences++;
@@ -853,7 +943,14 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
 #       elif defined(__GNUC__) && (__GNUC__ >= 3)
             return (__builtin_ctzll((U64)val) >> 3);
 #       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+                                                     0, 3, 1, 3, 1, 4, 2, 7,
+                                                     0, 2, 3, 6, 1, 5, 3, 5,
+                                                     1, 3, 4, 4, 2, 5, 6, 7,
+                                                     7, 0, 1, 2, 3, 3, 4, 6,
+                                                     2, 6, 5, 5, 3, 4, 5, 6,
+                                                     7, 1, 2, 4, 6, 4, 4, 5,
+                                                     7, 2, 6, 5, 7, 6, 7, 7 };
             return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
 #       endif
         } else { /* 32 bits */
@@ -864,7 +961,10 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
 #       elif defined(__GNUC__) && (__GNUC__ >= 3)
             return (__builtin_ctz((U32)val) >> 3);
 #       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+                                                     3, 2, 2, 1, 3, 2, 0, 1,
+                                                     3, 3, 1, 2, 2, 2, 2, 0,
+                                                     3, 1, 2, 0, 1, 0, 1, 1 };
             return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
 #       endif
         }
@@ -936,7 +1036,7 @@ static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE
 ***************************************/
 static const U32 prime3bytes = 506832829U;
 static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); }   /* only in zstd_opt.h */
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
 
 static const U32 prime4bytes = 2654435761U;
 static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
@@ -1085,7 +1185,7 @@ static void ZSTD_compressBlock_fast(ZSTD_CCtx* ctx,
     const U32 mls = ctx->params.cParams.searchLength;
     switch(mls)
     {
-    default:
+    default: /* includes case 3 */
     case 4 :
         ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return;
     case 5 :
@@ -1135,7 +1235,7 @@ static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
         if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
             const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
             ip++;
             ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
         } else {
@@ -1147,7 +1247,7 @@ static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
             {   const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
                 const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
                 U32 offset;
-                mLength = ZSTD_count_2segments(ip+EQUAL_READ32, match+EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32;
+                mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
                 while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
                 offset = current - matchIndex;
                 offset_2 = offset_1;
@@ -1171,7 +1271,7 @@ static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
                 if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
-                    size_t repLength2 = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch2+EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
                     hashTable[ZSTD_hashPtr(ip, hBits, mls)] = current2;
@@ -1199,7 +1299,7 @@ static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
     U32 const mls = ctx->params.cParams.searchLength;
     switch(mls)
     {
-    default:
+    default: /* includes case 3 */
     case 4 :
         ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return;
     case 5 :
@@ -1274,7 +1374,9 @@ void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
         const BYTE* match = base + matchIndexS;
         hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
 
-        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { /* note : by construction, offset_1 <= current */
+        assert(offset_1 <= current);   /* supposed guaranteed by construction */
+        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+            /* favor repcode */
             mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             ip++;
             ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
@@ -1285,15 +1387,15 @@ void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
                 offset = (U32)(ip-matchLong);
                 while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
             } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
-                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-                U32 const matchIndex3 = hashLong[h3];
-                const BYTE* match3 = base + matchIndex3;
-                hashLong[h3] = current + 1;
-                if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
-                    mLength = ZSTD_count(ip+9, match3+8, iend) + 8;
+                size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndexL3 = hashLong[hl3];
+                const BYTE* matchL3 = base + matchIndexL3;
+                hashLong[hl3] = current + 1;
+                if ( (matchIndexL3 > lowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1)) ) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
                     ip++;
-                    offset = (U32)(ip-match3);
-                    while (((ip>anchor) & (match3>lowest)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>lowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                 } else {
                     mLength = ZSTD_count(ip+4, match+4, iend) + 4;
                     offset = (U32)(ip-match);
@@ -1353,7 +1455,7 @@ static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx* ctx, const void* src, size_
     const U32 mls = ctx->params.cParams.searchLength;
     switch(mls)
     {
-    default:
+    default: /* includes case 3 */
     case 4 :
         ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return;
     case 5 :
@@ -1462,8 +1564,8 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
 
         if (ip <= ilimit) {
             /* Fill Table */
-			hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;
-			hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2;
+            hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;
+            hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2;
             hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
             hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
             /* check immediate repcode */
@@ -1474,7 +1576,7 @@ static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
                 if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch2+EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, lowPrefixPtr) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
@@ -1503,7 +1605,7 @@ static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx* ctx,
     U32 const mls = ctx->params.cParams.searchLength;
     switch(mls)
     {
-    default:
+    default: /* includes case 3 */
     case 4 :
         ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return;
     case 5 :
@@ -1588,7 +1690,7 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co
             match = dictBase + matchIndex;
             matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
             if (matchIndex+matchLength >= dictLimit)
-				match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
         }
 
         if (matchLength > bestLength) {
@@ -1667,7 +1769,7 @@ static size_t ZSTD_insertBtAndFindBestMatch (
             match = dictBase + matchIndex;
             matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
             if (matchIndex+matchLength >= dictLimit)
-				match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
         }
 
         if (matchLength > bestLength) {
@@ -1733,9 +1835,10 @@ static size_t ZSTD_BtFindBestMatch_selectMLS (
 {
     switch(matchLengthSearch)
     {
-    default :
+    default : /* includes case 3 */
     case 4 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
     case 5 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+    case 7 :
     case 6 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
     }
 }
@@ -1772,9 +1875,10 @@ static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
 {
     switch(matchLengthSearch)
     {
-    default :
+    default : /* includes case 3 */
     case 4 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
     case 5 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+    case 7 :
     case 6 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
     }
 }
@@ -1831,7 +1935,7 @@ size_t ZSTD_HcFindBestMatch_generic (
     const U32 current = (U32)(ip-base);
     const U32 minChain = current > chainSize ? current - chainSize : 0;
     int nbAttempts=maxNbAttempts;
-    size_t ml=EQUAL_READ32-1;
+    size_t ml=4-1;
 
     /* HC4 match finder */
     U32 matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls);
@@ -1846,11 +1950,15 @@ size_t ZSTD_HcFindBestMatch_generic (
         } else {
             match = dictBase + matchIndex;
             if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-                currentMl = ZSTD_count_2segments(ip+EQUAL_READ32, match+EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32;
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
         }
 
         /* save best solution */
-        if (currentMl > ml) { ml = currentMl; *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; if (ip+currentMl == iLimit) break; /* best possible, and avoid read overflow*/ }
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
+            if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+        }
 
         if (matchIndex <= minChain) break;
         matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
@@ -1868,9 +1976,10 @@ FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS (
 {
     switch(matchLengthSearch)
     {
-    default :
+    default : /* includes case 3 */
     case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0);
     case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0);
+    case 7 :
     case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0);
     }
 }
@@ -1884,9 +1993,10 @@ FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
 {
     switch(matchLengthSearch)
     {
-    default :
+    default : /* includes case 3 */
     case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1);
     case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1);
+    case 7 :
     case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1);
     }
 }
@@ -1934,7 +2044,7 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
         /* check repCode */
         if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) {
             /* repcode : we take it */
-            matchLength = ZSTD_count(ip+1+EQUAL_READ32, ip+1+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
+            matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
             if (depth==0) goto _storeSequence;
         }
 
@@ -1945,7 +2055,7 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
 
-        if (matchLength < EQUAL_READ32) {
+        if (matchLength < 4) {
             ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
             continue;
         }
@@ -1955,17 +2065,17 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
         while (ip<ilimit) {
             ip ++;
             if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
-                size_t const mlRep = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
+                size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
                 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
-                if ((mlRep >= EQUAL_READ32) && (gain2 > gain1))
+                if ((mlRep >= 4) && (gain2 > gain1))
                     matchLength = mlRep, offset = 0, start = ip;
             }
             {   size_t offset2=99999999;
                 size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
-                if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                if ((ml2 >= 4) && (gain2 > gain1)) {
                     matchLength = ml2, offset = offset2, start = ip;
                     continue;   /* search a better one */
             }   }
@@ -1974,17 +2084,17 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
             if ((depth==2) && (ip<ilimit)) {
                 ip ++;
                 if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
-                    size_t const ml2 = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
+                    size_t const ml2 = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                     int const gain2 = (int)(ml2 * 4);
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
-                    if ((ml2 >= EQUAL_READ32) && (gain2 > gain1))
+                    if ((ml2 >= 4) && (gain2 > gain1))
                         matchLength = ml2, offset = 0, start = ip;
                 }
                 {   size_t offset2=99999999;
                     size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
-                    if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
                         matchLength = ml2, offset = offset2, start = ip;
                         continue;
             }   }   }
@@ -1993,7 +2103,9 @@ void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
 
         /* catch up */
         if (offset) {
-            while ((start>anchor) && (start>base+offset-ZSTD_REP_MOVE) && (start[-1] == start[-1-offset+ZSTD_REP_MOVE]))   /* only search for offset within prefix */
+            while ( (start > anchor)
+                 && (start > base+offset-ZSTD_REP_MOVE)
+                 && (start[-1] == start[-1-offset+ZSTD_REP_MOVE]) )  /* only search for offset within prefix */
                 { start--; matchLength++; }
             offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
         }
@@ -2010,7 +2122,7 @@ _storeSequence:
              && ((offset_2>0)
              & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
             /* store sequence */
-            matchLength = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_2, iend) + EQUAL_READ32;
+            matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
             offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
             ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
             ip += matchLength;
@@ -2099,7 +2211,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
             if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-                matchLength = ZSTD_count_2segments(ip+1+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                 if (depth==0) goto _storeSequence;
         }   }
 
@@ -2110,7 +2222,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                 matchLength = ml2, start = ip, offset=offsetFound;
         }
 
-         if (matchLength < EQUAL_READ32) {
+         if (matchLength < 4) {
             ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
             continue;
         }
@@ -2129,10 +2241,10 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                 if (MEM_read32(ip) == MEM_read32(repMatch)) {
                     /* repcode detected */
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-                    size_t const repLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                    size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                     int const gain2 = (int)(repLength * 3);
                     int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
-                    if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+                    if ((repLength >= 4) && (gain2 > gain1))
                         matchLength = repLength, offset = 0, start = ip;
             }   }
 
@@ -2141,7 +2253,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                 size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
-                if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                if ((ml2 >= 4) && (gain2 > gain1)) {
                     matchLength = ml2, offset = offset2, start = ip;
                     continue;   /* search a better one */
             }   }
@@ -2159,10 +2271,10 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                     if (MEM_read32(ip) == MEM_read32(repMatch)) {
                         /* repcode detected */
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-                        size_t repLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
-                        int gain2 = (int)(repLength * 4);
-                        int gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
-                        if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+                        size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                        int const gain2 = (int)(repLength * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((repLength >= 4) && (gain2 > gain1))
                             matchLength = repLength, offset = 0, start = ip;
                 }   }
 
@@ -2171,7 +2283,7 @@ void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
                     size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
-                    if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
                         matchLength = ml2, offset = offset2, start = ip;
                         continue;
             }   }   }
@@ -2203,7 +2315,7 @@ _storeSequence:
             if (MEM_read32(ip) == MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
-                matchLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                 offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
                 ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
                 ip += matchLength;
@@ -2294,8 +2406,12 @@ typedef void (*ZSTD_blockCompressor) (ZSTD_CCtx* ctx, const void* src, size_t sr
 static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
 {
     static const ZSTD_blockCompressor blockCompressor[2][8] = {
-        { ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2 },
-        { ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict }
+        { ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2 },
+        { ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict }
     };
 
     return blockCompressor[extDict][(U32)strat];
@@ -2311,7 +2427,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCa
     if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) return 0;   /* don't even attempt compression below a certain srcSize */
     ZSTD_resetSeqStore(&(zc->seqStore));
     if (current > zc->nextToUpdate + 384)
-        zc->nextToUpdate = current - MIN(192, (U32)(current - zc->nextToUpdate - 384));   /* update tree not updated after finding very long rep matches */
+        zc->nextToUpdate = current - MIN(192, (U32)(current - zc->nextToUpdate - 384));   /* limited update after finding a very long match */
     blockCompressor(zc, src, srcSize);
     return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize);
 }
@@ -2343,7 +2459,8 @@ static size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
         size_t cSize;
 
-        if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE) return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
+        if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE)
+            return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
         if (remaining < blockSize) blockSize = remaining;
 
         /* preemptive overflow correction */
@@ -2398,7 +2515,8 @@ static size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
 static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
                                     ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID)
 {   BYTE* const op = (BYTE*)dst;
-    U32   const dictIDSizeCode = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCode = params.fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
     U32   const checksumFlag = params.fParams.checksumFlag>0;
     U32   const windowSize = 1U << params.cParams.windowLog;
     U32   const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
@@ -2410,6 +2528,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
     size_t pos;
 
     if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall);
+    DEBUGLOG(5, "ZSTD_writeFrameHeader : dictIDFlag : %u \n", !params.fParams.noDictIDFlag);
+    DEBUGLOG(5, "ZSTD_writeFrameHeader : dictID : %u \n", dictID);
+    DEBUGLOG(5, "ZSTD_writeFrameHeader : dictIDSizeCode : %u \n", dictIDSizeCode);
 
     MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
     op[4] = frameHeaderDecriptionByte; pos=5;
@@ -2478,6 +2599,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
                              ZSTD_compress_generic (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
                              ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize);
         if (ZSTD_isError(cSize)) return cSize;
+        cctx->consumedSrcSize += srcSize;
         return cSize + fhSize;
     } else
         return fhSize;
@@ -2488,7 +2610,7 @@ size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
                               void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize)
 {
-    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
 }
 
 
@@ -2501,10 +2623,12 @@ size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const
 {
     size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx);
     if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
-    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
-
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
 static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx* zc, const void* src, size_t srcSize)
 {
     const BYTE* const ip = (const BYTE*) src;
@@ -2534,13 +2658,15 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx* zc, const void* src, size_t
     case ZSTD_greedy:
     case ZSTD_lazy:
     case ZSTD_lazy2:
-        ZSTD_insertAndFindFirstIndex (zc, iend-HASH_READ_SIZE, zc->params.cParams.searchLength);
+        if (srcSize >= HASH_READ_SIZE)
+            ZSTD_insertAndFindFirstIndex(zc, iend-HASH_READ_SIZE, zc->params.cParams.searchLength);
         break;
 
     case ZSTD_btlazy2:
     case ZSTD_btopt:
     case ZSTD_btopt2:
-        ZSTD_updateTree(zc, iend-HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength);
+        if (srcSize >= HASH_READ_SIZE)
+            ZSTD_updateTree(zc, iend-HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength);
         break;
 
     default:
@@ -2567,18 +2693,15 @@ static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSym
 
 
 /* Dictionary format :
-    Magic == ZSTD_DICT_MAGIC (4 bytes)
-    HUF_writeCTable(256)
-    FSE_writeNCount(off)
-    FSE_writeNCount(ml)
-    FSE_writeNCount(ll)
-    RepOffsets
-    Dictionary content
-*/
-/*! ZSTD_loadDictEntropyStats() :
-    @return : size read from dictionary
-    note : magic number supposed already checked */
-static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : 0, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed > 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
     const BYTE* const dictEnd = dictPtr + dictSize;
@@ -2586,7 +2709,11 @@ static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_
     unsigned offcodeMaxValue = MaxOff;
     BYTE scratchBuffer[1<<MAX(MLFSELog,LLFSELog)];
 
-    {   size_t const hufHeaderSize = HUF_readCTable(cctx->hufTable, 255, dict, dictSize);
+    dictPtr += 4;   /* skip magic number */
+    cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr);
+    dictPtr += 4;
+
+    {   size_t const hufHeaderSize = HUF_readCTable(cctx->hufCTable, 255, dictPtr, dictEnd-dictPtr);
         if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
         dictPtr += hufHeaderSize;
     }
@@ -2596,7 +2723,8 @@ static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_
         if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
         if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
         /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
-        CHECK_E (FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted);
+        CHECK_E( FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, scratchBuffer, sizeof(scratchBuffer)),
+                 dictionary_corrupted);
         dictPtr += offcodeHeaderSize;
     }
 
@@ -2606,8 +2734,9 @@ static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_
         if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
         /* Every match length code must have non-zero probability */
-        CHECK_F (ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
-        CHECK_E (FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted);
+        CHECK_F( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+        CHECK_E( FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, scratchBuffer, sizeof(scratchBuffer)),
+                 dictionary_corrupted);
         dictPtr += matchlengthHeaderSize;
     }
 
@@ -2617,49 +2746,51 @@ static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_
         if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
         if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
         /* Every literal length code must have non-zero probability */
-        CHECK_F (ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
-        CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted);
+        CHECK_F( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+        CHECK_E( FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, scratchBuffer, sizeof(scratchBuffer)),
+                 dictionary_corrupted);
         dictPtr += litlengthHeaderSize;
     }
 
     if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
-    cctx->rep[0] = MEM_readLE32(dictPtr+0); if (cctx->rep[0] == 0 || cctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted);
-    cctx->rep[1] = MEM_readLE32(dictPtr+4); if (cctx->rep[1] == 0 || cctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted);
-    cctx->rep[2] = MEM_readLE32(dictPtr+8); if (cctx->rep[2] == 0 || cctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted);
+    cctx->rep[0] = MEM_readLE32(dictPtr+0);
+    cctx->rep[1] = MEM_readLE32(dictPtr+4);
+    cctx->rep[2] = MEM_readLE32(dictPtr+8);
     dictPtr += 12;
 
-    {   U32 offcodeMax = MaxOff;
-        if ((size_t)(dictEnd - dictPtr) <= ((U32)-1) - 128 KB) {
-            U32 const maxOffset = (U32)(dictEnd - dictPtr) + 128 KB; /* The maximum offset that must be supported */
-            /* Calculate minimum offset code required to represent maxOffset */
-            offcodeMax = ZSTD_highbit32(maxOffset);
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
         }
-        /* Every possible supported offset <= dictContentSize + 128 KB must be representable */
+        /* All offset values <= dictContentSize + 128 KB must be representable */
         CHECK_F (ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
-    }
+        /* All repCodes must be <= dictContentSize and != 0*/
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                if (cctx->rep[u] == 0) return ERROR(dictionary_corrupted);
+                if (cctx->rep[u] > dictContentSize) return ERROR(dictionary_corrupted);
+        }   }
 
-    cctx->flagStaticTables = 1;
-    cctx->flagStaticHufTable = HUF_repeat_valid;
-    return dictPtr - (const BYTE*)dict;
+        cctx->fseCTables_ready = 1;
+        cctx->hufCTable_repeatMode = HUF_repeat_valid;
+        return ZSTD_loadDictionaryContent(cctx, dictPtr, dictContentSize);
+    }
 }
 
 /** ZSTD_compress_insertDictionary() :
 *   @return : 0, or an error code */
-static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* dict, size_t dictSize)
+static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
 {
     if ((dict==NULL) || (dictSize<=8)) return 0;
 
     /* dict as pure content */
-    if ((MEM_readLE32(dict) != ZSTD_DICT_MAGIC) || (zc->forceRawDict))
-        return ZSTD_loadDictionaryContent(zc, dict, dictSize);
-    zc->dictID = zc->params.fParams.noDictIDFlag ? 0 :  MEM_readLE32((const char*)dict+4);
+    if ((MEM_readLE32(dict) != ZSTD_DICT_MAGIC) || (cctx->forceRawDict))
+        return ZSTD_loadDictionaryContent(cctx, dict, dictSize);
 
-    /* known magic number : dict is parsed for entropy stats and content */
-    {   size_t const loadError = ZSTD_loadDictEntropyStats(zc, (const char*)dict+8 /* skip dictHeader */, dictSize-8);
-        size_t const eSize = loadError + 8;
-        if (ZSTD_isError(loadError)) return loadError;
-        return ZSTD_loadDictionaryContent(zc, (const char*)dict+eSize, dictSize-eSize);
-    }
+    /* dict as zstd dictionary */
+    return ZSTD_loadZstdDictionary(cctx, dict, dictSize);
 }
 
 /*! ZSTD_compressBegin_internal() :
@@ -2669,7 +2800,8 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                    ZSTD_parameters params, U64 pledgedSrcSize)
 {
     ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue;
-    CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp));
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    CHECK_F(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, crp));
     return ZSTD_compress_insertDictionary(cctx, dict, dictSize);
 }
 
@@ -2745,10 +2877,13 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
                    const void* src, size_t srcSize)
 {
     size_t endResult;
-    size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1);
+    size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 1 /* last chunk */);
     if (ZSTD_isError(cSize)) return cSize;
     endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
     if (ZSTD_isError(endResult)) return endResult;
+    if (cctx->params.fParams.contentSizeFlag) {  /* control src size */
+        if (cctx->frameContentSize != cctx->consumedSrcSize) return ERROR(srcSize_wrong);
+    }
     return cSize + endResult;
 }
 
@@ -2773,7 +2908,8 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
     return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
 }
 
-size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, int compressionLevel)
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize,
+                               const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, dict ? dictSize : 0);
     params.fParams.contentSizeFlag = 1;
@@ -2812,8 +2948,16 @@ size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
     return ZSTD_sizeof_CCtx(cdict->refContext) + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict);
 }
 
+static ZSTD_parameters ZSTD_makeParams(ZSTD_compressionParameters cParams, ZSTD_frameParameters fParams)
+{
+    ZSTD_parameters params;
+    params.cParams = cParams;
+    params.fParams = fParams;
+    return params;
+}
+
 ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, unsigned byReference,
-                                      ZSTD_parameters params, ZSTD_customMem customMem)
+                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem)
 {
     if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
     if (!customMem.customAlloc || !customMem.customFree) return NULL;
@@ -2838,7 +2982,9 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, u
             cdict->dictContent = internalBuffer;
         }
 
-        {   size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0);
+        {   ZSTD_frameParameters const fParams = { 0 /* contentSizeFlag */, 0 /* checksumFlag */, 0 /* noDictIDFlag */ };   /* dummy */
+            ZSTD_parameters const params = ZSTD_makeParams(cParams, fParams);
+            size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0);
             if (ZSTD_isError(errorCode)) {
                 ZSTD_free(cdict->dictBuffer, customMem);
                 ZSTD_free(cdict, customMem);
@@ -2855,17 +3001,15 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, u
 ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_customMem const allocator = { NULL, NULL, NULL };
-    ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    params.fParams.contentSizeFlag = 1;
-    return ZSTD_createCDict_advanced(dict, dictSize, 0, params, allocator);
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize, 0, cParams, allocator);
 }
 
 ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_customMem const allocator = { NULL, NULL, NULL };
-    ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    params.fParams.contentSizeFlag = 1;
-    return ZSTD_createCDict_advanced(dict, dictSize, 1, params, allocator);
+    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize, 1, cParams, allocator);
 }
 
 size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
@@ -2883,34 +3027,55 @@ static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict* cdict) {
     return ZSTD_getParamsFromCCtx(cdict->refContext);
 }
 
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize)
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
 {
-    if (cdict->dictContentSize) CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize))
+    if (cdict==NULL) return ERROR(GENERIC);  /* does not support NULL cdict */
+    DEBUGLOG(5, "ZSTD_compressBegin_usingCDict_advanced : dictIDFlag == %u \n", !fParams.noDictIDFlag);
+    if (cdict->dictContentSize)
+        CHECK_F( ZSTD_copyCCtx_internal(cctx, cdict->refContext, fParams, pledgedSrcSize) )
     else {
         ZSTD_parameters params = cdict->refContext->params;
-        params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
-        CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, params, pledgedSrcSize));
+        params.fParams = fParams;
+        CHECK_F(ZSTD_compressBegin_internal(cctx, NULL, 0, params, pledgedSrcSize));
     }
     return 0;
 }
 
+/* ZSTD_compressBegin_usingCDict() :
+ * pledgedSrcSize=0 means "unknown"
+ * if pledgedSrcSize>0, it will enable contentSizeFlag */
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    DEBUGLOG(5, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u \n", !fParams.noDictIDFlag);
+    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, 0);
+}
+
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    CHECK_F (ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize));   /* will check if cdict != NULL */
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
 /*! ZSTD_compress_usingCDict() :
-*   Compression using a digested Dictionary.
-*   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
-*   Note that compression level is decided during dictionary creation */
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression parameters are decided at CDict creation time
+ *  while frame parameters are hardcoded */
 size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                 void* dst, size_t dstCapacity,
                                 const void* src, size_t srcSize,
                                 const ZSTD_CDict* cdict)
 {
-    CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize));
-
-    if (cdict->refContext->params.fParams.contentSizeFlag==1) {
-        cctx->params.fParams.contentSizeFlag = 1;
-        cctx->frameContentSize = srcSize;
-    }
-
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
 }
 
 
@@ -2939,7 +3104,6 @@ struct ZSTD_CStream_s {
     U32    checksum;
     U32    frameEnded;
     U64    pledgedSrcSize;
-    U64    inputProcessed;
     ZSTD_parameters params;
     ZSTD_customMem customMem;
 };   /* typedef'd to ZSTD_CStream within "zstd.h" */
@@ -2970,9 +3134,13 @@ size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
     if (zcs==NULL) return 0;   /* support free on NULL */
     {   ZSTD_customMem const cMem = zcs->customMem;
         ZSTD_freeCCtx(zcs->cctx);
+        zcs->cctx = NULL;
         ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = NULL;
         ZSTD_free(zcs->inBuff, cMem);
+        zcs->inBuff = NULL;
         ZSTD_free(zcs->outBuff, cMem);
+        zcs->outBuff = NULL;
         ZSTD_free(zcs, cMem);
         return 0;
     }
@@ -2982,14 +3150,20 @@ size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
 /*======   Initialization   ======*/
 
 size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
-size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+    return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
 
 static size_t ZSTD_resetCStream_internal(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
 {
     if (zcs->inBuffSize==0) return ERROR(stage_wrong);   /* zcs has not been init at least once => can't reset */
 
-    if (zcs->cdict) CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize))
-    else CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize));
+    DEBUGLOG(5, "ZSTD_resetCStream_internal : dictIDFlag == %u \n", !zcs->params.fParams.noDictIDFlag);
+
+    if (zcs->cdict) CHECK_F(ZSTD_compressBegin_usingCDict_advanced(zcs->cctx, zcs->cdict, zcs->params.fParams, pledgedSrcSize))
+    else CHECK_F(ZSTD_compressBegin_internal(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize));
 
     zcs->inToCompress = 0;
     zcs->inBuffPos = 0;
@@ -2998,7 +3172,6 @@ static size_t ZSTD_resetCStream_internal(ZSTD_CStream* zcs, unsigned long long p
     zcs->stage = zcss_load;
     zcs->frameEnded = 0;
     zcs->pledgedSrcSize = pledgedSrcSize;
-    zcs->inputProcessed = 0;
     return 0;   /* ready to go */
 }
 
@@ -3006,70 +3179,109 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
 {
 
     zcs->params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
-
+    DEBUGLOG(5, "ZSTD_resetCStream : dictIDFlag == %u \n", !zcs->params.fParams.noDictIDFlag);
     return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
 }
 
-size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
-                                 const void* dict, size_t dictSize,
-                                 ZSTD_parameters params, unsigned long long pledgedSrcSize)
+/* ZSTD_initCStream_internal() :
+ * params are supposed validated at this stage
+ * and zcs->cdict is supposed to be correct */
+static size_t ZSTD_initCStream_stage2(ZSTD_CStream* zcs,
+                                const ZSTD_parameters params,
+                                unsigned long long pledgedSrcSize)
 {
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+
     /* allocate buffers */
     {   size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog;
         if (zcs->inBuffSize < neededInBuffSize) {
-            zcs->inBuffSize = neededInBuffSize;
+            zcs->inBuffSize = 0;
             ZSTD_free(zcs->inBuff, zcs->customMem);
             zcs->inBuff = (char*) ZSTD_malloc(neededInBuffSize, zcs->customMem);
             if (zcs->inBuff == NULL) return ERROR(memory_allocation);
+            zcs->inBuffSize = neededInBuffSize;
         }
         zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize);
     }
     if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize)+1) {
-        zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize)+1;
+        size_t const outBuffSize = ZSTD_compressBound(zcs->blockSize)+1;
+        zcs->outBuffSize = 0;
         ZSTD_free(zcs->outBuff, zcs->customMem);
-        zcs->outBuff = (char*) ZSTD_malloc(zcs->outBuffSize, zcs->customMem);
+        zcs->outBuff = (char*) ZSTD_malloc(outBuffSize, zcs->customMem);
         if (zcs->outBuff == NULL) return ERROR(memory_allocation);
+        zcs->outBuffSize = outBuffSize;
     }
 
-    if (dict && dictSize >= 8) {
-        ZSTD_freeCDict(zcs->cdictLocal);
-        zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem);
-        if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
-        zcs->cdict = zcs->cdictLocal;
-    } else zcs->cdict = NULL;
-
     zcs->checksum = params.fParams.checksumFlag > 0;
     zcs->params = params;
 
+    DEBUGLOG(5, "ZSTD_initCStream_stage2 : dictIDFlag == %u \n", !params.fParams.noDictIDFlag);
     return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
 }
 
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize, ZSTD_frameParameters fParams)
+{
+    if (!cdict) return ERROR(GENERIC);   /* cannot handle NULL cdict (does not know what to do) */
+    {   ZSTD_parameters params = ZSTD_getParamsFromCDict(cdict);
+        params.fParams = fParams;
+        zcs->cdict = cdict;
+        return ZSTD_initCStream_stage2(zcs, params, pledgedSrcSize);
+    }
+}
+
 /* note : cdict must outlive compression session */
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
 {
-    ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict);
-    size_t const initError =  ZSTD_initCStream_advanced(zcs, NULL, 0, params, 0);
-    zcs->cdict = cdict;
-    zcs->cctx->dictID = params.fParams.noDictIDFlag ? 0 : cdict->refContext->dictID;
-    return initError;
+    ZSTD_frameParameters const fParams = { 0 /* content */, 0 /* checksum */, 0 /* noDictID */ };
+    return ZSTD_initCStream_usingCDict_advanced(zcs, cdict, 0, fParams);
+}
+
+static size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                                const void* dict, size_t dictSize,
+                                ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    zcs->cdict = NULL;
+
+    if (dict && dictSize >= 8) {
+        ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0 /* copy */, params.cParams, zcs->customMem);
+        if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
+        zcs->cdict = zcs->cdictLocal;
+    }
+
+    DEBUGLOG(5, "ZSTD_initCStream_internal : dictIDFlag == %u \n", !params.fParams.noDictIDFlag);
+    return ZSTD_initCStream_stage2(zcs, params, pledgedSrcSize);
+}
+
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    CHECK_F( ZSTD_checkCParams(params.cParams) );
+    DEBUGLOG(5, "ZSTD_initCStream_advanced : dictIDFlag == %u \n", !params.fParams.noDictIDFlag);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, params, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
-    return ZSTD_initCStream_advanced(zcs, dict, dictSize, params, 0);
+    return ZSTD_initCStream_internal(zcs, dict, dictSize, params, 0);
 }
 
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize)
 {
     ZSTD_parameters params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
-    if (pledgedSrcSize) params.fParams.contentSizeFlag = 1;
-    return ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize);
+    params.fParams.contentSizeFlag = (pledgedSrcSize>0);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, params, pledgedSrcSize);
 }
 
 size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
 {
-    return ZSTD_initCStream_usingDict(zcs, NULL, 0, compressionLevel);
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, 0);
+    return ZSTD_initCStream_internal(zcs, NULL, 0, params, 0);
 }
 
 size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
@@ -3163,7 +3375,6 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
 
     *srcSizePtr = ip - istart;
     *dstCapacityPtr = op - ostart;
-    zcs->inputProcessed += *srcSizePtr;
     if (zcs->frameEnded) return 0;
     {   size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
         if (hintInSize==0) hintInSize = zcs->blockSize;
@@ -3208,14 +3419,12 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
     BYTE* const oend = (BYTE*)(output->dst) + output->size;
     BYTE* op = ostart;
 
-    if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize))
-        return ERROR(srcSize_wrong);   /* pledgedSrcSize not respected */
-
     if (zcs->stage != zcss_final) {
         /* flush whatever remains */
         size_t srcSize = 0;
         size_t sizeWritten = output->size - output->pos;
-        size_t const notEnded = ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end);  /* use a valid src address instead of NULL */
+        size_t const notEnded = ZSTD_compressStream_generic(zcs, ostart, &sizeWritten,
+                                     &srcSize /* use a valid src address instead of NULL */, &srcSize, zsf_end);
         size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
         op += sizeWritten;
         if (remainingToFlush) {
@@ -3225,7 +3434,9 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
         /* create epilogue */
         zcs->stage = zcss_final;
         zcs->outBuffContentSize = !notEnded ? 0 :
-            ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL, 0);  /* write epilogue, including final empty block, into outBuff */
+            /* write epilogue, including final empty block, into outBuff */
+            ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL, 0);
+        if (ZSTD_isError(zcs->outBuffContentSize)) return zcs->outBuffContentSize;
     }
 
     /* flush epilogue */
@@ -3268,7 +3479,7 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV
     { 22, 21, 21,  5,  5, 16, ZSTD_btlazy2 },  /* level 15 */
     { 23, 22, 22,  5,  5, 16, ZSTD_btlazy2 },  /* level 16 */
     { 23, 21, 22,  4,  5, 24, ZSTD_btopt   },  /* level 17 */
-    { 23, 23, 22,  6,  5, 32, ZSTD_btopt   },  /* level 18 */
+    { 23, 22, 22,  5,  4, 32, ZSTD_btopt   },  /* level 18 */
     { 23, 23, 22,  6,  3, 48, ZSTD_btopt   },  /* level 19 */
     { 25, 25, 23,  7,  3, 64, ZSTD_btopt2  },  /* level 20 */
     { 26, 26, 23,  7,  3,256, ZSTD_btopt2  },  /* level 21 */
diff --git a/lib/compress/zstd_opt.h b/lib/compress/zstd_opt.h
index ac418b61c834..54376119121d 100644
--- a/lib/compress/zstd_opt.h
+++ b/lib/compress/zstd_opt.h
@@ -175,10 +175,10 @@ MEM_STATIC void ZSTD_updatePrice(seqStore_t* seqStorePtr, U32 litLength, const B
     }
 
     /* match offset */
-	{   BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
-		seqStorePtr->offCodeSum++;
-		seqStorePtr->offCodeFreq[offCode]++;
-	}
+    {   BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
+        seqStorePtr->offCodeSum++;
+        seqStorePtr->offCodeFreq[offCode]++;
+    }
 
     /* match Length */
     {   const BYTE ML_deltaCode = 36;
@@ -360,6 +360,7 @@ static U32 ZSTD_BtGetAllMatches_selectMLS (
     default :
     case 4 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
     case 5 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+    case 7 :
     case 6 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
     }
 }
@@ -387,6 +388,7 @@ static U32 ZSTD_BtGetAllMatches_selectMLS_extDict (
     default :
     case 4 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
     case 5 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+    case 7 :
     case 6 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
     }
 }
diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c
index 45514a81a3af..fc7f52a2902d 100644
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@@ -33,7 +33,7 @@
 #  include <stdio.h>
 #  include <unistd.h>
 #  include <sys/times.h>
-   static unsigned g_debugLevel = 3;
+   static unsigned g_debugLevel = 5;
 #  define DEBUGLOGRAW(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __VA_ARGS__); }
 #  define DEBUGLOG(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __FILE__ ": "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, " \n"); }
 
@@ -44,26 +44,26 @@
     DEBUGLOGRAW(l, " \n");       \
 }
 
-static unsigned long long GetCurrentClockTimeMicroseconds()
+static unsigned long long GetCurrentClockTimeMicroseconds(void)
 {
    static clock_t _ticksPerSecond = 0;
    if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK);
 
-   struct tms junk; clock_t newTicks = (clock_t) times(&junk);
-   return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond);
+   { struct tms junk; clock_t newTicks = (clock_t) times(&junk);
+     return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond); }
 }
 
 #define MUTEX_WAIT_TIME_DLEVEL 5
 #define PTHREAD_MUTEX_LOCK(mutex) \
 if (g_debugLevel>=MUTEX_WAIT_TIME_DLEVEL) { \
-   unsigned long long beforeTime = GetCurrentClockTimeMicroseconds(); \
-   pthread_mutex_lock(mutex); \
-   unsigned long long afterTime = GetCurrentClockTimeMicroseconds(); \
-   unsigned long long elapsedTime = (afterTime-beforeTime); \
-   if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
-      DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
+    unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \
+    pthread_mutex_lock(mutex); \
+    {   unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \
+        unsigned long long const elapsedTime = (afterTime-beforeTime); \
+        if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
+            DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
                elapsedTime, #mutex); \
-  } \
+    }   } \
 } else pthread_mutex_lock(mutex);
 
 #else
@@ -228,17 +228,19 @@ void ZSTDMT_compressChunk(void* jobDescription)
     ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
     const void* const src = (const char*)job->srcStart + job->dictSize;
     buffer_t const dstBuff = job->dstBuff;
-    DEBUGLOG(3, "job (first:%u) (last:%u) : dictSize %u, srcSize %u", job->firstChunk, job->lastChunk, (U32)job->dictSize, (U32)job->srcSize);
+    DEBUGLOG(3, "job (first:%u) (last:%u) : dictSize %u, srcSize %u",
+                 job->firstChunk, job->lastChunk, (U32)job->dictSize, (U32)job->srcSize);
     if (job->cdict) {  /* should only happen for first segment */
-        size_t const initError = ZSTD_compressBegin_usingCDict(job->cctx, job->cdict, job->fullFrameSize);
+        size_t const initError = ZSTD_compressBegin_usingCDict_advanced(job->cctx, job->cdict, job->params.fParams, job->fullFrameSize);
         if (job->cdict) DEBUGLOG(3, "using CDict ");
         if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
     } else {  /* srcStart points at reloaded section */
-        size_t const dictModeError = ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceRawDict, 1);  /* Force loading dictionary in "content-only" mode (no header analysis) */
-        size_t const initError = ZSTD_compressBegin_advanced(job->cctx, job->srcStart, job->dictSize, job->params, 0);
-        if (ZSTD_isError(initError) || ZSTD_isError(dictModeError)) { job->cSize = initError; goto _endJob; }
-        ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceWindow, 1);
-    }
+        if (!job->firstChunk) job->params.fParams.contentSizeFlag = 0;  /* ensure no srcSize control */
+        {   size_t const dictModeError = ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceRawDict, 1);  /* Force loading dictionary in "content-only" mode (no header analysis) */
+            size_t const initError = ZSTD_compressBegin_advanced(job->cctx, job->srcStart, job->dictSize, job->params, job->fullFrameSize);
+            if (ZSTD_isError(initError) || ZSTD_isError(dictModeError)) { job->cSize = initError; goto _endJob; }
+            ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceWindow, 1);
+    }   }
     if (!job->firstChunk) {  /* flush and overwrite frame header when it's not first segment */
         size_t const hSize = ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, 0);
         if (ZSTD_isError(hSize)) { job->cSize = hSize; goto _endJob; }
@@ -250,7 +252,9 @@ void ZSTDMT_compressChunk(void* jobDescription)
     job->cSize = (job->lastChunk) ?
                  ZSTD_compressEnd     (job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize) :
                  ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize);
-    DEBUGLOG(3, "compressed %u bytes into %u bytes   (first:%u) (last:%u)", (unsigned)job->srcSize, (unsigned)job->cSize, job->firstChunk, job->lastChunk);
+    DEBUGLOG(3, "compressed %u bytes into %u bytes   (first:%u) (last:%u)",
+                (unsigned)job->srcSize, (unsigned)job->cSize, job->firstChunk, job->lastChunk);
+    DEBUGLOG(5, "dstBuff.size : %u ; => %s", (U32)dstBuff.size, ZSTD_getErrorName(job->cSize));
 
 _endJob:
     PTHREAD_MUTEX_LOCK(job->jobCompleted_mutex);
@@ -388,14 +392,17 @@ size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
                            int compressionLevel)
 {
     ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0);
+    U32 const overlapLog = (compressionLevel >= ZSTD_maxCLevel()) ? 0 : 3;
+    size_t const overlapSize = (size_t)1 << (params.cParams.windowLog - overlapLog);
     size_t const chunkTargetSize = (size_t)1 << (params.cParams.windowLog + 2);
-    unsigned const nbChunksMax = (unsigned)(srcSize / chunkTargetSize) + (srcSize < chunkTargetSize) /* min 1 */;
+    unsigned const nbChunksMax = (unsigned)(srcSize / chunkTargetSize) + 1;
     unsigned nbChunks = MIN(nbChunksMax, mtctx->nbThreads);
     size_t const proposedChunkSize = (srcSize + (nbChunks-1)) / nbChunks;
     size_t const avgChunkSize = ((proposedChunkSize & 0x1FFFF) < 0xFFFF) ? proposedChunkSize + 0xFFFF : proposedChunkSize;   /* avoid too small last block */
     size_t remainingSrcSize = srcSize;
     const char* const srcStart = (const char*)src;
-    size_t frameStartPos = 0;
+    unsigned const compressWithinDst = (dstCapacity >= ZSTD_compressBound(srcSize)) ? nbChunks : (unsigned)(dstCapacity / ZSTD_compressBound(avgChunkSize));  /* presumes avgChunkSize >= 256 KB, which should be the case */
+    size_t frameStartPos = 0, dstBufferPos = 0;
 
     DEBUGLOG(3, "windowLog : %2u => chunkTargetSize : %u bytes  ", params.cParams.windowLog, (U32)chunkTargetSize);
     DEBUGLOG(2, "nbChunks  : %2u   (chunkSize : %u bytes)   ", nbChunks, (U32)avgChunkSize);
@@ -409,10 +416,11 @@ size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
     {   unsigned u;
         for (u=0; u<nbChunks; u++) {
             size_t const chunkSize = MIN(remainingSrcSize, avgChunkSize);
-            size_t const dstBufferCapacity = u ? ZSTD_compressBound(chunkSize) : dstCapacity;
-            buffer_t const dstAsBuffer = { dst, dstCapacity };
-            buffer_t const dstBuffer = u ? ZSTDMT_getBuffer(mtctx->buffPool, dstBufferCapacity) : dstAsBuffer;
+            size_t const dstBufferCapacity = ZSTD_compressBound(chunkSize);
+            buffer_t const dstAsBuffer = { (char*)dst + dstBufferPos, dstBufferCapacity };
+            buffer_t const dstBuffer = u < compressWithinDst ? dstAsBuffer : ZSTDMT_getBuffer(mtctx->buffPool, dstBufferCapacity);
             ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(mtctx->cctxPool);
+            size_t dictSize = u ? overlapSize : 0;
 
             if ((cctx==NULL) || (dstBuffer.start==NULL)) {
                 mtctx->jobs[u].cSize = ERROR(memory_allocation);   /* job result */
@@ -421,7 +429,8 @@ size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
                 break;   /* let's wait for previous jobs to complete, but don't start new ones */
             }
 
-            mtctx->jobs[u].srcStart = srcStart + frameStartPos;
+            mtctx->jobs[u].srcStart = srcStart + frameStartPos - dictSize;
+            mtctx->jobs[u].dictSize = dictSize;
             mtctx->jobs[u].srcSize = chunkSize;
             mtctx->jobs[u].fullFrameSize = srcSize;
             mtctx->jobs[u].params = params;
@@ -438,6 +447,7 @@ size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
             POOL_add(mtctx->factory, ZSTDMT_compressChunk, &mtctx->jobs[u]);
 
             frameStartPos += chunkSize;
+            dstBufferPos += dstBufferCapacity;
             remainingSrcSize -= chunkSize;
     }   }
     /* note : since nbChunks <= nbThreads, all jobs should be running immediately in parallel */
@@ -461,8 +471,10 @@ size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
                 if (ZSTD_isError(cSize)) error = cSize;
                 if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall);
                 if (chunkID) {   /* note : chunk 0 is already written directly into dst */
-                    if (!error) memcpy((char*)dst + dstPos, mtctx->jobs[chunkID].dstBuff.start, cSize);
-                    ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[chunkID].dstBuff);
+                    if (!error)
+                        memmove((char*)dst + dstPos, mtctx->jobs[chunkID].dstBuff.start, cSize);  /* may overlap if chunk decompressed within dst */
+                    if (chunkID >= compressWithinDst)   /* otherwise, it decompresses within dst */
+                        ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[chunkID].dstBuff);
                     mtctx->jobs[chunkID].dstBuff = g_nullBuffer;
                 }
                 dstPos += cSize ;
@@ -509,7 +521,7 @@ static size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
     if (updateDict) {
         ZSTD_freeCDict(zcs->cdict); zcs->cdict = NULL;
         if (dict && dictSize) {
-            zcs->cdict = ZSTD_createCDict_advanced(dict, dictSize, 0, params, cmem);
+            zcs->cdict = ZSTD_createCDict_advanced(dict, dictSize, 0, params.cParams, cmem);
             if (zcs->cdict == NULL) return ERROR(memory_allocation);
     }   }
     zcs->frameContentSize = pledgedSrcSize;
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 2aaa4a3df3c5..910f9ab783c3 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -177,30 +177,6 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
     memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize);  /* no need to copy workspace */
 }
 
-#if 0
-/* deprecated */
-static void ZSTD_refDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
-{
-    ZSTD_decompressBegin(dstDCtx);  /* init */
-    if (srcDCtx) {   /* support refDCtx on NULL */
-        dstDCtx->dictEnd = srcDCtx->dictEnd;
-        dstDCtx->vBase = srcDCtx->vBase;
-        dstDCtx->base = srcDCtx->base;
-        dstDCtx->previousDstEnd = srcDCtx->previousDstEnd;
-        dstDCtx->dictID = srcDCtx->dictID;
-        dstDCtx->litEntropy = srcDCtx->litEntropy;
-        dstDCtx->fseEntropy = srcDCtx->fseEntropy;
-        dstDCtx->LLTptr = srcDCtx->entropy.LLTable;
-        dstDCtx->MLTptr = srcDCtx->entropy.MLTable;
-        dstDCtx->OFTptr = srcDCtx->entropy.OFTable;
-        dstDCtx->HUFptr = srcDCtx->entropy.hufTable;
-        dstDCtx->entropy.rep[0] = srcDCtx->entropy.rep[0];
-        dstDCtx->entropy.rep[1] = srcDCtx->entropy.rep[1];
-        dstDCtx->entropy.rep[2] = srcDCtx->entropy.rep[2];
-    }
-}
-#endif
-
 static void ZSTD_refDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict);
 
 
@@ -431,7 +407,8 @@ typedef struct
 
 /*! ZSTD_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
-size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr)
 {
     if (srcSize < ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
     {   U32 const cBlockHeader = MEM_readLE24(src);
@@ -446,7 +423,8 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bp
 }
 
 
-static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize)
 {
     if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
     memcpy(dst, src, srcSize);
@@ -454,7 +432,9 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, const void* src,
 }
 
 
-static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, size_t regenSize)
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                               size_t regenSize)
 {
     if (srcSize != 1) return ERROR(srcSize_wrong);
     if (regenSize > dstCapacity) return ERROR(dstSize_tooSmall);
@@ -595,176 +575,70 @@ typedef union {
     U32 alignedBy4;
 } FSE_decode_t4;
 
+/* Default FSE distribution table for Literal Lengths */
 static const FSE_decode_t4 LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
     { { LL_DEFAULTNORMLOG, 1, 1 } }, /* header : tableLog, fastMode, fastMode */
-    { {  0,  0,  4 } },              /* 0 : base, symbol, bits */
-    { { 16,  0,  4 } },
-    { { 32,  1,  5 } },
-    { {  0,  3,  5 } },
-    { {  0,  4,  5 } },
-    { {  0,  6,  5 } },
-    { {  0,  7,  5 } },
-    { {  0,  9,  5 } },
-    { {  0, 10,  5 } },
-    { {  0, 12,  5 } },
-    { {  0, 14,  6 } },
-    { {  0, 16,  5 } },
-    { {  0, 18,  5 } },
-    { {  0, 19,  5 } },
-    { {  0, 21,  5 } },
-    { {  0, 22,  5 } },
-    { {  0, 24,  5 } },
-    { { 32, 25,  5 } },
-    { {  0, 26,  5 } },
-    { {  0, 27,  6 } },
-    { {  0, 29,  6 } },
-    { {  0, 31,  6 } },
-    { { 32,  0,  4 } },
-    { {  0,  1,  4 } },
-    { {  0,  2,  5 } },
-    { { 32,  4,  5 } },
-    { {  0,  5,  5 } },
-    { { 32,  7,  5 } },
-    { {  0,  8,  5 } },
-    { { 32, 10,  5 } },
-    { {  0, 11,  5 } },
-    { {  0, 13,  6 } },
-    { { 32, 16,  5 } },
-    { {  0, 17,  5 } },
-    { { 32, 19,  5 } },
-    { {  0, 20,  5 } },
-    { { 32, 22,  5 } },
-    { {  0, 23,  5 } },
-    { {  0, 25,  4 } },
-    { { 16, 25,  4 } },
-    { { 32, 26,  5 } },
-    { {  0, 28,  6 } },
-    { {  0, 30,  6 } },
-    { { 48,  0,  4 } },
-    { { 16,  1,  4 } },
-    { { 32,  2,  5 } },
-    { { 32,  3,  5 } },
-    { { 32,  5,  5 } },
-    { { 32,  6,  5 } },
-    { { 32,  8,  5 } },
-    { { 32,  9,  5 } },
-    { { 32, 11,  5 } },
-    { { 32, 12,  5 } },
-    { {  0, 15,  6 } },
-    { { 32, 17,  5 } },
-    { { 32, 18,  5 } },
-    { { 32, 20,  5 } },
-    { { 32, 21,  5 } },
-    { { 32, 23,  5 } },
-    { { 32, 24,  5 } },
-    { {  0, 35,  6 } },
-    { {  0, 34,  6 } },
-    { {  0, 33,  6 } },
-    { {  0, 32,  6 } },
+     /* base, symbol, bits */
+    { {  0,  0,  4 } }, { { 16,  0,  4 } }, { { 32,  1,  5 } }, { {  0,  3,  5 } },
+    { {  0,  4,  5 } }, { {  0,  6,  5 } }, { {  0,  7,  5 } }, { {  0,  9,  5 } },
+    { {  0, 10,  5 } }, { {  0, 12,  5 } }, { {  0, 14,  6 } }, { {  0, 16,  5 } },
+    { {  0, 18,  5 } }, { {  0, 19,  5 } }, { {  0, 21,  5 } }, { {  0, 22,  5 } },
+    { {  0, 24,  5 } }, { { 32, 25,  5 } }, { {  0, 26,  5 } }, { {  0, 27,  6 } },
+    { {  0, 29,  6 } }, { {  0, 31,  6 } }, { { 32,  0,  4 } }, { {  0,  1,  4 } },
+    { {  0,  2,  5 } }, { { 32,  4,  5 } }, { {  0,  5,  5 } }, { { 32,  7,  5 } },
+    { {  0,  8,  5 } }, { { 32, 10,  5 } }, { {  0, 11,  5 } }, { {  0, 13,  6 } },
+    { { 32, 16,  5 } }, { {  0, 17,  5 } }, { { 32, 19,  5 } }, { {  0, 20,  5 } },
+    { { 32, 22,  5 } }, { {  0, 23,  5 } }, { {  0, 25,  4 } }, { { 16, 25,  4 } },
+    { { 32, 26,  5 } }, { {  0, 28,  6 } }, { {  0, 30,  6 } }, { { 48,  0,  4 } },
+    { { 16,  1,  4 } }, { { 32,  2,  5 } }, { { 32,  3,  5 } }, { { 32,  5,  5 } },
+    { { 32,  6,  5 } }, { { 32,  8,  5 } }, { { 32,  9,  5 } }, { { 32, 11,  5 } },
+    { { 32, 12,  5 } }, { {  0, 15,  6 } }, { { 32, 17,  5 } }, { { 32, 18,  5 } },
+    { { 32, 20,  5 } }, { { 32, 21,  5 } }, { { 32, 23,  5 } }, { { 32, 24,  5 } },
+    { {  0, 35,  6 } }, { {  0, 34,  6 } }, { {  0, 33,  6 } }, { {  0, 32,  6 } },
 };   /* LL_defaultDTable */
 
+/* Default FSE distribution table for Match Lengths */
 static const FSE_decode_t4 ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
     { { ML_DEFAULTNORMLOG, 1, 1 } }, /* header : tableLog, fastMode, fastMode */
-    { {  0,  0,  6 } },              /* 0 : base, symbol, bits */
-    { {  0,  1,  4 } },
-    { { 32,  2,  5 } },
-    { {  0,  3,  5 } },
-    { {  0,  5,  5 } },
-    { {  0,  6,  5 } },
-    { {  0,  8,  5 } },
-    { {  0, 10,  6 } },
-    { {  0, 13,  6 } },
-    { {  0, 16,  6 } },
-    { {  0, 19,  6 } },
-    { {  0, 22,  6 } },
-    { {  0, 25,  6 } },
-    { {  0, 28,  6 } },
-    { {  0, 31,  6 } },
-    { {  0, 33,  6 } },
-    { {  0, 35,  6 } },
-    { {  0, 37,  6 } },
-    { {  0, 39,  6 } },
-    { {  0, 41,  6 } },
-    { {  0, 43,  6 } },
-    { {  0, 45,  6 } },
-    { { 16,  1,  4 } },
-    { {  0,  2,  4 } },
-    { { 32,  3,  5 } },
-    { {  0,  4,  5 } },
-    { { 32,  6,  5 } },
-    { {  0,  7,  5 } },
-    { {  0,  9,  6 } },
-    { {  0, 12,  6 } },
-    { {  0, 15,  6 } },
-    { {  0, 18,  6 } },
-    { {  0, 21,  6 } },
-    { {  0, 24,  6 } },
-    { {  0, 27,  6 } },
-    { {  0, 30,  6 } },
-    { {  0, 32,  6 } },
-    { {  0, 34,  6 } },
-    { {  0, 36,  6 } },
-    { {  0, 38,  6 } },
-    { {  0, 40,  6 } },
-    { {  0, 42,  6 } },
-    { {  0, 44,  6 } },
-    { { 32,  1,  4 } },
-    { { 48,  1,  4 } },
-    { { 16,  2,  4 } },
-    { { 32,  4,  5 } },
-    { { 32,  5,  5 } },
-    { { 32,  7,  5 } },
-    { { 32,  8,  5 } },
-    { {  0, 11,  6 } },
-    { {  0, 14,  6 } },
-    { {  0, 17,  6 } },
-    { {  0, 20,  6 } },
-    { {  0, 23,  6 } },
-    { {  0, 26,  6 } },
-    { {  0, 29,  6 } },
-    { {  0, 52,  6 } },
-    { {  0, 51,  6 } },
-    { {  0, 50,  6 } },
-    { {  0, 49,  6 } },
-    { {  0, 48,  6 } },
-    { {  0, 47,  6 } },
-    { {  0, 46,  6 } },
+    /* base, symbol, bits */
+    { {  0,  0,  6 } }, { {  0,  1,  4 } }, { { 32,  2,  5 } }, { {  0,  3,  5 } },
+    { {  0,  5,  5 } }, { {  0,  6,  5 } }, { {  0,  8,  5 } }, { {  0, 10,  6 } },
+    { {  0, 13,  6 } }, { {  0, 16,  6 } }, { {  0, 19,  6 } }, { {  0, 22,  6 } },
+    { {  0, 25,  6 } }, { {  0, 28,  6 } }, { {  0, 31,  6 } }, { {  0, 33,  6 } },
+    { {  0, 35,  6 } }, { {  0, 37,  6 } }, { {  0, 39,  6 } }, { {  0, 41,  6 } },
+    { {  0, 43,  6 } }, { {  0, 45,  6 } }, { { 16,  1,  4 } }, { {  0,  2,  4 } },
+    { { 32,  3,  5 } }, { {  0,  4,  5 } }, { { 32,  6,  5 } }, { {  0,  7,  5 } },
+    { {  0,  9,  6 } }, { {  0, 12,  6 } }, { {  0, 15,  6 } }, { {  0, 18,  6 } },
+    { {  0, 21,  6 } }, { {  0, 24,  6 } }, { {  0, 27,  6 } }, { {  0, 30,  6 } },
+    { {  0, 32,  6 } }, { {  0, 34,  6 } }, { {  0, 36,  6 } }, { {  0, 38,  6 } },
+    { {  0, 40,  6 } }, { {  0, 42,  6 } }, { {  0, 44,  6 } }, { { 32,  1,  4 } },
+    { { 48,  1,  4 } }, { { 16,  2,  4 } }, { { 32,  4,  5 } }, { { 32,  5,  5 } },
+    { { 32,  7,  5 } }, { { 32,  8,  5 } }, { {  0, 11,  6 } }, { {  0, 14,  6 } },
+    { {  0, 17,  6 } }, { {  0, 20,  6 } }, { {  0, 23,  6 } }, { {  0, 26,  6 } },
+    { {  0, 29,  6 } }, { {  0, 52,  6 } }, { {  0, 51,  6 } }, { {  0, 50,  6 } },
+    { {  0, 49,  6 } }, { {  0, 48,  6 } }, { {  0, 47,  6 } }, { {  0, 46,  6 } },
 };   /* ML_defaultDTable */
 
+/* Default FSE distribution table for Offset Codes */
 static const FSE_decode_t4 OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
     { { OF_DEFAULTNORMLOG, 1, 1 } }, /* header : tableLog, fastMode, fastMode */
-    { {  0,  0,  5 } },              /* 0 : base, symbol, bits */
-    { {  0,  6,  4 } },
-    { {  0,  9,  5 } },
-    { {  0, 15,  5 } },
-    { {  0, 21,  5 } },
-    { {  0,  3,  5 } },
-    { {  0,  7,  4 } },
-    { {  0, 12,  5 } },
-    { {  0, 18,  5 } },
-    { {  0, 23,  5 } },
-    { {  0,  5,  5 } },
-    { {  0,  8,  4 } },
-    { {  0, 14,  5 } },
-    { {  0, 20,  5 } },
-    { {  0,  2,  5 } },
-    { { 16,  7,  4 } },
-    { {  0, 11,  5 } },
-    { {  0, 17,  5 } },
-    { {  0, 22,  5 } },
-    { {  0,  4,  5 } },
-    { { 16,  8,  4 } },
-    { {  0, 13,  5 } },
-    { {  0, 19,  5 } },
-    { {  0,  1,  5 } },
-    { { 16,  6,  4 } },
-    { {  0, 10,  5 } },
-    { {  0, 16,  5 } },
-    { {  0, 28,  5 } },
-    { {  0, 27,  5 } },
-    { {  0, 26,  5 } },
-    { {  0, 25,  5 } },
-    { {  0, 24,  5 } },
+    /* base, symbol, bits */
+    { {  0,  0,  5 } }, { {  0,  6,  4 } },
+    { {  0,  9,  5 } }, { {  0, 15,  5 } },
+    { {  0, 21,  5 } }, { {  0,  3,  5 } },
+    { {  0,  7,  4 } }, { {  0, 12,  5 } },
+    { {  0, 18,  5 } }, { {  0, 23,  5 } },
+    { {  0,  5,  5 } }, { {  0,  8,  4 } },
+    { {  0, 14,  5 } }, { {  0, 20,  5 } },
+    { {  0,  2,  5 } }, { { 16,  7,  4 } },
+    { {  0, 11,  5 } }, { {  0, 17,  5 } },
+    { {  0, 22,  5 } }, { {  0,  4,  5 } },
+    { { 16,  8,  4 } }, { {  0, 13,  5 } },
+    { {  0, 19,  5 } }, { {  0,  1,  5 } },
+    { { 16,  6,  4 } }, { {  0, 10,  5 } },
+    { {  0, 16,  5 } }, { {  0, 28,  5 } },
+    { {  0, 27,  5 } }, { {  0, 26,  5 } },
+    { {  0, 25,  5 } }, { {  0, 24,  5 } },
 };   /* OF_defaultDTable */
 
 /*! ZSTD_buildSeqTable() :
@@ -927,8 +801,6 @@ size_t ZSTD_execSequenceLast7(BYTE* op,
 }
 
 
-
-
 static seq_t ZSTD_decodeSequence(seqState_t* seqState)
 {
     seq_t seq;
@@ -943,21 +815,26 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState)
     U32 const totalBits = llBits+mlBits+ofBits;
 
     static const U32 LL_base[MaxLL+1] = {
-                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
-                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                             0,    1,    2,     3,     4,     5,     6,      7,
+                             8,    9,   10,    11,    12,    13,    14,     15,
+                            16,   18,   20,    22,    24,    28,    32,     40,
+                            48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
                             0x2000, 0x4000, 0x8000, 0x10000 };
 
     static const U32 ML_base[MaxML+1] = {
-                             3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,   14,    15,    16,    17,    18,
-                            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,   30,    31,    32,    33,    34,
-                            35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                             3,  4,  5,    6,     7,     8,     9,    10,
+                            11, 12, 13,   14,    15,    16,    17,    18,
+                            19, 20, 21,   22,    23,    24,    25,    26,
+                            27, 28, 29,   30,    31,    32,    33,    34,
+                            35, 37, 39,   41,    43,    47,    51,    59,
+                            67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
                             0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
 
     static const U32 OF_base[MaxOff+1] = {
-                             0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
-                             0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
-                             0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
-                             0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
+                     0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                     0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                     0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                     0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
 
     /* sequence */
     {   size_t offset;
@@ -1031,7 +908,7 @@ size_t ZSTD_execSequence(BYTE* op,
 
     /* copy Match */
     if (sequence.offset > (size_t)(oLitEnd - base)) {
-        /* offset beyond prefix */
+        /* offset beyond prefix -> go into extDict */
         if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
         match = dictEnd + (match - base);
         if (match + sequence.matchLength <= dictEnd) {
@@ -1156,21 +1033,26 @@ FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t* seqState, int con
     U32 const totalBits = llBits+mlBits+ofBits;
 
     static const U32 LL_base[MaxLL+1] = {
-                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
-                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                             0,  1,    2,     3,     4,     5,     6,      7,
+                             8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18,   20,    22,    24,    28,    32,     40,
+                            48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
                             0x2000, 0x4000, 0x8000, 0x10000 };
 
     static const U32 ML_base[MaxML+1] = {
-                             3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,   14,    15,    16,    17,    18,
-                            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,   30,    31,    32,    33,    34,
-                            35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                             3,  4,  5,    6,     7,     8,     9,    10,
+                            11, 12, 13,   14,    15,    16,    17,    18,
+                            19, 20, 21,   22,    23,    24,    25,    26,
+                            27, 28, 29,   30,    31,    32,    33,    34,
+                            35, 37, 39,   41,    43,    47,    51,    59,
+                            67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
                             0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
 
     static const U32 OF_base[MaxOff+1] = {
-                             0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
-                             0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
-                             0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
-                             0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
+                     0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                     0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                     0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                     0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
 
     /* sequence */
     {   size_t offset;
@@ -1476,7 +1358,7 @@ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
     if (ZSTD_isLegacy(src, srcSize)) return ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
 #endif
     if (srcSize >= ZSTD_skippableHeaderSize &&
-            (MEM_readLE32(src) & 0xFFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+            (MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
         return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + 4);
     } else {
         const BYTE* ip = (const BYTE*)src;
@@ -2115,15 +1997,18 @@ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
 }
 
 /*! ZSTD_getDictID_fromFrame() :
- *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  Provides the dictID required to decompresse frame stored within `src`.
  *  If @return == 0, the dictID could not be decoded.
  *  This could for one of the following reasons :
- *  - The frame does not require a dictionary to be decoded (most common case).
- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *  - The frame does not require a dictionary (most common case).
+ *  - The frame was built with dictID intentionally removed.
+ *    Needed dictionary is a hidden information.
  *    Note : this use case also happens when using a non-conformant dictionary.
- *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+ *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
  *  - This is not a Zstandard frame.
- *  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+ *  When identifying the exact failure cause, it's possible to use
+ *  ZSTD_getFrameParams(), which will provide a more precise error code. */
 unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
 {
     ZSTD_frameParams zfp = { 0 , 0 , 0 , 0 };
@@ -2209,9 +2094,13 @@ size_t ZSTD_freeDStream(ZSTD_DStream* zds)
     if (zds==NULL) return 0;   /* support free on null */
     {   ZSTD_customMem const cMem = zds->customMem;
         ZSTD_freeDCtx(zds->dctx);
+        zds->dctx = NULL;
         ZSTD_freeDDict(zds->ddictLocal);
+        zds->ddictLocal = NULL;
         ZSTD_free(zds->inBuff, cMem);
+        zds->inBuff = NULL;
         ZSTD_free(zds->outBuff, cMem);
+        zds->outBuff = NULL;
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
         if (zds->legacyContext)
             ZSTD_freeLegacyStreamContext(zds->legacyContext, zds->previousLegacyVersion);
@@ -2247,7 +2136,9 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
     return ZSTD_initDStream_usingDict(zds, NULL, 0);
 }
 
-size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict)  /**< note : ddict will just be referenced, and must outlive decompression session */
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict)
 {
     size_t const initResult = ZSTD_initDStream(zds);
     zds->ddict = ddict;
@@ -2277,8 +2168,11 @@ size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds,
 
 size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds)
 {
-    if (zds==NULL) return 0;   /* support sizeof on NULL */
-    return sizeof(*zds) + ZSTD_sizeof_DCtx(zds->dctx) + ZSTD_sizeof_DDict(zds->ddictLocal) + zds->inBuffSize + zds->outBuffSize;
+    if (zds==NULL) return 0;   /* support sizeof NULL */
+    return sizeof(*zds)
+           + ZSTD_sizeof_DCtx(zds->dctx)
+           + ZSTD_sizeof_DDict(zds->ddictLocal)
+           + zds->inBuffSize + zds->outBuffSize;
 }
 
 
@@ -2376,15 +2270,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 zds->blockSize = blockSize;
                 if (zds->inBuffSize < blockSize) {
                     ZSTD_free(zds->inBuff, zds->customMem);
-                    zds->inBuffSize = blockSize;
+                    zds->inBuffSize = 0;
                     zds->inBuff = (char*)ZSTD_malloc(blockSize, zds->customMem);
                     if (zds->inBuff == NULL) return ERROR(memory_allocation);
+                    zds->inBuffSize = blockSize;
                 }
                 if (zds->outBuffSize < neededOutSize) {
                     ZSTD_free(zds->outBuff, zds->customMem);
-                    zds->outBuffSize = neededOutSize;
+                    zds->outBuffSize = 0;
                     zds->outBuff = (char*)ZSTD_malloc(neededOutSize, zds->customMem);
                     if (zds->outBuff == NULL) return ERROR(memory_allocation);
+                    zds->outBuffSize = neededOutSize;
             }   }
             zds->stage = zdss_read;
             /* pass-through */
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index 3a7b9f39f9fd..1863c8f34542 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -59,8 +59,6 @@ static int g_displayLevel = 2;
     if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
       g_time = clock();                                                        \
       DISPLAY(__VA_ARGS__);                                                    \
-      if (displayLevel >= 4)                                                   \
-        fflush(stdout);                                                        \
     }                                                                          \
   }
 #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
@@ -236,10 +234,22 @@ static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
  * Returns 1 if the dmer at lp is greater than the dmer at rp.
  */
 static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
-  const U32 lhs = *(const U32 *)lp;
-  const U32 rhs = *(const U32 *)rp;
+  U32 const lhs = *(U32 const *)lp;
+  U32 const rhs = *(U32 const *)rp;
   return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
 }
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
+  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
+  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
+  if (lhs < rhs) {
+    return -1;
+  }
+  return (lhs > rhs);
+}
 
 /**
  * Same as COVER_cmp() except ties are broken by pointer value
@@ -253,6 +263,16 @@ static int COVER_strict_cmp(const void *lp, const void *rp) {
   }
   return result;
 }
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_strict_cmp8(const void *lp, const void *rp) {
+  int result = COVER_cmp8(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
 
 /**
  * Returns the first pointer in [first, last) whose element does not compare
@@ -508,7 +528,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
   const BYTE *const samples = (const BYTE *)samplesBuffer;
   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
   /* Checks */
-  if (totalSamplesSize < d ||
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
     DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
                  (COVER_MAX_SAMPLES_SIZE >> 20));
@@ -522,7 +542,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
   ctx->samplesSizes = samplesSizes;
   ctx->nbSamples = nbSamples;
   /* Partial suffix array */
-  ctx->suffixSize = totalSamplesSize - d + 1;
+  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
   /* Maps index to the dmerID */
   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@@ -556,7 +576,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
     }
     /* qsort doesn't take an opaque pointer, so pass as a global */
     g_ctx = ctx;
-    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp);
+    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
   }
   DISPLAYLEVEL(2, "Computing frequencies\n");
   /* For each dmer group (group of positions with the same first d bytes):
@@ -566,8 +587,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
    * 2. We calculate how many samples the dmer occurs in and save it in
    *    freqs[dmerId].
    */
-  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp,
-                &COVER_group);
+  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
+                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
   ctx->freqs = ctx->suffix;
   ctx->suffix = NULL;
   return 1;
@@ -918,10 +939,10 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
   /* constants */
   const unsigned nbThreads = parameters->nbThreads;
   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
-  const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d;
-  const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k;
-  const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k;
-  const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps;
+  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
   const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
   const unsigned kIterations =
       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c
index 0757dbbbb643..179e02effa4d 100644
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -11,8 +11,9 @@
 /*-**************************************
 *  Tuning parameters
 ****************************************/
+#define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
-#define ZDICT_MIN_SAMPLES_SIZE 512
+#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
 
 
 /*-**************************************
@@ -59,11 +60,8 @@
 
 #define NOISELENGTH 32
 
-#define MINRATIO 4
 static const int g_compressionLevel_default = 6;
 static const U32 g_selectivity_default = 9;
-static const size_t g_provision_entropySize = 200;
-static const size_t g_min_fast_dictContent = 192;
 
 
 /*-*************************************
@@ -308,10 +306,10 @@ static dictItem ZDICT_analyzePos(
         /* look backward */
         length = MINMATCHLENGTH;
         while ((length >= MINMATCHLENGTH) & (start > 0)) {
-        	length = ZDICT_count(b + pos, b + suffix[start - 1]);
-        	if (length >= LLIMIT) length = LLIMIT - 1;
-        	lengthList[length]++;
-        	if (length >= MINMATCHLENGTH) start--;
+            length = ZDICT_count(b + pos, b + suffix[start - 1]);
+            if (length >= LLIMIT) length = LLIMIT - 1;
+            lengthList[length]++;
+            if (length >= MINMATCHLENGTH) start--;
         }
 
         /* largest useful length */
@@ -363,21 +361,35 @@ static dictItem ZDICT_analyzePos(
 }
 
 
+static int isIncluded(const void* in, const void* container, size_t length)
+{
+    const char* const ip = (const char*) in;
+    const char* const into = (const char*) container;
+    size_t u;
+
+    for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
+        if (ip[u] != into[u]) break;
+    }
+
+    return u==length;
+}
+
 /*! ZDICT_checkMerge
     check if dictItem can be merged, do it if possible
     @return : id of destination elt, 0 if not merged
 */
-static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
+static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
 {
     const U32 tableSize = table->pos;
     const U32 eltEnd = elt.pos + elt.length;
+    const char* const buf = (const char*) buffer;
 
     /* tail overlap */
     U32 u; for (u=1; u<tableSize; u++) {
         if (u==eltNbToSkip) continue;
         if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
             /* append */
-            U32 addedLength = table[u].pos - elt.pos;
+            U32 const addedLength = table[u].pos - elt.pos;
             table[u].length += addedLength;
             table[u].pos = elt.pos;
             table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
@@ -393,9 +405,10 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
     /* front overlap */
     for (u=1; u<tableSize; u++) {
         if (u==eltNbToSkip) continue;
+
         if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
             /* append */
-            int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
+            int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
             table[u].savings += elt.length / 8;    /* rough approx bonus */
             if (addedLength > 0) {   /* otherwise, elt fully included into existing */
                 table[u].length += addedLength;
@@ -407,7 +420,18 @@ static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
                 table[u] = table[u-1], u--;
             table[u] = elt;
             return u;
-    }   }
+        }
+
+        if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
+            if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
+                size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
+                table[u].pos = elt.pos;
+                table[u].savings += (U32)(elt.savings * addedLength / elt.length);
+                table[u].length = MIN(elt.length, table[u].length + 1);
+                return u;
+            }
+        }
+    }
 
     return 0;
 }
@@ -425,14 +449,14 @@ static void ZDICT_removeDictItem(dictItem* table, U32 id)
 }
 
 
-static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
+static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
 {
     /* merge if possible */
-    U32 mergeId = ZDICT_checkMerge(table, elt, 0);
+    U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
     if (mergeId) {
         U32 newMerge = 1;
         while (newMerge) {
-            newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
+            newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
             if (newMerge) ZDICT_removeDictItem(table, mergeId);
             mergeId = newMerge;
         }
@@ -480,7 +504,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
 #   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
             if (ZDICT_clockSpan(displayClock) > refreshRate)  \
             { displayClock = clock(); DISPLAY(__VA_ARGS__); \
-            if (notificationLevel>=4) fflush(stdout); } }
+            if (notificationLevel>=4) fflush(stderr); } }
 
     /* init */
     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
@@ -521,7 +545,7 @@ static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
             if (doneMarks[cursor]) { cursor++; continue; }
             solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
             if (solution.length==0) { cursor++; continue; }
-            ZDICT_insertDictItem(dictList, dictListSize, solution);
+            ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
             cursor += solution.length;
             DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
     }   }
@@ -683,19 +707,19 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
         goto _cleanup;
     }
     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; }   /* too large dictionary */
-    for (u=0; u<256; u++) countLit[u]=1;   /* any character must be described */
-    for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
-    for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
-    for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
+    for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
+    for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
+    for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
+    for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
     memset(repOffset, 0, sizeof(repOffset));
     repOffset[1] = repOffset[4] = repOffset[8] = 1;
     memset(bestRepOffset, 0, sizeof(bestRepOffset));
-    if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
+    if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
     {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
-            if (ZSTD_isError(beginResult)) {
+        if (ZSTD_isError(beginResult)) {
+            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
             eSize = ERROR(GENERIC);
-            DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
             goto _cleanup;
     }   }
 
@@ -812,7 +836,6 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
     MEM_writeLE32(dstPtr+4, repStartValue[1]);
     MEM_writeLE32(dstPtr+8, repStartValue[2]);
 #endif
-    //dstPtr += 12;
     eSize += 12;
 
 _cleanup:
@@ -831,7 +854,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                           ZDICT_params_t params)
 {
     size_t hSize;
-#define HBUFFSIZE 256
+#define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
     BYTE header[HBUFFSIZE];
     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
@@ -877,20 +900,11 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
                                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
                                                  ZDICT_params_t params)
 {
-    size_t hSize;
     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
     U32 const notificationLevel = params.notificationLevel;
+    size_t hSize = 8;
 
-    /* dictionary header */
-    MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
-    {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
-        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
-        U32 const dictID = params.dictID ? params.dictID : compliantID;
-        MEM_writeLE32((char*)dictBuffer+4, dictID);
-    }
-    hSize = 8;
-
-    /* entropy tables */
+    /* calculate entropy tables */
     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
     DISPLAYLEVEL(2, "statistics ... \n");
     {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
@@ -902,6 +916,13 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
         hSize += eSize;
     }
 
+    /* add dictionary header (after entropy tables) */
+    MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
+    {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
+        U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
+        U32 const dictID = params.dictID ? params.dictID : compliantID;
+        MEM_writeLE32((char*)dictBuffer+4, dictID);
+    }
 
     if (hSize + dictContentSize < dictBufferCapacity)
         memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
@@ -929,8 +950,8 @@ size_t ZDICT_trainFromBuffer_unsafe(
 
     /* checks */
     if (!dictList) return ERROR(memory_allocation);
-    if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
-    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; }   /* not enough source to create dictionary */
+    if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
+    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
 
     /* init */
     ZDICT_initDictItem(dictList);
@@ -963,14 +984,15 @@ size_t ZDICT_trainFromBuffer_unsafe(
 
     /* create dictionary */
     {   U32 dictContentSize = ZDICT_dictSize(dictList);
-        if (dictContentSize < targetDictSize/3) {
+        if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
+        if (dictContentSize < targetDictSize/4) {
             DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+            if (samplesBuffSize < 10 * targetDictSize)
+                DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
             if (minRep > MINRATIO) {
                 DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
                 DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
             }
-            if (samplesBuffSize < 10 * targetDictSize)
-                DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
         }
 
         if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
@@ -978,7 +1000,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
             while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
             DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
             DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
-            DISPLAYLEVEL(2, "!  always test dictionary efficiency on samples \n");
+            DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
         }
 
         /* limit dictionary size */
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h
index 4ead4474fa9b..9b53de346527 100644
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -88,7 +88,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dict
 
 /*! COVER_params_t :
     For all values 0 means default.
-    kMin and d are the only required parameters.
+    k and d are the only required parameters.
 */
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
@@ -147,18 +147,18 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictB
     Samples must be stored concatenated in a flat buffer `samplesBuffer`,
     supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
 
-    dictContentSize must be > ZDICT_CONTENTSIZE_MIN bytes.
-    maxDictSize must be >= dictContentSize, and must be > ZDICT_DICTSIZE_MIN bytes.
+    dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
+    maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
 
     @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
               or an error code, which can be tested by ZDICT_isError().
     note : ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
-    note 2 : dictBuffer and customDictContent can overlap
+    note 2 : dictBuffer and dictContent can overlap
 */
-#define ZDICT_CONTENTSIZE_MIN 256
-#define ZDICT_DICTSIZE_MIN    512
+#define ZDICT_CONTENTSIZE_MIN 128
+#define ZDICT_DICTSIZE_MIN    256
 ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
-                                const void* customDictContent, size_t dictContentSize,
+                                const void* dictContent, size_t dictContentSize,
                                 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
                                 ZDICT_params_t parameters);
 
diff --git a/lib/legacy/zstd_v01.c b/lib/legacy/zstd_v01.c
index bcacb8d5d7a2..cf5354d6a9b6 100644
--- a/lib/legacy/zstd_v01.c
+++ b/lib/legacy/zstd_v01.c
@@ -1432,7 +1432,7 @@ typedef struct ZSTD_Cctx_s
 #else
     U32 hashTable[HASH_TABLESIZE];
 #endif
-	BYTE buffer[WORKPLACESIZE];
+    BYTE buffer[WORKPLACESIZE];
 } cctxi_t;
 
 
diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c
index 2297b28c8b2e..3cf8f4778250 100644
--- a/lib/legacy/zstd_v02.c
+++ b/lib/legacy/zstd_v02.c
@@ -475,8 +475,8 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
 
 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
-	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
-		return BIT_DStream_overflow;
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
 
     if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
     {
@@ -1334,8 +1334,8 @@ static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsi
                 else
                 {
                     bitCount -= (int)(8 * (iend - 4 - ip));
-					ip = iend - 4;
-				}
+                    ip = iend - 4;
+                }
                 bitStream = MEM_readLE32(ip) >> (bitCount & 31);
             }
         }
@@ -2040,7 +2040,7 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
         rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
     }
 
-	/* Build rankVal */
+    /* Build rankVal */
     {
         const U32 minBits = tableLog+1 - maxW;
         U32 nextRankVal = 0;
@@ -2374,7 +2374,7 @@ static size_t HUF_readDTableX6 (U32* DTable, const void* src, size_t srcSize)
         rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
     }
 
-	/* Build rankVal */
+    /* Build rankVal */
     {
         const U32 minBits = tableLog+1 - maxW;
         U32 nextRankVal = 0;
@@ -2948,14 +2948,14 @@ static size_t ZSTD_decodeLiteralsBlock(void* ctx,
             const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
             if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
             {
-				if (litSize > srcSize-3) return ERROR(corruption_detected);
-				memcpy(dctx->litBuffer, istart, litSize);
-				dctx->litPtr = dctx->litBuffer;
-				dctx->litSize = litSize;
-				memset(dctx->litBuffer + dctx->litSize, 0, 8);
-				return litSize+3;
-			}
-			/* direct reference into compressed stream */
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
             dctx->litPtr = istart+3;
             dctx->litSize = litSize;
             return litSize+3;
@@ -3515,13 +3515,13 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi
 
 unsigned ZSTDv02_isError(size_t code)
 {
-	return ZSTD_isError(code);
+    return ZSTD_isError(code);
 }
 
 size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
                      const void* src, size_t compressedSize)
 {
-	return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
+    return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
 }
 
 size_t ZSTDv02_findFrameCompressedSize(const void *src, size_t compressedSize)
@@ -3531,25 +3531,25 @@ size_t ZSTDv02_findFrameCompressedSize(const void *src, size_t compressedSize)
 
 ZSTDv02_Dctx* ZSTDv02_createDCtx(void)
 {
-	return (ZSTDv02_Dctx*)ZSTD_createDCtx();
+    return (ZSTDv02_Dctx*)ZSTD_createDCtx();
 }
 
 size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx)
 {
-	return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+    return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
 }
 
 size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx)
 {
-	return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+    return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
 }
 
 size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx)
 {
-	return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+    return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
 }
 
 size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-	return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+    return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
 }
diff --git a/lib/legacy/zstd_v03.c b/lib/legacy/zstd_v03.c
index ef654931f528..f438330a4692 100644
--- a/lib/legacy/zstd_v03.c
+++ b/lib/legacy/zstd_v03.c
@@ -477,8 +477,8 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
 
 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
-	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
-		return BIT_DStream_overflow;
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
 
     if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
     {
@@ -1335,8 +1335,8 @@ static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsi
                 else
                 {
                     bitCount -= (int)(8 * (iend - 4 - ip));
-					ip = iend - 4;
-				}
+                    ip = iend - 4;
+                }
                 bitStream = MEM_readLE32(ip) >> (bitCount & 31);
             }
         }
@@ -2037,7 +2037,7 @@ static size_t HUF_readDTableX4 (U32* DTable, const void* src, size_t srcSize)
         rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
     }
 
-	/* Build rankVal */
+    /* Build rankVal */
     {
         const U32 minBits = tableLog+1 - maxW;
         U32 nextRankVal = 0;
@@ -2589,14 +2589,14 @@ static size_t ZSTD_decodeLiteralsBlock(void* ctx,
             const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2;   /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */
             if (litSize > srcSize-11)   /* risk of reading too far with wildcopy */
             {
-				if (litSize > srcSize-3) return ERROR(corruption_detected);
-				memcpy(dctx->litBuffer, istart, litSize);
-				dctx->litPtr = dctx->litBuffer;
-				dctx->litSize = litSize;
-				memset(dctx->litBuffer + dctx->litSize, 0, 8);
-				return litSize+3;
-			}
-			/* direct reference into compressed stream */
+                if (litSize > srcSize-3) return ERROR(corruption_detected);
+                memcpy(dctx->litBuffer, istart, litSize);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                memset(dctx->litBuffer + dctx->litSize, 0, 8);
+                return litSize+3;
+            }
+            /* direct reference into compressed stream */
             dctx->litPtr = istart+3;
             dctx->litSize = litSize;
             return litSize+3;
@@ -3156,13 +3156,13 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi
 
 unsigned ZSTDv03_isError(size_t code)
 {
-	return ZSTD_isError(code);
+    return ZSTD_isError(code);
 }
 
 size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
                      const void* src, size_t compressedSize)
 {
-	return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
+    return ZSTD_decompress(dst, maxOriginalSize, src, compressedSize);
 }
 
 size_t ZSTDv03_findFrameCompressedSize(const void* src, size_t srcSize)
@@ -3172,25 +3172,25 @@ size_t ZSTDv03_findFrameCompressedSize(const void* src, size_t srcSize)
 
 ZSTDv03_Dctx* ZSTDv03_createDCtx(void)
 {
-	return (ZSTDv03_Dctx*)ZSTD_createDCtx();
+    return (ZSTDv03_Dctx*)ZSTD_createDCtx();
 }
 
 size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx)
 {
-	return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
+    return ZSTD_freeDCtx((ZSTD_DCtx*)dctx);
 }
 
 size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx)
 {
-	return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
+    return ZSTD_resetDCtx((ZSTD_DCtx*)dctx);
 }
 
 size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx)
 {
-	return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
+    return ZSTD_nextSrcSizeToDecompress((ZSTD_DCtx*)dctx);
 }
 
 size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-	return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
+    return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize);
 }
diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c
index 09040e68ec56..1a29da92d1e8 100644
--- a/lib/legacy/zstd_v04.c
+++ b/lib/legacy/zstd_v04.c
@@ -882,8 +882,8 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
 
 MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
-	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
-		return BIT_DStream_overflow;
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BIT_DStream_overflow;
 
     if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
     {
@@ -1451,8 +1451,8 @@ static size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsi
                 else
                 {
                     bitCount -= (int)(8 * (iend - 4 - ip));
-					ip = iend - 4;
-				}
+                    ip = iend - 4;
+                }
                 bitStream = MEM_readLE32(ip) >> (bitCount & 31);
             }
         }
diff --git a/lib/legacy/zstd_v05.c b/lib/legacy/zstd_v05.c
index a6f5f5dbbd16..674f5b0e4a8e 100644
--- a/lib/legacy/zstd_v05.c
+++ b/lib/legacy/zstd_v05.c
@@ -884,8 +884,8 @@ MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, U32 nbBits)
 
 MEM_STATIC BITv05_DStream_status BITv05_reloadDStream(BITv05_DStream_t* bitD)
 {
-	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
-		return BITv05_DStream_overflow;
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BITv05_DStream_overflow;
 
     if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
         bitD->ptr -= bitD->bitsConsumed >> 3;
diff --git a/lib/legacy/zstd_v06.c b/lib/legacy/zstd_v06.c
index a4258b67a618..ad8c4cd3186d 100644
--- a/lib/legacy/zstd_v06.c
+++ b/lib/legacy/zstd_v06.c
@@ -982,8 +982,8 @@ MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, U32 nbBits)
               if status == unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
 MEM_STATIC BITv06_DStream_status BITv06_reloadDStream(BITv06_DStream_t* bitD)
 {
-	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
-		return BITv06_DStream_overflow;
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+        return BITv06_DStream_overflow;
 
     if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
         bitD->ptr -= bitD->bitsConsumed >> 3;
diff --git a/lib/zstd.h b/lib/zstd.h
index a3237c77eebd..f8050c136104 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -55,8 +55,8 @@ extern "C" {
 
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
-#define ZSTD_VERSION_MINOR    1
-#define ZSTD_VERSION_RELEASE  4
+#define ZSTD_VERSION_MINOR    2
+#define ZSTD_VERSION_RELEASE  0
 
 #define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
 #define ZSTD_QUOTE(str) #str
@@ -71,48 +71,48 @@ ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< library version number; to
 *  Simple API
 ***************************************/
 /*! ZSTD_compress() :
-    Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
-    Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
-    @return : compressed size written into `dst` (<= `dstCapacity),
-              or an error code if it fails (which can be tested using ZSTD_isError()). */
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
                             const void* src, size_t srcSize,
                                   int compressionLevel);
 
 /*! ZSTD_decompress() :
-    `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
-    `dstCapacity` is an upper bound of originalSize.
-    If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
-    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
-              or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
                               const void* src, size_t compressedSize);
 
 /*! ZSTD_getDecompressedSize() :
-*   NOTE: This function is planned to be obsolete, in favour of ZSTD_getFrameContentSize.
-*   ZSTD_getFrameContentSize functions the same way, returning the decompressed size of a single
-*   frame, but distinguishes empty frames from frames with an unknown size, or errors.
-*
-*   Additionally, ZSTD_findDecompressedSize can be used instead.  It can handle multiple
-*   concatenated frames in one buffer, and so is more general.
-*   As a result however, it requires more computation and entire frames to be passed to it,
-*   as opposed to ZSTD_getFrameContentSize which requires only a single frame's header.
-*
-*   'src' is the start of a zstd compressed frame.
-*   @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise.
-*    note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
-*             When `return==0`, data to decompress could be any size.
-*             In which case, it's necessary to use streaming mode to decompress data.
-*             Optionally, application can still use ZSTD_decompress() while relying on implied limits.
-*             (For example, data may be necessarily cut into blocks <= 16 KB).
-*    note 2 : decompressed size is always present when compression is done with ZSTD_compress()
-*    note 3 : decompressed size can be very large (64-bits value),
-*             potentially larger than what local system can handle as a single memory segment.
-*             In which case, it's necessary to use streaming mode to decompress data.
-*    note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
-*             Always ensure result fits within application's authorized limits.
-*             Each application can set its own limits.
-*    note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameParams() to know more. */
+ *  NOTE: This function is planned to be obsolete, in favour of ZSTD_getFrameContentSize.
+ *  ZSTD_getFrameContentSize functions the same way, returning the decompressed size of a single
+ *  frame, but distinguishes empty frames from frames with an unknown size, or errors.
+ *
+ *  Additionally, ZSTD_findDecompressedSize can be used instead.  It can handle multiple
+ *  concatenated frames in one buffer, and so is more general.
+ *  As a result however, it requires more computation and entire frames to be passed to it,
+ *  as opposed to ZSTD_getFrameContentSize which requires only a single frame's header.
+ *
+ *  'src' is the start of a zstd compressed frame.
+ *  @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise.
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==0`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can still use ZSTD_decompress() while relying on implied limits.
+ *            (For example, data may be necessarily cut into blocks <= 16 KB).
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameParams() to know more. */
 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 
@@ -127,29 +127,29 @@ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readab
 *  Explicit memory management
 ***************************************/
 /*= Compression context
-*   When compressing many times,
-*   it is recommended to allocate a context just once, and re-use it for each successive compression operation.
-*   This will make workload friendlier for system's memory.
-*   Use one context per thread for parallel execution in multi-threaded environments. */
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution in multi-threaded environments. */
 typedef struct ZSTD_CCtx_s ZSTD_CCtx;
 ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
 ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
 
 /*! ZSTD_compressCCtx() :
-    Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */
+ *  Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */
 ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel);
 
 /*= Decompression context
-*   When decompressing many times,
-*   it is recommended to allocate a context just once, and re-use it for each successive compression operation.
-*   This will make workload friendlier for system's memory.
-*   Use one context per thread for parallel execution in multi-threaded environments. */
+ *  When decompressing many times,
+ *  it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution in multi-threaded environments. */
 typedef struct ZSTD_DCtx_s ZSTD_DCtx;
 ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
 ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
 
 /*! ZSTD_decompressDCtx() :
-*   Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()). */
+ *  Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()). */
 ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 
@@ -194,9 +194,10 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize
 ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
 
 /*! ZSTD_compress_usingCDict() :
-*   Compression using a digested Dictionary.
-*   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
-*   Note that compression level is decided during dictionary creation. */
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression level is decided during dictionary creation.
+ *  Frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
 ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                             void* dst, size_t dstCapacity,
                                       const void* src, size_t srcSize,
@@ -487,7 +488,7 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, siz
 /*! ZSTD_createCDict_advanced() :
  *  Create a ZSTD_CDict using external alloc and free, and customized compression parameters */
 ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, unsigned byReference,
-                                                  ZSTD_parameters params, ZSTD_customMem customMem);
+                                                  ZSTD_compressionParameters cParams, ZSTD_customMem customMem);
 
 /*! ZSTD_sizeof_CDict() :
  *  Gives the amount of memory used by a given ZSTD_sizeof_CDict */
@@ -513,12 +514,19 @@ ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
 ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
 
 /*! ZSTD_compress_advanced() :
-*   Same as ZSTD_compress_usingDict(), with fine-tune control of each compression parameter */
-ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
-                                           void* dst, size_t dstCapacity,
-                                     const void* src, size_t srcSize,
-                                     const void* dict,size_t dictSize,
-                                           ZSTD_parameters params);
+*   Same as ZSTD_compress_usingDict(), with fine-tune control over each compression parameter */
+ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const void* dict,size_t dictSize,
+                                  ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+*   Same as ZSTD_compress_usingCDict(), with fine-tune control over frame parameters */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_CDict* cdict, ZSTD_frameParameters fParams);
 
 
 /*--- Advanced decompression functions ---*/
@@ -578,7 +586,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
  *  - This is not a Zstandard frame.
- *  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameParams(), which will provide a more precise error code. */
 ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
 
 
@@ -588,13 +596,22 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
 
 /*=====   Advanced Streaming compression functions  =====*/
 ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);   /**< size of CStream is variable, depending primarily on compression level */
 ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   /**< pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */
 ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */
 ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
                                              ZSTD_parameters params, unsigned long long pledgedSrcSize);  /**< pledgedSrcSize is optional and can be 0 (meaning unknown). note: if the contentSizeFlag is set, pledgedSrcSize == 0 means the source size is actually 0 */
 ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);  /**< note : cdict will just be referenced, and must outlive compression session */
-ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);  /**< re-use compression parameters from previous init; skip dictionary loading stage; zcs must be init at least once before. note: pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */
-ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize, ZSTD_frameParameters fParams);  /**< same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+
+/*! ZSTD_resetCStream() :
+ *  start a new compression job, using same parameters from previous job.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place..
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  pledgedSrcSize==0 means "srcSize unknown".
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
 /*=====   Advanced Streaming decompression functions  =====*/
@@ -650,8 +667,10 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
 ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
 ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
 ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be 0 (meaning unknown). note: if the contentSizeFlag is set, pledgedSrcSize == 0 means the source size is actually 0 */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize=0 means null-size */
 ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize can be 0, indicating unknown size.  if it is non-zero, it must be accurate.  for 0 size frames, use compressBegin_advanced */
-ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize can be 0, indicating unknown size.  if it is non-zero, it must be accurate.  for 0 size frames, use compressBegin_advanced */
+
 ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
@@ -745,19 +764,20 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
     - Compressing and decompressing require a context structure
       + Use ZSTD_createCCtx() and ZSTD_createDCtx()
     - It is necessary to init context before starting
-      + compression : ZSTD_compressBegin()
-      + decompression : ZSTD_decompressBegin()
-      + variants _usingDict() are also allowed
-      + copyCCtx() and copyDCtx() work too
-    - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
-      + If you need to compress more, cut data into multiple blocks
-      + Consider using the regular ZSTD_compress() instead, as frame metadata costs become negligible when source size is large.
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSizeMax() <= ZSTD_BLOCKSIZE_ABSOLUTEMAX
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block size, consider using the regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger.
     - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
       In which case, nothing is produced into `dst`.
       + User must test for such outcome and deal directly with uncompressed data
       + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!!
-      + In case of multiple successive blocks, decoder must be informed of uncompressed block existence to follow proper history.
-        Use ZSTD_insertBlock() in such a case.
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
 */
 
 #define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)   /* define, for static allocation */
diff --git a/programs/Makefile b/programs/Makefile
index 1475cb610916..0c920a87bcbd 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -18,6 +18,19 @@
 
 ZSTDDIR = ../lib
 
+# Version numbers
+LIBVER_SRC := $(ZSTDDIR)/zstd.h
+LIBVER_MAJOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MAJOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
+LIBVER_MINOR_SCRIPT:=`sed -n '/define ZSTD_VERSION_MINOR/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
+LIBVER_PATCH_SCRIPT:=`sed -n '/define ZSTD_VERSION_RELEASE/s/.*[[:blank:]]\([0-9][0-9]*\).*/\1/p' < $(LIBVER_SRC)`
+LIBVER_SCRIPT:= $(LIBVER_MAJOR_SCRIPT).$(LIBVER_MINOR_SCRIPT).$(LIBVER_PATCH_SCRIPT)
+LIBVER_MAJOR := $(shell echo $(LIBVER_MAJOR_SCRIPT))
+LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
+LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
+LIBVER  := $(shell echo $(LIBVER_SCRIPT))
+
+ZSTD_VERSION=$(LIBVER)
+
 ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version "), 1)
 ALIGN_LOOP = -falign-loops=32
 else
@@ -69,10 +82,23 @@ else
 EXT =
 endif
 
+VOID = /dev/null
+
+# thread detection
+NO_THREAD_MSG := ==> no threads, building without multithreading support
+HAVE_PTHREAD := $(shell printf '\#include <pthread.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_pthread$(EXT) -x c - -pthread 2> $(VOID) && rm have_pthread$(EXT) && echo 1 || echo 0)
+HAVE_THREAD := $(shell [ "$(HAVE_PTHREAD)" -eq "1" -o -n "$(filter Windows%,$(OS))" ] && echo 1 || echo 0)
+ifeq ($(HAVE_THREAD), 1)
+THREAD_MSG := ==> building with threading support
+THREAD_CPP := -DZSTD_MULTITHREAD
+THREAD_LD := -pthread
+else
+THREAD_MSG := $(NO_THREAD_MSG)
+endif
+
 # zlib detection
 NO_ZLIB_MSG := ==> no zlib, building zstd without .gz support
-VOID = /dev/null
-HAVE_ZLIB := $(shell printf '\#include <zlib.h>\nint main(){}' | $(CC) -o have_zlib -x c - -lz 2> $(VOID) && rm have_zlib$(EXT) && echo 1 || echo 0)
+HAVE_ZLIB := $(shell printf '\#include <zlib.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_zlib$(EXT) -x c - -lz 2> $(VOID) && rm have_zlib$(EXT) && echo 1 || echo 0)
 ifeq ($(HAVE_ZLIB), 1)
 ZLIB_MSG := ==> building zstd with .gz compression support
 ZLIBCPP = -DZSTD_GZCOMPRESS -DZSTD_GZDECOMPRESS
@@ -80,9 +106,10 @@ ZLIBLD = -lz
 else
 ZLIB_MSG := $(NO_ZLIB_MSG)
 endif
+
 # lzma detection
 NO_LZMA_MSG := ==> no liblzma, building zstd without .xz/.lzma support
-HAVE_LZMA := $(shell printf '\#include <lzma.h>\nint main(){}' | $(CC) -o have_lzma -x c - -llzma 2> $(VOID) && rm have_lzma$(EXT) && echo 1 || echo 0)
+HAVE_LZMA := $(shell printf '\#include <lzma.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_lzma$(EXT) -x c - -llzma 2> $(VOID) && rm have_lzma$(EXT) && echo 1 || echo 0)
 ifeq ($(HAVE_LZMA), 1)
 LZMA_MSG := ==> building zstd with .xz/.lzma compression support
 LZMACPP = -DZSTD_LZMACOMPRESS -DZSTD_LZMADECOMPRESS
@@ -91,6 +118,16 @@ else
 LZMA_MSG := $(NO_LZMA_MSG)
 endif
 
+# lz4 detection
+NO_LZ4_MSG := ==> no liblz4, building zstd without .lz4 support
+HAVE_LZ4 := $(shell printf '\#include <lz4frame.h>\n\#include <lz4.h>\nint main(void) { return 0; }' | $(CC) $(FLAGS) -o have_lz4$(EXT) -x c - -llz4 2> $(VOID) && rm have_lz4$(EXT) && echo 1 || echo 0)
+ifeq ($(HAVE_LZ4), 1)
+LZ4_MSG := ==> building zstd with .lz4 compression support
+LZ4CPP = -DZSTD_LZ4COMPRESS -DZSTD_LZ4DECOMPRESS
+LZ4LD = -llz4
+else
+LZ4_MSG := $(NO_LZ4_MSG)
+endif
 
 .PHONY: default all clean clean_decomp_o install uninstall generate_res
 
@@ -100,17 +137,20 @@ all: zstd
 
 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)
 
-zstd : CPPFLAGS += $(ZLIBCPP)
-zstd : LDFLAGS += $(ZLIBLD)
-zstd : LZMA_MSG := $(NO_LZMA_MSG)
-zstd-nogz : ZLIB_MSG := $(NO_ZLIB_MSG)
-zstd-nogz : LZMA_MSG := $(NO_LZMA_MSG)
-xzstd : CPPFLAGS += $(ZLIBCPP) $(LZMACPP)
-xzstd : LDFLAGS += $(ZLIBLD) $(LZMALD)
-zstd zstd-nogz xzstd : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
-zstd zstd-nogz xzstd : $(ZSTDLIB_OBJ) zstdcli.o fileio.o bench.o datagen.o dibio.o
+zstd xzstd zstd4 xzstd4 : CPPFLAGS += $(THREAD_CPP) $(ZLIBCPP)
+zstd xzstd zstd4 xzstd4 : LDFLAGS += $(THREAD_LD) $(ZLIBLD)
+xzstd xzstd4 : CPPFLAGS += $(LZMACPP)
+xzstd xzstd4 : LDFLAGS += $(LZMALD)
+zstd4 xzstd4 : CPPFLAGS += $(LZ4CPP)
+zstd4 xzstd4 : LDFLAGS += $(LZ4LD)
+zstd zstd4 : LZMA_MSG := - xz/lzma support is disabled
+zstd xzstd : LZ4_MSG := - lz4 support is disabled
+zstd xzstd zstd4 xzstd4 : CPPFLAGS += -DZSTD_LEGACY_SUPPORT=$(ZSTD_LEGACY_SUPPORT)
+zstd xzstd zstd4 xzstd4 : $(ZSTDLIB_FILES) zstdcli.o fileio.o bench.o datagen.o dibio.o
+	@echo "$(THREAD_MSG)"
 	@echo "$(ZLIB_MSG)"
 	@echo "$(LZMA_MSG)"
+	@echo "$(LZ4_MSG)"
 ifneq (,$(filter Windows%,$(OS)))
 	windres/generate_res.bat
 endif
@@ -126,10 +166,20 @@ ifneq (,$(filter Windows%,$(OS)))
 endif
 	$(CC) -m32 $(FLAGS) $^ $(RES32_FILE) -o $@$(EXT)
 
-
 zstd-nolegacy : clean_decomp_o
 	$(MAKE) zstd ZSTD_LEGACY_SUPPORT=0
 
+zstd-nomt : THREAD_CPP :=
+zstd-nomt : THREAD_LD :=
+zstd-nomt : THREAD_MSG := - multi-threading disabled
+zstd-nomt : zstd
+
+zstd-nogz : ZLIBCPP :=
+zstd-nogz : ZLIBLD :=
+zstd-nogz : ZLIB_MSG := - gzip support is disabled
+zstd-nogz : zstd
+
+
 zstd-pgo : MOREFLAGS = -fprofile-generate
 zstd-pgo : clean zstd
 	./zstd -b19i1 $(PROFILE_WITH)
@@ -142,22 +192,18 @@ zstd-pgo : clean zstd
 	$(RM) $(ZSTDDECOMP_O)
 	$(MAKE) zstd MOREFLAGS=-fprofile-use
 
-zstd-frugal: $(ZSTD_FILES) zstdcli.c fileio.c
+# minimal target, with only zstd compression and decompression. no bench. no legacy.
+zstd-small: CFLAGS = "-Os -s"
+zstd-frugal zstd-small: $(ZSTD_FILES) zstdcli.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT $^ -o zstd$(EXT)
 
-zstd-small:
-	CFLAGS="-Os -s" $(MAKE) zstd-frugal
-
 zstd-decompress: $(ZSTDCOMMON_FILES) $(ZSTDDECOMP_FILES) zstdcli.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NOCOMPRESS $^ -o $@$(EXT)
 
 zstd-compress: $(ZSTDCOMMON_FILES) $(ZSTDCOMP_FILES) zstdcli.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NODECOMPRESS $^ -o $@$(EXT)
 
-zstdmt: CPPFLAGS += -DZSTD_MULTITHREAD
-ifeq (,$(filter Windows%,$(OS)))
-zstdmt: LDFLAGS += -lpthread
-endif
+# zstd is now built with Multi-threading by default
 zstdmt: zstd
 
 generate_res:
@@ -174,6 +220,19 @@ clean:
 clean_decomp_o:
 	@$(RM) $(ZSTDDECOMP_O)
 
+MD2ROFF = ronn
+MD2ROFF_FLAGS = --roff --warnings --manual="User Commands" --organization="zstd $(ZSTD_VERSION)"
+
+zstd.1: zstd.1.md
+	cat $^ | $(MD2ROFF) $(MD2ROFF_FLAGS) | sed -n '/^\.\\\".*/!p' > $@
+
+man: zstd.1
+
+clean-man:
+	rm zstd.1
+
+preview-man: clean-man man
+	man ./zstd.1
 
 #-----------------------------------------------------------------------------
 # make install is validated only for Linux, OSX, BSD, Hurd and Solaris targets
@@ -206,6 +265,7 @@ install: zstd
 	@$(INSTALL_PROGRAM) zstd $(DESTDIR)$(BINDIR)/zstd
 	@ln -sf zstd $(DESTDIR)$(BINDIR)/zstdcat
 	@ln -sf zstd $(DESTDIR)$(BINDIR)/unzstd
+	@ln -sf zstd $(DESTDIR)$(BINDIR)/zstdmt
 	@$(INSTALL_SCRIPT) zstdless $(DESTDIR)$(BINDIR)/zstdless
 	@$(INSTALL_SCRIPT) zstdgrep $(DESTDIR)$(BINDIR)/zstdgrep
 	@echo Installing man pages
diff --git a/programs/README.md b/programs/README.md
index 203fd7b49bce..d7922a0969e4 100644
--- a/programs/README.md
+++ b/programs/README.md
@@ -11,8 +11,29 @@ There are however other Makefile targets that create different variations of CLI
 - `zstd-decompress` : decompressor-only version of CLI; without dictionary builder, benchmark, and support for decompression of legacy zstd versions
 
 
+#### Compilation variables
+`zstd` tries to detect and use the following features automatically :
+
+- __HAVE_THREAD__ : multithreading is automatically enabled when `pthread` is detected.
+  It's possible to disable multithread support, by either compiling `zstd-nomt` target or using HAVE_THREAD=0 variable.
+  Example : make zstd HAVE_THREAD=0
+  It's also possible to force compilation with multithread support, using HAVE_THREAD=1.
+  In which case, linking stage will fail if `pthread` library cannot be found.
+  This might be useful to prevent silent feature disabling.
+
+- __HAVE_ZLIB__ : `zstd` can compress and decompress files in `.gz` format.
+  This is done through command `--format=gzip`.
+  Alternatively, symlinks named `gzip` or `gunzip` will mimic intended behavior.
+  .gz support is automatically enabled when `zlib` library is detected at build time.
+  It's possible to disable .gz support, by either compiling `zstd-nogz` target or using HAVE_ZLIB=0 variable.
+  Example : make zstd HAVE_ZLIB=0
+  It's also possible to force compilation with zlib support, using HAVE_ZLIB=1.
+  In which case, linking stage will fail if `zlib` library cannot be found.
+  This might be useful to prevent silent feature disabling.
+
+
 #### Aggregation of parameters
-CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`. 
+CLI supports aggregation of parameters i.e. `-b1`, `-e18`, and `-i1` can be joined into `-b1e18i1`.
 
 
 #### Dictionary builder in Command Line Interface
@@ -23,7 +44,7 @@ which can be loaded before compression and decompression.
 
 Using a dictionary, the compression ratio achievable on small data improves dramatically.
 These compression gains are achieved while simultaneously providing faster compression and decompression speeds.
-Dictionary work if there is some correlation in a family of small data (there is no universal dictionary). 
+Dictionary work if there is some correlation in a family of small data (there is no universal dictionary).
 Hence, deploying one dictionary per type of data will provide the greater benefits.
 Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm
 will rely more and more on previously decoded content to compress the rest of the file.
@@ -35,7 +56,6 @@ Usage of the dictionary builder and created dictionaries with CLI:
 3. Decompress with the dictionary: `zstd --decompress FILE.zst -D dictionaryName`
 
 
-
 #### Benchmark in Command Line Interface
 CLI includes in-memory compression benchmark module for zstd.
 The benchmark is conducted using given filenames. The files are read into memory and joined together.
@@ -48,7 +68,6 @@ One can select compression levels starting from `-b` and ending with `-e`.
 The `-i` parameter selects minimal time used for each of tested levels.
 
 
-
 #### Usage of Command Line Interface
 The full list of options can be obtained with `-h` or `-H` parameter:
 ```
@@ -62,33 +81,40 @@ Arguments :
  -d     : decompression
  -D file: use `file` as Dictionary
  -o file: result stored into `file` (only if 1 input file)
- -f     : overwrite output without prompting
+ -f     : overwrite output without prompting and (de)compress links
 --rm    : remove source file(s) after successful de/compression
  -k     : preserve source file(s) (default)
  -h/-H  : display help/long help and exit
 
 Advanced arguments :
  -V     : display Version number and exit
- -v     : verbose mode; specify multiple times to increase log level (default:2)
+ -v     : verbose mode; specify multiple times to increase verbosity
  -q     : suppress warnings; specify twice to suppress errors too
  -c     : force write to standard output, even if it is the console
- -r     : operate recursively on directories
 --ultra : enable levels beyond 19, up to 22 (requires more memory)
+ -T#    : use # threads for compression (default:1)
+ -B#    : select size of each job (default:0==automatic)
 --no-dictID : don't write dictID into header (dictionary compression)
 --[no-]check : integrity check (default:enabled)
+ -r     : operate recursively on directories
+--format=gzip : compress files to the .gz format
 --test  : test compressed file integrity
---[no-]sparse : sparse mode (default:enabled on file, disabled on stdout)
+--[no-]sparse : sparse mode (default:disabled)
+ -M#    : Set a memory usage limit for decompression
+--      : All arguments after "--" are treated as files
 
 Dictionary builder :
 --train ## : create a dictionary from a training set of files
+--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args
+--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
  -o file : `file` is dictionary name (default: dictionary)
---maxdict ## : limit dictionary to specified size (default : 112640)
- -s#    : dictionary selectivity level (default: 9)
---dictID ## : force dictionary ID to specified value (default: random)
+--maxdict=# : limit dictionary to specified size (default : 112640)
+--dictID=# : force dictionary ID to specified value (default: random)
 
 Benchmark arguments :
  -b#    : benchmark file(s), using # compression level (default : 1)
  -e#    : test all compression levels from -bX to # (default: 1)
  -i#    : minimum evaluation time in seconds (default : 3s)
  -B#    : cut file into independent blocks of size # (default: no block)
- ```
\ No newline at end of file
+--priority=rt : set process priority to real-time
+```
diff --git a/programs/bench.c b/programs/bench.c
index 2dd1cfb0fab1..22b871952b8e 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -70,12 +70,12 @@ static U32 g_compressibilityDefault = 50;
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static U32 g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+static int g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
 
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((clock() - g_time > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
 static clock_t g_time = 0;
 
@@ -89,7 +89,7 @@ static clock_t g_time = 0;
 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
 #define EXM_THROW(error, ...)                                             \
 {                                                                         \
-    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__); \
     DISPLAYLEVEL(1, "Error %i : ", error);                                \
     DISPLAYLEVEL(1, __VA_ARGS__);                                         \
     DISPLAYLEVEL(1, " \n");                                               \
@@ -146,17 +146,20 @@ typedef struct {
 } blockParam_t;
 
 
-#define MIN(a,b) ((a)<(b) ? (a) : (b))
-#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+#undef MIN
+#undef MAX
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
+#define MAX(a,b)    ((a) > (b) ? (a) : (b))
 
 static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                         const char* displayName, int cLevel,
                         const size_t* fileSizes, U32 nbFiles,
                         const void* dictBuffer, size_t dictBufferSize,
-                        ZSTD_compressionParameters *comprParams)
+                        const ZSTD_compressionParameters* comprParams)
 {
     size_t const blockSize = ((g_blockSize>=32 && !g_decodeOnly) ? g_blockSize : srcSize) + (!srcSize) /* avoid div by 0 */ ;
-    size_t const avgSize = MIN(g_blockSize, (srcSize / nbFiles));
+    size_t const avgSize = MIN(blockSize, (srcSize / nbFiles));
     U32 const maxNbBlocks = (U32) ((srcSize + (blockSize-1)) / blockSize) + nbFiles;
     blockParam_t* const blockTable = (blockParam_t*) malloc(maxNbBlocks * sizeof(blockParam_t));
     size_t const maxCompressedSize = ZSTD_compressBound(srcSize) + (maxNbBlocks * 1024);   /* add some room for safety */
@@ -176,22 +179,21 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
         EXM_THROW(31, "allocation error : not enough memory");
 
     /* init */
-    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* can only display 17 characters */
+    if (strlen(displayName)>17) displayName += strlen(displayName)-17;   /* display last 17 characters */
     UTIL_initTimer(&ticksPerSecond);
 
-    if (g_decodeOnly) {
-        const char* srcPtr = (const char*) srcBuffer;
-        U64 dSize64 = 0;
+    if (g_decodeOnly) {  /* benchmark only decompression : source must be already compressed */
+        const char* srcPtr = (const char*)srcBuffer;
+        U64 totalDSize64 = 0;
         U32 fileNb;
         for (fileNb=0; fileNb<nbFiles; fileNb++) {
             U64 const fSize64 = ZSTD_findDecompressedSize(srcPtr, fileSizes[fileNb]);
             if (fSize64==0) EXM_THROW(32, "Impossible to determine original size ");
-            dSize64 += fSize64;
+            totalDSize64 += fSize64;
             srcPtr += fileSizes[fileNb];
         }
-        {   size_t const decodedSize = (size_t)dSize64;
-            if (dSize64 > decodedSize) EXM_THROW(32, "original size is too large");
-            if (decodedSize==0) EXM_THROW(32, "Impossible to determine original size ");
+        {   size_t const decodedSize = (size_t)totalDSize64;
+            if (totalDSize64 > decodedSize) EXM_THROW(32, "original size is too large");   /* size_t overflow */
             free(resultBuffer);
             resultBuffer = malloc(decodedSize);
             if (!resultBuffer) EXM_THROW(33, "not enough memory");
@@ -260,12 +262,11 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                 UTIL_getTime(&clockStart);
 
                 if (!cCompleted) {   /* still some time to do compression tests */
-                    ZSTD_parameters zparams = ZSTD_getParams(cLevel, avgSize, dictBufferSize);
                     ZSTD_customMem const cmem = { NULL, NULL, NULL };
-                    U64 clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1;
+                    U64 const clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1;
                     U32 nbLoops = 0;
-                    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, 1, zparams, cmem);
-                    if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict_advanced() allocation failure");
+                    ZSTD_parameters zparams = ZSTD_getParams(cLevel, avgSize, dictBufferSize);
+                    ZSTD_CDict* cdict;
                     if (comprParams->windowLog) zparams.cParams.windowLog = comprParams->windowLog;
                     if (comprParams->chainLog) zparams.cParams.chainLog = comprParams->chainLog;
                     if (comprParams->hashLog) zparams.cParams.hashLog = comprParams->hashLog;
@@ -273,6 +274,8 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize,
                     if (comprParams->searchLength) zparams.cParams.searchLength = comprParams->searchLength;
                     if (comprParams->targetLength) zparams.cParams.targetLength = comprParams->targetLength;
                     if (comprParams->strategy) zparams.cParams.strategy = (ZSTD_strategy)(comprParams->strategy - 1);
+                    cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, 1, zparams.cParams, cmem);
+                    if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict_advanced() allocation failure");
                     do {
                         U32 blockNb;
                         size_t rSize;
diff --git a/programs/dibio.c b/programs/dibio.c
index 5ef202c8abad..aac36425cf75 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -53,12 +53,12 @@ static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static unsigned g_displayLevel = 0;   /* 0 : no display;   1: errors;   2: default;  4: full information */
+static int g_displayLevel = 0;   /* 0 : no display;   1: errors;   2: default;  4: full information */
 
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((DIB_clockSpan(g_time) > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 2 / 10;
 static clock_t g_time = 0;
 
@@ -89,7 +89,8 @@ unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
 
 const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
 
-#define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+#undef MIN
+#define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
 
 /* ********************************************************
diff --git a/programs/fileio.c b/programs/fileio.c
index e6481f1fa726..e188936b21f5 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -31,6 +31,11 @@
 #include <time.h>       /* clock */
 #include <errno.h>      /* errno */
 
+#if defined (_MSC_VER)
+#  include <sys/stat.h>
+#  include <io.h>
+#endif
+
 #include "mem.h"
 #include "fileio.h"
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
@@ -48,6 +53,12 @@
 #  include <lzma.h>
 #endif
 
+#define LZ4_MAGICNUMBER 0x184D2204
+#if defined(ZSTD_LZ4COMPRESS) || defined(ZSTD_LZ4DECOMPRESS)
+#  include <lz4frame.h>
+#  include <lz4.h>
+#endif
+
 
 /*-*************************************
 *  Constants
@@ -71,7 +82,7 @@
 
 #define CACHELINE 64
 
-#define MAX_DICT_SIZE (8 MB)   /* protection against large input (attack scenario) */
+#define DICTSIZE_MAX (32 MB)   /* protection against large input (attack scenario) */
 
 #define FNSPACE 30
 
@@ -81,18 +92,20 @@
 ***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
-static U32 g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+static int g_displayLevel = 2;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
 void FIO_setNotificationLevel(unsigned level) { g_displayLevel=level; }
 
 #define DISPLAYUPDATE(l, ...) { if (g_displayLevel>=l) { \
             if ((clock() - g_time > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } } }
+            if (g_displayLevel>=4) fflush(stderr); } } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
 static clock_t g_time = 0;
 
+#undef MIN
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 
+
 /* ************************************************************
 * Avoid fseek()'s 2GiB barrier with MSVC, MacOS, *BSD, MinGW
 ***************************************************************/
@@ -188,6 +201,18 @@ void FIO_setOverlapLog(unsigned overlapLog){
 /*-*************************************
 *  Functions
 ***************************************/
+/** FIO_remove() :
+ * @result : Unlink `fileName`, even if it's read-only */
+static int FIO_remove(const char* path)
+{
+#if defined(_WIN32) || defined(WIN32)
+    /* windows doesn't allow remove read-only files, so try to make it
+     * writable first */
+    chmod(path, _S_IWRITE);
+#endif
+    return remove(path);
+}
+
 /** FIO_openSrcFile() :
  * condition : `dstFileName` must be non-NULL.
  * @result : FILE* to `dstFileName`, or NULL if it fails */
@@ -227,23 +252,32 @@ static FILE* FIO_openDstFile(const char* dstFileName)
             DISPLAYLEVEL(4, "Sparse File Support is automatically disabled on stdout ; try --sparse \n");
         }
     } else {
-        if (!g_overwrite && strcmp (dstFileName, nulmark)) {  /* Check if destination file already exists */
+        if (g_sparseFileSupport == 1) {
+            g_sparseFileSupport = ZSTD_SPARSE_DEFAULT;
+        }
+        if (strcmp (dstFileName, nulmark)) {  /* Check if destination file already exists */
             f = fopen( dstFileName, "rb" );
             if (f != 0) {  /* dest file exists, prompt for overwrite authorization */
                 fclose(f);
-                if (g_displayLevel <= 1) {
-                    /* No interaction possible */
-                    DISPLAY("zstd: %s already exists; not overwritten  \n", dstFileName);
-                    return NULL;
-                }
-                DISPLAY("zstd: %s already exists; do you wish to overwrite (y/N) ? ", dstFileName);
-                {   int ch = getchar();
-                    if ((ch!='Y') && (ch!='y')) {
-                        DISPLAY("    not overwritten  \n");
+                if (!g_overwrite) {
+                    if (g_displayLevel <= 1) {
+                        /* No interaction possible */
+                        DISPLAY("zstd: %s already exists; not overwritten  \n", dstFileName);
                         return NULL;
                     }
-                    while ((ch!=EOF) && (ch!='\n')) ch = getchar();  /* flush rest of input line */
-        }   }   }
+                    DISPLAY("zstd: %s already exists; do you wish to overwrite (y/N) ? ", dstFileName);
+                    {   int ch = getchar();
+                        if ((ch!='Y') && (ch!='y')) {
+                            DISPLAY("    not overwritten  \n");
+                            return NULL;
+                        }
+                        while ((ch!=EOF) && (ch!='\n')) ch = getchar();  /* flush rest of input line */
+                    }
+                }
+
+                /* need to unlink */
+                FIO_remove(dstFileName);
+        }   }
         f = fopen( dstFileName, "wb" );
         if (f==NULL) DISPLAYLEVEL(1, "zstd: %s: %s\n", dstFileName, strerror(errno));
     }
@@ -252,13 +286,13 @@ static FILE* FIO_openDstFile(const char* dstFileName)
 }
 
 
-/*! FIO_loadFile() :
-*   creates a buffer, pointed by `*bufferPtr`,
-*   loads `filename` content into it,
-*   up to MAX_DICT_SIZE bytes.
-*   @return : loaded size
-*/
-static size_t FIO_loadFile(void** bufferPtr, const char* fileName)
+/*! FIO_createDictBuffer() :
+ *  creates a buffer, pointed by `*bufferPtr`,
+ *  loads `filename` content into it, up to DICTSIZE_MAX bytes.
+ *  @return : loaded size
+ *  if fileName==NULL, returns 0 and a NULL pointer
+ */
+static size_t FIO_createDictBuffer(void** bufferPtr, const char* fileName)
 {
     FILE* fileHandle;
     U64 fileSize;
@@ -270,14 +304,7 @@ static size_t FIO_loadFile(void** bufferPtr, const char* fileName)
     fileHandle = fopen(fileName, "rb");
     if (fileHandle==0) EXM_THROW(31, "zstd: %s: %s", fileName, strerror(errno));
     fileSize = UTIL_getFileSize(fileName);
-    if (fileSize > MAX_DICT_SIZE) {
-        int seekResult;
-        if (fileSize > 1 GB) EXM_THROW(32, "Dictionary file %s is too large", fileName);   /* avoid extreme cases */
-        DISPLAYLEVEL(2,"Dictionary %s is too large : using last %u bytes only \n", fileName, (U32)MAX_DICT_SIZE);
-        seekResult = fseek(fileHandle, (long int)(fileSize-MAX_DICT_SIZE), SEEK_SET);   /* use end of file */
-        if (seekResult != 0) EXM_THROW(33, "zstd: %s: %s", fileName, strerror(errno));
-        fileSize = MAX_DICT_SIZE;
-    }
+    if (fileSize > DICTSIZE_MAX) EXM_THROW(32, "Dictionary file %s is too large (> %u MB)", fileName, DICTSIZE_MAX >> 20);   /* avoid extreme cases */
     *bufferPtr = malloc((size_t)fileSize);
     if (*bufferPtr==NULL) EXM_THROW(34, "zstd: %s", strerror(errno));
     { size_t const readSize = fread(*bufferPtr, 1, (size_t)fileSize, fileHandle);
@@ -330,7 +357,7 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
 
     /* dictionary */
     {   void* dictBuffer;
-        size_t const dictBuffSize = FIO_loadFile(&dictBuffer, dictFileName);
+        size_t const dictBuffSize = FIO_createDictBuffer(&dictBuffer, dictFileName);   /* works with dictFileName==NULL */
         if (dictFileName && (dictBuffer==NULL)) EXM_THROW(32, "zstd: allocation error : can't create dictBuffer");
         {   ZSTD_parameters params = ZSTD_getParams(cLevel, srcSize, dictBuffSize);
             params.fParams.contentSizeFlag = srcRegFile;
@@ -342,7 +369,7 @@ static cRess_t FIO_createCResources(const char* dictFileName, int cLevel,
             if (comprParams->searchLog) params.cParams.searchLog = comprParams->searchLog;
             if (comprParams->searchLength) params.cParams.searchLength = comprParams->searchLength;
             if (comprParams->targetLength) params.cParams.targetLength = comprParams->targetLength;
-            if (comprParams->strategy) params.cParams.strategy = (ZSTD_strategy)(comprParams->strategy - 1);
+            if (comprParams->strategy) params.cParams.strategy = (ZSTD_strategy)(comprParams->strategy - 1);   /* 0 means : do not change */
 #ifdef ZSTD_MULTITHREAD
             {   size_t const errorCode = ZSTDMT_initCStream_advanced(ress.cctx, dictBuffer, dictBuffSize, params, srcSize);
                 if (ZSTD_isError(errorCode)) EXM_THROW(33, "Error initializing CStream : %s", ZSTD_getErrorName(errorCode));
@@ -494,6 +521,84 @@ static unsigned long long FIO_compressLzmaFrame(cRess_t* ress, const char* srcFi
 }
 #endif
 
+#ifdef ZSTD_LZ4COMPRESS
+static int FIO_LZ4_GetBlockSize_FromBlockId (int id) { return (1 << (8 + (2 * id))); }
+static unsigned long long FIO_compressLz4Frame(cRess_t* ress, const char* srcFileName, U64 const srcFileSize, int compressionLevel, U64* readsize)
+{
+    unsigned long long inFileSize = 0, outFileSize = 0;
+
+    LZ4F_preferences_t prefs;
+    LZ4F_compressionContext_t ctx;
+
+    LZ4F_errorCode_t const errorCode = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
+    if (LZ4F_isError(errorCode)) EXM_THROW(31, "zstd: failed to create lz4 compression context");
+
+    memset(&prefs, 0, sizeof(prefs));
+
+#if LZ4_VERSION_NUMBER <= 10600
+#define LZ4F_blockIndependent blockIndependent
+#define LZ4F_max4MB max4MB
+#endif
+
+    prefs.autoFlush = 1;
+    prefs.compressionLevel = compressionLevel;
+    prefs.frameInfo.blockMode = LZ4F_blockIndependent; /* stick to defaults for lz4 cli */
+    prefs.frameInfo.blockSizeID = LZ4F_max4MB;
+    prefs.frameInfo.contentChecksumFlag = (contentChecksum_t)g_checksumFlag;
+#if LZ4_VERSION_NUMBER >= 10600
+    prefs.frameInfo.contentSize = srcFileSize;
+#endif
+
+    {
+        size_t blockSize = FIO_LZ4_GetBlockSize_FromBlockId(LZ4F_max4MB);
+        size_t readSize;
+        size_t headerSize = LZ4F_compressBegin(ctx, ress->dstBuffer, ress->dstBufferSize, &prefs);
+        if (LZ4F_isError(headerSize)) EXM_THROW(33, "File header generation failed : %s", LZ4F_getErrorName(headerSize));
+        { size_t const sizeCheck = fwrite(ress->dstBuffer, 1, headerSize, ress->dstFile);
+          if (sizeCheck!=headerSize) EXM_THROW(34, "Write error : cannot write header"); }
+        outFileSize += headerSize;
+
+        /* Read first block */
+        readSize  = fread(ress->srcBuffer, (size_t)1, (size_t)blockSize, ress->srcFile);
+        inFileSize += readSize;
+
+        /* Main Loop */
+        while (readSize>0) {
+            size_t outSize;
+
+            /* Compress Block */
+            outSize = LZ4F_compressUpdate(ctx, ress->dstBuffer, ress->dstBufferSize, ress->srcBuffer, readSize, NULL);
+            if (LZ4F_isError(outSize)) EXM_THROW(35, "zstd: %s: lz4 compression failed : %s", srcFileName, LZ4F_getErrorName(outSize));
+            outFileSize += outSize;
+            if (!srcFileSize) DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%", (U32)(inFileSize>>20), (double)outFileSize/inFileSize*100)
+            else DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%", (U32)(inFileSize>>20), (U32)(srcFileSize>>20), (double)outFileSize/inFileSize*100);
+
+            /* Write Block */
+            { size_t const sizeCheck = fwrite(ress->dstBuffer, 1, outSize, ress->dstFile);
+              if (sizeCheck!=outSize) EXM_THROW(36, "Write error : cannot write compressed block"); }
+
+            /* Read next block */
+            readSize  = fread(ress->srcBuffer, (size_t)1, (size_t)blockSize, ress->srcFile);
+            inFileSize += readSize;
+        }
+        if (ferror(ress->srcFile)) EXM_THROW(37, "Error reading %s ", srcFileName);
+
+        /* End of Stream mark */
+        headerSize = LZ4F_compressEnd(ctx, ress->dstBuffer, ress->dstBufferSize, NULL);
+        if (LZ4F_isError(headerSize)) EXM_THROW(38, "zstd: %s: lz4 end of file generation failed : %s", srcFileName, LZ4F_getErrorName(headerSize));
+
+        { size_t const sizeCheck = fwrite(ress->dstBuffer, 1, headerSize, ress->dstFile);
+          if (sizeCheck!=headerSize) EXM_THROW(39, "Write error : cannot write end of stream"); }
+        outFileSize += headerSize;
+    }
+
+    *readsize = inFileSize;
+    LZ4F_freeCompressionContext(ctx);
+
+    return outFileSize;
+}
+#endif
+
 
 /*! FIO_compressFilename_internal() :
  *  same as FIO_compressFilename_extRess(), with `ress.desFile` already opened.
@@ -512,6 +617,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
     switch (g_compressionType) {
         case FIO_zstdCompression:
             break;
+
         case FIO_gzipCompression:
 #ifdef ZSTD_GZCOMPRESS
             compressedfilesize = FIO_compressGzFrame(&ress, srcFileName, fileSize, compressionLevel, &readsize);
@@ -520,6 +626,7 @@ static int FIO_compressFilename_internal(cRess_t ress,
             EXM_THROW(20, "zstd: %s: file cannot be compressed as gzip (zstd compiled without ZSTD_GZCOMPRESS) -- ignored \n", srcFileName);
 #endif
             goto finish;
+
         case FIO_xzCompression:
         case FIO_lzmaCompression:
 #ifdef ZSTD_LZMACOMPRESS
@@ -527,6 +634,15 @@ static int FIO_compressFilename_internal(cRess_t ress,
 #else
             (void)compressionLevel;
             EXM_THROW(20, "zstd: %s: file cannot be compressed as xz/lzma (zstd compiled without ZSTD_LZMACOMPRESS) -- ignored \n", srcFileName);
+#endif
+            goto finish;
+
+        case FIO_lz4Compression:
+#ifdef ZSTD_LZ4COMPRESS
+            compressedfilesize = FIO_compressLz4Frame(&ress, srcFileName, fileSize, compressionLevel, &readsize);
+#else
+            (void)compressionLevel;
+            EXM_THROW(20, "zstd: %s: file cannot be compressed as lz4 (zstd compiled without ZSTD_LZ4COMPRESS) -- ignored \n", srcFileName);
 #endif
             goto finish;
     }
@@ -548,8 +664,8 @@ static int FIO_compressFilename_internal(cRess_t ress,
         readsize += inSize;
 
         {   ZSTD_inBuffer  inBuff = { ress.srcBuffer, inSize, 0 };
-            while (inBuff.pos != inBuff.size) {   /* note : is there any possibility of endless loop ? for example, if outBuff is not large enough ? */
-                ZSTD_outBuffer outBuff= { ress.dstBuffer, ress.dstBufferSize, 0 };
+            while (inBuff.pos != inBuff.size) {
+                ZSTD_outBuffer outBuff = { ress.dstBuffer, ress.dstBufferSize, 0 };
 #ifdef ZSTD_MULTITHREAD
                 size_t const result = ZSTDMT_compressStream(ress.cctx, &outBuff, &inBuff);
 #else
@@ -563,13 +679,13 @@ static int FIO_compressFilename_internal(cRess_t ress,
                     if (sizeCheck!=outBuff.pos) EXM_THROW(25, "Write error : cannot write compressed block into %s", dstFileName);
                     compressedfilesize += outBuff.pos;
         }   }   }
-#ifdef ZSTD_MULTITHREAD
-        if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB", (U32)(readsize>>20))
-        else DISPLAYUPDATE(2, "\rRead : %u / %u MB", (U32)(readsize>>20), (U32)(fileSize>>20));
-#else
-        if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%", (U32)(readsize>>20), (double)compressedfilesize/readsize*100)
-        else DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%", (U32)(readsize>>20), (U32)(fileSize>>20), (double)compressedfilesize/readsize*100);
-#endif
+        if (g_nbThreads > 1) {
+            if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB", (U32)(readsize>>20))
+            else DISPLAYUPDATE(2, "\rRead : %u / %u MB", (U32)(readsize>>20), (U32)(fileSize>>20));
+        } else {
+            if (!fileSize) DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%%", (U32)(readsize>>20), (double)compressedfilesize/readsize*100)
+            else DISPLAYUPDATE(2, "\rRead : %u / %u MB ==> %.2f%%", (U32)(readsize>>20), (U32)(fileSize>>20), (double)compressedfilesize/readsize*100);
+        }
     }
 
     /* End of Frame */
@@ -750,7 +866,7 @@ static dRess_t FIO_createDResources(const char* dictFileName)
 
     /* dictionary */
     {   void* dictBuffer;
-        size_t const dictBufferSize = FIO_loadFile(&dictBuffer, dictFileName);
+        size_t const dictBufferSize = FIO_createDictBuffer(&dictBuffer, dictFileName);
         size_t const initError = ZSTD_initDStream_usingDict(ress.dctx, dictBuffer, dictBufferSize);
         if (ZSTD_isError(initError)) EXM_THROW(61, "ZSTD_initDStream_usingDict error : %s", ZSTD_getErrorName(initError));
         free(dictBuffer);
@@ -1019,6 +1135,66 @@ static unsigned long long FIO_decompressLzmaFrame(dRess_t* ress, FILE* srcFile,
 }
 #endif
 
+#ifdef ZSTD_LZ4DECOMPRESS
+static unsigned long long FIO_decompressLz4Frame(dRess_t* ress, FILE* srcFile, const char* srcFileName)
+{
+    unsigned long long filesize = 0;
+    LZ4F_errorCode_t nextToLoad;
+    LZ4F_decompressionContext_t dCtx;
+    LZ4F_errorCode_t const errorCode = LZ4F_createDecompressionContext(&dCtx, LZ4F_VERSION);
+
+    if (LZ4F_isError(errorCode)) EXM_THROW(61, "zstd: failed to create lz4 decompression context");
+
+    /* Init feed with magic number (already consumed from FILE* sFile) */
+    {   size_t inSize = 4;
+        size_t outSize= 0;
+        MEM_writeLE32(ress->srcBuffer, LZ4_MAGICNUMBER);
+        nextToLoad = LZ4F_decompress(dCtx, ress->dstBuffer, &outSize, ress->srcBuffer, &inSize, NULL);
+        if (LZ4F_isError(nextToLoad)) EXM_THROW(62, "zstd: %s: lz4 header error : %s", srcFileName, LZ4F_getErrorName(nextToLoad));
+    }
+
+    /* Main Loop */
+    for (;nextToLoad;) {
+        size_t readSize;
+        size_t pos = 0;
+        size_t decodedBytes = ress->dstBufferSize;
+
+        /* Read input */
+        if (nextToLoad > ress->srcBufferSize) nextToLoad = ress->srcBufferSize;
+        readSize = fread(ress->srcBuffer, 1, nextToLoad, srcFile);
+        if (!readSize) break;   /* reached end of file or stream */
+
+        while ((pos < readSize) || (decodedBytes == ress->dstBufferSize)) {  /* still to read, or still to flush */
+            /* Decode Input (at least partially) */
+            size_t remaining = readSize - pos;
+            decodedBytes = ress->dstBufferSize;
+            nextToLoad = LZ4F_decompress(dCtx, ress->dstBuffer, &decodedBytes, (char*)(ress->srcBuffer)+pos, &remaining, NULL);
+            if (LZ4F_isError(nextToLoad)) EXM_THROW(66, "zstd: %s: decompression error : %s", srcFileName, LZ4F_getErrorName(nextToLoad));
+            pos += remaining;
+
+            /* Write Block */
+            if (decodedBytes) {
+                if (fwrite(ress->dstBuffer, 1, decodedBytes, ress->dstFile) != decodedBytes) EXM_THROW(63, "Write error : cannot write to output file");
+                filesize += decodedBytes;
+                DISPLAYUPDATE(2, "\rDecompressed : %u MB  ", (unsigned)(filesize>>20));
+            }
+
+            if (!nextToLoad) break;
+        }
+    }
+    /* can be out because readSize == 0, which could be an fread() error */
+    if (ferror(srcFile)) EXM_THROW(67, "zstd: %s: read error", srcFileName);
+
+    if (nextToLoad!=0) EXM_THROW(68, "zstd: %s: unfinished stream", srcFileName);
+
+    LZ4F_freeDecompressionContext(dCtx);
+    ress->srcBufferLoaded = 0; /* LZ4F will go to the frame boundary */
+
+    return filesize;
+}
+#endif
+
+
 
 /** FIO_decompressSrcFile() :
     Decompression `srcFileName` into `ress.dstFile`
@@ -1070,6 +1246,15 @@ static int FIO_decompressSrcFile(dRess_t ress, const char* dstFileName, const ch
 #else
             DISPLAYLEVEL(1, "zstd: %s: xz/lzma file cannot be uncompressed (zstd compiled without ZSTD_LZMADECOMPRESS) -- ignored \n", srcFileName);
             return 1;
+#endif
+        } else if (MEM_readLE32(buf) == LZ4_MAGICNUMBER) {
+#ifdef ZSTD_LZ4DECOMPRESS
+            unsigned long long const result = FIO_decompressLz4Frame(&ress, srcFile, srcFileName);
+            if (result == 0) return 1;
+            filesize += result;
+#else
+            DISPLAYLEVEL(1, "zstd: %s: lz4 file cannot be uncompressed (zstd compiled without ZSTD_LZ4DECOMPRESS) -- ignored \n", srcFileName);
+            return 1;
 #endif
         } else {
             if (!ZSTD_isFrame(ress.srcBuffer, toRead)) {
@@ -1179,7 +1364,7 @@ int FIO_decompressMultipleFilenames(const char** srcNamesTable, unsigned nbFiles
                 dstFileName = (char*)malloc(dfnSize);
                 if (dstFileName==NULL) EXM_THROW(74, "not enough memory for dstFileName");
             }
-            if (sfnSize <= suffixSize || (strcmp(suffixPtr, GZ_EXTENSION) && strcmp(suffixPtr, XZ_EXTENSION) && strcmp(suffixPtr, ZSTD_EXTENSION) && strcmp(suffixPtr, LZMA_EXTENSION))) {
+            if (sfnSize <= suffixSize || (strcmp(suffixPtr, GZ_EXTENSION) && strcmp(suffixPtr, XZ_EXTENSION) && strcmp(suffixPtr, ZSTD_EXTENSION) && strcmp(suffixPtr, LZMA_EXTENSION) && strcmp(suffixPtr, LZ4_EXTENSION))) {
                 DISPLAYLEVEL(1, "zstd: %s: unknown suffix (%s/%s/%s/%s expected) -- ignored \n", srcFileName, GZ_EXTENSION, XZ_EXTENSION, ZSTD_EXTENSION, LZMA_EXTENSION);
                 skippedFiles++;
                 continue;
diff --git a/programs/fileio.h b/programs/fileio.h
index 0dd58d625d44..65da98d7fa88 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -33,12 +33,13 @@ extern "C" {
 #define XZ_EXTENSION    ".xz"
 #define GZ_EXTENSION    ".gz"
 #define ZSTD_EXTENSION  ".zst"
+#define LZ4_EXTENSION   ".lz4"
 
 
 /*-*************************************
 *  Types
 ***************************************/
-typedef enum { FIO_zstdCompression, FIO_gzipCompression, FIO_xzCompression, FIO_lzmaCompression } FIO_compressionType_t;
+typedef enum { FIO_zstdCompression, FIO_gzipCompression, FIO_xzCompression, FIO_lzmaCompression, FIO_lz4Compression } FIO_compressionType_t;
 
 
 /*-*************************************
diff --git a/programs/platform.h b/programs/platform.h
index 89a9f6cd42a0..74412cde332e 100644
--- a/programs/platform.h
+++ b/programs/platform.h
@@ -100,9 +100,18 @@ extern "C" {
 #if (defined(__linux__) && (PLATFORM_POSIX_VERSION >= 1)) || (PLATFORM_POSIX_VERSION >= 200112L) || defined(__DJGPP__)
 #  include <unistd.h>   /* isatty */
 #  define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
-#elif defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
+#elif defined(MSDOS) || defined(OS2) || defined(__CYGWIN__)
 #  include <io.h>       /* _isatty */
 #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
+#elif defined(WIN32) || defined(_WIN32)
+#  include <io.h>      /* _isatty */
+#  include <windows.h> /* DeviceIoControl, HANDLE, FSCTL_SET_SPARSE */
+#  include <stdio.h>   /* FILE */
+static __inline int IS_CONSOLE(FILE* stdStream)
+{
+    DWORD dummy;
+    return _isatty(_fileno(stdStream)) && GetConsoleMode((HANDLE)_get_osfhandle(_fileno(stdStream)), &dummy);
+}
 #else
 #  define IS_CONSOLE(stdStream) 0
 #endif
@@ -129,6 +138,14 @@ extern "C" {
 #endif
 
 
+#ifndef ZSTD_SPARSE_DEFAULT
+#  if (defined(__APPLE__) && defined(__MACH__))
+#    define ZSTD_SPARSE_DEFAULT 0
+#  else
+#    define ZSTD_SPARSE_DEFAULT 1
+#  endif
+#endif
+
 
 #if defined (__cplusplus)
 }
diff --git a/programs/util.h b/programs/util.h
index 59e19d027ccd..5f437b2b268c 100644
--- a/programs/util.h
+++ b/programs/util.h
@@ -1,6 +1,6 @@
 /**
  * util.h - utility functions
- * 
+ *
  * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
  * All rights reserved.
  *
@@ -25,6 +25,7 @@ extern "C" {
 #include <stdlib.h>       /* malloc */
 #include <stddef.h>       /* size_t, ptrdiff_t */
 #include <stdio.h>        /* fprintf */
+#include <string.h>       /* strncmp */
 #include <sys/types.h>    /* stat, utime */
 #include <sys/stat.h>     /* stat */
 #if defined(_MSC_VER)
@@ -166,8 +167,8 @@ UTIL_STATIC void UTIL_waitForNextTick(UTIL_freq_t ticksPerSecond)
 *  File functions
 ******************************************/
 #if defined(_MSC_VER)
-	#define chmod _chmod
-	typedef struct __stat64 stat_t;
+    #define chmod _chmod
+    typedef struct __stat64 stat_t;
 #else
     typedef struct stat stat_t;
 #endif
@@ -178,9 +179,9 @@ UTIL_STATIC int UTIL_setFileStat(const char *filename, stat_t *statbuf)
     int res = 0;
     struct utimbuf timebuf;
 
-	timebuf.actime = time(NULL);
-	timebuf.modtime = statbuf->st_mtime;
-	res += utime(filename, &timebuf);  /* set access and modification times */
+    timebuf.actime = time(NULL);
+    timebuf.modtime = statbuf->st_mtime;
+    res += utime(filename, &timebuf);  /* set access and modification times */
 
 #if !defined(_WIN32)
     res += chown(filename, statbuf->st_uid, statbuf->st_gid);  /* Copy ownership */
@@ -228,6 +229,20 @@ UTIL_STATIC U32 UTIL_isDirectory(const char* infilename)
     return 0;
 }
 
+UTIL_STATIC U32 UTIL_isLink(const char* infilename)
+{
+#if defined(_WIN32)
+    /* no symlinks on windows */
+    (void)infilename;
+#else
+    int r;
+    stat_t statbuf;
+    r = lstat(infilename, &statbuf);
+    if (!r && S_ISLNK(statbuf.st_mode)) return 1;
+#endif
+    return 0;
+}
+
 
 UTIL_STATIC U64 UTIL_getFileSize(const char* infilename)
 {
@@ -271,11 +286,14 @@ UTIL_STATIC void *UTIL_realloc(void *ptr, size_t size)
     return NULL;
 }
 
+static int g_utilDisplayLevel;
+#define UTIL_DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define UTIL_DISPLAYLEVEL(l, ...) { if (g_utilDisplayLevel>=l) { UTIL_DISPLAY(__VA_ARGS__); } }
 
 #ifdef _WIN32
 #  define UTIL_HAS_CREATEFILELIST
 
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd)
+UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
     char* path;
     int dirLength, fnameLength, pathLength, nbFiles = 0;
@@ -311,7 +329,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
             if (strcmp (cFile.cFileName, "..") == 0 ||
                 strcmp (cFile.cFileName, ".") == 0) continue;
 
-            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd);  /* Recursively call "UTIL_prepareFileList" with the new path. */
+            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
             if (*bufStart == NULL) { free(path); FindClose(hFile); return 0; }
         }
         else if ((cFile.dwFileAttributes & FILE_ATTRIBUTE_NORMAL) || (cFile.dwFileAttributes & FILE_ATTRIBUTE_ARCHIVE) || (cFile.dwFileAttributes & FILE_ATTRIBUTE_COMPRESSED)) {
@@ -339,7 +357,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
 #  include <dirent.h>       /* opendir, readdir */
 #  include <string.h>       /* strerror, memcpy */
 
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd)
+UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
     DIR *dir;
     struct dirent *entry;
@@ -360,13 +378,19 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
         path = (char*) malloc(dirLength + fnameLength + 2);
         if (!path) { closedir(dir); return 0; }
         memcpy(path, dirName, dirLength);
+
         path[dirLength] = '/';
         memcpy(path+dirLength+1, entry->d_name, fnameLength);
         pathLength = dirLength+1+fnameLength;
         path[pathLength] = 0;
 
+        if (!followLinks && UTIL_isLink(path)) {
+            UTIL_DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", path);
+            continue;
+        }
+
         if (UTIL_isDirectory(path)) {
-            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd);  /* Recursively call "UTIL_prepareFileList" with the new path. */
+            nbFiles += UTIL_prepareFileList(path, bufStart, pos, bufEnd, followLinks);  /* Recursively call "UTIL_prepareFileList" with the new path. */
             if (*bufStart == NULL) { free(path); closedir(dir); return 0; }
         } else {
             if (*bufStart + *pos + pathLength >= *bufEnd) {
@@ -396,7 +420,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
 
 #else
 
-UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd)
+UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_t* pos, char** bufEnd, int followLinks)
 {
     (void)bufStart; (void)bufEnd; (void)pos;
     fprintf(stderr, "Directory %s ignored (compiled without _WIN32 or _POSIX_C_SOURCE)\n", dirName);
@@ -411,7 +435,7 @@ UTIL_STATIC int UTIL_prepareFileList(const char *dirName, char** bufStart, size_
  * After finishing usage of the list the structures should be freed with UTIL_freeFileList(params: return value, allocatedBuffer)
  * In case of error UTIL_createFileList returns NULL and UTIL_freeFileList should not be called.
  */
-UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb)
+UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned inputNamesNb, char** allocatedBuffer, unsigned* allocatedNamesNb, int followLinks)
 {
     size_t pos;
     unsigned i, nbFiles;
@@ -436,7 +460,7 @@ UTIL_STATIC const char** UTIL_createFileList(const char **inputNames, unsigned i
                 nbFiles++;
             }
         } else {
-            nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend);
+            nbFiles += UTIL_prepareFileList(inputNames[i], &buf, &pos, &bufend, followLinks);
             if (buf == NULL) return NULL;
     }   }
 
@@ -465,6 +489,201 @@ UTIL_STATIC void UTIL_freeFileList(const char** filenameTable, char* allocatedBu
     if (filenameTable) free((void*)filenameTable);
 }
 
+/* count the number of physical cores */
+#if defined(_WIN32) || defined(WIN32)
+
+#include <windows.h>
+
+typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    {   LPFN_GLPI glpi;
+        BOOL done = FALSE;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
+        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
+        DWORD returnLength = 0;
+        size_t byteOffset = 0;
+
+        glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")),
+                                         "GetLogicalProcessorInformation");
+
+        if (glpi == NULL) {
+            goto failed;
+        }
+
+        while(!done) {
+            DWORD rc = glpi(buffer, &returnLength);
+            if (FALSE == rc) {
+                if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
+                    if (buffer)
+                        free(buffer);
+                    buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
+
+                    if (buffer == NULL) {
+                        perror("zstd");
+                        exit(1);
+                    }
+                } else {
+                    /* some other error */
+                    goto failed;
+                }
+            } else {
+                done = TRUE;
+            }
+        }
+
+        ptr = buffer;
+
+        while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) {
+
+            if (ptr->Relationship == RelationProcessorCore) {
+                numPhysicalCores++;
+            }
+
+            ptr++;
+            byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        }
+
+        free(buffer);
+
+        return numPhysicalCores;
+    }
+
+failed:
+    /* try to fall back on GetSystemInfo */
+    {   SYSTEM_INFO sysinfo;
+        GetSystemInfo(&sysinfo);
+        numPhysicalCores = sysinfo.dwNumberOfProcessors;
+        if (numPhysicalCores == 0) numPhysicalCores = 1; /* just in case */
+    }
+    return numPhysicalCores;
+}
+
+#elif defined(__APPLE__)
+
+#include <sys/sysctl.h>
+
+/* Use apple-provided syscall
+ * see: man 3 sysctl */
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static S32 numPhysicalCores = 0; /* apple specifies int32_t */
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    {   size_t size = sizeof(S32);
+        int const ret = sysctlbyname("hw.physicalcpu", &numPhysicalCores, &size, NULL, 0);
+        if (ret != 0) {
+            if (errno == ENOENT) {
+                /* entry not present, fall back on 1 */
+                numPhysicalCores = 1;
+            } else {
+                perror("zstd: can't get number of physical cpus");
+                exit(1);
+            }
+        }
+
+        return numPhysicalCores;
+    }
+}
+
+#elif defined(__linux__)
+
+/* parse /proc/cpuinfo
+ * siblings / cpu cores should give hyperthreading ratio
+ * otherwise fall back on sysconf */
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        return numPhysicalCores = 1;
+    }
+
+    /* try to determine if there's hyperthreading */
+    {   FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+        size_t const BUF_SIZE = 80;
+        char buff[BUF_SIZE];
+
+        int siblings = 0;
+        int cpu_cores = 0;
+        int ratio = 1;
+
+        if (cpuinfo == NULL) {
+            /* fall back on the sysconf value */
+            return numPhysicalCores;
+        }
+
+        /* assume the cpu cores/siblings values will be constant across all
+         * present processors */
+        while (!feof(cpuinfo)) {
+            if (fgets(buff, BUF_SIZE, cpuinfo) != NULL) {
+                if (strncmp(buff, "siblings", 8) == 0) {
+                    const char* const sep = strchr(buff, ':');
+                    if (*sep == '\0') {
+                        /* formatting was broken? */
+                        goto failed;
+                    }
+
+                    siblings = atoi(sep + 1);
+                }
+                if (strncmp(buff, "cpu cores", 9) == 0) {
+                    const char* const sep = strchr(buff, ':');
+                    if (*sep == '\0') {
+                        /* formatting was broken? */
+                        goto failed;
+                    }
+
+                    cpu_cores = atoi(sep + 1);
+                }
+            } else if (ferror(cpuinfo)) {
+                /* fall back on the sysconf value */
+                goto failed;
+            }
+        }
+        if (siblings && cpu_cores) {
+            ratio = siblings / cpu_cores;
+        }
+failed:
+        fclose(cpuinfo);
+        return numPhysicalCores = numPhysicalCores / ratio;
+    }
+}
+
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
+
+/* Use apple-provided syscall
+ * see: man 3 sysctl */
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    static int numPhysicalCores = 0;
+
+    if (numPhysicalCores != 0) return numPhysicalCores;
+
+    numPhysicalCores = (int)sysconf(_SC_NPROCESSORS_ONLN);
+    if (numPhysicalCores == -1) {
+        /* value not queryable, fall back on 1 */
+        return numPhysicalCores = 1;
+    }
+    return numPhysicalCores;
+}
+
+#else
+
+UTIL_STATIC int UTIL_countPhysicalCores(void)
+{
+    /* assume 1 */
+    return 1;
+}
+
+#endif
 
 #if defined (__cplusplus)
 }
diff --git a/programs/zstd.1 b/programs/zstd.1
index 684fb868aa30..6cc5f7e9d2f6 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -1,408 +1,334 @@
-\"
-\" zstd.1: This is a manual page for 'zstd' program. This file is part of the
-\" zstd <http://www.zstd.net/> project.
-\" Author: Yann Collet
-\"
-
-\" No hyphenation
-.hy 0
-.nr HY 0
-
-.TH zstd "1" "2015-08-22" "zstd" "User Commands"
-.SH NAME
-\fBzstd, unzstd, zstdcat\fR - Compress or decompress .zst files
-
-.SH SYNOPSIS
-.TP 5
-\fBzstd\fR [\fBOPTIONS\fR] [-|INPUT-FILE] [-o <OUTPUT-FILE>]
-.PP
-.B unzstd
-is equivalent to
-.BR "zstd \-d"
-.br
-.B zstdcat
-is equivalent to
-.BR "zstd \-dcf"
-.br
-
-.SH DESCRIPTION
-.PP
-\fBzstd\fR is a fast lossless compression algorithm
-and data compression tool,
-with command line syntax similar to \fB gzip (1) \fR and \fB xz (1) \fR .
-It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages.
-\fBzstd\fR offers highly configurable compression speed,
-with fast modes at > 200 MB/s per core,
-and strong modes nearing lzma compression ratios.
-It also features a very fast decoder, with speeds > 500 MB/s per core.
-
-\fBzstd\fR command line syntax is generally similar to gzip,
-but features the following differences :
- - Source files are preserved by default.
-   It's possible to remove them automatically by using \fB--rm\fR command.
- - When compressing a single file, \fBzstd\fR displays progress notifications and result summary by default.
-   Use \fB-q\fR to turn them off
-
-.PP
-.B zstd
-compresses or decompresses each
-.I file
-according to the selected operation mode.
-If no
-.I files
-are given or
-.I file
-is
-.BR \- ,
-.B zstd
-reads from standard input and writes the processed data
-to standard output.
-.B zstd
-will refuse (display an error and skip the
-.IR file )
-to write compressed data to standard output if it is a terminal.
-Similarly,
-.B zstd
-will refuse to read compressed data
-from standard input if it is a terminal.
-
-.PP
-Unless
-.B \-\-stdout
-or
-.B \-o
-is specified,
-.I files
-are written to a new file whose name is derived from the source
-.I file
-name:
-.IP \(bu 3
-When compressing, the suffix
-.B .zst
-is appended to the source filename to get the target filename.
-.IP \(bu 3
-When decompressing, the
-.B .zst
-suffix is removed from the filename to get the target filename.
-
-.SS "Concatenation with .zst files"
-It is possible to concatenate
-.B .zst
-files as is.
-.B zstd
-will decompress such files as if they were a single
-.B .zst
-file.
-
-
-
-.SH OPTIONS
-
+.
+.TH "ZSTD" "1" "May 2017" "zstd 1.2.0" "User Commands"
+.
+.SH "NAME"
+\fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
+.
+.SH "SYNOPSIS"
+\fBzstd\fR [\fIOPTIONS\fR] [\-|\fIINPUT\-FILE\fR] [\-o \fIOUTPUT\-FILE\fR]
+.
+.P
+\fBzstdmt\fR is equivalent to \fBzstd \-T0\fR
+.
+.P
+\fBunzstd\fR is equivalent to \fBzstd \-d\fR
+.
+.P
+\fBzstdcat\fR is equivalent to \fBzstd \-dcf\fR
+.
+.SH "DESCRIPTION"
+\fBzstd\fR is a fast lossless compression algorithm and data compression tool, with command line syntax similar to \fBgzip (1)\fR and \fBxz (1)\fR\. It is based on the \fBLZ77\fR family, with further FSE & huff0 entropy stages\. \fBzstd\fR offers highly configurable compression speed, with fast modes at > 200 MB/s per code, and strong modes nearing lzma compression ratios\. It also features a very fast decoder, with speeds > 500 MB/s per core\.
+.
+.P
+\fBzstd\fR command line syntax is generally similar to gzip, but features the following differences :
+.
+.IP "\(bu" 4
+Source files are preserved by default\. It\'s possible to remove them automatically by using the \fB\-\-rm\fR command\.
+.
+.IP "\(bu" 4
+When compressing a single file, \fBzstd\fR displays progress notifications and result summary by default\. Use \fB\-q\fR to turn them off\.
+.
+.IP "\(bu" 4
+\fBzstd\fR does not accept input from console, but it properly accepts \fBstdin\fR when it\'s not the console\.
+.
+.IP "\(bu" 4
+\fBzstd\fR displays a short help page when command line is an error\. Use \fB\-q\fR to turn it off\.
+.
+.IP "" 0
+.
+.P
+\fBzstd\fR compresses or decompresses each \fIfile\fR according to the selected operation mode\. If no \fIfiles\fR are given or \fIfile\fR is \fB\-\fR, \fBzstd\fR reads from standard input and writes the processed data to standard output\. \fBzstd\fR will refuse to write compressed data to standard output if it is a terminal : it will display an error message and skip the \fIfile\fR\. Similarly, \fBzstd\fR will refuse to read compressed data from standard input if it is a terminal\.
+.
+.P
+Unless \fB\-\-stdout\fR or \fB\-o\fR is specified, \fIfiles\fR are written to a new file whose name is derived from the source \fIfile\fR name:
+.
+.IP "\(bu" 4
+When compressing, the suffix \fB\.zst\fR is appended to the source filename to get the target filename\.
+.
+.IP "\(bu" 4
+When decompressing, the \fB\.zst\fR suffix is removed from the source filename to get the target filename
+.
+.IP "" 0
+.
+.SS "Concatenation with \.zst files"
+It is possible to concatenate \fB\.zst\fR files as is\. \fBzstd\fR will decompress such files as if they were a single \fB\.zst\fR file\.
+.
+.SH "OPTIONS"
 .
 .SS "Integer suffixes and special values"
-In most places where an integer argument is expected,
-an optional suffix is supported to easily indicate large integers.
-There must be no space between the integer and the suffix.
-.TP
-.B KiB
-Multiply the integer by 1,024 (2^10).
-.BR Ki ,
-.BR K ,
-and
-.B KB
-are accepted as synonyms for
-.BR KiB .
-.TP
-.B MiB
-Multiply the integer by 1,048,576 (2^20).
-.BR Mi ,
-.BR M ,
-and
-.B MB
-are accepted as synonyms for
-.BR MiB .
-
+In most places where an integer argument is expected, an optional suffix is supported to easily indicate large integers\. There must be no space between the integer and the suffix\.
+.
+.TP
+\fBKiB\fR
+Multiply the integer by 1,024 (2^10)\. \fBKi\fR, \fBK\fR, and \fBKB\fR are accepted as synonyms for \fBKiB\fR\.
+.
+.TP
+\fBMiB\fR
+Multiply the integer by 1,048,576 (2^20)\. \fBMi\fR, \fBM\fR, and \fBMB\fR are accepted as synonyms for \fBMiB\fR\.
 .
 .SS "Operation mode"
-If multiple operation mode options are given,
-the last one takes effect.
-.TP
-.BR \-z ", " \-\-compress
-Compress.
-This is the default operation mode when no operation mode option
-is specified and no other operation mode is implied from
-the command name (for example,
-.B unzstd
-implies
-.BR \-\-decompress ).
-.TP
-.BR \-d ", " \-\-decompress ", " \-\-uncompress
-Decompress.
-.TP
-.BR \-t ", " \-\-test
-Test the integrity of compressed
-.IR files .
-This option is equivalent to
-.B "\-\-decompress \-\-stdout"
-except that the decompressed data is discarded instead of being
-written to standard output.
-No files are created or removed.
-.TP
-.B \-b#
- benchmark file(s) using compression level #
-.TP
-.B \--train FILEs
- use FILEs as training set to create a dictionary. The training set should contain a lot of small files (> 100).
-
+If multiple operation mode options are given, the last one takes effect\.
+.
+.TP
+\fB\-z\fR, \fB\-\-compress\fR
+Compress\. This is the default operation mode when no operation mode option is specified and no other operation mode is implied from the command name (for example, \fBunzstd\fR implies \fB\-\-decompress\fR)\.
+.
+.TP
+\fB\-d\fR, \fB\-\-decompress\fR, \fB\-\-uncompress\fR
+Decompress\.
+.
+.TP
+\fB\-t\fR, \fB\-\-test\fR
+Test the integrity of compressed \fIfiles\fR\. This option is equivalent to \fB\-\-decompress \-\-stdout\fR except that the decompressed data is discarded instead of being written to standard output\. No files are created or removed\.
+.
+.TP
+\fB\-b#\fR
+Benchmark file(s) using compression level #
+.
+.TP
+\fB\-\-train FILEs\fR
+Use FILEs as a training set to create a dictionary\. The training set should contain a lot of small files (> 100)\.
 .
 .SS "Operation modifiers"
+.
 .TP
-.B \-#
- # compression level [1-19] (default:3)
-.TP
-.BR \--ultra
- unlocks high compression levels 20+ (maximum 22), using a lot more memory.
-Note that decompression will also require more memory when using these levels.
-.TP
-.B \-D file
- use `file` as Dictionary to compress or decompress FILE(s)
-.TP
-.BR \--no-dictID
- do not store dictionary ID within frame header (dictionary compression).
- The decoder will have to rely on implicit knowledge about which dictionary to use,
-it won't be able to check if it's correct.
-.TP
-.B \-o file
- save result into `file` (only possible with a single INPUT-FILE)
-.TP
-.BR \-f ", " --force
- overwrite output without prompting
-.TP
-.BR \-c ", " --stdout
- force write to standard output, even if it is the console
-.TP
-.BR \--[no-]sparse
- enable / disable sparse FS support, to make files with many zeroes smaller on disk.
- Creating sparse files may save disk space and speed up the decompression
-by reducing the amount of disk I/O.
- default : enabled when output is into a file, and disabled when output is stdout.
- This setting overrides default and can force sparse mode over stdout.
-.TP
-.BR \--rm
- remove source file(s) after successful compression or decompression
-.TP
-.BR \-k ", " --keep
- keep source file(s) after successful compression or decompression.
- This is the default behavior.
-.TP
-.BR \-r
- operate recursively on directories
-.TP
-.BR \-h/\-H ", " --help
- display help/long help and exit
-.TP
-.BR \-V ", " --version
- display Version number and exit
-.TP
-.BR \-v ", " --verbose
- verbose mode
-.TP
-.BR \-q ", " --quiet
- suppress warnings, interactivity and notifications.
- specify twice to suppress errors too.
-.TP
-.BR \-C ", " --[no-]check
- add integrity check computed from uncompressed data (default : enabled)
-.TP
-.BR \-t ", " --test
- Test the integrity of compressed files. This option is equivalent to \fB--decompress --stdout > /dev/null\fR.
- No files are created or removed.
-.TP
-.BR --
- All arguments after -- are treated as files
-
-
-.SH DICTIONARY BUILDER
-.PP
-\fBzstd\fR offers \fIdictionary\fR compression, useful for very small files and messages.
-It's possible to train \fBzstd\fR with some samples, the result of which is saved into a file called `dictionary`.
-Then during compression and decompression, make reference to the same dictionary.
-It will improve compression ratio of small files.
-Typical gains range from ~10% (at 64KB) to x5 better (at <1KB).
-.TP
-.B \--train FILEs
- use FILEs as training set to create a dictionary. The training set should contain a lot of small files (> 100),
-and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)
-.TP
-.B \-o file
- dictionary saved into `file` (default: dictionary)
-.TP
-.B \--maxdict #
- limit dictionary to specified size (default : 112640)
-.TP
-.B \--dictID #
- A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary.
- By default, zstd will create a 4-bytes random number ID.
- It's possible to give a precise number instead.
- Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header,
- and an ID < 65536 will only need 2 bytes. This compares favorably to 4 bytes default.
- However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries.
-.TP
-.B \-s#
- dictionary selectivity level (default: 9)
- the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size.
-.TP
-.B \--cover=k=#,d=#
- Use alternate dictionary builder algorithm named cover with parameters \fIk\fR and \fId\fR with \fId\fR <= \fIk\fR.
- Selects segments of size \fIk\fR with the highest score to put in the dictionary.
- The score of a segment is computed by the sum of the frequencies of all the subsegments of of size \fId\fR.
- Generally \fId\fR should be in the range [6, 24].
- Good values for \fIk\fR vary widely based on the input data, but a safe range is [32, 2048].
- Example: \fB--train --cover=k=64,d=8 FILEs\fR.
-.TP
-.B \--optimize-cover[=steps=#,k=#,d=#]
- If \fIsteps\fR is not specified, the default value of 32 is used.
- If \fIk\fR is not specified, \fIsteps\fR values in [16, 2048] are checked for each value of \fId\fR.
- If \fId\fR is not specified, the values checked are [6, 8, ..., 16].
-
- Runs the cover dictionary builder for each parameter set saves the optimal parameters and dictionary.
- Prints the optimal parameters and writes the optimal dictionary to the output file.
- Supports multithreading if \fBzstd\fR is compiled with threading support.
-
- The parameter \fIk\fR is more sensitve than \fId\fR, and is faster to optimize over.
- Suggested use is to run with a \fIsteps\fR <= 32 with neither \fIk\fR nor \fId\fR set.
- Once it completes, use the value of \fId\fR it selects with a higher \fIsteps\fR (in the range [256, 1024]).
- \fBzstd --train --optimize-cover FILEs
- \fBzstd --train --optimize-cover=d=d,steps=512 FILEs
-.TP
-
-.SH BENCHMARK
-.TP
-.B \-b#
- benchmark file(s) using compression level #
-.TP
-.B \-e#
- benchmark file(s) using multiple compression levels, from -b# to -e# (included).
-.TP
-.B \-i#
- minimum evaluation time, in seconds (default : 3s), benchmark mode only
-.TP
-.B \-B#
- cut file into independent blocks of size # (default: no block)
-.B \--priority=rt
- set process priority to real-time
-
-.SH ADVANCED COMPRESSION OPTIONS
-.TP
-.B \--zstd[=\fIoptions\fR]
-.PD
-\fBzstd\fR provides 22 predefined compression levels. The selected or default predefined compression level can be changed with advanced compression options.
-The \fIoptions\fR are provided as a comma-separated list. You may specify only the \fIoptions\fR you want to change and the rest will be taken from the selected or default compression level.
-The list of available \fIoptions\fR:
-.RS
-
-.TP
-.BI strategy= strat
-.PD 0
-.TP
-.BI strat= strat
-.PD
-Specify a strategy used by a match finder.
-.IP ""
-There are 8 strategies numbered from 0 to 7, from faster to stronger:
-0=ZSTD_fast, 1=ZSTD_dfast, 2=ZSTD_greedy, 3=ZSTD_lazy, 4=ZSTD_lazy2, 5=ZSTD_btlazy2, 6=ZSTD_btopt, 7=ZSTD_btopt2.
-.IP ""
-
-.TP
-.BI windowLog= wlog
-.PD 0
-.TP
-.BI wlog= wlog
-.PD
-Specify the maximum number of bits for a match distance.
-.IP ""
-The higher number of bits increases the chance to find a match what usually improves compression ratio.
-It also increases memory requirements for compressor and decompressor.
-.IP ""
-The minimum \fIwlog\fR is 10 (1 KiB) and the maximum is 25 (32 MiB) for 32-bit compilation and 27 (128 MiB) for 64-bit compilation.
-.IP ""
-
-.TP
-.BI hashLog= hlog
-.PD 0
-.TP
-.BI hlog= hlog
-.PD
-Specify the maximum number of bits for a hash table.
-.IP ""
-The bigger hash table causes less collisions what usually make compression faster but requires more memory during compression.
-.IP ""
-The minimum \fIhlog\fR is 6 (64 B) and the maximum is 25 (32 MiB) for 32-bit compilation and 27 (128 MiB) for 64-bit compilation.
-
-.TP
-.BI chainLog= clog
-.PD 0
-.TP
-.BI clog= clog
-.PD
-Specify the maximum number of bits for a hash chain or a binary tree.
-.IP ""
-The higher number of bits increases the chance to find a match what usually improves compression ratio.
-It also slows down compression speed and increases memory requirements for compression.
-This option is ignored for the ZSTD_fast strategy.
-.IP ""
-The minimum \fIclog\fR is 6 (64 B) and the maximum is 26 (64 MiB) for 32-bit compilation and 28 (256 MiB) for 64-bit compilation.
-.IP ""
-
-.TP
-.BI searchLog= slog
-.PD 0
-.TP
-.BI slog= slog
-.PD
-Specify the maximum number of searches in a hash chain or a binary tree using logarithmic scale.
-.IP ""
-The bigger number of searches increases the chance to find a match what usually improves compression ratio but decreases compression speed.
-.IP ""
-The minimum \fIslog\fR is 1 and the maximum is 24 for 32-bit compilation and 26 for 64-bit compilation.
-.IP ""
-
-.TP
-.BI searchLength= slen
-.PD 0
-.TP
-.BI slen= slen
-.PD
-Specify the minimum searched length of a match in a hash table.
-.IP ""
-The bigger search length usually decreases compression ratio but improves decompression speed.
-.IP ""
-The minimum \fIslen\fR is 3 and the maximum is 7.
-.IP ""
-
-.TP
-.BI targetLength= tlen
-.PD 0
-.TP
-.BI tlen= tlen
-.PD
-Specify the minimum match length that causes a match finder to interrupt searching of better matches.
-.IP ""
-The bigger minimum match length usually improves compression ratio but decreases compression speed.
-This option is used only with ZSTD_btopt and ZSTD_btopt2 strategies.
-.IP ""
-The minimum \fItlen\fR is 4 and the maximum is 999.
-.IP ""
-
-.PP
-.B An example
-.br
-The following parameters sets advanced compression options to predefined level 19 for files bigger than 256 KB:
-.IP ""
-\fB--zstd=\fRwindowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
-
-.SH BUGS
-Report bugs at:- https://github.com/facebook/zstd/issues
-
-.SH AUTHOR
+\fB\-#\fR
+\fB#\fR compression level [1\-19] (default: 3)
+.
+.TP
+\fB\-\-ultra\fR
+unlocks high compression levels 20+ (maximum 22), using a lot more memory\. Note that decompression will also require more memory when using these levels\.
+.
+.TP
+\fB\-T#\fR, \fB\-\-threads=#\fR
+Compress using \fB#\fR threads (default: 1)\. If \fB#\fR is 0, attempt to detect and use the number of physical CPU cores\. This modifier does nothing if \fBzstd\fR is compiled without multithread support\.
+.
+.TP
+\fB\-D file\fR
+use \fBfile\fR as Dictionary to compress or decompress FILE(s)
+.
+.TP
+\fB\-\-nodictID\fR
+do not store dictionary ID within frame header (dictionary compression)\. The decoder will have to rely on implicit knowledge about which dictionary to use, it won\'t be able to check if it\'s correct\.
+.
+.TP
+\fB\-o file\fR
+save result into \fBfile\fR (only possible with a single \fIINPUT\-FILE\fR)
+.
+.TP
+\fB\-f\fR, \fB\-\-force\fR
+overwrite output without prompting, and (de)compress symbolic links
+.
+.TP
+\fB\-c\fR, \fB\-\-stdout\fR
+force write to standard output, even if it is the console
+.
+.TP
+\fB\-\-[no\-]sparse\fR
+enable / disable sparse FS support, to make files with many zeroes smaller on disk\. Creating sparse files may save disk space and speed up decompression by reducing the amount of disk I/O\. default : enabled when output is into a file, and disabled when output is stdout\. This setting overrides default and can force sparse mode over stdout\.
+.
+.TP
+\fB\-\-rm\fR
+remove source file(s) after successful compression or decompression
+.
+.TP
+\fB\-k\fR, \fB\-\-keep\fR
+keep source file(s) after successful compression or decompression\. This is the default behavior\.
+.
+.TP
+\fB\-r\fR
+operate recursively on dictionaries
+.
+.TP
+\fB\-h\fR/\fB\-H\fR, \fB\-\-help\fR
+display help/long help and exit
+.
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+display version number and exit
+.
+.TP
+\fB\-v\fR
+verbose mode
+.
+.TP
+\fB\-q\fR, \fB\-\-quiet\fR
+suppress warnings, interactivity, and notifications\. specify twice to suppress errors too\.
+.
+.TP
+\fB\-C\fR, \fB\-\-[no\-]check\fR
+add integrity check computed from uncompressed data (default : enabled)
+.
+.TP
+\fB\-\-\fR
+All arguments after \fB\-\-\fR are treated as files
+.
+.SH "DICTIONARY BUILDER"
+\fBzstd\fR offers \fIdictionary\fR compression, useful for very small files and messages\. It\'s possible to train \fBzstd\fR with some samples, the result of which is saved into a file called a \fBdictionary\fR\. Then during compression and decompression, reference the same dictionary\. It will improve compression ratio of small files\. Typical gains range from 10% (at 64KB) to x5 better (at <1KB)\.
+.
+.TP
+\fB\-\-train FILEs\fR
+Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
+.
+.IP
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
+.
+.TP
+\fB\-o file\fR
+Dictionary saved into \fBfile\fR (default name: dictionary)\.
+.
+.TP
+\fB\-\-maxdict=#\fR
+Limit dictionary to specified size (default: 112640)\.
+.
+.TP
+\fB\-\-dictID=#\fR
+A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\.
+.
+.TP
+\fB\-\-train\-cover[=k#,d=#,steps=#]\fR
+Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\.
+.
+.IP
+Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-cover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=k=50,d=8 FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=d=8,steps=500 FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=k=50 FILEs\fR
+.
+.TP
+\fB\-\-train\-legacy[=selectivity=#]\fR
+Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
+.
+.IP
+Examples:
+.
+.IP
+\fBzstd \-\-train\-legacy FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-legacy=selectivity=8 FILEs\fR
+.
+.SH "BENCHMARK"
+.
+.TP
+\fB\-b#\fR
+benchmark file(s) using compression level #
+.
+.TP
+\fB\-e#\fR
+benchmark file(s) using multiple compression levels, from \fB\-b#\fR to \fB\-e#\fR (inclusive)
+.
+.TP
+\fB\-i#\fR
+minimum evaluation time, in seconds (default : 3s), benchmark mode only
+.
+.TP
+\fB\-B#\fR
+cut file into independent blocks of size # (default: no block)
+.
+.TP
+\fB\-\-priority=rt\fR
+set process priority to real\-time
+.
+.SH "ADVANCED COMPRESSION OPTIONS"
+.
+.SS "\-\-zstd[=options]:"
+\fBzstd\fR provides 22 predefined compression levels\. The selected or default predefined compression level can be changed with advanced compression options\. The \fIoptions\fR are provided as a comma\-separated list\. You may specify only the options you want to change and the rest will be taken from the selected or default compression level\. The list of available \fIoptions\fR:
+.
+.TP
+\fBstrategy\fR=\fIstrat\fR, \fBstrat\fR=\fIstrat\fR
+Specify a strategy used by a match finder\.
+.
+.IP
+There are 8 strategies numbered from 0 to 7, from faster to stronger: 0=ZSTD_fast, 1=ZSTD_dfast, 2=ZSTD_greedy, 3=ZSTD_lazy, 4=ZSTD_lazy2, 5=ZSTD_btlazy2, 6=ZSTD_btopt, 7=ZSTD_btopt2\.
+.
+.TP
+\fBwindowLog\fR=\fIwlog\fR, \fBwlog\fR=\fIwlog\fR
+Specify the maximum number of bits for a match distance\.
+.
+.IP
+The higher number of increases the chance to find a match which usually improves compression ratio\. It also increases memory requirements for the compressor and decompressor\. The minimum \fIwlog\fR is 10 (1 KiB) and the maximum is 27 (128 MiB)\.
+.
+.TP
+\fBhashLog\fR=\fIhlog\fR, \fBhlog\fR=\fIhlog\fR
+Specify the maximum number of bits for a hash table\.
+.
+.IP
+Bigger hash tables cause less collisions which usually makes compression faster, but requires more memory during compression\.
+.
+.IP
+The minimum \fIhlog\fR is 6 (64 B) and the maximum is 26 (128 MiB)\.
+.
+.TP
+\fBchainLog\fR=\fIclog\fR, \fBclog\fR=\fIclog\fR
+Specify the maximum number of bits for a hash chain or a binary tree\.
+.
+.IP
+Higher numbers of bits increases the chance to find a match which usually improves compression ratio\. It also slows down compression speed and increases memory requirements for compression\. This option is ignored for the ZSTD_fast strategy\.
+.
+.IP
+The minimum \fIclog\fR is 6 (64 B) and the maximum is 28 (256 MiB)\.
+.
+.TP
+\fBsearchLog\fR=\fIslog\fR, \fBslog\fR=\fIslog\fR
+Specify the maximum number of searches in a hash chain or a binary tree using logarithmic scale\.
+.
+.IP
+More searches increases the chance to find a match which usually increases compression ratio but decreases compression speed\.
+.
+.IP
+The minimum \fIslog\fR is 1 and the maximum is 26\.
+.
+.TP
+\fBsearchLength\fR=\fIslen\fR, \fBslen\fR=\fIslen\fR
+Specify the minimum searched length of a match in a hash table\.
+.
+.IP
+Larger search lengths usually decrease compression ratio but improve decompression speed\.
+.
+.IP
+The minimum \fIslen\fR is 3 and the maximum is 7\.
+.
+.TP
+\fBtargetLen\fR=\fItlen\fR, \fBtlen\fR=\fItlen\fR
+Specify the minimum match length that causes a match finder to stop searching for better matches\.
+.
+.IP
+A larger minimum match length usually improves compression ratio but decreases compression speed\. This option is only used with strategies ZSTD_btopt and ZSTD_btopt2\.
+.
+.IP
+The minimum \fItlen\fR is 4 and the maximum is 999\.
+.
+.TP
+\fBoverlapLog\fR=\fIovlog\fR, \fBovlog\fR=\fIovlog\fR
+Determine \fBoverlapSize\fR, amount of data reloaded from previous job\. This parameter is only available when multithreading is enabled\. Reloading more data improves compression ratio, but decreases speed\.
+.
+.IP
+The minimum \fIovlog\fR is 0, and the maximum is 9\. 0 means "no overlap", hence completely independent jobs\. 9 means "full overlap", meaning up to \fBwindowSize\fR is reloaded from previous job\. Reducing \fIovlog\fR by 1 reduces the amount of reload by a factor 2\. Default \fIovlog\fR is 6, which means "reload \fBwindowSize / 8\fR"\. Exception : the maximum compression level (22) has a default \fIovlog\fR of 9\.
+.
+.SS "\-B#:"
+Select the size of each compression job\. This parameter is available only when multi\-threading is enabled\. Default value is \fB4 * windowSize\fR, which means it varies depending on compression level\. \fB\-B#\fR makes it possible to select a custom value\. Note that job size must respect a minimum value which is enforced transparently\. This minimum is either 1 MB, or \fBoverlapSize\fR, whichever is largest\.
+.
+.SS "Example"
+The following parameters sets advanced compression options to those of predefined level 19 for files bigger than 256 KB:
+.
+.P
+\fB\-\-zstd\fR=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
+.
+.SH "BUGS"
+Report bugs at: https://github\.com/facebook/zstd/issues
+.
+.SH "AUTHOR"
 Yann Collet
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
new file mode 100644
index 000000000000..118c9f2f8ee5
--- /dev/null
+++ b/programs/zstd.1.md
@@ -0,0 +1,343 @@
+zstd(1) -- zstd, zstdmt, unzstd, zstdcat - Compress or decompress .zst files
+============================================================================
+
+SYNOPSIS
+--------
+
+`zstd` [*OPTIONS*] [-|_INPUT-FILE_] [-o _OUTPUT-FILE_]
+
+`zstdmt` is equivalent to `zstd -T0`
+
+`unzstd` is equivalent to `zstd -d`
+
+`zstdcat` is equivalent to `zstd -dcf`
+
+
+DESCRIPTION
+-----------
+`zstd` is a fast lossless compression algorithm and data compression tool,
+with command line syntax similar to `gzip (1)` and `xz (1)`.
+It is based on the **LZ77** family, with further FSE & huff0 entropy stages.
+`zstd` offers highly configurable compression speed,
+with fast modes at > 200 MB/s per code,
+and strong modes nearing lzma compression ratios.
+It also features a very fast decoder, with speeds > 500 MB/s per core.
+
+`zstd` command line syntax is generally similar to gzip,
+but features the following differences :
+
+  - Source files are preserved by default.
+    It's possible to remove them automatically by using the `--rm` command.
+  - When compressing a single file, `zstd` displays progress notifications
+    and result summary by default.
+    Use `-q` to turn them off.
+  - `zstd` does not accept input from console,
+    but it properly accepts `stdin` when it's not the console.
+  - `zstd` displays a short help page when command line is an error.
+    Use `-q` to turn it off.
+
+`zstd` compresses or decompresses each _file_ according to the selected
+operation mode.
+If no _files_ are given or _file_ is `-`, `zstd` reads from standard input
+and writes the processed data to standard output.
+`zstd` will refuse to write compressed data to standard output
+if it is a terminal : it will display an error message and skip the _file_.
+Similarly, `zstd` will refuse to read compressed data from standard input
+if it is a terminal.
+
+Unless `--stdout` or `-o` is specified, _files_ are written to a new file
+whose name is derived from the source _file_ name:
+
+* When compressing, the suffix `.zst` is appended to the source filename to
+  get the target filename.
+* When decompressing, the `.zst` suffix is removed from the source filename to
+  get the target filename
+
+### Concatenation with .zst files
+It is possible to concatenate `.zst` files as is.
+`zstd` will decompress such files as if they were a single `.zst` file.
+
+OPTIONS
+-------
+
+### Integer suffixes and special values
+In most places where an integer argument is expected,
+an optional suffix is supported to easily indicate large integers.
+There must be no space between the integer and the suffix.
+
+* `KiB`:
+    Multiply the integer by 1,024 (2\^10).
+    `Ki`, `K`, and `KB` are accepted as synonyms for `KiB`.
+* `MiB`:
+    Multiply the integer by 1,048,576 (2\^20).
+    `Mi`, `M`, and `MB` are accepted as synonyms for `MiB`.
+
+### Operation mode
+If multiple operation mode options are given,
+the last one takes effect.
+
+* `-z`, `--compress`:
+    Compress.
+    This is the default operation mode when no operation mode option is specified
+    and no other operation mode is implied from the command name
+    (for example, `unzstd` implies `--decompress`).
+* `-d`, `--decompress`, `--uncompress`:
+    Decompress.
+* `-t`, `--test`:
+    Test the integrity of compressed _files_.
+    This option is equivalent to `--decompress --stdout` except that the
+    decompressed data is discarded instead of being written to standard output.
+    No files are created or removed.
+* `-b#`:
+    Benchmark file(s) using compression level #
+* `--train FILEs`:
+    Use FILEs as a training set to create a dictionary.
+    The training set should contain a lot of small files (> 100).
+
+### Operation modifiers
+
+* `-#`:
+    `#` compression level \[1-19] (default: 3)
+* `--ultra`:
+    unlocks high compression levels 20+ (maximum 22), using a lot more memory.
+    Note that decompression will also require more memory when using these levels.
+* `-T#`, `--threads=#`:
+    Compress using `#` threads (default: 1).
+    If `#` is 0, attempt to detect and use the number of physical CPU cores.
+    This modifier does nothing if `zstd` is compiled without multithread support.
+* `-D file`:
+    use `file` as Dictionary to compress or decompress FILE(s)
+* `--nodictID`:
+    do not store dictionary ID within frame header (dictionary compression).
+    The decoder will have to rely on implicit knowledge about which dictionary to use,
+    it won't be able to check if it's correct.
+* `-o file`:
+    save result into `file` (only possible with a single _INPUT-FILE_)
+* `-f`, `--force`:
+    overwrite output without prompting, and (de)compress symbolic links
+* `-c`, `--stdout`:
+    force write to standard output, even if it is the console
+* `--[no-]sparse`:
+    enable / disable sparse FS support,
+    to make files with many zeroes smaller on disk.
+    Creating sparse files may save disk space and speed up decompression by
+    reducing the amount of disk I/O.
+    default : enabled when output is into a file,
+    and disabled when output is stdout.
+    This setting overrides default and can force sparse mode over stdout.
+* `--rm`:
+    remove source file(s) after successful compression or decompression
+* `-k`, `--keep`:
+    keep source file(s) after successful compression or decompression.
+    This is the default behavior.
+* `-r`:
+    operate recursively on dictionaries
+* `-h`/`-H`, `--help`:
+    display help/long help and exit
+* `-V`, `--version`:
+    display version number and exit
+* `-v`:
+    verbose mode
+* `-q`, `--quiet`:
+    suppress warnings, interactivity, and notifications.
+    specify twice to suppress errors too.
+* `-C`, `--[no-]check`:
+    add integrity check computed from uncompressed data (default : enabled)
+* `--`:
+    All arguments after `--` are treated as files
+
+
+DICTIONARY BUILDER
+------------------
+`zstd` offers _dictionary_ compression,
+useful for very small files and messages.
+It's possible to train `zstd` with some samples,
+the result of which is saved into a file called a `dictionary`.
+Then during compression and decompression, reference the same dictionary.
+It will improve compression ratio of small files.
+Typical gains range from 10% (at 64KB) to x5 better (at <1KB).
+
+* `--train FILEs`:
+    Use FILEs as training set to create a dictionary.
+    The training set should contain a lot of small files (> 100),
+    and weight typically 100x the target dictionary size
+    (for example, 10 MB for a 100 KB dictionary).
+
+    Supports multithreading if `zstd` is compiled with threading support.
+    Additional parameters can be specified with `--train-cover`.
+    The legacy dictionary builder can be accessed with `--train-legacy`.
+    Equivalent to `--train-cover=d=8,steps=4`.
+* `-o file`:
+    Dictionary saved into `file` (default name: dictionary).
+* `--maxdict=#`:
+    Limit dictionary to specified size (default: 112640).
+* `--dictID=#`:
+    A dictionary ID is a locally unique ID that a decoder can use to verify it is
+    using the right dictionary.
+    By default, zstd will create a 4-bytes random number ID.
+    It's possible to give a precise number instead.
+    Short numbers have an advantage : an ID < 256 will only need 1 byte in the
+    compressed frame header, and an ID < 65536 will only need 2 bytes.
+    This compares favorably to 4 bytes default.
+    However, it's up to the dictionary manager to not assign twice the same ID to
+    2 different dictionaries.
+* `--train-cover[=k#,d=#,steps=#]`:
+    Select parameters for the default dictionary builder algorithm named cover.
+    If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
+    If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
+    If _steps_ is not specified, then the default value of 40 is used.
+    Requires that _d_ <= _k_.
+
+    Selects segments of size _k_ with highest score to put in the dictionary.
+    The score of a segment is computed by the sum of the frequencies of all the
+    subsegments of size _d_.
+    Generally _d_ should be in the range [6, 8], occasionally up to 16, but the
+    algorithm will run faster with d <= _8_.
+    Good values for _k_ vary widely based on the input data, but a safe range is
+    [2 * _d_, 2000].
+    Supports multithreading if `zstd` is compiled with threading support.
+
+    Examples:
+
+    `zstd --train-cover FILEs`
+
+    `zstd --train-cover=k=50,d=8 FILEs`
+
+    `zstd --train-cover=d=8,steps=500 FILEs`
+
+    `zstd --train-cover=k=50 FILEs`
+
+* `--train-legacy[=selectivity=#]`:
+    Use legacy dictionary builder algorithm with the given dictionary
+    _selectivity_ (default: 9).
+    The smaller the _selectivity_ value, the denser the dictionary,
+    improving its efficiency but reducing its possible maximum size.
+    `--train-legacy=s=#` is also accepted.
+
+    Examples:
+
+    `zstd --train-legacy FILEs`
+
+    `zstd --train-legacy=selectivity=8 FILEs`
+
+
+BENCHMARK
+---------
+
+* `-b#`:
+    benchmark file(s) using compression level #
+* `-e#`:
+    benchmark file(s) using multiple compression levels, from `-b#` to `-e#` (inclusive)
+* `-i#`:
+    minimum evaluation time, in seconds (default : 3s), benchmark mode only
+* `-B#`:
+    cut file into independent blocks of size # (default: no block)
+* `--priority=rt`:
+    set process priority to real-time
+
+
+ADVANCED COMPRESSION OPTIONS
+----------------------------
+### --zstd[=options]:
+`zstd` provides 22 predefined compression levels.
+The selected or default predefined compression level can be changed with
+advanced compression options.
+The _options_ are provided as a comma-separated list.
+You may specify only the options you want to change and the rest will be
+taken from the selected or default compression level.
+The list of available _options_:
+
+- `strategy`=_strat_, `strat`=_strat_:
+    Specify a strategy used by a match finder.
+
+    There are 8 strategies numbered from 0 to 7, from faster to stronger:
+    0=ZSTD\_fast, 1=ZSTD\_dfast, 2=ZSTD\_greedy, 3=ZSTD\_lazy,
+    4=ZSTD\_lazy2, 5=ZSTD\_btlazy2, 6=ZSTD\_btopt, 7=ZSTD\_btopt2.
+
+- `windowLog`=_wlog_, `wlog`=_wlog_:
+    Specify the maximum number of bits for a match distance.
+
+    The higher number of increases the chance to find a match which usually
+    improves compression ratio.
+    It also increases memory requirements for the compressor and decompressor.
+    The minimum _wlog_ is 10 (1 KiB) and the maximum is 27 (128 MiB).
+
+- `hashLog`=_hlog_, `hlog`=_hlog_:
+    Specify the maximum number of bits for a hash table.
+
+    Bigger hash tables cause less collisions which usually makes compression
+    faster, but requires more memory during compression.
+
+    The minimum _hlog_ is 6 (64 B) and the maximum is 26 (128 MiB).
+
+- `chainLog`=_clog_, `clog`=_clog_:
+    Specify the maximum number of bits for a hash chain or a binary tree.
+
+    Higher numbers of bits increases the chance to find a match which usually
+    improves compression ratio.
+    It also slows down compression speed and increases memory requirements for
+    compression.
+    This option is ignored for the ZSTD_fast strategy.
+
+    The minimum _clog_ is 6 (64 B) and the maximum is 28 (256 MiB).
+
+- `searchLog`=_slog_, `slog`=_slog_:
+    Specify the maximum number of searches in a hash chain or a binary tree
+    using logarithmic scale.
+
+    More searches increases the chance to find a match which usually increases
+    compression ratio but decreases compression speed.
+
+    The minimum _slog_ is 1 and the maximum is 26.
+
+- `searchLength`=_slen_, `slen`=_slen_:
+    Specify the minimum searched length of a match in a hash table.
+
+    Larger search lengths usually decrease compression ratio but improve
+    decompression speed.
+
+    The minimum _slen_ is 3 and the maximum is 7.
+
+- `targetLen`=_tlen_, `tlen`=_tlen_:
+    Specify the minimum match length that causes a match finder to stop
+    searching for better matches.
+
+    A larger minimum match length usually improves compression ratio but
+    decreases compression speed.
+    This option is only used with strategies ZSTD_btopt and ZSTD_btopt2.
+
+    The minimum _tlen_ is 4 and the maximum is 999.
+
+- `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
+    Determine `overlapSize`, amount of data reloaded from previous job.
+    This parameter is only available when multithreading is enabled.
+    Reloading more data improves compression ratio, but decreases speed.
+
+    The minimum _ovlog_ is 0, and the maximum is 9.
+    0 means "no overlap", hence completely independent jobs.
+    9 means "full overlap", meaning up to `windowSize` is reloaded from previous job.
+    Reducing _ovlog_ by 1 reduces the amount of reload by a factor 2.
+    Default _ovlog_ is 6, which means "reload `windowSize / 8`".
+    Exception : the maximum compression level (22) has a default _ovlog_ of 9.
+
+### -B#:
+Select the size of each compression job.
+This parameter is available only when multi-threading is enabled.
+Default value is `4 * windowSize`, which means it varies depending on compression level.
+`-B#` makes it possible to select a custom value.
+Note that job size must respect a minimum value which is enforced transparently.
+This minimum is either 1 MB, or `overlapSize`, whichever is largest.
+
+### Example
+The following parameters sets advanced compression options to those of
+predefined level 19 for files bigger than 256 KB:
+
+`--zstd`=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6
+
+BUGS
+----
+Report bugs at: https://github.com/facebook/zstd/issues
+
+AUTHOR
+------
+Yann Collet
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index ae49da7b1e72..32fef99930e9 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -49,6 +49,7 @@
 #define AUTHOR "Yann Collet"
 #define WELCOME_MESSAGE "*** %s %i-bits %s, by %s ***\n", COMPRESSOR_NAME, (int)(sizeof(size_t)*8), ZSTD_VERSION, AUTHOR
 
+#define ZSTD_ZSTDMT "zstdmt"
 #define ZSTD_UNZSTD "unzstd"
 #define ZSTD_CAT "zstdcat"
 #define ZSTD_GZ "gzip"
@@ -74,10 +75,10 @@ static U32 g_overlapLog = OVERLAP_LOG_DEFAULT;
 /*-************************************
 *  Display Macros
 **************************************/
-#define DISPLAY(...)           fprintf(displayOut, __VA_ARGS__)
-#define DISPLAYLEVEL(l, ...)   if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static FILE* displayOut;
-static unsigned displayLevel = DEFAULT_DISPLAY_LEVEL;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+#define DISPLAY(...)         fprintf(g_displayOut, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) { if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } }
+static int g_displayLevel = DEFAULT_DISPLAY_LEVEL;   /* 0 : no display,  1: errors,  2 : + result + interaction + warnings,  3 : + progression,  4 : + information */
+static FILE* g_displayOut;
 
 
 /*-************************************
@@ -99,7 +100,7 @@ static int usage(const char* programName)
 #endif
     DISPLAY( " -D file: use `file` as Dictionary \n");
     DISPLAY( " -o file: result stored into `file` (only if 1 input file) \n");
-    DISPLAY( " -f     : overwrite output without prompting \n");
+    DISPLAY( " -f     : overwrite output without prompting and (de)compress links \n");
     DISPLAY( "--rm    : remove source file(s) after successful de/compression \n");
     DISPLAY( " -k     : preserve source file(s) (default) \n");
     DISPLAY( " -h/-H  : display help/long help and exit\n");
@@ -113,19 +114,20 @@ static int usage_advanced(const char* programName)
     DISPLAY( "\n");
     DISPLAY( "Advanced arguments :\n");
     DISPLAY( " -V     : display Version number and exit\n");
-    DISPLAY( " -v     : verbose mode; specify multiple times to increase log level (default:%d)\n", DEFAULT_DISPLAY_LEVEL);
+    DISPLAY( " -v     : verbose mode; specify multiple times to increase verbosity\n");
     DISPLAY( " -q     : suppress warnings; specify twice to suppress errors too\n");
     DISPLAY( " -c     : force write to standard output, even if it is the console\n");
-#ifdef UTIL_HAS_CREATEFILELIST
-    DISPLAY( " -r     : operate recursively on directories \n");
-#endif
 #ifndef ZSTD_NOCOMPRESS
     DISPLAY( "--ultra : enable levels beyond %i, up to %i (requires more memory)\n", ZSTDCLI_CLEVEL_MAX, ZSTD_maxCLevel());
-    DISPLAY( "--no-dictID : don't write dictID into header (dictionary compression)\n");
-    DISPLAY( "--[no-]check : integrity check (default:enabled) \n");
 #ifdef ZSTD_MULTITHREAD
     DISPLAY( " -T#    : use # threads for compression (default:1) \n");
-    DISPLAY( " -B#    : select size of independent sections (default:0==automatic) \n");
+    DISPLAY( " -B#    : select size of each job (default:0==automatic) \n");
+#endif
+    DISPLAY( "--no-dictID : don't write dictID into header (dictionary compression)\n");
+    DISPLAY( "--[no-]check : integrity check (default:enabled) \n");
+#endif
+#ifdef UTIL_HAS_CREATEFILELIST
+    DISPLAY( " -r     : operate recursively on directories \n");
 #endif
 #ifdef ZSTD_GZCOMPRESS
     DISPLAY( "--format=gzip : compress files to the .gz format \n");
@@ -134,10 +136,16 @@ static int usage_advanced(const char* programName)
     DISPLAY( "--format=xz : compress files to the .xz format \n");
     DISPLAY( "--format=lzma : compress files to the .lzma format \n");
 #endif
+#ifdef ZSTD_LZ4COMPRESS
+    DISPLAY( "--format=lz4 : compress files to the .lz4 format \n");
 #endif
 #ifndef ZSTD_NODECOMPRESS
     DISPLAY( "--test  : test compressed file integrity \n");
+#if ZSTD_SPARSE_DEFAULT
     DISPLAY( "--[no-]sparse : sparse mode (default:enabled on file, disabled on stdout)\n");
+#else
+    DISPLAY( "--[no-]sparse : sparse mode (default:disabled)\n");
+#endif
 #endif
     DISPLAY( " -M#    : Set a memory usage limit for decompression \n");
     DISPLAY( "--      : All arguments after \"--\" are treated as files \n");
@@ -145,12 +153,11 @@ static int usage_advanced(const char* programName)
     DISPLAY( "\n");
     DISPLAY( "Dictionary builder :\n");
     DISPLAY( "--train ## : create a dictionary from a training set of files \n");
-    DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n");
-    DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n");
+    DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
     DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
-    DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
-    DISPLAY( " -s#    : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
-    DISPLAY( "--dictID ## : force dictionary ID to specified value (default: random)\n");
+    DISPLAY( "--maxdict=# : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
+    DISPLAY( "--dictID=# : force dictionary ID to specified value (default: random)\n");
 #endif
 #ifndef ZSTD_NOBENCH
     DISPLAY( "\n");
@@ -167,7 +174,7 @@ static int usage_advanced(const char* programName)
 static int badusage(const char* programName)
 {
     DISPLAYLEVEL(1, "Incorrect parameters\n");
-    if (displayLevel >= 1) usage(programName);
+    if (g_displayLevel >= 2) usage(programName);
     return 1;
 }
 
@@ -179,6 +186,23 @@ static void waitEnter(void)
     (void)unused;
 }
 
+static const char* lastNameFromPath(const char* path)
+{
+    const char* name = path;
+    if (strrchr(name, '/')) name = strrchr(name, '/') + 1;
+    if (strrchr(name, '\\')) name = strrchr(name, '\\') + 1; /* windows */
+    return name;
+}
+
+/*! exeNameMatch() :
+    @return : a non-zero value if exeName matches test, excluding the extension
+   */
+static int exeNameMatch(const char* exeName, const char* test)
+{
+    return !strncmp(exeName, test, strlen(test)) &&
+        (exeName[strlen(test)] == '\0' || exeName[strlen(test)] == '.');
+}
+
 /*! readU32FromChar() :
     @return : unsigned integer value read from input in `char` format
     allows and interprets K, KB, KiB, M, MB and MiB suffix.
@@ -216,11 +240,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
 #ifndef ZSTD_NODICT
 /**
  * parseCoverParameters() :
- * reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params
+ * reads cover parameters from *stringPtr (e.g. "--train-cover=k=48,d=8,steps=32") into *params
  * @return 1 means that cover parameters were correct
  * @return 0 in case of malformed parameters
  */
-static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params)
+static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t* params)
 {
     memset(params, 0, sizeof(*params));
     for (; ;) {
@@ -230,9 +254,33 @@ static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *para
         return 0;
     }
     if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
+    return 1;
+}
+
+/**
+ * parseLegacyParameters() :
+ * reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity
+ * @return 1 means that legacy dictionary builder parameters were correct
+ * @return 0 in case of malformed parameters
+ */
+static unsigned parseLegacyParameters(const char* stringPtr, unsigned* selectivity)
+{
+    if (!longCommandWArg(&stringPtr, "s=") && !longCommandWArg(&stringPtr, "selectivity=")) { return 0; }
+    *selectivity = readU32FromChar(&stringPtr);
+    if (stringPtr[0] != 0) return 0;
+    DISPLAYLEVEL(4, "legacy: selectivity=%u\n", *selectivity);
     return 1;
 }
+
+static COVER_params_t defaultCoverParams(void)
+{
+    COVER_params_t params;
+    memset(&params, 0, sizeof(params));
+    params.d = 8;
+    params.steps = 4;
+    return params;
+}
 #endif
 
 
@@ -270,6 +318,7 @@ int main(int argCount, const char* argv[])
 {
     int argNb,
         forceStdout=0,
+        followLinks=0,
         main_pause=0,
         nextEntryIsDictionary=0,
         operationResult=0,
@@ -305,8 +354,8 @@ int main(int argCount, const char* argv[])
     unsigned fileNamesNb;
 #endif
 #ifndef ZSTD_NODICT
-    COVER_params_t coverParams;
-    int cover = 0;
+    COVER_params_t coverParams = defaultCoverParams();
+    int cover = 1;
 #endif
 
     /* init */
@@ -316,21 +365,19 @@ int main(int argCount, const char* argv[])
     (void)memLimit;   /* not used when ZSTD_NODECOMPRESS set */
     if (filenameTable==NULL) { DISPLAY("zstd: %s \n", strerror(errno)); exit(1); }
     filenameTable[0] = stdinmark;
-    displayOut = stderr;
-    /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
-    {   size_t pos;
-        for (pos = (int)strlen(programName); pos > 0; pos--) { if (programName[pos] == '/') { pos++; break; } }
-        programName += pos;
-    }
+    g_displayOut = stderr;
+
+    programName = lastNameFromPath(programName);
 
     /* preset behaviors */
-    if (!strcmp(programName, ZSTD_UNZSTD)) operation=zom_decompress;
-    if (!strcmp(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; displayLevel=1; }
-    if (!strcmp(programName, ZSTD_GZ)) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); FIO_setRemoveSrcFile(1); }    /* behave like gzip */
-    if (!strcmp(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                          /* behave like gunzip */
-    if (!strcmp(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; displayLevel=1; }  /* behave like gzcat */
-    if (!strcmp(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like lzma */
-    if (!strcmp(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
+    if (exeNameMatch(programName, ZSTD_ZSTDMT)) nbThreads=0;
+    if (exeNameMatch(programName, ZSTD_UNZSTD)) operation=zom_decompress;
+    if (exeNameMatch(programName, ZSTD_CAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }
+    if (exeNameMatch(programName, ZSTD_GZ)) { suffix = GZ_EXTENSION; FIO_setCompressionType(FIO_gzipCompression); FIO_setRemoveSrcFile(1); }    /* behave like gzip */
+    if (exeNameMatch(programName, ZSTD_GUNZIP)) { operation=zom_decompress; FIO_setRemoveSrcFile(1); }                                          /* behave like gunzip */
+    if (exeNameMatch(programName, ZSTD_GZCAT)) { operation=zom_decompress; forceStdout=1; FIO_overwriteMode(); outFileName=stdoutmark; g_displayLevel=1; }  /* behave like gzcat */
+    if (exeNameMatch(programName, ZSTD_LZMA)) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression); FIO_setRemoveSrcFile(1); }    /* behave like lzma */
+    if (exeNameMatch(programName, ZSTD_XZ)) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression); FIO_setRemoveSrcFile(1); }    /* behave like xz */
     memset(&compressionParams, 0, sizeof(compressionParams));
 
     /* command switches */
@@ -344,7 +391,7 @@ int main(int argCount, const char* argv[])
                 if (!filenameIdx) {
                     filenameIdx=1, filenameTable[0]=stdinmark;
                     outFileName=stdoutmark;
-                    displayLevel-=(displayLevel==2);
+                    g_displayLevel-=(g_displayLevel==2);
                     continue;
             }   }
 
@@ -357,12 +404,12 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--compress")) { operation=zom_compress; continue; }
                     if (!strcmp(argument, "--decompress")) { operation=zom_decompress; continue; }
                     if (!strcmp(argument, "--uncompress")) { operation=zom_decompress; continue; }
-                    if (!strcmp(argument, "--force")) { FIO_overwriteMode(); continue; }
-                    if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0); }
-                    if (!strcmp(argument, "--help")) { displayOut=stdout; CLEAN_RETURN(usage_advanced(programName)); }
-                    if (!strcmp(argument, "--verbose")) { displayLevel++; continue; }
-                    if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
-                    if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; displayLevel-=(displayLevel==2); continue; }
+                    if (!strcmp(argument, "--force")) { FIO_overwriteMode(); forceStdout=1; followLinks=1; continue; }
+                    if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0); }
+                    if (!strcmp(argument, "--help")) { g_displayOut=stdout; CLEAN_RETURN(usage_advanced(programName)); }
+                    if (!strcmp(argument, "--verbose")) { g_displayLevel++; continue; }
+                    if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
+                    if (!strcmp(argument, "--stdout")) { forceStdout=1; outFileName=stdoutmark; g_displayLevel-=(g_displayLevel==2); continue; }
                     if (!strcmp(argument, "--ultra")) { ultra=1; continue; }
                     if (!strcmp(argument, "--check")) { FIO_setChecksumFlag(2); continue; }
                     if (!strcmp(argument, "--no-check")) { FIO_setChecksumFlag(0); continue; }
@@ -370,8 +417,8 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; }
                     if (!strcmp(argument, "--test")) { operation=zom_test; continue; }
                     if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; }
-                    if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }
-                    if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }
+                    if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
+                    if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; }  /* kept available for compatibility with old syntax ; will be removed one day */
                     if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; }
                     if (!strcmp(argument, "--keep")) { FIO_setRemoveSrcFile(0); continue; }
                     if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; }
@@ -383,26 +430,40 @@ int main(int argCount, const char* argv[])
                     if (!strcmp(argument, "--format=lzma")) { suffix = LZMA_EXTENSION; FIO_setCompressionType(FIO_lzmaCompression);  continue; }
                     if (!strcmp(argument, "--format=xz")) { suffix = XZ_EXTENSION; FIO_setCompressionType(FIO_xzCompression);  continue; }
 #endif
+#ifdef ZSTD_LZ4COMPRESS
+                    if (!strcmp(argument, "--format=lz4")) { suffix = LZ4_EXTENSION; FIO_setCompressionType(FIO_lz4Compression);  continue; }
+#endif
 
                     /* long commands with arguments */
 #ifndef ZSTD_NODICT
-                    if (longCommandWArg(&argument, "--cover=")) {
-                      cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName));
-                      continue;
-                    }
-                    if (longCommandWArg(&argument, "--optimize-cover")) {
-                      cover=2;
+                    if (longCommandWArg(&argument, "--train-cover")) {
+                      operation = zom_train;
+                      outFileName = g_defaultDictName;
+                      cover = 1;
                       /* Allow optional arguments following an = */
                       if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); }
                       else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
                       else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); }
                       continue;
                     }
+                    if (longCommandWArg(&argument, "--train-legacy")) {
+                      operation = zom_train;
+                      outFileName = g_defaultDictName;
+                      cover = 0;
+                      /* Allow optional arguments following an = */
+                      if (*argument == 0) { continue; }
+                      else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); }
+                      else if (!parseLegacyParameters(argument, &dictSelect)) { CLEAN_RETURN(badusage(programName)); }
+                      continue;
+                    }
 #endif
+                    if (longCommandWArg(&argument, "--threads=")) { nbThreads = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--block-size=")) { blockSize = readU32FromChar(&argument); continue; }
+                    if (longCommandWArg(&argument, "--maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+                    if (longCommandWArg(&argument, "--dictID=")) { dictID = readU32FromChar(&argument); continue; }
                     if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; }
                     /* fall-through, will trigger bad_usage() later on */
                 }
@@ -424,9 +485,9 @@ int main(int argCount, const char* argv[])
                     switch(argument[0])
                     {
                         /* Display help */
-                    case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0);   /* Version Only */
+                    case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); CLEAN_RETURN(0);   /* Version Only */
                     case 'H':
-                    case 'h': displayOut=stdout; CLEAN_RETURN(usage_advanced(programName));
+                    case 'h': g_displayOut=stdout; CLEAN_RETURN(usage_advanced(programName));
 
                          /* Compress */
                     case 'z': operation=zom_compress; argument++; break;
@@ -445,19 +506,19 @@ int main(int argCount, const char* argv[])
                     case 'D': nextEntryIsDictionary = 1; lastCommand = 1; argument++; break;
 
                         /* Overwrite */
-                    case 'f': FIO_overwriteMode(); forceStdout=1; argument++; break;
+                    case 'f': FIO_overwriteMode(); forceStdout=1; followLinks=1; argument++; break;
 
                         /* Verbose mode */
-                    case 'v': displayLevel++; argument++; break;
+                    case 'v': g_displayLevel++; argument++; break;
 
                         /* Quiet mode */
-                    case 'q': displayLevel--; argument++; break;
+                    case 'q': g_displayLevel--; argument++; break;
 
-                        /* keep source file (default); for gzip/xz compatibility */
+                        /* keep source file (default) */
                     case 'k': FIO_setRemoveSrcFile(0); argument++; break;
 
                         /* Checksum */
-                    case 'C': argument++; FIO_setChecksumFlag(2); break;
+                    case 'C': FIO_setChecksumFlag(2); argument++; break;
 
                         /* test compressed file */
                     case 't': operation=zom_test; argument++; break;
@@ -532,14 +593,14 @@ int main(int argCount, const char* argv[])
                 continue;
             }   /* if (argument[0]=='-') */
 
-            if (nextArgumentIsMaxDict) {
+            if (nextArgumentIsMaxDict) {  /* kept available for compatibility with old syntax ; will be removed one day */
                 nextArgumentIsMaxDict = 0;
                 lastCommand = 0;
                 maxDictSize = readU32FromChar(&argument);
                 continue;
             }
 
-            if (nextArgumentIsDictID) {
+            if (nextArgumentIsDictID) {  /* kept available for compatibility with old syntax ; will be removed one day */
                 nextArgumentIsDictID = 0;
                 lastCommand = 0;
                 dictID = readU32FromChar(&argument);
@@ -581,9 +642,27 @@ int main(int argCount, const char* argv[])
     DISPLAYLEVEL(4, "PLATFORM_POSIX_VERSION defined: %ldL\n", (long) PLATFORM_POSIX_VERSION);
 #endif
 
+    if (nbThreads == 0) {
+        /* try to guess */
+        nbThreads = UTIL_countPhysicalCores();
+        DISPLAYLEVEL(3, "Note: %d physical core(s) detected\n", nbThreads);
+    }
+
+    g_utilDisplayLevel = g_displayLevel;
+    if (!followLinks) {
+        unsigned u;
+        for (u=0, fileNamesNb=0; u<filenameIdx; u++) {
+            if (UTIL_isLink(filenameTable[u])) {
+                DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring\n", filenameTable[u]);
+            } else {
+                filenameTable[fileNamesNb++] = filenameTable[u];
+            }
+        }
+        filenameIdx = fileNamesNb;
+    }
 #ifdef UTIL_HAS_CREATEFILELIST
     if (recursive) {  /* at this stage, filenameTable is a list of paths, which can contain both files and directories */
-        extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb);
+        extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
         if (extendedFileList) {
             unsigned u;
             for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
@@ -597,7 +676,7 @@ int main(int argCount, const char* argv[])
     /* Check if benchmark is selected */
     if (operation==zom_bench) {
 #ifndef ZSTD_NOBENCH
-        BMK_setNotificationLevel(displayLevel);
+        BMK_setNotificationLevel(g_displayLevel);
         BMK_setBlockSize(blockSize);
         BMK_setNbThreads(nbThreads);
         BMK_setNbSeconds(bench_nbSeconds);
@@ -611,19 +690,20 @@ int main(int argCount, const char* argv[])
     if (operation==zom_train) {
 #ifndef ZSTD_NODICT
         if (cover) {
+            int const optimize = !coverParams.k || !coverParams.d;
             coverParams.nbThreads = nbThreads;
             coverParams.compressionLevel = dictCLevel;
-            coverParams.notificationLevel = displayLevel;
+            coverParams.notificationLevel = g_displayLevel;
             coverParams.dictID = dictID;
-            DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize);
         } else {
             ZDICT_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
             dictParams.compressionLevel = dictCLevel;
             dictParams.selectivityLevel = dictSelect;
-            dictParams.notificationLevel = displayLevel;
+            dictParams.notificationLevel = g_displayLevel;
             dictParams.dictID = dictID;
-            DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
         }
 #endif
         goto _end;
@@ -635,7 +715,7 @@ int main(int argCount, const char* argv[])
 
     /* Check if input/output defined as console; trigger an error in this case */
     if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) CLEAN_RETURN(badusage(programName));
-    if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && strcmp(filenameTable[0], stdinmark) && !(forceStdout && (operation==zom_decompress)))
+    if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && !strcmp(filenameTable[0], stdinmark) && !forceStdout && operation!=zom_decompress)
         CLEAN_RETURN(badusage(programName));
 
     /* user-selected output filename, only possible with a single file */
@@ -654,11 +734,11 @@ int main(int argCount, const char* argv[])
 #endif
 
     /* No status message in pipe mode (stdin - stdout) or multi-files mode */
-    if (!strcmp(filenameTable[0], stdinmark) && outFileName && !strcmp(outFileName,stdoutmark) && (displayLevel==2)) displayLevel=1;
-    if ((filenameIdx>1) & (displayLevel==2)) displayLevel=1;
+    if (!strcmp(filenameTable[0], stdinmark) && outFileName && !strcmp(outFileName,stdoutmark) && (g_displayLevel==2)) g_displayLevel=1;
+    if ((filenameIdx>1) & (g_displayLevel==2)) g_displayLevel=1;
 
     /* IO Stream/File */
-    FIO_setNotificationLevel(displayLevel);
+    FIO_setNotificationLevel(g_displayLevel);
     if (operation==zom_compress) {
 #ifndef ZSTD_NOCOMPRESS
         FIO_setNbThreads(nbThreads);
diff --git a/tests/Makefile b/tests/Makefile
index 9382fe80ddc2..ea58c0fe5119 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -26,9 +26,12 @@ PYTHON ?= python3
 TESTARTEFACT := versionsTest namespaceTest
 
 
-CPPFLAGS+= -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress -I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(PRGDIR)
+DEBUGFLAGS=-g -DZSTD_DEBUG=1
+CPPFLAGS+= -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
+           -I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(PRGDIR) \
+           $(DEBUGFLAG)
 CFLAGS  ?= -O3
-CFLAGS  += -g -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
+CFLAGS  += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
            -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
            -Wstrict-prototypes -Wundef -Wformat-security
 CFLAGS  += $(MOREFLAGS)
@@ -65,14 +68,16 @@ FUZZERTEST ?= -T5mn
 ZSTDRTTEST = --test-large-data
 DECODECORPUS_TESTTIME ?= -T30
 
-.PHONY: default all all32 dll clean test test32 test-all namespaceTest versionsTest
+.PHONY: default all all32 allnothread dll clean test test32 test-all namespaceTest versionsTest
 
 default: fullbench
 
-all: fullbench fuzzer zstreamtest paramgrill datagen zbufftest
+all: fullbench fuzzer zstreamtest paramgrill datagen zbufftest decodecorpus
 
 all32: fullbench32 fuzzer32 zstreamtest32 zbufftest32
 
+allnothread: fullbench fuzzer paramgrill datagen zbufftest decodecorpus
+
 dll: fuzzer-dll zstreamtest-dll zbufftest-dll
 
 zstd:
@@ -152,6 +157,7 @@ zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/datagen.c zstreamtest.c
 	$(MAKE) -C $(ZSTDDIR) libzstd
 	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@$(EXT)
 
+paramgrill : DEBUGFLAG =
 paramgrill : $(ZSTD_FILES) $(PRGDIR)/datagen.c paramgrill.c
 	$(CC)      $(FLAGS) $^ -lm -o $@$(EXT)
 
@@ -300,10 +306,10 @@ test-fullbench32: fullbench32 datagen
 	$(QEMU_SYS) ./fullbench32 -i1 -P0
 
 test-fuzzer: fuzzer
-	$(QEMU_SYS) ./fuzzer $(FUZZERTEST)
+	$(QEMU_SYS) ./fuzzer $(FUZZERTEST) $(FUZZER_FLAGS)
 
 test-fuzzer32: fuzzer32
-	$(QEMU_SYS) ./fuzzer32 $(FUZZERTEST)
+	$(QEMU_SYS) ./fuzzer32 $(FUZZERTEST) $(FUZZER_FLAGS)
 
 test-zbuff: zbufftest
 	$(QEMU_SYS) ./zbufftest $(ZSTREAM_TESTTIME)
@@ -312,10 +318,10 @@ test-zbuff32: zbufftest32
 	$(QEMU_SYS) ./zbufftest32 $(ZSTREAM_TESTTIME)
 
 test-zstream: zstreamtest
-	$(QEMU_SYS) ./zstreamtest $(ZSTREAM_TESTTIME)
+	$(QEMU_SYS) ./zstreamtest $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
 
 test-zstream32: zstreamtest32
-	$(QEMU_SYS) ./zstreamtest32 $(ZSTREAM_TESTTIME)
+	$(QEMU_SYS) ./zstreamtest32 $(ZSTREAM_TESTTIME) $(FUZZER_FLAGS)
 
 test-longmatch: longmatch
 	$(QEMU_SYS) ./longmatch
diff --git a/tests/decodecorpus.c b/tests/decodecorpus.c
index 183d20f749e6..f7b3c854fdb2 100644
--- a/tests/decodecorpus.c
+++ b/tests/decodecorpus.c
@@ -98,7 +98,7 @@ static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb)
     BYTE* op = ptr;
 
     for (i = 0; i < size; i++) {
-        op[i] = RAND(seed) % (maxSymb + 1);
+        op[i] = (BYTE) (RAND(seed) % (maxSymb + 1));
     }
 }
 
@@ -134,8 +134,8 @@ static void RAND_genDist(U32* seed, BYTE* dist, double weight)
 {
     size_t i = 0;
     size_t statesLeft = DISTSIZE;
-    BYTE symb = RAND(seed) % 256;
-    BYTE step = (RAND(seed) % 256) | 1; /* force it to be odd so it's relatively prime to 256 */
+    BYTE symb = (BYTE) (RAND(seed) % 256);
+    BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */
 
     while (i < DISTSIZE) {
         size_t states = ((size_t)(weight * statesLeft)) + 1;
@@ -259,7 +259,7 @@ static void writeFrameHeader(U32* seed, frame_t* frame)
         /* Follow window algorithm from specification */
         int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10);
         int const mantissa = RAND(seed) % 8;
-        windowByte = (exponent << 3) | mantissa;
+        windowByte = (BYTE) ((exponent << 3) | mantissa);
         fh.windowSize = (1U << (exponent + 10));
         fh.windowSize += fh.windowSize / 8 * mantissa;
     }
@@ -284,7 +284,7 @@ static void writeFrameHeader(U32* seed, frame_t* frame)
 
         if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) {
             /* do single segment sometimes */
-            fh.windowSize = fh.contentSize;
+            fh.windowSize = (U32) fh.contentSize;
             singleSegment = 1;
         }
     }
@@ -307,7 +307,7 @@ static void writeFrameHeader(U32* seed, frame_t* frame)
 
     {
         BYTE const frameHeaderDescriptor =
-                (fcsCode << 6) | (singleSegment << 5) | (1 << 2);
+                (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2));
         op[pos++] = frameHeaderDescriptor;
     }
 
@@ -318,14 +318,14 @@ static void writeFrameHeader(U32* seed, frame_t* frame)
     if (contentSizeFlag) {
         switch (fcsCode) {
         default: /* Impossible */
-        case 0: op[pos++] = fh.contentSize; break;
-        case 1: MEM_writeLE16(op + pos, fh.contentSize - 256); pos += 2; break;
-        case 2: MEM_writeLE32(op + pos, fh.contentSize); pos += 4; break;
-        case 3: MEM_writeLE64(op + pos, fh.contentSize); pos += 8; break;
+        case 0: op[pos++] = (BYTE) fh.contentSize; break;
+        case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break;
+        case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break;
+        case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break;
         }
     }
 
-    DISPLAYLEVEL(2, " frame content size:\t%zu\n", fh.contentSize);
+    DISPLAYLEVEL(2, " frame content size:\t%u\n", (U32)fh.contentSize);
     DISPLAYLEVEL(2, " frame window size:\t%u\n", fh.windowSize);
     DISPLAYLEVEL(2, " content size flag:\t%d\n", contentSizeFlag);
     DISPLAYLEVEL(2, " single segment flag:\t%d\n", singleSegment);
@@ -385,7 +385,7 @@ static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t content
         op += litSize;
     } else {
         /* RLE literals */
-        BYTE const symb = RAND(seed) % 256;
+        BYTE const symb = (BYTE) (RAND(seed) % 256);
 
         DISPLAYLEVEL(4, "   rle literals: 0x%02x\n", (U32)symb);
 
@@ -542,8 +542,8 @@ static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t con
         op += compressedSize;
 
         compressedSize += hufHeaderSize;
-        DISPLAYLEVEL(5, "    regenerated size: %zu\n", litSize);
-        DISPLAYLEVEL(5, "    compressed size: %zu\n", compressedSize);
+        DISPLAYLEVEL(5, "    regenerated size: %u\n", (U32)litSize);
+        DISPLAYLEVEL(5, "    compressed size: %u\n", (U32)compressedSize);
         if (compressedSize >= litSize) {
             DISPLAYLEVEL(5, "     trying again\n");
             /* if we have to try again, reset the stats so we don't accidentally
@@ -611,7 +611,7 @@ static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
     size_t const remainingMatch = contentSize - literalsSize;
     size_t excessMatch = 0;
     U32 numSequences = 0;
-  
+
     U32 i;
 
 
@@ -628,7 +628,7 @@ static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
         excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN;
     }
 
-    DISPLAYLEVEL(5, "    total match lengths: %zu\n", remainingMatch);
+    DISPLAYLEVEL(5, "    total match lengths: %u\n", (U32)remainingMatch);
 
     for (i = 0; i < numSequences; i++) {
         /* Generate match and literal lengths by exponential distribution to
@@ -647,10 +647,10 @@ static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
         U32 offset, offsetCode, repIndex;
 
         /* bounds checks */
-        matchLen = MIN(matchLen, excessMatch + MIN_SEQ_LEN);
-        literalLen = MIN(literalLen, literalsSize);
+        matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN);
+        literalLen = MIN(literalLen, (U32) literalsSize);
         if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1;
-        if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + excessMatch;
+        if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch;
 
         memcpy(srcPtr, literals, literalLen);
         srcPtr += literalLen;
@@ -694,8 +694,8 @@ static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
         }
 
         DISPLAYLEVEL(6, "      LL: %5u OF: %5u ML: %5u", literalLen, offset, matchLen);
-        DISPLAYLEVEL(7, " srcPos: %8tu seqNb: %3u",
-                     (BYTE*)srcPtr - (BYTE*)frame->srcStart, i);
+        DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u",
+                     (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart), i);
         DISPLAYLEVEL(6, "\n");
         if (offsetCode < 3) {
             DISPLAYLEVEL(7, "        repeat offset: %d\n", repIndex);
@@ -711,8 +711,8 @@ static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
 
     memcpy(srcPtr, literals, literalsSize);
     srcPtr += literalsSize;
-    DISPLAYLEVEL(6, "      excess literals: %5zu", literalsSize);
-    DISPLAYLEVEL(7, " srcPos: %8tu", (BYTE*)srcPtr - (BYTE*)frame->srcStart);
+    DISPLAYLEVEL(6, "      excess literals: %5u", (U32)literalsSize);
+    DISPLAYLEVEL(7, " srcPos: %8u", (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart));
     DISPLAYLEVEL(6, "\n");
 
     return numSequences;
@@ -957,11 +957,11 @@ static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize
 
     literalsSize = writeLiteralsBlock(seed, frame, contentSize);
 
-    DISPLAYLEVEL(4, "   literals size: %zu\n", literalsSize);
+    DISPLAYLEVEL(4, "   literals size: %u\n", (U32)literalsSize);
 
     nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize);
 
-    DISPLAYLEVEL(4, "   number of sequences: %zu\n", nbSeq);
+    DISPLAYLEVEL(4, "   number of sequences: %u\n", (U32)nbSeq);
 
     return (BYTE*)frame->data - blockStart;
 }
@@ -977,7 +977,7 @@ static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
     BYTE *op = header + 3;
 
     DISPLAYLEVEL(3, " block:\n");
-    DISPLAYLEVEL(3, "  block content size: %zu\n", contentSize);
+    DISPLAYLEVEL(3, "  block content size: %u\n", (U32)contentSize);
     DISPLAYLEVEL(3, "  last block: %s\n", lastBlock ? "yes" : "no");
 
     if (blockTypeDesc == 0) {
@@ -1025,10 +1025,10 @@ static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
     frame->src = (BYTE*)frame->src + contentSize;
 
     DISPLAYLEVEL(3, "  block type: %s\n", BLOCK_TYPES[blockType]);
-    DISPLAYLEVEL(3, "  block size field: %zu\n", blockSize);
+    DISPLAYLEVEL(3, "  block size field: %u\n", (U32)blockSize);
 
-    header[0] = (lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff;
-    MEM_writeLE16(header + 1, blockSize >> 5);
+    header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff);
+    MEM_writeLE16(header + 1, (U16) (blockSize >> 5));
 
     frame->data = op;
 }
@@ -1300,7 +1300,7 @@ static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
 *********************************************************/
 static U32 makeSeed(void)
 {
-    U32 t = time(NULL);
+    U32 t = (U32) time(NULL);
     return XXH32(&t, sizeof(t), 0) % 65536;
 }
 
diff --git a/tests/fullbench.c b/tests/fullbench.c
index 940d315a7cd4..38cf22fa7ebc 100644
--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@@ -251,14 +251,14 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb)
     case 13:
         benchFunction = local_ZSTD_decompressContinue; benchName = "ZSTD_decompressContinue";
         break;
-	case 31:
+    case 31:
         benchFunction = local_ZSTD_decodeLiteralsBlock; benchName = "ZSTD_decodeLiteralsBlock";
         break;
     case 32:
         benchFunction = local_ZSTD_decodeSeqHeaders; benchName = "ZSTD_decodeSeqHeaders";
         break;
 #endif
-	case 41:
+    case 41:
         benchFunction = local_ZSTD_compressStream; benchName = "ZSTD_compressStream";
         break;
     case 42:
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index def7542b5f0b..a9dcf12e0702 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -28,6 +28,7 @@
 #define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_compressContinue, ZSTD_compressBlock */
 #include "zstd.h"         /* ZSTD_VERSION_STRING */
 #include "zstd_errors.h"  /* ZSTD_getErrorCode */
+#include "zstdmt_compress.h"
 #define ZDICT_STATIC_LINKING_ONLY
 #include "zdict.h"        /* ZDICT_trainFromBuffer */
 #include "datagen.h"      /* RDG_genBuffer */
@@ -57,7 +58,7 @@ static U32 g_displayLevel = 2;
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((FUZ_clockSpan(g_displayClock) > g_refreshRate) || (g_displayLevel>=4)) \
             { g_displayClock = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t g_refreshRate = CLOCKS_PER_SEC / 6;
 static clock_t g_displayClock = 0;
 
@@ -133,13 +134,21 @@ static int basicUnitTests(U32 seed, double compressibility)
         DISPLAYLEVEL(4, "OK : %s \n", errorString);
     }
 
+
     DISPLAYLEVEL(4, "test%3i : compress %u bytes : ", testNb++, (U32)CNBuffSize);
     CHECKPLUS(r, ZSTD_compress(compressedBuffer, ZSTD_compressBound(CNBuffSize),
                                CNBuffer, CNBuffSize, 1),
               cSize=r );
     DISPLAYLEVEL(4, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBuffSize*100);
 
-    DISPLAYLEVEL(4, "test%3i : decompressed size test : ", testNb++);
+
+    DISPLAYLEVEL(4, "test%3i : ZSTD_getFrameContentSize test : ", testNb++);
+    {   unsigned long long const rSize = ZSTD_getFrameContentSize(compressedBuffer, cSize);
+        if (rSize != CNBuffSize) goto _output_error;
+    }
+    DISPLAYLEVEL(4, "OK \n");
+
+    DISPLAYLEVEL(4, "test%3i : ZSTD_findDecompressedSize test : ", testNb++);
     {   unsigned long long const rSize = ZSTD_findDecompressedSize(compressedBuffer, cSize);
         if (rSize != CNBuffSize) goto _output_error;
     }
@@ -157,6 +166,7 @@ static int basicUnitTests(U32 seed, double compressibility)
     }   }
     DISPLAYLEVEL(4, "OK \n");
 
+
     DISPLAYLEVEL(4, "test%3i : decompress with null dict : ", testNb++);
     { size_t const r = ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, NULL, 0);
       if (r != CNBuffSize) goto _output_error; }
@@ -179,6 +189,49 @@ static int basicUnitTests(U32 seed, double compressibility)
       if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error; }
     DISPLAYLEVEL(4, "OK \n");
 
+
+    /* ZSTDMT simple MT compression test */
+    DISPLAYLEVEL(4, "test%3i : create ZSTDMT CCtx : ", testNb++);
+    {   ZSTDMT_CCtx* mtctx = ZSTDMT_createCCtx(2);
+        if (mtctx==NULL) {
+            DISPLAY("mtctx : mot enough memory, aborting \n");
+            testResult = 1;
+            goto _end;
+        }
+        DISPLAYLEVEL(4, "OK \n");
+
+        DISPLAYLEVEL(4, "test%3i : compress %u bytes with 2 threads : ", testNb++, (U32)CNBuffSize);
+        CHECKPLUS(r, ZSTDMT_compressCCtx(mtctx,
+                                compressedBuffer, ZSTD_compressBound(CNBuffSize),
+                                CNBuffer, CNBuffSize,
+                                1),
+                  cSize=r );
+        DISPLAYLEVEL(4, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBuffSize*100);
+
+        DISPLAYLEVEL(4, "test%3i : decompressed size test : ", testNb++);
+        {   unsigned long long const rSize = ZSTD_getFrameContentSize(compressedBuffer, cSize);
+            if (rSize != CNBuffSize)  {
+                DISPLAY("ZSTD_getFrameContentSize incorrect : %u != %u \n", (U32)rSize, (U32)CNBuffSize);
+                goto _output_error;
+        }   }
+        DISPLAYLEVEL(4, "OK \n");
+
+        DISPLAYLEVEL(4, "test%3i : decompress %u bytes : ", testNb++, (U32)CNBuffSize);
+        { size_t const r = ZSTD_decompress(decodedBuffer, CNBuffSize, compressedBuffer, cSize);
+          if (r != CNBuffSize) goto _output_error; }
+        DISPLAYLEVEL(4, "OK \n");
+
+        DISPLAYLEVEL(4, "test%3i : check decompressed result : ", testNb++);
+        {   size_t u;
+            for (u=0; u<CNBuffSize; u++) {
+                if (((BYTE*)decodedBuffer)[u] != ((BYTE*)CNBuffer)[u]) goto _output_error;;
+        }   }
+        DISPLAYLEVEL(4, "OK \n");
+
+        ZSTDMT_freeCCtx(mtctx);
+    }
+
+
     /* Simple API multiframe test */
     DISPLAYLEVEL(4, "test%3i : compress multiple frames : ", testNb++);
     {   size_t off = 0;
@@ -351,7 +404,58 @@ static int basicUnitTests(U32 seed, double compressibility)
                   if (r != CNBuffSize) goto _output_error);
         DISPLAYLEVEL(4, "OK \n");
 
-        DISPLAYLEVEL(4, "test%3i : compress without dictID : ", testNb++);
+        DISPLAYLEVEL(4, "test%3i : compress with preprocessed dictionary : ", testNb++);
+        {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBuffSize, dictSize);
+            ZSTD_customMem customMem = { NULL, NULL, NULL };
+            ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dictBuffer, dictSize, 1, cParams, customMem);
+            cSize = ZSTD_compress_usingCDict(cctx, compressedBuffer, ZSTD_compressBound(CNBuffSize),
+                                                 CNBuffer, CNBuffSize, cdict);
+            ZSTD_freeCDict(cdict);
+            if (ZSTD_isError(cSize)) goto _output_error;
+        }
+        DISPLAYLEVEL(4, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBuffSize*100);
+
+        DISPLAYLEVEL(4, "test%3i : retrieve dictID from frame : ", testNb++);
+        {   U32 const did = ZSTD_getDictID_fromFrame(compressedBuffer, cSize);
+            if (did != dictID) goto _output_error;   /* non-conformant (content-only) dictionary */
+        }
+        DISPLAYLEVEL(4, "OK \n");
+
+        DISPLAYLEVEL(4, "test%3i : frame built with dictionary should be decompressible : ", testNb++);
+        CHECKPLUS(r, ZSTD_decompress_usingDict(dctx,
+                                       decodedBuffer, CNBuffSize,
+                                       compressedBuffer, cSize,
+                                       dictBuffer, dictSize),
+                  if (r != CNBuffSize) goto _output_error);
+        DISPLAYLEVEL(4, "OK \n");
+
+        DISPLAYLEVEL(4, "test%3i : ZSTD_compress_usingCDict_advanced, no contentSize, no dictID : ", testNb++);
+        {   ZSTD_frameParameters const fParams = { 0 /* frameSize */, 1 /* checksum */, 1 /* noDictID*/ };
+            ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBuffSize, dictSize);
+            ZSTD_customMem const customMem = { NULL, NULL, NULL };
+            ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictBuffer, dictSize, 1, cParams, customMem);
+            cSize = ZSTD_compress_usingCDict_advanced(cctx, compressedBuffer, ZSTD_compressBound(CNBuffSize),
+                                                 CNBuffer, CNBuffSize, cdict, fParams);
+            ZSTD_freeCDict(cdict);
+            if (ZSTD_isError(cSize)) goto _output_error;
+        }
+        DISPLAYLEVEL(4, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBuffSize*100);
+
+        DISPLAYLEVEL(4, "test%3i : try retrieving contentSize from frame : ", testNb++);
+        {   U64 const contentSize = ZSTD_getFrameContentSize(compressedBuffer, cSize);
+            if (contentSize != ZSTD_CONTENTSIZE_UNKNOWN) goto _output_error;
+        }
+        DISPLAYLEVEL(4, "OK (unknown)\n");
+
+        DISPLAYLEVEL(4, "test%3i : frame built without dictID should be decompressible : ", testNb++);
+        CHECKPLUS(r, ZSTD_decompress_usingDict(dctx,
+                                       decodedBuffer, CNBuffSize,
+                                       compressedBuffer, cSize,
+                                       dictBuffer, dictSize),
+                  if (r != CNBuffSize) goto _output_error);
+        DISPLAYLEVEL(4, "OK \n");
+
+        DISPLAYLEVEL(4, "test%3i : ZSTD_compress_advanced, no dictID : ", testNb++);
         {   ZSTD_parameters p = ZSTD_getParams(3, CNBuffSize, dictSize);
             p.fParams.noDictIDFlag = 1;
             cSize = ZSTD_compress_advanced(cctx, compressedBuffer, ZSTD_compressBound(CNBuffSize),
@@ -438,9 +542,9 @@ static int basicUnitTests(U32 seed, double compressibility)
 
     /* Decompression defense tests */
     DISPLAYLEVEL(4, "test%3i : Check input length for magic number : ", testNb++);
-    { size_t const r = ZSTD_decompress(decodedBuffer, CNBuffSize, CNBuffer, 3);
+    { size_t const r = ZSTD_decompress(decodedBuffer, CNBuffSize, CNBuffer, 3);   /* too small input */
       if (!ZSTD_isError(r)) goto _output_error;
-      if (r != (size_t)-ZSTD_error_srcSize_wrong) goto _output_error; }
+      if (ZSTD_getErrorCode(r) != ZSTD_error_srcSize_wrong) goto _output_error; }
     DISPLAYLEVEL(4, "OK \n");
 
     DISPLAYLEVEL(4, "test%3i : Check magic Number : ", testNb++);
@@ -449,6 +553,24 @@ static int basicUnitTests(U32 seed, double compressibility)
       if (!ZSTD_isError(r)) goto _output_error; }
     DISPLAYLEVEL(4, "OK \n");
 
+    /* content size verification test */
+    DISPLAYLEVEL(4, "test%3i : Content size verification : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        size_t const srcSize = 5000;
+        size_t const wrongSrcSize = (srcSize + 1000);
+        ZSTD_parameters params = ZSTD_getParams(1, wrongSrcSize, 0);
+        params.fParams.contentSizeFlag = 1;
+        {   size_t const result = ZSTD_compressBegin_advanced(cctx, NULL, 0, params, wrongSrcSize);
+            if (ZSTD_isError(result)) goto _output_error;
+        }
+        {   size_t const result = ZSTD_compressEnd(cctx, decodedBuffer, CNBuffSize, CNBuffer, srcSize);
+            if (!ZSTD_isError(result)) goto _output_error;
+            if (ZSTD_getErrorCode(result) != ZSTD_error_srcSize_wrong) goto _output_error;
+            DISPLAYLEVEL(4, "OK : %s \n", ZSTD_getErrorName(result));
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+
     /* block API tests */
     {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
         static const size_t dictSize = 65 KB;
@@ -600,6 +722,14 @@ static size_t findDiff(const void* buf1, const void* buf2, size_t max)
 }
 
 
+static ZSTD_parameters FUZ_makeParams(ZSTD_compressionParameters cParams, ZSTD_frameParameters fParams)
+{
+    ZSTD_parameters params;
+    params.cParams = cParams;
+    params.fParams = fParams;
+    return params;
+}
+
 static size_t FUZ_rLogLength(U32* seed, U32 logLength)
 {
     size_t const lengthMask = ((size_t)1 << logLength) - 1;
@@ -616,7 +746,7 @@ static size_t FUZ_randomLength(U32* seed, U32 maxLog)
 #define CHECK(cond, ...) if (cond) { DISPLAY("Error => "); DISPLAY(__VA_ARGS__); \
                          DISPLAY(" (seed %u, test nb %u)  \n", seed, testNb); goto _output_error; }
 
-static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxDurationS, double compressibility)
+static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxDurationS, double compressibility, int bigTests)
 {
     static const U32 maxSrcLog = 23;
     static const U32 maxSampleLog = 22;
@@ -636,6 +766,7 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     U32 coreSeed = seed, lseed = 0;
     clock_t const startClock = clock();
     clock_t const maxClockSpan = maxDurationS * CLOCKS_PER_SEC;
+    int const cLevelLimiter = bigTests ? 3 : 2;
 
     /* allocation */
     cNoiseBuffer[0] = (BYTE*)malloc (srcBufferSize);
@@ -662,7 +793,6 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
     for ( ; (testNb <= nbTests) || (FUZ_clockSpan(startClock) < maxClockSpan); testNb++ ) {
         size_t sampleSize, maxTestSize, totalTestSize;
         size_t cSize, totalCSize, totalGenSize;
-        XXH64_state_t xxhState;
         U64 crcOrig;
         BYTE* sampleBuffer;
         const BYTE* dict;
@@ -701,7 +831,10 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
         crcOrig = XXH64(sampleBuffer, sampleSize, 0);
 
         /* compression tests */
-        {   unsigned const cLevel = (FUZ_rand(&lseed) % (ZSTD_maxCLevel() - (FUZ_highbit32((U32)sampleSize)/3))) + 1;
+        {   unsigned const cLevel =
+                    ( FUZ_rand(&lseed) %
+                     (ZSTD_maxCLevel() - (FUZ_highbit32((U32)sampleSize) / cLevelLimiter)) )
+                     + 1;
             cSize = ZSTD_compressCCtx(ctx, cBuffer, cBufferSize, sampleBuffer, sampleSize, cLevel);
             CHECK(ZSTD_isError(cSize), "ZSTD_compressCCtx failed : %s", ZSTD_getErrorName(cSize));
 
@@ -801,7 +934,10 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
 
         {   U32 const testLog = FUZ_rand(&lseed) % maxSrcLog;
             U32 const dictLog = FUZ_rand(&lseed) % maxSrcLog;
-            int const cLevel = (FUZ_rand(&lseed) % (ZSTD_maxCLevel() - (MAX(testLog, dictLog)/3))) + 1;
+            int const cLevel = (FUZ_rand(&lseed) %
+                                (ZSTD_maxCLevel() -
+                                 (MAX(testLog, dictLog) / cLevelLimiter))) +
+                               1;
             maxTestSize = FUZ_rLogLength(&lseed, testLog);
             if (maxTestSize >= dstBufferSize) maxTestSize = dstBufferSize-1;
 
@@ -813,22 +949,22 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
                 CHECK (ZSTD_isError(errorCode), "ZSTD_compressBegin_usingDict error : %s", ZSTD_getErrorName(errorCode));
             } else {
                 ZSTD_compressionParameters const cPar = ZSTD_getCParams(cLevel, 0, dictSize);
-                ZSTD_frameParameters const fpar = { FUZ_rand(&lseed)&1 /* contentSizeFlag */,
+                ZSTD_frameParameters const fPar = { FUZ_rand(&lseed)&1 /* contentSizeFlag */,
                                                     !(FUZ_rand(&lseed)&3) /* contentChecksumFlag*/,
                                                     0 /*NodictID*/ };   /* note : since dictionary is fake, dictIDflag has no impact */
-                ZSTD_parameters p;
-                size_t errorCode;
-                p.cParams = cPar; p.fParams = fpar;
-                errorCode = ZSTD_compressBegin_advanced(refCtx, dict, dictSize, p, 0);
+                ZSTD_parameters const p = FUZ_makeParams(cPar, fPar);
+                size_t const errorCode = ZSTD_compressBegin_advanced(refCtx, dict, dictSize, p, 0);
                 CHECK (ZSTD_isError(errorCode), "ZSTD_compressBegin_advanced error : %s", ZSTD_getErrorName(errorCode));
             }
             {   size_t const errorCode = ZSTD_copyCCtx(ctx, refCtx, 0);
                 CHECK (ZSTD_isError(errorCode), "ZSTD_copyCCtx error : %s", ZSTD_getErrorName(errorCode));
         }   }
-        XXH64_reset(&xxhState, 0);
         ZSTD_setCCtxParameter(ctx, ZSTD_p_forceWindow, FUZ_rand(&lseed) & 1);
+
         {   U32 const nbChunks = (FUZ_rand(&lseed) & 127) + 2;
             U32 n;
+            XXH64_state_t xxhState;
+            XXH64_reset(&xxhState, 0);
             for (totalTestSize=0, cSize=0, n=0 ; n<nbChunks ; n++) {
                 size_t const segmentSize = FUZ_randomLength(&lseed, maxSampleLog);
                 size_t const segmentStart = FUZ_rand(&lseed) % (srcBufferSize - segmentSize);
@@ -843,13 +979,14 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD
                 XXH64_update(&xxhState, srcBuffer+segmentStart, segmentSize);
                 memcpy(mirrorBuffer + totalTestSize, srcBuffer+segmentStart, segmentSize);
                 totalTestSize += segmentSize;
-        }   }
+            }
 
-        {   size_t const flushResult = ZSTD_compressEnd(ctx, cBuffer+cSize, cBufferSize-cSize, NULL, 0);
-            CHECK (ZSTD_isError(flushResult), "multi-segments epilogue error : %s", ZSTD_getErrorName(flushResult));
-            cSize += flushResult;
+            {   size_t const flushResult = ZSTD_compressEnd(ctx, cBuffer+cSize, cBufferSize-cSize, NULL, 0);
+                CHECK (ZSTD_isError(flushResult), "multi-segments epilogue error : %s", ZSTD_getErrorName(flushResult));
+                cSize += flushResult;
+            }
+            crcOrig = XXH64_digest(&xxhState);
         }
-        crcOrig = XXH64_digest(&xxhState);
 
         /* streaming decompression test */
         if (dictSize<8) dictSize=0, dict=NULL;   /* disable dictionary */
@@ -899,7 +1036,7 @@ _output_error:
 /*_*******************************************************
 *  Command line
 *********************************************************/
-int FUZ_usage(const char* programName)
+static int FUZ_usage(const char* programName)
 {
     DISPLAY( "Usage :\n");
     DISPLAY( "      %s [args]\n", programName);
@@ -915,19 +1052,39 @@ int FUZ_usage(const char* programName)
     return 0;
 }
 
+/*! readU32FromChar() :
+    @return : unsigned integer value read from input in `char` format
+    allows and interprets K, KB, KiB, M, MB and MiB suffix.
+    Will also modify `*stringPtr`, advancing it to position where it stopped reading.
+    Note : function result can overflow if digit string > MAX_UINT */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9'))
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        result <<= 10;
+        if (**stringPtr=='M') result <<= 10;
+        (*stringPtr)++ ;
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
 
 int main(int argc, const char** argv)
 {
-    U32 seed=0;
-    int seedset=0;
+    U32 seed = 0;
+    int seedset = 0;
     int argNb;
     int nbTests = nbTestsDefault;
     int testNb = 0;
     U32 proba = FUZ_compressibility_default;
-    int result=0;
+    int result = 0;
     U32 mainPause = 0;
     U32 maxDuration = 0;
-    const char* programName = argv[0];
+    int bigTests = 1;
+    const char* const programName = argv[0];
 
     /* Check command line */
     for (argNb=1; argNb<argc; argNb++) {
@@ -936,81 +1093,64 @@ int main(int argc, const char** argv)
 
         /* Handle commands. Aggregated commands are allowed */
         if (argument[0]=='-') {
+
+            if (!strcmp(argument, "--no-big-tests")) { bigTests=0; continue; }
+
             argument++;
             while (*argument!=0) {
                 switch(*argument)
                 {
                 case 'h':
                     return FUZ_usage(programName);
+
                 case 'v':
                     argument++;
-                    g_displayLevel=4;
+                    g_displayLevel = 4;
                     break;
+
                 case 'q':
                     argument++;
                     g_displayLevel--;
                     break;
+
                 case 'p': /* pause at the end */
                     argument++;
                     mainPause = 1;
                     break;
 
                 case 'i':
-                    argument++; maxDuration=0;
-                    nbTests=0;
-                    while ((*argument>='0') && (*argument<='9')) {
-                        nbTests *= 10;
-                        nbTests += *argument - '0';
-                        argument++;
-                    }
+                    argument++; maxDuration = 0;
+                    nbTests = readU32FromChar(&argument);
                     break;
 
                 case 'T':
                     argument++;
-                    nbTests=0; maxDuration=0;
-                    while ((*argument>='0') && (*argument<='9')) {
-                        maxDuration *= 10;
-                        maxDuration += *argument - '0';
-                        argument++;
-                    }
-                    if (*argument=='m') maxDuration *=60, argument++;
+                    nbTests = 0;
+                    maxDuration = readU32FromChar(&argument);
+                    if (*argument=='s') argument++;   /* seconds */
+                    if (*argument=='m') maxDuration *= 60, argument++;   /* minutes */
                     if (*argument=='n') argument++;
                     break;
 
                 case 's':
                     argument++;
-                    seed=0;
-                    seedset=1;
-                    while ((*argument>='0') && (*argument<='9')) {
-                        seed *= 10;
-                        seed += *argument - '0';
-                        argument++;
-                    }
+                    seedset = 1;
+                    seed = readU32FromChar(&argument);
                     break;
 
                 case 't':
                     argument++;
-                    testNb=0;
-                    while ((*argument>='0') && (*argument<='9')) {
-                        testNb *= 10;
-                        testNb += *argument - '0';
-                        argument++;
-                    }
+                    testNb = readU32FromChar(&argument);
                     break;
 
                 case 'P':   /* compressibility % */
                     argument++;
-                    proba=0;
-                    while ((*argument>='0') && (*argument<='9')) {
-                        proba *= 10;
-                        proba += *argument - '0';
-                        argument++;
-                    }
-                    if (proba>100) proba=100;
+                    proba = readU32FromChar(&argument);
+                    if (proba>100) proba = 100;
                     break;
 
                 default:
-                    return FUZ_usage(programName);
+                    return (FUZ_usage(programName), 1);
     }   }   }   }   /* for (argNb=1; argNb<argc; argNb++) */
 
     /* Get Seed */
@@ -1030,7 +1170,7 @@ int main(int argc, const char** argv)
     if (testNb==0)
         result = basicUnitTests(0, ((double)proba) / 100);  /* constant seed for predictability */
     if (!result)
-        result = fuzzerTests(seed, nbTests, testNb, maxDuration, ((double)proba) / 100);
+        result = fuzzerTests(seed, nbTests, testNb, maxDuration, ((double)proba) / 100, bigTests);
     if (mainPause) {
         int unused;
         DISPLAY("Press Enter \n");
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index 5eabcba2b60b..1913b54d0a54 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -58,6 +58,11 @@ static const int g_maxNbVariations = 64;
 **************************************/
 #define DISPLAY(...)  fprintf(stderr, __VA_ARGS__)
 
+#undef MIN
+#undef MAX
+#define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+#define MAX(a,b)   ( (a) > (b) ? (a) : (b) )
+
 
 /*-************************************
 *  Benchmark Parameters
@@ -106,7 +111,11 @@ static size_t BMK_findMaxMem(U64 requiredMem)
 }
 
 
-#  define FUZ_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 FUZ_rotl32(U32 x, U32 r)
+{
+    return ((x << r) | (x >> (32 - r)));
+}
+
 U32 FUZ_rand(U32* src)
 {
     const U32 prime1 = 2654435761U;
@@ -125,7 +134,7 @@ U32 FUZ_rand(U32* src)
 *********************************************************/
 typedef struct {
     size_t cSize;
-    double cSpeed;
+    double cSpeed;   /* bytes / sec */
     double dSpeed;
 } BMK_result_t;
 
@@ -141,8 +150,6 @@ typedef struct
 } blockParam_t;
 
 
-#define MIN(a,b)  ( (a) < (b) ? (a) : (b) )
-
 static size_t BMK_benchParam(BMK_result_t* resultPtr,
                              const void* srcBuffer, size_t srcSize,
                              ZSTD_CCtx* ctx,
@@ -165,6 +172,11 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
     char name[30] = { 0 };
     U64 crcOrig;
 
+    /* init result for early exit */
+    resultPtr->cSize = srcSize;
+    resultPtr->cSpeed = 0.;
+    resultPtr->dSpeed = 0.;
+
     /* Memory allocation & restrictions */
     snprintf(name, 30, "Sw%02uc%02uh%02us%02ul%1ut%03uS%1u", Wlog, Clog, Hlog, Slog, Slength, Tlength, strat);
     if (!compressedBuffer || !resultBuffer || !blockTable) {
@@ -206,7 +218,6 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
         size_t cSize = 0;
         double fastestC = 100000000., fastestD = 100000000.;
         double ratio = 0.;
-        U64 crcCheck = 0;
         clock_t const benchStart = clock();
 
         DISPLAY("\r%79s\r", "");
@@ -242,8 +253,8 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
             cSize = 0;
             for (blockNb=0; blockNb<nbBlocks; blockNb++)
                 cSize += blockTable[blockNb].cSize;
-            if ((double)roundClock < fastestC * CLOCKS_PER_SEC * nbLoops) fastestC = ((double)roundClock / CLOCKS_PER_SEC) / nbLoops;
             ratio = (double)srcSize / (double)cSize;
+            if ((double)roundClock < fastestC * CLOCKS_PER_SEC * nbLoops) fastestC = ((double)roundClock / CLOCKS_PER_SEC) / nbLoops;
             DISPLAY("\r");
             DISPLAY("%1u-%s : %9u ->", loopNb, name, (U32)srcSize);
             DISPLAY(" %9u (%4.3f),%7.1f MB/s", (U32)cSize, ratio, (double)srcSize / fastestC / 1000000.);
@@ -273,18 +284,18 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
             resultPtr->dSpeed = (double)srcSize / fastestD;
 
             /* CRC Checking */
-            crcCheck = XXH64(resultBuffer, srcSize, 0);
-            if (crcOrig!=crcCheck) {
-                unsigned u;
-                unsigned eBlockSize = (unsigned)(MIN(65536*2, blockSize));
-                DISPLAY("\n!!! WARNING !!! Invalid Checksum : %x != %x\n", (unsigned)crcOrig, (unsigned)crcCheck);
-                for (u=0; u<srcSize; u++) {
-                    if (((const BYTE*)srcBuffer)[u] != ((BYTE*)resultBuffer)[u]) {
-                        printf("Decoding error at pos %u (block %u, pos %u) \n", u, u / eBlockSize, u % eBlockSize);
-                        break;
-                }   }
-                break;
-            }
+            {   U64 const crcCheck = XXH64(resultBuffer, srcSize, 0);
+                if (crcOrig!=crcCheck) {
+                    unsigned u;
+                    unsigned eBlockSize = (unsigned)(MIN(65536*2, blockSize));
+                    DISPLAY("\n!!! WARNING !!! Invalid Checksum : %x != %x\n", (unsigned)crcOrig, (unsigned)crcCheck);
+                    for (u=0; u<srcSize; u++) {
+                        if (((const BYTE*)srcBuffer)[u] != ((BYTE*)resultBuffer)[u]) {
+                            printf("Decoding error at pos %u (block %u, pos %u) \n", u, u / eBlockSize, u % eBlockSize);
+                            break;
+                    }   }
+                    break;
+            }   }
 #endif
     }   }
 
@@ -505,8 +516,6 @@ static BYTE g_alreadyTested[PARAMTABLESIZE] = {0};   /* init to zero */
     g_alreadyTested[(XXH64(sanitizeParams(p), sizeof(p), 0) >> 3) & PARAMTABLEMASK]
 
 
-#define MAX(a,b)   ( (a) > (b) ? (a) : (b) )
-
 static void playAround(FILE* f, winnerInfo_t* winners,
                        ZSTD_compressionParameters params,
                        const void* srcBuffer, size_t srcSize,
@@ -711,6 +720,14 @@ int benchFiles(const char** fileNamesTable, int nbFiles)
 }
 
 
+static void BMK_translateAdvancedParams(ZSTD_compressionParameters params)
+{
+    DISPLAY("--zstd=windowLog=%u,chainLog=%u,hashLog=%u,searchLog=%u,searchLength=%u,targetLength=%u,strategy=%u \n",
+             params.windowLog, params.chainLog, params.hashLog, params.searchLog, params.searchLength, params.targetLength, (U32)(params.strategy));
+}
+
+/* optimizeForSize():
+ * targetSpeed : expressed in MB/s */
 int optimizeForSize(const char* inFileName, U32 targetSpeed)
 {
     FILE* const inFile = fopen( inFileName, "rb" );
@@ -723,8 +740,11 @@ int optimizeForSize(const char* inFileName, U32 targetSpeed)
 
     /* Memory allocation & restrictions */
     if ((U64)benchedSize > inFileSize) benchedSize = (size_t)inFileSize;
-    if (benchedSize < inFileSize)
-        DISPLAY("Not enough memory for '%s' full size; testing %i MB only...\n", inFileName, (int)(benchedSize>>20));
+    if (benchedSize < inFileSize) {
+        DISPLAY("Not enough memory for '%s' \n", inFileName);
+        fclose(inFile);
+        return 11;
+    }
 
     /* Alloc */
     origBuff = malloc(benchedSize);
@@ -747,10 +767,9 @@ int optimizeForSize(const char* inFileName, U32 targetSpeed)
     /* bench */
     DISPLAY("\r%79s\r", "");
     DISPLAY("optimizing for %s - limit speed %u MB/s \n", inFileName, targetSpeed);
-    targetSpeed *= 1000;
+    targetSpeed *= 1000000;
 
     {   ZSTD_CCtx* const ctx = ZSTD_createCCtx();
-        ZSTD_compressionParameters params;
         winnerInfo_t winner;
         BMK_result_t candidate;
         const size_t blockSize = g_blockSize ? g_blockSize : benchedSize;
@@ -764,26 +783,28 @@ int optimizeForSize(const char* inFileName, U32 targetSpeed)
         {   const int maxSeeds = g_noSeed ? 1 : ZSTD_maxCLevel();
             int i;
             for (i=1; i<=maxSeeds; i++) {
-                params = ZSTD_getCParams(i, blockSize, 0);
-                BMK_benchParam(&candidate, origBuff, benchedSize, ctx, params);
+                ZSTD_compressionParameters const CParams = ZSTD_getCParams(i, blockSize, 0);
+                BMK_benchParam(&candidate, origBuff, benchedSize, ctx, CParams);
                 if (candidate.cSpeed < targetSpeed)
                     break;
                 if ( (candidate.cSize < winner.result.cSize)
                    | ((candidate.cSize == winner.result.cSize) & (candidate.cSpeed > winner.result.cSpeed)) )
                 {
-                    winner.params = params;
+                    winner.params = CParams;
                     winner.result = candidate;
                     BMK_printWinner(stdout, i, winner.result, winner.params, benchedSize);
             }   }
         }
         BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
+        BMK_translateAdvancedParams(winner.params);
 
         /* start tests */
         {   time_t const grillStart = time(NULL);
             do {
-                params = winner.params;
+                ZSTD_compressionParameters params = winner.params;
                 paramVariation(&params);
-                if ((FUZ_rand(&g_rand) & 15) == 3) params = randomParams();
+                if ((FUZ_rand(&g_rand) & 31) == 3) params = randomParams();  /* totally random config to improve search space */
+                params = ZSTD_adjustCParams(params, blockSize, 0);
 
                 /* exclude faster if already played set of params */
                 if (FUZ_rand(&g_rand) & ((1 << NB_TESTS_PLAYED(params))-1)) continue;
@@ -800,6 +821,7 @@ int optimizeForSize(const char* inFileName, U32 targetSpeed)
                     winner.params = params;
                     winner.result = candidate;
                     BMK_printWinner(stdout, 99, winner.result, winner.params, benchedSize);
+                    BMK_translateAdvancedParams(winner.params);
                 }
             } while (BMK_timeSpan(grillStart) < g_grillDuration_s);
         }
@@ -833,7 +855,7 @@ static int usage_advanced(void)
     DISPLAY( " -T#    : set level 1 speed objective \n");
     DISPLAY( " -B#    : cut input into blocks of size # (default : single block) \n");
     DISPLAY( " -i#    : iteration loops [1-9](default : %i) \n", NBLOOPS);
-    DISPLAY( " -O#    : find Optimized parameters for # target speed (default : 0) \n");
+    DISPLAY( " -O#    : find Optimized parameters for # MB/s compression speed (default : 0) \n");
     DISPLAY( " -S     : Single run \n");
     DISPLAY( " -P#    : generated sample compressibility (default : %.1f%%) \n", COMPRESSIBILITY_DEFAULT * 100);
     return 0;
diff --git a/tests/playTests.sh b/tests/playTests.sh
index c493fed55358..021fd59fe2af 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -11,6 +11,7 @@ roundTripTest() {
         local_p="$2"
     else
         local_c="$2"
+        local_p=""
     fi
 
     rm -f tmp1 tmp2
@@ -20,13 +21,36 @@ roundTripTest() {
     $DIFF -q tmp1 tmp2
 }
 
+fileRoundTripTest() {
+    if [ -n "$3" ]; then
+        local_c="$3"
+        local_p="$2"
+    else
+        local_c="$2"
+        local_p=""
+    fi
+
+    rm -f tmp.zstd tmp.md5.1 tmp.md5.2
+    $ECHO "fileRoundTripTest: ./datagen $1 $local_p > tmp && $ZSTD -v$local_c -c tmp | $ZSTD -d"
+    ./datagen $1 $local_p > tmp
+    cat tmp | $MD5SUM > tmp.md5.1
+    $ZSTD --ultra -v$local_c -c tmp | $ZSTD -d | $MD5SUM > tmp.md5.2
+    $DIFF -q tmp.md5.1 tmp.md5.2
+}
+
+isTerminal=false
+if [ -t 0 ] && [ -t 1 ]
+then
+    isTerminal=true
+fi
+
 isWindows=false
-ECHO="echo"
+ECHO="echo -e"
 INTOVOID="/dev/null"
 case "$OS" in
   Windows*)
     isWindows=true
-    ECHO="echo -e"
+    INTOVOID="NUL"
     ;;
 esac
 
@@ -42,11 +66,17 @@ case "$UNAME" in
   SunOS) DIFF="gdiff" ;;
 esac
 
-
 $ECHO "\nStarting playTests.sh isWindows=$isWindows ZSTD='$ZSTD'"
 
 [ -n "$ZSTD" ] || die "ZSTD variable must be defined!"
 
+if [ -n "$(echo hello | $ZSTD -v -T2 2>&1 > $INTOVOID | grep 'multi-threading is disabled')" ]
+then
+    hasMT=""
+else
+    hasMT="true"
+fi
+
 $ECHO "\n**** simple tests **** "
 
 ./datagen > tmp
@@ -72,6 +102,12 @@ cp tmp tmp2
 $ZSTD tmp2 -fo && die "-o must be followed by filename "
 $ECHO "test : implied stdout when input is stdin"
 $ECHO bob | $ZSTD | $ZSTD -d
+if [ "$isTerminal" = true ]; then
+$ECHO "test : compressed data to terminal"
+$ECHO bob | $ZSTD && die "should have refused : compressed data to terminal"
+$ECHO "test : compressed data from terminal (a hang here is a test fail, zstd is wrongly waiting on data from terminal)"
+$ZSTD -d > $INTOVOID && die "should have refused : compressed data from terminal"
+fi
 $ECHO "test : null-length file roundtrip"
 $ECHO -n '' | $ZSTD - --stdout | $ZSTD -d --stdout
 $ECHO "test : decompress file with wrong suffix (must fail)"
@@ -96,6 +132,14 @@ $ZSTD -q tmp && die "overwrite check failed!"
 $ECHO "test : force overwrite"
 $ZSTD -q -f tmp
 $ZSTD -q --force tmp
+$ECHO "test : overwrite readonly file"
+rm -f tmpro tmpro.zst
+$ECHO foo > tmpro.zst
+$ECHO foo > tmpro
+chmod 400 tmpro.zst
+$ZSTD -q tmpro && die "should have refused to overwrite read-only file"
+$ZSTD -q -f tmpro
+rm -f tmpro tmpro.zst
 $ECHO "test : file removal"
 $ZSTD -f --rm tmp
 ls tmp && die "tmp should no longer be present"
@@ -156,6 +200,19 @@ $ECHO "$ECHO foo | $ZSTD > /dev/full"
 $ECHO foo | $ZSTD > /dev/full && die "write error not detected!"
 $ECHO "$ECHO foo | $ZSTD | $ZSTD -d > /dev/full"
 $ECHO foo | $ZSTD | $ZSTD -d > /dev/full && die "write error not detected!"
+
+$ECHO "\n**** symbolic link test **** "
+
+rm -f hello.tmp world.tmp hello.tmp.zst world.tmp.zst
+$ECHO "hello world" > hello.tmp
+ln -s hello.tmp world.tmp
+$ZSTD world.tmp hello.tmp
+ls hello.tmp.zst || die "regular file should have been compressed!"
+ls world.tmp.zst && die "symbolic link should not have been compressed!"
+$ZSTD world.tmp hello.tmp -f
+ls world.tmp.zst || die "symbol link should have been compressed with --force"
+rm -f hello.tmp world.tmp hello.tmp.zst world.tmp.zst
+
 fi
 
 
@@ -227,12 +284,12 @@ $ECHO "- Create second (different) dictionary "
 $ZSTD --train *.c ../programs/*.c ../programs/*.h -o tmpDictC
 $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
 $ECHO "- Create dictionary with short dictID"
-$ZSTD --train *.c ../programs/*.c --dictID 1 -o tmpDict1
+$ZSTD --train *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with wrong dictID parameter order (must fail)"
 $ZSTD --train *.c ../programs/*.c --dictID -o 1 tmpDict1 && die "wrong order : --dictID must be followed by argument "
 $ECHO "- Create dictionary with size limit"
-$ZSTD --train *.c ../programs/*.c -o tmpDict2 --maxdict 4K -v
+$ZSTD --train *.c ../programs/*.c -o tmpDict2 --maxdict=4K -v
 $ECHO "- Create dictionary with wrong parameter order (must fail)"
 $ZSTD --train *.c ../programs/*.c -o tmpDict2 --maxdict -v 4K && die "wrong order : --maxdict must be followed by argument "
 $ECHO "- Compress without dictID"
@@ -240,7 +297,7 @@ $ZSTD -f tmp -D tmpDict1 --no-dictID
 $ZSTD -d tmp.zst -D tmpDict -fo result
 $DIFF $TESTFILE result
 $ECHO "- Compress with wrong argument order (must fail)"
-$ZSTD tmp -Df tmpDict1 -c > /dev/null && die "-D must be followed by dictionary name "
+$ZSTD tmp -Df tmpDict1 -c > $INTOVOID && die "-D must be followed by dictionary name "
 $ECHO "- Compress multiple files with dictionary"
 rm -rf dirTestDict
 mkdir dirTestDict
@@ -255,6 +312,11 @@ case "$UNAME" in
   *) $MD5SUM -c tmph1 ;;
 esac
 rm -rf dirTestDict
+$ECHO "- dictionary builder on bogus input"
+$ECHO "Hello World" > tmp
+$ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source"
+./datagen -P0 -g10M > tmp
+$ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise"
 rm tmp*
 
 
@@ -263,19 +325,39 @@ $ECHO "\n**** cover dictionary tests **** "
 TESTFILE=../programs/zstdcli.c
 ./datagen > tmpDict
 $ECHO "- Create first dictionary"
-$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c -o tmpDict
+$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c -o tmpDict
 cp $TESTFILE tmp
 $ZSTD -f tmp -D tmpDict
 $ZSTD -d tmp.zst -D tmpDict -fo result
 $DIFF $TESTFILE result
 $ECHO "- Create second (different) dictionary"
-$ZSTD --train --cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
 $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
 $ECHO "- Create dictionary with short dictID"
-$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c --dictID 1 -o tmpDict1
+$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1
 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
 $ECHO "- Create dictionary with size limit"
-$ZSTD --train --optimize-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict 4K
+$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
+rm tmp*
+
+$ECHO "\n**** legacy dictionary tests **** "
+
+TESTFILE=../programs/zstdcli.c
+./datagen > tmpDict
+$ECHO "- Create first dictionary"
+$ZSTD --train-legacy=selectivity=8 *.c ../programs/*.c -o tmpDict
+cp $TESTFILE tmp
+$ZSTD -f tmp -D tmpDict
+$ZSTD -d tmp.zst -D tmpDict -fo result
+$DIFF $TESTFILE result
+$ECHO "- Create second (different) dictionary"
+$ZSTD --train-legacy=s=5 *.c ../programs/*.c ../programs/*.h -o tmpDictC
+$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
+$ECHO "- Create dictionary with short dictID"
+$ZSTD --train-legacy -s5 *.c ../programs/*.c --dictID=1 -o tmpDict1
+cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
+$ECHO "- Create dictionary with size limit"
+$ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
 rm tmp*
 
 
@@ -341,7 +423,7 @@ if [ $GZIPMODE -eq 1 ]; then
     $ZSTD -f --format=gzip tmp
     $ZSTD -f tmp
     cat tmp.gz tmp.zst tmp.gz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.gz | $ZSTD -t && die "incomplete frame not detected !"
+    head -c -1 tmp.gz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "gzip mode not supported"
@@ -383,13 +465,48 @@ if [ $LZMAMODE -eq 1 ]; then
     $ZSTD -f --format=lzma tmp
     $ZSTD -f tmp
     cat tmp.xz tmp.lzma tmp.zst tmp.lzma tmp.xz tmp.zst | $ZSTD -d -f -o tmp
-    head -c -1 tmp.xz | $ZSTD -t && die "incomplete frame not detected !"
-    head -c -1 tmp.lzma | $ZSTD -t && die "incomplete frame not detected !"
+    head -c -1 tmp.xz | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    head -c -1 tmp.lzma | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
     rm tmp*
 else
     $ECHO "xz mode not supported"
 fi
 
+$ECHO "\n**** lz4 compatibility tests **** "
+
+LZ4MODE=1
+$ZSTD --format=lz4 -V || LZ4MODE=0
+if [ $LZ4MODE -eq 1 ]; then
+    $ECHO "lz4 support detected"
+    LZ4EXE=1
+    lz4 -V || LZ4EXE=0
+    if [ $LZ4EXE -eq 1 ]; then
+        ./datagen > tmp
+        $ZSTD --format=lz4 -f tmp
+        lz4 -t -v tmp.lz4
+        lz4 -f tmp
+        $ZSTD -d -f -v tmp.lz4
+        rm tmp*
+    else
+        $ECHO "lz4 binary not detected"
+    fi
+else
+    $ECHO "lz4 mode not supported"
+fi
+
+
+$ECHO "\n**** lz4 frame tests **** "
+
+if [ $LZ4MODE -eq 1 ]; then
+    ./datagen > tmp
+    $ZSTD -f --format=lz4 tmp
+    $ZSTD -f tmp
+    cat tmp.lz4 tmp.zst tmp.lz4 tmp.zst | $ZSTD -d -f -o tmp
+    head -c -1 tmp.lz4 | $ZSTD -t > $INTOVOID && die "incomplete frame not detected !"
+    rm tmp*
+else
+    $ECHO "lz4 mode not supported"
+fi
 
 $ECHO "\n**** zstd round-trip tests **** "
 
@@ -402,6 +519,19 @@ roundTripTest -g519K 6    # greedy, hash chain
 roundTripTest -g517K 16   # btlazy2
 roundTripTest -g516K 19   # btopt
 
+fileRoundTripTest -g500K
+
+if [ -n "$hasMT" ]
+then
+    $ECHO "\n**** zstdmt round-trip tests **** "
+    roundTripTest -g4M "1 -T0"
+    roundTripTest -g8M "3 -T2"
+    roundTripTest -g8000K "2 --threads=2"
+    fileRoundTripTest -g4M "19 -T2 -B1M"
+else
+    $ECHO "\n**** no multithreading, skipping zstdmt tests **** "
+fi
+
 rm tmp*
 
 if [ "$1" != "--test-large-data" ]; then
@@ -437,4 +567,16 @@ roundTripTest -g50000000 -P94 19
 roundTripTest -g99000000 -P99 20
 roundTripTest -g6000000000 -P99 1
 
+fileRoundTripTest -g4193M -P99 1
+
+if [ -n "$hasMT" ]
+then
+    $ECHO "\n**** zstdmt long round-trip tests **** "
+    roundTripTest -g99000000 -P99 "20 -T2"
+    roundTripTest -g6000000000 -P99 "1 -T2"
+    fileRoundTripTest -g4193M -P98 " -T0"
+else
+    $ECHO "\n**** no multithreading, skipping zstdmt tests **** "
+fi
+
 rm tmp*
diff --git a/tests/test-zstd-speed.py b/tests/test-zstd-speed.py
index 23d4f477c7f3..56108a5cae4c 100755
--- a/tests/test-zstd-speed.py
+++ b/tests/test-zstd-speed.py
@@ -14,14 +14,15 @@
 # - dir1/zstd and dir2/zstd will be merged in a single results file
 
 import argparse
-import os
+import os           # getloadavg
 import string
 import subprocess
-import time
+import time         # strftime
 import traceback
 import hashlib
+import platform     # system
 
-script_version = 'v1.1.1 (2016-10-28)'
+script_version = 'v1.1.2 (2017-03-26)'
 default_repo_url = 'https://github.com/facebook/zstd.git'
 working_dir_name = 'speedTest'
 working_path = os.getcwd() + '/' + working_dir_name     # /path/to/zstd/tests/speedTest
@@ -152,10 +153,15 @@ def benchmark_and_compare(branch, commit, last_commit, args, executableName, md5
             % (os.getloadavg()[0], args.maxLoadAvg, sleepTime))
         time.sleep(sleepTime)
     start_load = str(os.getloadavg())
+    osType = platform.system()
+    if osType == 'Linux':
+        cpuSelector = "taskset --cpu-list 0"
+    else:
+        cpuSelector = ""
     if args.dictionary:
-        result = execute('programs/%s -rqi5b1e%s -D %s %s' % (executableName, args.lastCLevel, args.dictionary, testFilePath), print_output=True)
+        result = execute('%s programs/%s -rqi5b1e%s -D %s %s' % (cpuSelector, executableName, args.lastCLevel, args.dictionary, testFilePath), print_output=True)
     else:
-        result = execute('programs/%s -rqi5b1e%s %s' % (executableName, args.lastCLevel, testFilePath), print_output=True)   
+        result = execute('%s programs/%s -rqi5b1e%s %s' % (cpuSelector, executableName, args.lastCLevel, testFilePath), print_output=True)
     end_load = str(os.getloadavg())
     linesExpected = args.lastCLevel + 1
     if len(result) != linesExpected:
@@ -291,7 +297,7 @@ if __name__ == '__main__':
         log("ERROR: e-mail senders 'mail' or 'mutt' not found")
         exit(1)
 
-    clang_version = execute("clang -v 2>&1 | grep 'clang version' | sed -e 's:.*version \\([0-9.]*\\).*:\\1:' -e 's:\\.\\([0-9][0-9]\\):\\1:g'", verbose)[0];
+    clang_version = execute("clang -v 2>&1 | grep ' version ' | sed -e 's:.*version \\([0-9.]*\\).*:\\1:' -e 's:\\.\\([0-9][0-9]\\):\\1:g'", verbose)[0];
     gcc_version = execute("gcc -dumpversion", verbose)[0];
 
     if verbose:
diff --git a/tests/zbufftest.c b/tests/zbufftest.c
index 14b73923311d..601aa808d027 100644
--- a/tests/zbufftest.c
+++ b/tests/zbufftest.c
@@ -60,7 +60,7 @@ static U32 g_displayLevel = 2;
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((FUZ_GetClockSpan(g_displayClock) > g_refreshRate) || (g_displayLevel>=4)) \
             { g_displayClock = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
 static clock_t g_displayClock = 0;
 
diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c
index aa7367bcfd31..0e09e185642b 100644
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@@ -59,7 +59,7 @@ static U32 g_displayLevel = 2;
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((FUZ_GetClockSpan(g_displayClock) > g_refreshRate) || (g_displayLevel>=4)) \
             { g_displayClock = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(stderr); } }
 static const clock_t g_refreshRate = CLOCKS_PER_SEC / 6;
 static clock_t g_displayClock = 0;
 
@@ -131,7 +131,7 @@ static buffer_t FUZ_createDictionary(const void* src, size_t srcSize, size_t blo
     }
     {   size_t const dictSize = ZDICT_trainFromBuffer(dict.start, requestedDictSize, src, blockSizes, (unsigned)nbBlocks);
         free(blockSizes);
-        if (ZDICT_isError(dictSize)) { free(dict.start); return (buffer_t){ NULL, 0, 0 }; }
+        if (ZDICT_isError(dictSize)) { free(dict.start); return g_nullBuffer; }
         dict.size = requestedDictSize;
         dict.filled = dictSize;
         return dict;   /* how to return dictSize ? */
@@ -207,6 +207,16 @@ static int basicUnitTests(U32 seed, double compressibility, ZSTD_customMem custo
       DISPLAYLEVEL(3, "OK (%u bytes) \n", (U32)s);
     }
 
+    /* Attempt bad compression parameters */
+    DISPLAYLEVEL(3, "test%3i : use bad compression parameters : ", testNb++);
+    {   size_t r;
+        ZSTD_parameters params = ZSTD_getParams(1, 0, 0);
+        params.cParams.searchLength = 2;
+        r = ZSTD_initCStream_advanced(zc, NULL, 0, params, 0);
+        if (!ZSTD_isError(r)) goto _output_error;
+        DISPLAYLEVEL(3, "init error : %s \n", ZSTD_getErrorName(r));
+    }
+
     /* skippable frame test */
     DISPLAYLEVEL(3, "test%3i : decompress skippable frame : ", testNb++);
     ZSTD_initDStream_usingDict(zd, CNBuffer, 128 KB);
@@ -438,11 +448,64 @@ static int basicUnitTests(U32 seed, double compressibility, ZSTD_customMem custo
       if (!ZSTD_isError(r)) goto _output_error;  /* must fail : frame requires > 100 bytes */
       DISPLAYLEVEL(3, "OK (%s)\n", ZSTD_getErrorName(r)); }
 
-    /* Unknown srcSize */
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_usingCDict_advanced with masked dictID : ", testNb++);
+    {   ZSTD_compressionParameters const cParams = ZSTD_getCParams(1, CNBufferSize, dictionary.filled);
+        ZSTD_frameParameters const fParams = { 1 /* contentSize */, 1 /* checksum */, 1 /* noDictID */};
+        ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictionary.start, dictionary.filled, 1 /* byReference */, cParams, customMem);
+        size_t const initError = ZSTD_initCStream_usingCDict_advanced(zc, cdict, CNBufferSize, fParams);
+        if (ZSTD_isError(initError)) goto _output_error;
+        cSize = 0;
+        outBuff.dst = compressedBuffer;
+        outBuff.size = compressedBufferSize;
+        outBuff.pos = 0;
+        inBuff.src = CNBuffer;
+        inBuff.size = CNBufferSize;
+        inBuff.pos = 0;
+        { size_t const r = ZSTD_compressStream(zc, &outBuff, &inBuff);
+          if (ZSTD_isError(r)) goto _output_error; }
+        if (inBuff.pos != inBuff.size) goto _output_error;   /* entire input should be consumed */
+        { size_t const r = ZSTD_endStream(zc, &outBuff);
+          if (r != 0) goto _output_error; }  /* error, or some data not flushed */
+        cSize = outBuff.pos;
+        ZSTD_freeCDict(cdict);
+        DISPLAYLEVEL(3, "OK (%u bytes : %.2f%%)\n", (U32)cSize, (double)cSize/CNBufferSize*100);
+    }
+
+    DISPLAYLEVEL(3, "test%3i : try retrieving dictID from frame : ", testNb++);
+    {   U32 const did = ZSTD_getDictID_fromFrame(compressedBuffer, cSize);
+        if (did != 0) goto _output_error;
+    }
+    DISPLAYLEVEL(3, "OK (not detected) \n");
+
+    DISPLAYLEVEL(3, "test%3i : decompress without dictionary : ", testNb++);
+    {   size_t const r = ZSTD_decompress(decodedBuffer, CNBufferSize, compressedBuffer, cSize);
+        if (!ZSTD_isError(r)) goto _output_error;  /* must fail : dictionary not used */
+        DISPLAYLEVEL(3, "OK (%s)\n", ZSTD_getErrorName(r));
+    }
+
+    /* Empty srcSize */
+    DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced with pledgedSrcSize=0 and dict : ", testNb++);
+    {   ZSTD_parameters params = ZSTD_getParams(5, 0, 0);
+        params.fParams.contentSizeFlag = 1;
+        ZSTD_initCStream_advanced(zc, dictionary.start, dictionary.filled, params, 0);
+    } /* cstream advanced shall write content size = 0 */
+    inBuff.src = CNBuffer;
+    inBuff.size = 0;
+    inBuff.pos = 0;
+    outBuff.dst = compressedBuffer;
+    outBuff.size = compressedBufferSize;
+    outBuff.pos = 0;
+    if (ZSTD_isError(ZSTD_compressStream(zc, &outBuff, &inBuff))) goto _output_error;
+    if (ZSTD_endStream(zc, &outBuff) != 0) goto _output_error;
+    cSize = outBuff.pos;
+    if (ZSTD_findDecompressedSize(compressedBuffer, cSize) != 0) goto _output_error;
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3i : pledgedSrcSize == 0 behaves properly : ", testNb++);
     {   ZSTD_parameters params = ZSTD_getParams(5, 0, 0);
         params.fParams.contentSizeFlag = 1;
-        ZSTD_initCStream_advanced(zc, NULL, 0, params, 0); } /* cstream advanced should write the 0 size field */
+        ZSTD_initCStream_advanced(zc, NULL, 0, params, 0);
+    } /* cstream advanced shall write content size = 0 */
     inBuff.src = CNBuffer;
     inBuff.size = 0;
     inBuff.pos = 0;
@@ -552,7 +615,7 @@ static size_t FUZ_randomLength(U32* seed, U32 maxLog)
 #define CHECK(cond, ...) if (cond) { DISPLAY("Error => "); DISPLAY(__VA_ARGS__); \
                          DISPLAY(" (seed %u, test nb %u)  \n", seed, testNb); goto _output_error; }
 
-static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibility)
+static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibility, int bigTests)
 {
     static const U32 maxSrcLog = 24;
     static const U32 maxSampleLog = 19;
@@ -574,6 +637,7 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compres
     const BYTE* dict=NULL;   /* can keep same dict on 2 consecutive tests */
     size_t dictSize = 0;
     U32 oldTestLog = 0;
+    int const cLevelLimiter = bigTests ? 3 : 2;
 
     /* allocations */
     cNoiseBuffer[0] = (BYTE*)malloc (srcBufferSize);
@@ -638,7 +702,8 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compres
         if ((FUZ_rand(&lseed)&1) /* at beginning, to keep same nb of rand */
             && oldTestLog /* at least one test happened */ && resetAllowed) {
             maxTestSize = FUZ_randomLength(&lseed, oldTestLog+2);
-            if (maxTestSize >= srcBufferSize) maxTestSize = srcBufferSize-1;
+            if (maxTestSize >= srcBufferSize)
+                maxTestSize = srcBufferSize-1;
             {   U64 const pledgedSrcSize = (FUZ_rand(&lseed) & 3) ? 0 : maxTestSize;
                 size_t const resetError = ZSTD_resetCStream(zc, pledgedSrcSize);
                 CHECK(ZSTD_isError(resetError), "ZSTD_resetCStream error : %s", ZSTD_getErrorName(resetError));
@@ -646,11 +711,14 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compres
         } else {
             U32 const testLog = FUZ_rand(&lseed) % maxSrcLog;
             U32 const dictLog = FUZ_rand(&lseed) % maxSrcLog;
-            U32 const cLevel = (FUZ_rand(&lseed) % (ZSTD_maxCLevel() - (MAX(testLog, dictLog)/3))) + 1;
+            U32 const cLevel = ( FUZ_rand(&lseed) %
+                                (ZSTD_maxCLevel() -
+                                (MAX(testLog, dictLog) / cLevelLimiter)))
+                                 + 1;
             maxTestSize = FUZ_rLogLength(&lseed, testLog);
             oldTestLog = testLog;
             /* random dictionary selection */
-            dictSize  = ((FUZ_rand(&lseed)&63)==1) ? FUZ_rLogLength(&lseed, dictLog) : 0;
+            dictSize  = ((FUZ_rand(&lseed)&1)==1) ? FUZ_rLogLength(&lseed, dictLog) : 0;
             {   size_t const dictStart = FUZ_rand(&lseed) % (srcBufferSize - dictSize);
                 dict = srcBuffer + dictStart;
             }
@@ -785,7 +853,7 @@ _output_error:
 
 
 /* Multi-threading version of fuzzer Tests */
-static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double compressibility)
+static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double compressibility, int bigTests)
 {
     static const U32 maxSrcLog = 24;
     static const U32 maxSampleLog = 19;
@@ -807,6 +875,7 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
     const BYTE* dict=NULL;   /* can keep same dict on 2 consecutive tests */
     size_t dictSize = 0;
     U32 oldTestLog = 0;
+    int const cLevelLimiter = bigTests ? 3 : 2;
 
     /* allocations */
     cNoiseBuffer[0] = (BYTE*)malloc (srcBufferSize);
@@ -851,6 +920,7 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
         /* some issues can only happen when reusing states */
         if ((FUZ_rand(&lseed) & 0xFF) == 131) {
             U32 const nbThreads = (FUZ_rand(&lseed) % 6) + 1;
+            DISPLAYLEVEL(5, "Creating new context with %u threads \n", nbThreads);
             ZSTDMT_freeCCtx(zc);
             zc = ZSTDMT_createCCtx(nbThreads);
             resetAllowed=0;
@@ -888,7 +958,10 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
         } else {
             U32 const testLog = FUZ_rand(&lseed) % maxSrcLog;
             U32 const dictLog = FUZ_rand(&lseed) % maxSrcLog;
-            U32 const cLevel = (FUZ_rand(&lseed) % (ZSTD_maxCLevel() - (MAX(testLog, dictLog)/3))) + 1;
+            U32 const cLevel = (FUZ_rand(&lseed) %
+                                (ZSTD_maxCLevel() -
+                                 (MAX(testLog, dictLog) / cLevelLimiter))) +
+                               1;
             maxTestSize = FUZ_rLogLength(&lseed, testLog);
             oldTestLog = testLog;
             /* random dictionary selection */
@@ -898,9 +971,12 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
             }
             {   U64 const pledgedSrcSize = (FUZ_rand(&lseed) & 3) ? 0 : maxTestSize;
                 ZSTD_parameters params = ZSTD_getParams(cLevel, pledgedSrcSize, dictSize);
-                DISPLAYLEVEL(5, "Init with windowLog = %u \n", params.cParams.windowLog);
+                DISPLAYLEVEL(5, "Init with windowLog = %u and pledgedSrcSize = %u \n",
+                    params.cParams.windowLog, (U32)pledgedSrcSize);
                 params.fParams.checksumFlag = FUZ_rand(&lseed) & 1;
                 params.fParams.noDictIDFlag = FUZ_rand(&lseed) & 1;
+                params.fParams.contentSizeFlag = pledgedSrcSize>0;
+                DISPLAYLEVEL(5, "checksumFlag : %u \n", params.fParams.checksumFlag);
                 { size_t const initError = ZSTDMT_initCStream_advanced(zc, dict, dictSize, params, pledgedSrcSize);
                   CHECK (ZSTD_isError(initError),"ZSTDMT_initCStream_advanced error : %s", ZSTD_getErrorName(initError)); }
                 ZSTDMT_setMTCtxParameter(zc, ZSTDMT_p_overlapSectionLog, FUZ_rand(&lseed) % 12);
@@ -938,7 +1014,7 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
                     outBuff.size = outBuff.pos + adjustedDstSize;
                     DISPLAYLEVEL(5, "Flushing into dst buffer of size %u \n", (U32)adjustedDstSize);
                     {   size_t const flushError = ZSTDMT_flushStream(zc, &outBuff);
-                        CHECK (ZSTD_isError(flushError), "flush error : %s", ZSTD_getErrorName(flushError));
+                        CHECK (ZSTD_isError(flushError), "ZSTDMT_flushStream error : %s", ZSTD_getErrorName(flushError));
             }   }   }
 
             /* final frame epilogue */
@@ -949,12 +1025,12 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
                     outBuff.size = outBuff.pos + adjustedDstSize;
                     DISPLAYLEVEL(5, "Ending into dst buffer of size %u \n", (U32)adjustedDstSize);
                     remainingToFlush = ZSTDMT_endStream(zc, &outBuff);
-                    CHECK (ZSTD_isError(remainingToFlush), "flush error : %s", ZSTD_getErrorName(remainingToFlush));
+                    CHECK (ZSTD_isError(remainingToFlush), "ZSTDMT_endStream error : %s", ZSTD_getErrorName(remainingToFlush));
                     DISPLAYLEVEL(5, "endStream : remainingToFlush : %u \n", (U32)remainingToFlush);
             }   }
-            DISPLAYLEVEL(5, "Frame completed \n");
             crcOrig = XXH64_digest(&xxhState);
             cSize = outBuff.pos;
+            DISPLAYLEVEL(5, "Frame completed : %u bytes \n", (U32)cSize);
         }
 
         /* multi - fragments decompression test */
@@ -972,8 +1048,10 @@ static int fuzzerTests_MT(U32 seed, U32 nbTests, unsigned startTest, double comp
                 size_t const dstBuffSize = MIN(dstBufferSize - totalGenSize, randomDstSize);
                 inBuff.size = inBuff.pos + readCSrcSize;
                 outBuff.size = inBuff.pos + dstBuffSize;
+                DISPLAYLEVEL(5, "ZSTD_decompressStream input %u bytes \n", (U32)readCSrcSize);
                 decompressionResult = ZSTD_decompressStream(zd, &outBuff, &inBuff);
                 CHECK (ZSTD_isError(decompressionResult), "decompression error : %s", ZSTD_getErrorName(decompressionResult));
+                DISPLAYLEVEL(5, "inBuff.pos = %u \n", (U32)readCSrcSize);
             }
             CHECK (outBuff.pos != totalTestSize, "decompressed data : wrong size (%u != %u)", (U32)outBuff.pos, (U32)totalTestSize);
             CHECK (inBuff.pos != cSize, "compressed data should be fully read (%u != %u)", (U32)inBuff.pos, (U32)cSize);
@@ -1063,6 +1141,7 @@ int main(int argc, const char** argv)
     int result=0;
     int mainPause = 0;
     int mtOnly = 0;
+    int bigTests = 1;
     const char* const programName = argv[0];
     ZSTD_customMem const customMem = { allocFunction, freeFunction, NULL };
     ZSTD_customMem const customNULL = { NULL, NULL, NULL };
@@ -1076,6 +1155,7 @@ int main(int argc, const char** argv)
         if (argument[0]=='-') {
 
             if (!strcmp(argument, "--mt")) { mtOnly=1; continue; }
+            if (!strcmp(argument, "--no-big-tests")) { bigTests=0; continue; }
 
             argument++;
             while (*argument!=0) {
@@ -1181,8 +1261,8 @@ int main(int argc, const char** argv)
             result = basicUnitTests(0, ((double)proba) / 100, customMem);  /* use custom memory allocation functions */
     }   }
 
-    if (!result && !mtOnly) result = fuzzerTests(seed, nbTests, testNb, ((double)proba) / 100);
-    if (!result) result = fuzzerTests_MT(seed, nbTests, testNb, ((double)proba) / 100);
+    if (!result && !mtOnly) result = fuzzerTests(seed, nbTests, testNb, ((double)proba) / 100, bigTests);
+    if (!result) result = fuzzerTests_MT(seed, nbTests, testNb, ((double)proba) / 100, bigTests);
 
     if (mainPause) {
         int unused;
diff --git a/zlibWrapper/examples/zwrapbench.c b/zlibWrapper/examples/zwrapbench.c
index 23c3ca4dab8f..a57ed51ec141 100644
--- a/zlibWrapper/examples/zwrapbench.c
+++ b/zlibWrapper/examples/zwrapbench.c
@@ -73,13 +73,13 @@ static U32 g_compressibilityDefault = 50;
 #define DEFAULT_DISPLAY_LEVEL 2
 #define DISPLAY(...)         fprintf(displayOut, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static U32 g_displayLevel = DEFAULT_DISPLAY_LEVEL;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
+static int g_displayLevel = DEFAULT_DISPLAY_LEVEL;   /* 0 : no display;   1: errors;   2 : + result + interaction + warnings;   3 : + progression;   4 : + information */
 static FILE* displayOut;
 
 #define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
             if ((clock() - g_time > refreshRate) || (g_displayLevel>=4)) \
             { g_time = clock(); DISPLAY(__VA_ARGS__); \
-            if (g_displayLevel>=4) fflush(stdout); } }
+            if (g_displayLevel>=4) fflush(displayOut); } }
 static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
 static clock_t g_time = 0;
 
@@ -128,6 +128,11 @@ void BMK_SetBlockSize(size_t blockSize)
 /* ********************************************************
 *  Bench functions
 **********************************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
 typedef struct
 {
     z_const char* srcPtr;
@@ -142,9 +147,6 @@ typedef struct
 typedef enum { BMK_ZSTD, BMK_ZSTD_STREAM, BMK_ZLIB, BMK_ZWRAP_ZLIB, BMK_ZWRAP_ZSTD, BMK_ZLIB_REUSE, BMK_ZWRAP_ZLIB_REUSE, BMK_ZWRAP_ZSTD_REUSE } BMK_compressor;
 
 
-#define MIN(a,b) ((a)<(b) ? (a) : (b))
-#define MAX(a,b) ((a)>(b) ? (a) : (b))
-
 static int BMK_benchMem(z_const void* srcBuffer, size_t srcSize,
                         const char* displayName, int cLevel,
                         const size_t* fileSizes, U32 nbFiles,
@@ -234,7 +236,7 @@ static int BMK_benchMem(z_const void* srcBuffer, size_t srcSize,
                 if (compressor == BMK_ZSTD) {
                     ZSTD_parameters const zparams = ZSTD_getParams(cLevel, avgSize, dictBufferSize);
                     ZSTD_customMem const cmem = { NULL, NULL, NULL };
-                    ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, 1, zparams, cmem);
+                    ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, 1, zparams.cParams, cmem);
                     if (cdict==NULL) EXM_THROW(1, "ZSTD_createCDict_advanced() allocation failure");
 
                     do {
@@ -982,7 +984,7 @@ int main(int argCount, char** argv)
 
 #ifdef UTIL_HAS_CREATEFILELIST
     if (recursive) {
-        fileNamesTable = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb);
+        fileNamesTable = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, 1);
         if (fileNamesTable) {
             unsigned u;
             for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, fileNamesTable[u]);
-- 
cgit v1.3