summaryrefslogtreecommitdiff
path: root/system/easy-kernel/0504-update-zstd-to-v1_5_5.patch
diff options
context:
space:
mode:
Diffstat (limited to 'system/easy-kernel/0504-update-zstd-to-v1_5_5.patch')
-rw-r--r--system/easy-kernel/0504-update-zstd-to-v1_5_5.patch13822
1 files changed, 13822 insertions, 0 deletions
diff --git a/system/easy-kernel/0504-update-zstd-to-v1_5_5.patch b/system/easy-kernel/0504-update-zstd-to-v1_5_5.patch
new file mode 100644
index 000000000..d0fc9da32
--- /dev/null
+++ b/system/easy-kernel/0504-update-zstd-to-v1_5_5.patch
@@ -0,0 +1,13822 @@
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Fri, 8 Dec 2023 16:36:05 +0100
+Subject: [PATCH 1/2] zstd-6.6: merge v1.5.5 into kernel tree
+
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+---
+ include/linux/zstd.h | 2 +-
+ include/linux/zstd_errors.h | 23 +-
+ include/linux/zstd_lib.h | 697 +++++--
+ lib/zstd/Makefile | 2 +-
+ lib/zstd/common/allocations.h | 56 +
+ lib/zstd/common/bits.h | 149 ++
+ lib/zstd/common/bitstream.h | 53 +-
+ lib/zstd/common/compiler.h | 14 +-
+ lib/zstd/common/cpu.h | 3 +-
+ lib/zstd/common/debug.c | 3 +-
+ lib/zstd/common/debug.h | 3 +-
+ lib/zstd/common/entropy_common.c | 42 +-
+ lib/zstd/common/error_private.c | 12 +-
+ lib/zstd/common/error_private.h | 3 +-
+ lib/zstd/common/fse.h | 89 +-
+ lib/zstd/common/fse_decompress.c | 94 +-
+ lib/zstd/common/huf.h | 222 +--
+ lib/zstd/common/mem.h | 2 +-
+ lib/zstd/common/portability_macros.h | 26 +-
+ lib/zstd/common/zstd_common.c | 38 +-
+ lib/zstd/common/zstd_deps.h | 16 +-
+ lib/zstd/common/zstd_internal.h | 99 +-
+ lib/zstd/compress/clevels.h | 3 +-
+ lib/zstd/compress/fse_compress.c | 59 +-
+ lib/zstd/compress/hist.c | 3 +-
+ lib/zstd/compress/hist.h | 3 +-
+ lib/zstd/compress/huf_compress.c | 372 ++--
+ lib/zstd/compress/zstd_compress.c | 1762 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h | 333 +++-
+ lib/zstd/compress/zstd_compress_literals.c | 155 +-
+ lib/zstd/compress/zstd_compress_literals.h | 25 +-
+ lib/zstd/compress/zstd_compress_sequences.c | 7 +-
+ lib/zstd/compress/zstd_compress_sequences.h | 3 +-
+ lib/zstd/compress/zstd_compress_superblock.c | 47 +-
+ lib/zstd/compress/zstd_compress_superblock.h | 3 +-
+ lib/zstd/compress/zstd_cwksp.h | 149 +-
+ lib/zstd/compress/zstd_double_fast.c | 129 +-
+ lib/zstd/compress/zstd_double_fast.h | 6 +-
+ lib/zstd/compress/zstd_fast.c | 582 ++++--
+ lib/zstd/compress/zstd_fast.h | 6 +-
+ lib/zstd/compress/zstd_lazy.c | 518 ++---
+ lib/zstd/compress/zstd_lazy.h | 7 +-
+ lib/zstd/compress/zstd_ldm.c | 11 +-
+ lib/zstd/compress/zstd_ldm.h | 3 +-
+ lib/zstd/compress/zstd_ldm_geartab.h | 3 +-
+ lib/zstd/compress/zstd_opt.c | 187 +-
+ lib/zstd/compress/zstd_opt.h | 3 +-
+ lib/zstd/decompress/huf_decompress.c | 770 ++++---
+ lib/zstd/decompress/zstd_ddict.c | 9 +-
+ lib/zstd/decompress/zstd_ddict.h | 3 +-
+ lib/zstd/decompress/zstd_decompress.c | 261 ++-
+ lib/zstd/decompress/zstd_decompress_block.c | 283 ++-
+ lib/zstd/decompress/zstd_decompress_block.h | 8 +-
+ .../decompress/zstd_decompress_internal.h | 7 +-
+ lib/zstd/decompress_sources.h | 2 +-
+ lib/zstd/zstd_common_module.c | 5 +-
+ lib/zstd/zstd_compress_module.c | 2 +-
+ lib/zstd/zstd_decompress_module.c | 4 +-
+ 58 files changed, 4787 insertions(+), 2594 deletions(-)
+ create mode 100644 lib/zstd/common/allocations.h
+ create mode 100644 lib/zstd/common/bits.h
+
+diff --git a/include/linux/zstd.h b/include/linux/zstd.h
+index 113408eef..f109d49f4 100644
+--- a/include/linux/zstd.h
++++ b/include/linux/zstd.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
+index 58b6dd45a..6d5cf55f0 100644
+--- a/include/linux/zstd_errors.h
++++ b/include/linux/zstd_errors.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -17,8 +18,17 @@
+
+
+ /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */
+-#define ZSTDERRORLIB_VISIBILITY
+-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
++#define ZSTDERRORLIB_VISIBLE
++
++#ifndef ZSTDERRORLIB_HIDDEN
++# if (__GNUC__ >= 4) && !defined(__MINGW32__)
++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
++# else
++# define ZSTDERRORLIB_HIDDEN
++# endif
++#endif
++
++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
+
+ /*-*********************************************
+ * Error codes list
+@@ -43,14 +53,17 @@ typedef enum {
+ ZSTD_error_frameParameter_windowTooLarge = 16,
+ ZSTD_error_corruption_detected = 20,
+ ZSTD_error_checksum_wrong = 22,
++ ZSTD_error_literals_headerWrong = 24,
+ ZSTD_error_dictionary_corrupted = 30,
+ ZSTD_error_dictionary_wrong = 32,
+ ZSTD_error_dictionaryCreation_failed = 34,
+ ZSTD_error_parameter_unsupported = 40,
++ ZSTD_error_parameter_combination_unsupported = 41,
+ ZSTD_error_parameter_outOfBound = 42,
+ ZSTD_error_tableLog_tooLarge = 44,
+ ZSTD_error_maxSymbolValue_tooLarge = 46,
+ ZSTD_error_maxSymbolValue_tooSmall = 48,
++ ZSTD_error_stabilityCondition_notRespected = 50,
+ ZSTD_error_stage_wrong = 60,
+ ZSTD_error_init_missing = 62,
+ ZSTD_error_memory_allocation = 64,
+@@ -58,11 +71,15 @@ typedef enum {
+ ZSTD_error_dstSize_tooSmall = 70,
+ ZSTD_error_srcSize_wrong = 72,
+ ZSTD_error_dstBuffer_null = 74,
++ ZSTD_error_noForwardProgress_destFull = 80,
++ ZSTD_error_noForwardProgress_inputEmpty = 82,
+ /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+ ZSTD_error_frameIndex_tooLarge = 100,
+ ZSTD_error_seekableIO = 102,
+ ZSTD_error_dstBuffer_wrong = 104,
+ ZSTD_error_srcBuffer_wrong = 105,
++ ZSTD_error_sequenceProducer_failed = 106,
++ ZSTD_error_externalSequences_invalid = 107,
+ ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+ } ZSTD_ErrorCode;
+
+diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
+index 79d55465d..8b4ffe649 100644
+--- a/include/linux/zstd_lib.h
++++ b/include/linux/zstd_lib.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -11,23 +12,42 @@
+ #ifndef ZSTD_H_235446
+ #define ZSTD_H_235446
+
+-/* ====== Dependency ======*/
++/* ====== Dependencies ======*/
+ #include <linux/limits.h> /* INT_MAX */
+ #include <linux/types.h> /* size_t */
+
+
+ /* ===== ZSTDLIB_API : control library symbols visibility ===== */
+-#ifndef ZSTDLIB_VISIBLE
++#define ZSTDLIB_VISIBLE
++
++#ifndef ZSTDLIB_HIDDEN
+ # if (__GNUC__ >= 4) && !defined(__MINGW32__)
+-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+ # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+ # else
+-# define ZSTDLIB_VISIBLE
+ # define ZSTDLIB_HIDDEN
+ # endif
+ #endif
++
+ #define ZSTDLIB_API ZSTDLIB_VISIBLE
+
++/* Deprecation warnings :
++ * Should these warnings be a problem, it is generally possible to disable them,
++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
++ */
++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
++#else
++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
++# elif (__GNUC__ >= 3)
++# define ZSTD_DEPRECATED(message) __attribute__((deprecated))
++# else
++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
++# define ZSTD_DEPRECATED(message)
++# endif
++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
++
+
+ /* *****************************************************************************
+ Introduction
+@@ -65,7 +85,7 @@
+ /*------ Version ------*/
+ #define ZSTD_VERSION_MAJOR 1
+ #define ZSTD_VERSION_MINOR 5
+-#define ZSTD_VERSION_RELEASE 2
++#define ZSTD_VERSION_RELEASE 5
+ #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+
+ /*! ZSTD_versionNumber() :
+@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
+ ***************************************/
+ /*! ZSTD_compress() :
+ * Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ * enough space to successfully compress the data.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ * or an error code if it fails (which can be tested using ZSTD_isError()). */
+ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
+ * "empty", "unknown" and "error" results to the same return value (0),
+ * while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
++ZSTDLIB_API
++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+ /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
+
+
+ /*====== Helper functions ======*/
+-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_compressBound() :
++ * maximum compressed size in worst case single-pass scenario.
++ * When invoking `ZSTD_compress()` or any other one-pass compression function,
++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
++ * as it eliminates one potential failure scenario,
++ * aka not enough room in dst buffer to write the compressed frame.
++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
++ * In which case, ZSTD_compressBound() will return an error code
++ * which can be tested using ZSTD_isError().
++ *
++ * ZSTD_COMPRESSBOUND() :
++ * same as ZSTD_compressBound(), but as a macro.
++ * It can be used to produce constants, which can be useful for static allocation,
++ * for example to size a static array on stack.
++ * Will produce constant value 0 if srcSize too large.
++ */
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
++/* ZSTD_isError() :
++ * Most ZSTD_* functions returning a size_t value can be tested for error,
++ * using ZSTD_isError().
++ * @return 1 if error, 0 otherwise
++ */
+ ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
+ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */
+ ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */
+@@ -412,6 +457,9 @@ typedef enum {
+ * ZSTD_c_validateSequences
+ * ZSTD_c_useBlockSplitter
+ * ZSTD_c_useRowMatchFinder
++ * ZSTD_c_prefetchCDictTables
++ * ZSTD_c_enableSeqProducerFallback
++ * ZSTD_c_maxBlockSize
+ * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+ * note : never ever use experimentalParam? names directly;
+ * also, the enums values themselves are unstable and can still change.
+@@ -430,7 +478,11 @@ typedef enum {
+ ZSTD_c_experimentalParam12=1009,
+ ZSTD_c_experimentalParam13=1010,
+ ZSTD_c_experimentalParam14=1011,
+- ZSTD_c_experimentalParam15=1012
++ ZSTD_c_experimentalParam15=1012,
++ ZSTD_c_experimentalParam16=1013,
++ ZSTD_c_experimentalParam17=1014,
++ ZSTD_c_experimentalParam18=1015,
++ ZSTD_c_experimentalParam19=1016
+ } ZSTD_cParameter;
+
+ typedef struct {
+@@ -493,7 +545,7 @@ typedef enum {
+ * They will be used to compress next frame.
+ * Resetting session never fails.
+ * - The parameters : changes all parameters back to "default".
+- * This removes any reference to any dictionary too.
++ * This also removes any reference to any dictionary or external sequence producer.
+ * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ * - Both : similar to resetting the session, followed by resetting parameters.
+@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+ * Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ * - The function is always blocking, returns when compression is completed.
+- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`.
++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
++ * enough space to successfully compress the data, though it is possible it fails for other reasons.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ * or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+@@ -543,13 +596,15 @@ typedef enum {
+ * ZSTD_d_stableOutBuffer
+ * ZSTD_d_forceIgnoreChecksum
+ * ZSTD_d_refMultipleDDicts
++ * ZSTD_d_disableHuffmanAssembly
+ * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+ * note : never ever use experimentalParam? names directly
+ */
+ ZSTD_d_experimentalParam1=1000,
+ ZSTD_d_experimentalParam2=1001,
+ ZSTD_d_experimentalParam3=1002,
+- ZSTD_d_experimentalParam4=1003
++ ZSTD_d_experimentalParam4=1003,
++ ZSTD_d_experimentalParam5=1004
+
+ } ZSTD_dParameter;
+
+@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output
+ * This following is a legacy streaming API, available since v1.0+ .
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+- * Streaming in combination with advanced parameters and dictionary compression
+- * can only be used through the new API.
+ ******************************************************************************/
+
+ /*!
+@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
++ *
++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
++ * to compress with a dictionary.
+ */
+ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ /*!
+@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer
+
+ /*===== Streaming decompression functions =====*/
+
+-/* This function is redundant with the advanced API and equivalent to:
++/*! ZSTD_initDStream() :
++ * Initialize/reset DStream state for new decompression operation.
++ * Call before new decompression operation using same DStream.
+ *
++ * Note : This function is redundant with the advanced API and equivalent to:
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ * ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
++/*! ZSTD_decompressStream() :
++ * Streaming decompression function.
++ * Call repetitively to consume full input updating it as necessary.
++ * Function will update both input and output `pos` fields exposing current state via these fields:
++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
++ * on the next call.
++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
++ * call ZSTD_decompressStream() again to flush remaining data to output.
++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * @return : 0 when a frame is completely decoded and fully flushed,
++ * or an error code, which can be tested using ZSTD_isError(),
++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ */
+ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */
+@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+ * If @return == 0, the dictID could not be decoded.
+ * This could for one of the following reasons :
+ * - The frame does not require a dictionary to be decoded (most common case).
+- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
+ * Note : this use case also happens when using a non-conformant dictionary.
+ * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ * - This is not a Zstandard frame.
+@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+ * Advanced dictionary and prefix API (Requires v1.4.0+)
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
+- * only reset with the context is reset with ZSTD_reset_parameters or
+- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
++ * Dictionaries are sticky, they remain valid when same context is re-used,
++ * they only reset when the context is reset
++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
++ * In contrast, Prefixes are single-use.
+ ******************************************************************************/
+
+
+@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ * meaning "return to no-dictionary mode".
+- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
++ * until parameters are reset, a new dictionary is loaded, or the dictionary
++ * is explicitly invalidated by loading a NULL dictionary.
+ * Note 2 : Loading a dictionary involves building tables.
+ * It's also a CPU consuming operation, with non-negligible impact on latency.
+ * Tables are dependent on compression parameters, and for this reason,
+@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+ * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ * In such a case, dictionary buffer must outlive its users.
+ * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+- * to precisely select how dictionary content must be interpreted. */
++ * to precisely select how dictionary content must be interpreted.
++ * Note 5 : This method does not benefit from LDM (long distance mode).
++ * If you want to employ LDM on some large dictionary content,
++ * prefer employing ZSTD_CCtx_refPrefix() described below.
++ */
+ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+ /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
+- * Reference a prepared dictionary, to be used for all next compressed frames.
++ * Reference a prepared dictionary, to be used for all future compressed frames.
+ * Note that compression parameters are enforced from within CDict,
+ * and supersede any compression parameter previously set within CCtx.
+ * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+ * Decompression will need same prefix to properly regenerate data.
+ * Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ * but performs much faster, especially during decompression (compression speed is tunable with compression level).
++ * This method is compatible with LDM (long distance mode).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ * Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+ const void* prefix, size_t prefixSize);
+
+ /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
+- * Create an internal DDict from dict buffer,
+- * to be used to decompress next frames.
+- * The dictionary remains valid for all future frames, until explicitly invalidated.
++ * Create an internal DDict from dict buffer, to be used to decompress all future frames.
++ * The dictionary remains valid for all future frames, until explicitly invalidated, or
++ * a new dictionary is loaded.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ * meaning "return to no-dictionary mode".
+@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+ * The memory for the table is allocated on the first call to refDDict, and can be
+ * freed with ZSTD_freeDCtx().
+ *
++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
++ * will be managed, and referencing a dictionary effectively "discards" any previous one.
++ *
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+- * Note 1 : Currently, only one dictionary can be managed.
+- * Referencing a new dictionary effectively "discards" any previous one.
+ * Special: referencing a NULL DDict means "return to no-dictionary mode".
+ * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+ #endif
+
+-/* Deprecation warnings :
+- * Should these warnings be a problem, it is generally possible to disable them,
+- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+- */
+-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */
+-#else
+-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
+-# elif (__GNUC__ >= 3)
+-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
+-# else
+-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+-# endif
+-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+-
+ /* **************************************************************************************
+ * experimental API (static linking only)
+ ****************************************************************************************
+@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */
+ #define ZSTD_STRATEGY_MIN ZSTD_fast
+ #define ZSTD_STRATEGY_MAX ZSTD_btultra2
++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
+
+
+ #define ZSTD_OVERLAPLOG_MIN 0
+@@ -1303,7 +1369,7 @@ typedef enum {
+ } ZSTD_paramSwitch_e;
+
+ /* *************************************
+-* Frame size functions
++* Frame header and size functions
+ ***************************************/
+
+ /*! ZSTD_findDecompressedSize() :
+@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+ * or an error code (if srcSize is too small) */
+ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
++typedef struct {
++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */
++ unsigned blockSizeMax;
++ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
++ unsigned headerSize;
++ unsigned dictID;
++ unsigned checksumFlag;
++ unsigned _reserved1;
++ unsigned _reserved2;
++} ZSTD_frameHeader;
++
++/*! ZSTD_getFrameHeader() :
++ * decode Frame Header, or requires larger `srcSize`.
++ * @return : 0, `zfhPtr` is correctly filled,
++ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
++ * or an error code, which can be tested using ZSTD_isError() */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */
++/*! ZSTD_getFrameHeader_advanced() :
++ * same as ZSTD_getFrameHeader(),
++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */
++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
++
++/*! ZSTD_decompressionMargin() :
++ * Zstd supports in-place decompression, where the input and output buffers overlap.
++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
++ * and the input buffer must be at the end of the output buffer.
++ *
++ * _______________________ Output Buffer ________________________
++ * | |
++ * | ____ Input Buffer ____|
++ * | | |
++ * v v v
++ * |---------------------------------------|-----------|----------|
++ * ^ ^ ^
++ * |___________________ Output_Size ___________________|_ Margin _|
++ *
++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
++ * ZSTD_decompressDCtx().
++ * NOTE: This function supports multi-frame input.
++ *
++ * @param src The compressed frame(s)
++ * @param srcSize The size of the compressed frame(s)
++ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
++
++/*! ZSTD_DECOMPRESS_MARGIN() :
++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
++ * the compressed frame, compute it from the original size and the blockSizeLog.
++ * See ZSTD_decompressionMargin() for details.
++ *
++ * WARNING: This macro does not support multi-frame input, the input must be a single
++ * zstd frame. If you need that support use the function, or implement it yourself.
++ *
++ * @param originalSize The original uncompressed size of the data.
++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
++ * Unless you explicitly set the windowLog smaller than
++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
++ */
++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \
++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \
++ 4 /* checksum */ + \
++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
++ (blockSize) /* One block of margin */ \
++ ))
++
+ typedef enum {
+ ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
+ ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */
+ } ZSTD_sequenceFormat_e;
+
++/*! ZSTD_sequenceBound() :
++ * `srcSize` : size of the input buffer
++ * @return : upper-bound for the number of sequences that can be generated
++ * from a buffer of srcSize bytes
++ *
++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
++
+ /*! ZSTD_generateSequences() :
+- * Generate sequences using ZSTD_compress2, given a source buffer.
++ * Generate sequences using ZSTD_compress2(), given a source buffer.
+ *
+ * Each block will end with a dummy sequence
+ * with offset == 0, matchLength == 0, and litLength == length of last literals.
+ * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
+ * simply acts as a block delimiter.
+ *
+- * zc can be used to insert custom compression params.
+- * This function invokes ZSTD_compress2
++ * @zc can be used to insert custom compression params.
++ * This function invokes ZSTD_compress2().
+ *
+ * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+ * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+ * @return : number of sequences generated
+ */
+
+-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+- size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_generateSequences( ZSTD_CCtx* zc,
++ ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++ const void* src, size_t srcSize);
+
+ /*! ZSTD_mergeBlockDelimiters() :
+ * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
+@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
+
+ /*! ZSTD_compressSequences() :
+- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
++ * @src contains the entire input (not just the literals).
++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
+ * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
+ * The entire source is compressed into a single frame.
+ *
+@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+ * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
+ * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
+ * and cannot emit an RLE block that disagrees with the repcode history
+- * @return : final compressed size or a ZSTD error.
++ * @return : final compressed size, or a ZSTD error code.
+ */
+-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
+- const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+- const void* src, size_t srcSize);
++ZSTDLIB_STATIC_API size_t
++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
++ const void* src, size_t srcSize);
+
+
+ /*! ZSTD_writeSkippableFrame() :
+@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+- * Note 2 : only single-threaded compression is supported.
++ * Note : only single-threaded compression is supported.
+ * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *
++ * Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ * Size estimates assume that no external sequence producer is registered.
+ */
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ * Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ * an internal ?Dict will be created, which additional size is not estimated here.
+- * In this case, get total size by adding ZSTD_estimate?DictSize */
++ * In this case, get total size by adding ZSTD_estimate?DictSize
++ * Note 2 : only single-threaded compression is supported.
++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ * Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ * Size estimates assume that no external sequence producer is registered.
++ */
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+ * This function never fails (wide contract) */
+ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
++/*! ZSTD_CCtx_setCParams() :
++ * Set all parameters provided within @p cparams into the working @p cctx.
++ * Note : if modifying parameters during compression (MT mode only),
++ * note that changes to the .windowLog parameter will be ignored.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ * On failure, no parameters are updated.
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
++
++/*! ZSTD_CCtx_setFParams() :
++ * Set all parameters provided within @p fparams into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
++
++/*! ZSTD_CCtx_setParams() :
++ * Set all parameters provided within @p params into the working @p cctx.
++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
++ */
++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
++
+ /*! ZSTD_compress_advanced() :
+ * Note : this function is now DEPRECATED.
+ * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ * This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+- void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize,
+- const void* dict,size_t dictSize,
+- ZSTD_parameters params);
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize,
++ const void* dict,size_t dictSize,
++ ZSTD_parameters params);
+
+ /*! ZSTD_compress_usingCDict_advanced() :
+ * Note : this function is now DEPRECATED.
+ * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ * This prototype will generate compilation warnings. */
+ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
+- * between calls, except for the modifications that zstd makes to pos (the
+- * caller must not modify pos). This is checked by the compressor, and
+- * compression will fail if it ever changes. This means the only flush
+- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
+- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
+- * MUST not be modified during compression or you will get data corruption.
++ * Tells the compressor that input data presented with ZSTD_inBuffer
++ * will ALWAYS be the same between calls.
++ * Technically, the @src pointer must never be changed,
++ * and the @pos field can only be updated by zstd.
++ * However, it's possible to increase the @size field,
++ * allowing scenarios where more data can be appended after compressions starts.
++ * These conditions are checked by the compressor,
++ * and compression will fail if they are not respected.
++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
++ * MUST not be modified during compression or it will result in data corruption.
+ *
+ * When this flag is enabled zstd won't allocate an input window buffer,
+ * because the user guarantees it can reference the ZSTD_inBuffer until
+@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+ * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
+ * avoid the memcpy() from the input buffer to the input window buffer.
+ *
+- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
+- * That means this flag cannot be used with ZSTD_compressStream().
+- *
+ * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+- * memory. However, compression WILL fail if you violate the preconditions.
++ * memory. However, compression WILL fail if conditions are not respected.
+ *
+- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
+- * not be modified during compression or you will get data corruption. This
+- * is because zstd needs to reference data in the ZSTD_inBuffer to find
++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
++ * not be modified during compression or it will result in data corruption.
++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * matches. Normally zstd maintains its own window buffer for this purpose,
+- * but passing this flag tells zstd to use the user provided buffer.
++ * but passing this flag tells zstd to rely on user provided buffer instead.
+ */
+ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
+
+@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+ * Without validation, providing a sequence that does not conform to the zstd spec will cause
+ * undefined behavior, and may produce a corrupted block.
+ *
+- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
+ * specifics regarding offset/matchlength requirements) then the function will bail out and
+ * return an error.
+ *
+@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+ */
+ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+
++/* ZSTD_c_prefetchCDictTables
++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
++ *
++ * In some situations, zstd uses CDict tables in-place rather than copying them
++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
++ * In such situations, compression speed is seriously impacted when CDict tables are
++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
++ * when they are used in-place.
++ *
++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
++ * into the working context, so there is no need to prefetch. This parameter is
++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
++ * useful but memcpy() is too expensive. The exact range of input sizes where this
++ * makes sense is best determined by careful experimentation.
++ *
++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
++ * but in the future zstd may conditionally enable this feature via an auto-detection
++ * heuristic for cold CDicts.
++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
++ */
++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
++
++/* ZSTD_c_enableSeqProducerFallback
++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
++ *
++ * Controls whether zstd will fall back to an internal sequence producer if an
++ * external sequence producer is registered and returns an error code. This fallback
++ * is block-by-block: the internal sequence producer will only be called for blocks
++ * where the external sequence producer returns an error code. Fallback parsing will
++ * follow any other cParam settings, such as compression level, the same as in a
++ * normal (fully-internal) compression operation.
++ *
++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
++ * documentation (below) before setting this parameter. */
++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
++
++/* ZSTD_c_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * This parameter can be used to set an upper bound on the blocksize
++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
++ * compressBound() inaccurate). Only currently meant to be used for testing.
++ *
++ */
++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
++
++/* ZSTD_c_searchForExternalRepcodes
++ * This parameter affects how zstd parses external sequences, such as sequences
++ * provided through the compressSequences() API or from an external block-level
++ * sequence producer.
++ *
++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
++ * external sequences, even if those repcodes are not explicitly indicated in
++ * the "rep" field. Note that this is the only way to exploit repcode matches
++ * while using compressSequences() or an external sequence producer, since zstd
++ * currently ignores the "rep" field of external sequences.
++ *
++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
++ * external sequences, regardless of whether the "rep" field has been set. This
++ * reduces sequence compression overhead by about 25% while sacrificing some
++ * compression ratio.
++ *
++ * The default value is ZSTD_ps_auto, for which the library will enable/disable
++ * based on compression level.
++ *
++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
++ */
++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
++
+ /*! ZSTD_CCtx_getParameter() :
+ * Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ * and store it into int* value.
+@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+- * When this flags is enabled zstd won't allocate an output buffer, because
++ * When this flag is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+ */
+ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
+
++/* ZSTD_d_disableHuffmanAssembly
++ * Set to 1 to disable the Huffman assembly implementation.
++ * The default value is 0, which allows zstd to use the Huffman assembly
++ * implementation if available.
++ *
++ * This parameter can be used to disable Huffman assembly at runtime.
++ * If you want to disable it at compile time you can define the macro
++ * ZSTD_DISABLE_ASM.
++ */
++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
+
+ /*! ZSTD_DCtx_setFormat() :
+ * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
+@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+ * such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+ /*! ZSTD_decompressStream_simpleArgs() :
+@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+ * This prototype will generate compilation warnings.
+ */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+ int compressionLevel,
+ unsigned long long pledgedSrcSize);
+@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+ * This prototype will generate compilation warnings.
+ */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ int compressionLevel);
+
+ /*! ZSTD_initCStream_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- * // Pseudocode: Set each zstd parameter and leave the rest as-is.
+- * for ((param, value) : params) {
+- * ZSTD_CCtx_setParameter(zcs, param, value);
+- * }
++ * ZSTD_CCtx_setParams(zcs, params);
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+ * This prototype will generate compilation warnings.
+ */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+ const void* dict, size_t dictSize,
+ ZSTD_parameters params,
+@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+ * This prototype will generate compilation warnings.
+ */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+ /*! ZSTD_initCStream_usingCDict_advanced() :
+- * This function is DEPRECATED, and is approximately equivalent to:
++ * This function is DEPRECATED, and is equivalent to:
+ * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+- * for ((fParam, value) : fParams) {
+- * ZSTD_CCtx_setParameter(zcs, fParam, value);
+- * }
++ * ZSTD_CCtx_setFParams(zcs, fParams);
+ * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+ * This prototype will generate compilation warnings.
+ */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+ const ZSTD_CDict* cdict,
+ ZSTD_frameParameters fParams,
+@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+ * This prototype will generate compilation warnings.
+ */
+ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+ * ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+ /*!
+@@ -2330,8 +2595,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+ * ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+ /*!
+@@ -2340,17 +2605,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+ * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
+ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
++ *
++ * *** OVERVIEW ***
++ * The Block-Level Sequence Producer API allows users to provide their own custom
++ * sequence producer which libzstd invokes to process each block. The produced list
++ * of sequences (literals and matches) is then post-processed by libzstd to produce
++ * valid compressed blocks.
++ *
++ * This block-level offload API is a more granular complement of the existing
++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
++ * an easier migration story for applications already integrated with libzstd: the
++ * user application continues to invoke the same compression functions
++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
++ * from the specific advantages of the external sequence producer. For example,
++ * the sequence producer could be tuned to take advantage of known characteristics
++ * of the input, to offer better speed / ratio, or could leverage hardware
++ * acceleration not available within libzstd itself.
++ *
++ * See contrib/externalSequenceProducer for an example program employing the
++ * Block-Level Sequence Producer API.
++ *
++ * *** USAGE ***
++ * The user is responsible for implementing a function of type
++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
++ * arguments to the user-provided function:
++ *
++ * - sequenceProducerState: a pointer to a user-managed state for the sequence
++ * producer.
++ *
++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
++ * backing outSeqs is managed by the CCtx.
++ *
++ * - src, srcSize: an input buffer for the sequence producer to parse.
++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
++ *
++ * - dict, dictSize: a history buffer, which may be empty, which the sequence
++ * producer may reference as it parses the src buffer. Currently, zstd will
++ * always pass dictSize == 0 into external sequence producers, but this will
++ * change in the future.
++ *
++ * - compressionLevel: a signed integer representing the zstd compression level
++ * set by the user for the current operation. The sequence producer may choose
++ * to use this information to change its compression strategy and speed/ratio
++ * tradeoff. Note: the compression level does not reflect zstd parameters set
++ * through the advanced API.
++ *
++ * - windowSize: a size_t representing the maximum allowed offset for external
++ * sequences. Note that sequence offsets are sometimes allowed to exceed the
++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md
++ * for details.
++ *
++ * The user-provided function shall return a size_t representing the number of
++ * sequences written to outSeqs. This return value will be treated as an error
++ * code if it is greater than outSeqsCapacity. The return value must be non-zero
++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
++ * for convenience, but any value greater than outSeqsCapacity will be treated as
++ * an error code.
++ *
++ * If the user-provided function does not return an error code, the sequences
++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
++ * occur if the parse is not valid. A parse is defined to be valid if the
++ * following conditions hold:
++ * - The sum of matchLengths and literalLengths must equal srcSize.
++ * - All sequences in the parse, except for the final sequence, must have
++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
++ * - All offsets must respect the windowSize parameter as specified in
++ * doc/zstd_compression_format.md.
++ * - If the final sequence has matchLength == 0, it must also have offset == 0.
++ *
++ * zstd will only validate these conditions (and fail compression if they do not
++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
++ * validation has a performance cost.
++ *
++ * If the user-provided function returns an error, zstd will either fall back
++ * to an internal sequence producer or fail the compression operation. The user can
++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
++ * cParam. Fallback compression will follow any other cParam settings, such as
++ * compression level, the same as in a normal compression operation.
++ *
++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
++ * function by calling
++ * ZSTD_registerSequenceProducer(cctx,
++ * sequenceProducerState,
++ * sequenceProducer)
++ * This setting will persist until the next parameter reset of the CCtx.
++ *
++ * The sequenceProducerState must be initialized by the user before calling
++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
++ * sequenceProducerState.
++ *
++ * *** LIMITATIONS ***
++ * This API is compatible with all zstd compression APIs which respect advanced parameters.
++ * However, there are three limitations:
++ *
++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
++ * external sequence producer.
++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
++ * cases (see its documentation for details). Users must explicitly set
++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
++ * sequence producer is registered.
++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
++ *
++ * Second, history buffers are not currently supported. Concretely, zstd will always pass
++ * dictSize == 0 to the external sequence producer (for now). This has two implications:
++ * - Dictionaries are not currently supported. Compression will *not* fail if the user
++ * references a dictionary, but the dictionary won't have any effect.
++ * - Stream history is not currently supported. All advanced compression APIs, including
++ * streaming APIs, work with external sequence producers, but each block is treated as
++ * an independent chunk without history from previous blocks.
++ *
++ * Third, multi-threading within a single compression is not currently supported. In other words,
++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
++ * Multi-threading across compressions is fine: simply create one CCtx per thread.
++ *
++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
++ * overcoming them. It is purely a question of engineering effort.
++ */
++
++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
++
++typedef size_t ZSTD_sequenceProducer_F (
++ void* sequenceProducerState,
++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
++ const void* src, size_t srcSize,
++ const void* dict, size_t dictSize,
++ int compressionLevel,
++ size_t windowSize
++);
++
++/*! ZSTD_registerSequenceProducer() :
++ * Instruct zstd to use a block-level external sequence producer function.
++ *
++ * The sequenceProducerState must be initialized by the caller, and the caller is
++ * responsible for managing its lifetime. This parameter is sticky across
++ * compressions. It will remain set until the user explicitly resets compression
++ * parameters.
++ *
++ * Sequence producer registration is considered to be an "advanced parameter",
++ * part of the "advanced API". This means it will only have an effect on compression
++ * APIs which respect advanced parameters, such as compress2() and compressStream2().
++ * Older compression APIs such as compressCCtx(), which predate the introduction of
++ * "advanced parameters", will ignore any external sequence producer setting.
++ *
++ * The sequence producer can be "cleared" by registering a NULL function pointer. This
++ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
++ *
++ * The user is strongly encouraged to read the full API documentation (above) before
++ * calling this function. */
++ZSTDLIB_STATIC_API void
++ZSTD_registerSequenceProducer(
++ ZSTD_CCtx* cctx,
++ void* sequenceProducerState,
++ ZSTD_sequenceProducer_F* sequenceProducer
++);
++
++
+ /* *******************************************************************
+-* Buffer-less and synchronous inner streaming functions
++* Buffer-less and synchronous inner streaming functions (DEPRECATED)
++*
++* This API is deprecated, and will be removed in a future version.
++* It allows streaming (de)compression with user allocated buffers.
++* However, it is hard to use, and not as well tested as the rest of
++* our API.
+ *
+-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+-* But it's also a complex one, with several restrictions, documented below.
+-* Prefer normal streaming API for an easier experience.
++* Please use the normal streaming API instead: ZSTD_compressStream2,
++* and ZSTD_decompressStream.
++* If there is functionality that you need, but it doesn't provide,
++* please open an issue on our GitHub.
+ ********************************************************************* */
+
+ /*
+@@ -2362,7 +2795,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+ Start by initializing a context.
+ Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
+- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+ Then, consume your input using ZSTD_compressContinue().
+ There are some important considerations to keep in mind when using this advanced function :
+@@ -2384,18 +2816,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ */
+
+ /*===== Buffer-less streaming compression functions =====*/
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
+-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
++ZSTDLIB_STATIC_API
++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
++
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+ /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ ZSTD_DEPRECATED("use advanced API to access custom parameters")
++ZSTDLIB_STATIC_API
+ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ /*
+ Buffer-less streaming decompression (synchronous mode)
+@@ -2408,8 +2850,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+ Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
+ errorCode, which can be tested using ZSTD_isError().
+
+ It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+@@ -2428,7 +2870,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+
+ The most memory efficient way is to use a round buffer of sufficient size.
+ Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+- which can @return an error code if required value is too large for current system (in 32-bits mode).
++ which can return an error code if required value is too large for current system (in 32-bits mode).
+ In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+ up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+ which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+@@ -2448,7 +2890,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+ ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+ It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+ It can also be an error code, which can be tested with ZSTD_isError().
+
+@@ -2471,27 +2913,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ */
+
+ /*===== Buffer-less streaming decompression functions =====*/
+-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+-typedef struct {
+- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */
+- unsigned blockSizeMax;
+- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+- unsigned headerSize;
+- unsigned dictID;
+- unsigned checksumFlag;
+-} ZSTD_frameHeader;
+
+-/*! ZSTD_getFrameHeader() :
+- * decode Frame Header, or requires larger `srcSize`.
+- * @return : 0, `zfhPtr` is correctly filled,
+- * >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- * or an error code, which can be tested using ZSTD_isError() */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */
+-/*! ZSTD_getFrameHeader_advanced() :
+- * same as ZSTD_getFrameHeader(),
+- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+@@ -2502,6 +2924,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+ /* misc */
++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
+ ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +2932,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+-/* ============================ */
+-/* Block level API */
+-/* ============================ */
++/* ========================================= */
++/* Block level API (DEPRECATED) */
++/* ========================================= */
+
+ /*!
++
++ This API is deprecated in favor of the regular compression API.
++ You can get the frame header down to 2 bytes by setting:
++ - ZSTD_c_format = ZSTD_f_zstd1_magicless
++ - ZSTD_c_contentSizeFlag = 0
++ - ZSTD_c_checksumFlag = 0
++ - ZSTD_c_dictIDFlag = 0
++
++ This API is not as well tested as our normal API, so we recommend not using it.
++ We will be removing it in a future version. If the normal API doesn't provide
++ the functionality you need, please open a GitHub issue.
++
+ Block functions produce and decode raw zstd blocks, without frame metadata.
+ Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+ But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+@@ -2524,7 +2959,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ - It is necessary to init context before starting
+ + compression : any ZSTD_compressBegin*() variant, including with dictionary
+ + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+- + copyCCtx() and copyDCtx() can be used too
+ - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+ + If input is larger than a block size, it's necessary to split input data into multiple blocks
+ + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+@@ -2541,11 +2975,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ */
+
+ /*===== Raw zstd block functions =====*/
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
+ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+-
+ #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
+index 20f08c644..464c410b2 100644
+--- a/lib/zstd/Makefile
++++ b/lib/zstd/Makefile
+@@ -1,6 +1,6 @@
+ # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ # ################################################################
+-# Copyright (c) Facebook, Inc.
++# Copyright (c) Meta Platforms, Inc. and affiliates.
+ # All rights reserved.
+ #
+ # This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
+new file mode 100644
+index 000000000..05adbbecc
+--- /dev/null
++++ b/lib/zstd/common/allocations.h
+@@ -0,0 +1,56 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++/* This file provides custom allocation primitives
++ */
++
++#define ZSTD_DEPS_NEED_MALLOC
++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
++
++#include "mem.h" /* MEM_STATIC */
++#define ZSTD_STATIC_LINKING_ONLY
++#include <linux/zstd.h> /* ZSTD_customMem */
++
++#ifndef ZSTD_ALLOCATIONS_H
++#define ZSTD_ALLOCATIONS_H
++
++/* custom memory allocation functions */
++
++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
++{
++ if (customMem.customAlloc)
++ return customMem.customAlloc(customMem.opaque, size);
++ return ZSTD_malloc(size);
++}
++
++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
++{
++ if (customMem.customAlloc) {
++ /* calloc implemented as malloc+memset;
++ * not as efficient as calloc, but next best guess for custom malloc */
++ void* const ptr = customMem.customAlloc(customMem.opaque, size);
++ ZSTD_memset(ptr, 0, size);
++ return ptr;
++ }
++ return ZSTD_calloc(1, size);
++}
++
++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
++{
++ if (ptr!=NULL) {
++ if (customMem.customFree)
++ customMem.customFree(customMem.opaque, ptr);
++ else
++ ZSTD_free(ptr);
++ }
++}
++
++#endif /* ZSTD_ALLOCATIONS_H */
+diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
+new file mode 100644
+index 000000000..aa3487ec4
+--- /dev/null
++++ b/lib/zstd/common/bits.h
+@@ -0,0 +1,149 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
++/*
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under both the BSD-style license (found in the
++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
++ * in the COPYING file in the root directory of this source tree).
++ * You may select, at your option, one of the above-listed licenses.
++ */
++
++#ifndef ZSTD_BITS_H
++#define ZSTD_BITS_H
++
++#include "mem.h"
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
++{
++ assert(val != 0);
++ {
++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
++ 30, 22, 20, 15, 25, 17, 4, 8,
++ 31, 27, 13, 23, 21, 19, 16, 7,
++ 26, 12, 18, 6, 11, 5, 10, 9};
++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
++ }
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
++{
++ assert(val != 0);
++# if (__GNUC__ >= 4)
++ return (unsigned)__builtin_ctz(val);
++# else
++ return ZSTD_countTrailingZeros32_fallback(val);
++# endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
++ assert(val != 0);
++ {
++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
++ 11, 14, 16, 18, 22, 25, 3, 30,
++ 8, 12, 20, 28, 15, 17, 24, 7,
++ 19, 27, 23, 6, 26, 5, 4, 31};
++ val |= val >> 1;
++ val |= val >> 2;
++ val |= val >> 4;
++ val |= val >> 8;
++ val |= val >> 16;
++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
++ }
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
++{
++ assert(val != 0);
++# if (__GNUC__ >= 4)
++ return (unsigned)__builtin_clz(val);
++# else
++ return ZSTD_countLeadingZeros32_fallback(val);
++# endif
++}
++
++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
++{
++ assert(val != 0);
++# if (__GNUC__ >= 4) && defined(__LP64__)
++ return (unsigned)__builtin_ctzll(val);
++# else
++ {
++ U32 mostSignificantWord = (U32)(val >> 32);
++ U32 leastSignificantWord = (U32)val;
++ if (leastSignificantWord == 0) {
++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
++ } else {
++ return ZSTD_countTrailingZeros32(leastSignificantWord);
++ }
++ }
++# endif
++}
++
++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
++{
++ assert(val != 0);
++# if (__GNUC__ >= 4)
++ return (unsigned)(__builtin_clzll(val));
++# else
++ {
++ U32 mostSignificantWord = (U32)(val >> 32);
++ U32 leastSignificantWord = (U32)val;
++ if (mostSignificantWord == 0) {
++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
++ } else {
++ return ZSTD_countLeadingZeros32(mostSignificantWord);
++ }
++ }
++# endif
++}
++
++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
++{
++ if (MEM_isLittleEndian()) {
++ if (MEM_64bits()) {
++ return ZSTD_countTrailingZeros64((U64)val) >> 3;
++ } else {
++ return ZSTD_countTrailingZeros32((U32)val) >> 3;
++ }
++ } else { /* Big Endian CPU */
++ if (MEM_64bits()) {
++ return ZSTD_countLeadingZeros64((U64)val) >> 3;
++ } else {
++ return ZSTD_countLeadingZeros32((U32)val) >> 3;
++ }
++ }
++}
++
++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
++{
++ assert(val != 0);
++ return 31 - ZSTD_countLeadingZeros32(val);
++}
++
++/* ZSTD_rotateRight_*():
++ * Rotates a bitfield to the right by "count" bits.
++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
++ */
++MEM_STATIC
++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
++ assert(count < 64);
++ count &= 0x3F; /* for fickle pattern recognition */
++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
++}
++
++MEM_STATIC
++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
++ assert(count < 32);
++ count &= 0x1F; /* for fickle pattern recognition */
++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
++}
++
++MEM_STATIC
++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
++ assert(count < 16);
++ count &= 0x0F; /* for fickle pattern recognition */
++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++}
++
++#endif /* ZSTD_BITS_H */
+diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
+index feef3a1b1..444dc4f85 100644
+--- a/lib/zstd/common/bitstream.h
++++ b/lib/zstd/common/bitstream.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+ * bitstream
+ * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -27,6 +28,7 @@
+ #include "compiler.h" /* UNLIKELY() */
+ #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */
+ #include "error_private.h" /* error codes and messages */
++#include "bits.h" /* ZSTD_highbit32 */
+
+
+ /*=========================================
+@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+ /* faster, but works only if nbBits >= 1 */
+
+-
+-
+-/*-**************************************************************
+-* Internal functions
+-****************************************************************/
+-MEM_STATIC unsigned BIT_highbit32 (U32 val)
+-{
+- assert(val != 0);
+- {
+-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */
+- return __builtin_clz (val) ^ 31;
+-# else /* Software version */
+- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29,
+- 11, 14, 16, 18, 22, 25, 3, 30,
+- 8, 12, 20, 28, 15, 17, 24, 7,
+- 19, 27, 23, 6, 26, 5, 4, 31 };
+- U32 v = val;
+- v |= v >> 1;
+- v |= v >> 2;
+- v |= v >> 4;
+- v |= v >> 8;
+- v |= v >> 16;
+- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+-# endif
+- }
+-}
+-
+ /*===== Local Constants =====*/
+ static const unsigned BIT_mask[] = {
+ 0, 1, 3, 7, 0xF, 0x1F,
+@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+ return 0;
+ }
+
++MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++{
++ assert(nbBits < BIT_MASK_SIZE);
++ return bitContainer & BIT_mask[nbBits];
++}
++
+ /*! BIT_addBits() :
+ * can add up to 31 bits into `bitC`.
+ * Note : does not check for register overflow ! */
+@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+ DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+ assert(nbBits < BIT_MASK_SIZE);
+ assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
+ bitC->bitPos += nbBits;
+ }
+
+@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+ bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+ bitD->bitContainer = MEM_readLEST(bitD->ptr);
+ { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
+ if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+ } else {
+ bitD->ptr = bitD->start;
+@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+ default: break;
+ }
+ { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+ if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */
+ }
+ bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+ #endif
+ }
+
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+-{
+- assert(nbBits < BIT_MASK_SIZE);
+- return bitContainer & BIT_mask[nbBits];
+-}
+-
+ /*! BIT_lookBits() :
+ * Provides next n bits from local register.
+ * local register is not modified.
+@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
+ }
+
+ /*! BIT_readBitsFast() :
+- * unsafe version; only works only if nbBits >= 1 */
++ * unsafe version; only works if nbBits >= 1 */
+ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+ size_t const value = BIT_lookBitsFast(bitD, nbBits);
+@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+ * This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+ {
+ if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */
+ return BIT_DStream_overflow;
+diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
+index c42d39faf..c437e0975 100644
+--- a/lib/zstd/common/compiler.h
++++ b/lib/zstd/common/compiler.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -179,6 +180,17 @@
+ * Sanitizer
+ *****************************************************************/
+
++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
++ * abundance of caution, disable our custom poisoning on mingw. */
++#ifdef __MINGW32__
++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
++#endif
++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
++#endif
++#endif
++
+
+
+ #endif /* ZSTD_COMPILER_H */
+diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
+index 0db7b4240..d8319a2be 100644
+--- a/lib/zstd/common/cpu.h
++++ b/lib/zstd/common/cpu.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
+index bb863c9ea..e56ff6464 100644
+--- a/lib/zstd/common/debug.c
++++ b/lib/zstd/common/debug.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * debug
+ * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
+index 6dd88d1fb..da0dbfc61 100644
+--- a/lib/zstd/common/debug.h
++++ b/lib/zstd/common/debug.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+ * debug
+ * Part of FSE library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
+index fef67056f..6cdd82233 100644
+--- a/lib/zstd/common/entropy_common.c
++++ b/lib/zstd/common/entropy_common.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * Common functions of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,8 +20,8 @@
+ #include "error_private.h" /* ERR_*, ERROR */
+ #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */
+ #include "huf.h"
++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
+
+
+ /*=== Version ===*/
+@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+ /*-**************************************************************
+ * FSE NCount encoding-decoding
+ ****************************************************************/
+-static U32 FSE_ctz(U32 val)
+-{
+- assert(val != 0);
+- {
+-# if (__GNUC__ >= 3) /* GCC Intrinsic */
+- return __builtin_ctz(val);
+-# else /* Software version */
+- U32 count = 0;
+- while ((val & 1) == 0) {
+- val >>= 1;
+- ++count;
+- }
+- return count;
+-# endif
+- }
+-}
+-
+ FORCE_INLINE_TEMPLATE
+ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+ const void* headerBuffer, size_t hbSize)
+@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+ * repeat.
+ * Avoid UB by setting the high bit to 1.
+ */
+- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+ while (repeats >= 12) {
+ charnum += 3 * 12;
+ if (LIKELY(ip <= iend-7)) {
+@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+ ip = iend - 4;
+ }
+ bitStream = MEM_readLE32(ip) >> bitCount;
+- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
+ }
+ charnum += 3 * repeats;
+ bitStream >>= 2 * repeats;
+@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
+ * know that threshold > 1.
+ */
+ if (remaining <= 1) break;
+- nbBits = BIT_highbit32(remaining) + 1;
++ nbBits = ZSTD_highbit32(remaining) + 1;
+ threshold = 1 << (nbBits - 1);
+ }
+ if (charnum >= maxSV1) break;
+@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+ const void* src, size_t srcSize)
+ {
+ U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
+- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
+ }
+
+ FORCE_INLINE_TEMPLATE size_t
+@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+ if (weightTotal == 0) return ERROR(corruption_detected);
+
+ /* get last non-null symbol weight (implied, total must be 2^n) */
+- { U32 const tableLog = BIT_highbit32(weightTotal) + 1;
++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
+ if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+ *tableLogPtr = tableLog;
+ /* determine last weight */
+ { U32 const total = 1 << tableLog;
+ U32 const rest = total - weightTotal;
+- U32 const verif = 1 << BIT_highbit32(rest);
+- U32 const lastWeight = BIT_highbit32(rest) + 1;
++ U32 const verif = 1 << ZSTD_highbit32(rest);
++ U32 const lastWeight = ZSTD_highbit32(rest) + 1;
+ if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */
+ huffWeight[oSize] = (BYTE)lastWeight;
+ rankStats[lastWeight]++;
+@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+ U32* nbSymbolsPtr, U32* tableLogPtr,
+ const void* src, size_t srcSize,
+ void* workSpace, size_t wkspSize,
+- int bmi2)
++ int flags)
+ {
+ #if DYNAMIC_BMI2
+- if (bmi2) {
++ if (flags & HUF_flags_bmi2) {
+ return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+ #endif
+- (void)bmi2;
++ (void)flags;
+ return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
+ }
+diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
+index 6d1135f8c..a4062d30d 100644
+--- a/lib/zstd/common/error_private.c
++++ b/lib/zstd/common/error_private.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
+ case PREFIX(version_unsupported): return "Version not supported";
+ case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+ case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+- case PREFIX(corruption_detected): return "Corrupted block detected";
++ case PREFIX(corruption_detected): return "Data corruption detected";
+ case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
+ case PREFIX(parameter_unsupported): return "Unsupported parameter";
++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
+ case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+ case PREFIX(init_missing): return "Context should be init first";
+ case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
+ case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+ case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+ case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
+ case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+ case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+ case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+ case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+ case PREFIX(srcSize_wrong): return "Src size is incorrect";
+ case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
+ /* following error codes are not stable and may be removed or changed in a future version */
+ case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+ case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+ case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+ case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
++ case PREFIX(externalSequences_invalid): return "External sequences are not valid";
+ case PREFIX(maxCode):
+ default: return notErrorCode;
+ }
+diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
+index ca5101e54..9a4699a38 100644
+--- a/lib/zstd/common/error_private.h
++++ b/lib/zstd/common/error_private.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
+index 4507043b2..c4e25a219 100644
+--- a/lib/zstd/common/fse.h
++++ b/lib/zstd/common/fse.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -50,34 +51,6 @@
+ FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */
+
+
+-/*-****************************************
+-* FSE simple functions
+-******************************************/
+-/*! FSE_compress() :
+- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+- @return : size of compressed data (<= dstCapacity).
+- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+- if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+-*/
+-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize);
+-
+-/*! FSE_decompress():
+- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+- into already allocated destination buffer 'dst', of size 'dstCapacity'.
+- @return : size of regenerated data (<= maxDstSize),
+- or an error code, which can be tested using FSE_isError() .
+-
+- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+- Why ? : making this distinction requires a header.
+- Header management is intentionally delegated to the user layer, which can better manage special cases.
+-*/
+-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity,
+- const void* cSrc, size_t cSrcSize);
+-
+-
+ /*-*****************************************
+ * Tool functions
+ ******************************************/
+@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return
+ FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */
+
+
+-/*-*****************************************
+-* FSE advanced functions
+-******************************************/
+-/*! FSE_compress2() :
+- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+- Both parameters can be defined as '0' to mean : use default value
+- @return : size of compressed data
+- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+- if FSE_isError(return), it's an error code.
+-*/
+-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-
+-
+ /*-*****************************************
+ * FSE detailed API
+ ******************************************/
+@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ /*! Constructor and Destructor of FSE_CTable.
+ Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+ typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct);
+
+ /*! FSE_buildCTable():
+ Builds `ct`, which must be already allocated, using FSE_createCTable().
+@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
+ unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+ const void* rBuffer, size_t rBuffSize, int bmi2);
+
+-/*! Constructor and Destructor of FSE_DTable.
+- Note that its size depends on 'tableLog' */
+ typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt);
+-
+-/*! FSE_buildDTable():
+- Builds 'dt', which must be already allocated, using FSE_createDTable().
+- return : 0, or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+-
+-/*! FSE_decompress_usingDTable():
+- Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+- into `dst` which must be already allocated.
+- @return : size of regenerated data (necessarily <= `dstCapacity`),
+- or an errorCode, which can be tested using FSE_isError() */
+-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+ /*!
+ Tutorial :
+@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+ /*< same as FSE_optimalTableLog(), which used `minus==2` */
+
+-/* FSE_compress_wksp() :
+- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+- */
+-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+-
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+-
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+ /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+ FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+ /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
+
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+-
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
+-
+-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+ #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
+-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
+-
+ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
+-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
+
+ typedef enum {
+ FSE_repeat_none, /*< Cannot use the previous table */
+@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
+
+ /* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
+index 8dcb8ca39..99ce8fa54 100644
+--- a/lib/zstd/common/fse_decompress.c
++++ b/lib/zstd/common/fse_decompress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * FSE : Finite State Entropy decoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -24,6 +25,7 @@
+ #include "error_private.h"
+ #define ZSTD_DEPS_NEED_MALLOC
+ #include "zstd_deps.h"
++#include "bits.h" /* ZSTD_highbit32 */
+
+
+ /* **************************************************************
+@@ -55,19 +57,6 @@
+ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+ #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+-
+-/* Function templates */
+-FSE_DTable* FSE_createDTable (unsigned tableLog)
+-{
+- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+-}
+-
+-void FSE_freeDTable (FSE_DTable* dt)
+-{
+- ZSTD_free(dt);
+-}
+-
+ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+ {
+ void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
+@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+ }
+ }
+ /* Now we spread those positions across the table.
+- * The benefit of doing it in two stages is that we avoid the the
++ * The benefit of doing it in two stages is that we avoid the
+ * variable size inner loop, which caused lots of branch misses.
+ * Now we can run through all the positions without any branch misses.
+- * We unroll the loop twice, since that is what emperically worked best.
++ * We unroll the loop twice, since that is what empirically worked best.
+ */
+ {
+ size_t position = 0;
+@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+ for (u=0; u<tableSize; u++) {
+ FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+ U32 const nextState = symbolNext[symbol]++;
+- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+ tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+ } }
+
+@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+ /*-*******************************************************
+ * Decompression (Byte symbols)
+ *********************************************************/
+-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+-{
+- void* ptr = dt;
+- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+- void* dPtr = dt + 1;
+- FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+-
+- DTableH->tableLog = 0;
+- DTableH->fastMode = 0;
+-
+- cell->newState = 0;
+- cell->symbol = symbolValue;
+- cell->nbBits = 0;
+-
+- return 0;
+-}
+-
+-
+-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+-{
+- void* ptr = dt;
+- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+- void* dPtr = dt + 1;
+- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+- const unsigned tableSize = 1 << nbBits;
+- const unsigned tableMask = tableSize - 1;
+- const unsigned maxSV1 = tableMask+1;
+- unsigned s;
+-
+- /* Sanity checks */
+- if (nbBits < 1) return ERROR(GENERIC); /* min size */
+-
+- /* Build Decoding Table */
+- DTableH->tableLog = (U16)nbBits;
+- DTableH->fastMode = 1;
+- for (s=0; s<maxSV1; s++) {
+- dinfo[s].newState = 0;
+- dinfo[s].symbol = (BYTE)s;
+- dinfo[s].nbBits = (BYTE)nbBits;
+- }
+-
+- return 0;
+-}
+
+ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+ void* dst, size_t maxDstSize,
+@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+ return op-ostart;
+ }
+
+-
+-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+- const void* cSrc, size_t cSrcSize,
+- const FSE_DTable* dt)
+-{
+- const void* ptr = dt;
+- const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+- const U32 fastMode = DTableH->fastMode;
+-
+- /* select fast mode (static) */
+- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+-}
+-
+-
+-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
+-{
+- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+ typedef struct {
+ short ncount[FSE_MAX_SYMBOL_VALUE + 1];
+ FSE_DTable dtable[]; /* Dynamically sized */
+@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+ }
+
+ if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
+- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+ wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
+
+ CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+ return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
+ }
+
+-
+-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+-
+-
+-
+ #endif /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
+index 5042ff870..8e7943092 100644
+--- a/lib/zstd/common/huf.h
++++ b/lib/zstd/common/huf.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+ * huff0 huffman codec,
+ * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -18,99 +19,22 @@
+
+ /* *** Dependencies *** */
+ #include "zstd_deps.h" /* size_t */
+-
+-
+-/* *** library symbols visibility *** */
+-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+- * HUF symbols remain "private" (internal symbols for library only).
+- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+-# define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */
+-# define HUF_PUBLIC_API __declspec(dllexport)
+-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+-#else
+-# define HUF_PUBLIC_API
+-#endif
+-
+-
+-/* ========================== */
+-/* *** simple functions *** */
+-/* ========================== */
+-
+-/* HUF_compress() :
+- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+- * 'dst' buffer must be already allocated.
+- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+- * @return : size of compressed data (<= `dstCapacity`).
+- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+- * if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+- */
+-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize);
+-
+-/* HUF_decompress() :
+- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+- * into already allocated buffer 'dst', of minimum size 'dstSize'.
+- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+- * Note : in contrast with FSE, HUF_decompress can regenerate
+- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+- * because it knows size to regenerate (originalSize).
+- * @return : size of regenerated data (== originalSize),
+- * or an error code, which can be tested using HUF_isError()
+- */
+-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize,
+- const void* cSrc, size_t cSrcSize);
++#include "mem.h" /* U32 */
++#define FSE_STATIC_LINKING_ONLY
++#include "fse.h"
+
+
+ /* *** Tool functions *** */
+-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */
+-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */
++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */
++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */
+
+ /* Error Management */
+-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */
+-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */
++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */
++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */
+
+
+-/* *** Advanced function *** */
+-
+-/* HUF_compress2() :
+- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize,
+- unsigned maxSymbolValue, unsigned tableLog);
+-
+-/* HUF_compress4X_wksp() :
+- * Same as HUF_compress2(), but uses externally allocated `workSpace`.
+- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+ #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
+-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize,
+- unsigned maxSymbolValue, unsigned tableLog,
+- void* workSpace, size_t wkspSize);
+-
+-#endif /* HUF_H_298734234 */
+-
+-/* ******************************************************************
+- * WARNING !!
+- * The following section contains advanced and experimental definitions
+- * which shall never be used in the context of a dynamic library,
+- * because they are not guaranteed to remain stable in the future.
+- * Only consider them in association with static linking.
+- * *****************************************************************/
+-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+-#define HUF_H_HUF_STATIC_LINKING_ONLY
+-
+-/* *** Dependencies *** */
+-#include "mem.h" /* U32 */
+-#define FSE_STATIC_LINKING_ONLY
+-#include "fse.h"
+-
+
+ /* *** Constants *** */
+ #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
+@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
+ /* ****************************************
+ * Advanced decompression functions
+ ******************************************/
+-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */
+-#endif
+
+-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */
+-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
+-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */
+-#endif
++/*
++ * Huffman flags bitset.
++ * For all flags, 0 is the default value.
++ */
++typedef enum {
++ /*
++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
++ * Otherwise: Ignored.
++ */
++ HUF_flags_bmi2 = (1 << 0),
++ /*
++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
++ * If unset: Use heuristic to find the table depth.
++ */
++ HUF_flags_optimalDepth = (1 << 1),
++ /*
++ * If set: If the previous table can encode the input, always reuse the previous table.
++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
++ */
++ HUF_flags_preferRepeat = (1 << 2),
++ /*
++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
++ * If unset: Always histogram the entire input.
++ */
++ HUF_flags_suspectUncompressible = (1 << 3),
++ /*
++ * If set: Don't use assembly implementations
++ * If unset: Allow using assembly implementations
++ */
++ HUF_flags_disableAsm = (1 << 4),
++ /*
++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
++ * If unset: Use the fast decoding loop when possible.
++ */
++ HUF_flags_disableFast = (1 << 5)
++} HUF_flags_e;
+
+
+ /* ****************************************
+ * HUF detailed API
+ * ****************************************/
++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
+
+ /*! HUF_compress() does the following:
+ * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ * For example, it's possible to compress several blocks using the same 'CTable',
+ * or to save and regenerate 'CTable' using external methods.
+ */
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
++unsigned HUF_minTableLog(unsigned symbolCardinality);
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
+ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+
+@@ -196,6 +144,7 @@ typedef enum {
+ HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+ HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
++
+ /* HUF_compress4X_repeat() :
+ * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ * If it uses hufTable it does not modify hufTable or repeat.
+@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned tableLog,
+ void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+
+ /* HUF_buildCTable_wksp() :
+ * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
+ #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+ size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+ const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
+ U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+ const void* src, size_t srcSize,
+ void* workspace, size_t wkspSize,
+- int bmi2);
++ int flags);
+
+ /* HUF_readCTable() :
+ * Loading a CTable saved with HUF_writeCTable() */
+@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
+ #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+-#endif
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
+-
+
+ /* ====================== */
+ /* single stream variants */
+ /* ====================== */
+
+-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
+ /* HUF_compress1X_repeat() :
+ * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ * If it uses hufTable it does not modify hufTable or repeat.
+@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned tableLog,
+ void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+-
+-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */
+-#endif
+-
+-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */
+-#endif
+-#ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */
+-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */
+-#endif
++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */
+-#ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+-#endif
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */
+ #endif
+
+ /* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+ #ifndef HUF_FORCE_DECOMPRESS_X1
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
+ #endif
+
+-#endif /* HUF_STATIC_LINKING_ONLY */
++#endif /* HUF_H_298734234 */
+
+diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
+index 1d9cc0392..a7231822b 100644
+--- a/lib/zstd/common/mem.h
++++ b/lib/zstd/common/mem.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
+index 0e3b2c0a5..7ede8cf1f 100644
+--- a/lib/zstd/common/portability_macros.h
++++ b/lib/zstd/common/portability_macros.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -12,7 +13,7 @@
+ #define ZSTD_PORTABILITY_MACROS_H
+
+ /*
+- * This header file contains macro defintions to support portability.
++ * This header file contains macro definitions to support portability.
+ * This header is shared between C and ASM code, so it MUST only
+ * contain macro definitions. It MUST not contain any C code.
+ *
+@@ -65,7 +66,7 @@
+ #endif
+
+ /*
+- * Only enable assembly for GNUC comptabile compilers,
++ * Only enable assembly for GNUC compatible compilers,
+ * because other platforms may not support GAS assembly syntax.
+ *
+ * Only enable assembly for Linux / MacOS, other platforms may
+@@ -90,4 +91,23 @@
+ */
+ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+
++/*
++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
++ * assembly sources when CET is enabled.
++ *
++ * Additionally, any function that may be called indirectly must begin
++ * with ZSTD_CET_ENDBRANCH.
++ */
++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
++ && defined(__has_include)
++# if __has_include(<cet.h>)
++# include <cet.h>
++# define ZSTD_CET_ENDBRANCH _CET_ENDBR
++# endif
++#endif
++
++#ifndef ZSTD_CET_ENDBRANCH
++# define ZSTD_CET_ENDBRANCH
++#endif
++
+ #endif /* ZSTD_PORTABILITY_MACROS_H */
+diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
+index 3d7e35b30..44b95b253 100644
+--- a/lib/zstd/common/zstd_common.c
++++ b/lib/zstd/common/zstd_common.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,6 @@
+ * Dependencies
+ ***************************************/
+ #define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+ #include "error_private.h"
+ #include "zstd_internal.h"
+
+@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+ /*! ZSTD_getErrorString() :
+ * provides error code string from enum */
+ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+-
+-
+-
+-/*=**************************************************************
+-* Custom allocator
+-****************************************************************/
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+-{
+- if (customMem.customAlloc)
+- return customMem.customAlloc(customMem.opaque, size);
+- return ZSTD_malloc(size);
+-}
+-
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+-{
+- if (customMem.customAlloc) {
+- /* calloc implemented as malloc+memset;
+- * not as efficient as calloc, but next best guess for custom malloc */
+- void* const ptr = customMem.customAlloc(customMem.opaque, size);
+- ZSTD_memset(ptr, 0, size);
+- return ptr;
+- }
+- return ZSTD_calloc(1, size);
+-}
+-
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+-{
+- if (ptr!=NULL) {
+- if (customMem.customFree)
+- customMem.customFree(customMem.opaque, ptr);
+- else
+- ZSTD_free(ptr);
+- }
+-}
+diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
+index 2c34e8a33..f931f7d0e 100644
+--- a/lib/zstd/common/zstd_deps.h
++++ b/lib/zstd/common/zstd_deps.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
+
+ #endif /* ZSTD_DEPS_IO */
+ #endif /* ZSTD_DEPS_NEED_IO */
++
++/*
++ * Only requested when MSAN is enabled.
++ * Need:
++ * intptr_t
++ */
++#ifdef ZSTD_DEPS_NEED_STDINT
++#ifndef ZSTD_DEPS_STDINT
++#define ZSTD_DEPS_STDINT
++
++/* intptr_t already provided by ZSTD_DEPS_COMMON */
++
++#endif /* ZSTD_DEPS_STDINT */
++#endif /* ZSTD_DEPS_NEED_STDINT */
+diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
+index 93305d9b4..7f023e4d4 100644
+--- a/lib/zstd/common/zstd_internal.h
++++ b/lib/zstd/common/zstd_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -28,7 +29,6 @@
+ #include <linux/zstd.h>
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "huf.h"
+ #include <linux/xxhash.h> /* XXH_reset, update, digest */
+ #define ZSTD_TRACE 0
+@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+ #define ZSTD_FRAMECHECKSUMSIZE 4
+
+ #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */
++#define MIN_LITERALS_FOR_4_STREAMS 6
+
+-#define HufLog 12
+ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+ #define LONGNBSEQ 0x7F00
+@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define MINMATCH 3
+
+ #define Litbits 8
++#define LitHufLog 11
+ #define MaxLit ((1<<Litbits) - 1)
+ #define MaxML 52
+ #define MaxLL 35
+@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
+ #define LLFSELog 9
+ #define OffFSELog 8
+ #define MaxFSELog MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
++#define MaxMLBits 16
++#define MaxLLBits 16
+
+ #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
+ /* Each table cannot take more than #symbols * FSELog bits */
+@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+ * one COPY16() in the first call. Then, do two calls per loop since
+ * at that point it is more likely to have a high trip count.
+ */
+-#ifdef __aarch64__
+- do {
+- COPY16(op, ip);
+- }
+- while (op < oend);
+-#else
+ ZSTD_copy16(op, ip);
+ if (16 >= length) return;
+ op += 16;
+@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+ COPY16(op, ip);
+ }
+ while (op < oend);
+-#endif
+ }
+ }
+
+@@ -289,11 +285,11 @@ typedef enum {
+ typedef struct {
+ seqDef* sequencesStart;
+ seqDef* sequences; /* ptr to end of sequences */
+- BYTE* litStart;
+- BYTE* lit; /* ptr to end of literals */
+- BYTE* llCode;
+- BYTE* mlCode;
+- BYTE* ofCode;
++ BYTE* litStart;
++ BYTE* lit; /* ptr to end of literals */
++ BYTE* llCode;
++ BYTE* mlCode;
++ BYTE* ofCode;
+ size_t maxNbSeq;
+ size_t maxNbLit;
+
+@@ -301,8 +297,8 @@ typedef struct {
+ * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
+ * the existing value of the litLength or matchLength by 0x10000.
+ */
+- ZSTD_longLengthType_e longLengthType;
+- U32 longLengthPos; /* Index of the sequence to apply long length modification to */
++ ZSTD_longLengthType_e longLengthType;
++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */
+ } seqStore_t;
+
+ typedef struct {
+@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+ seqLen.matchLength = seq->mlBase + MINMATCH;
+ if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+ if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+- seqLen.litLength += 0xFFFF;
++ seqLen.litLength += 0x10000;
+ }
+ if (seqStore->longLengthType == ZSTD_llt_matchLength) {
+- seqLen.matchLength += 0xFFFF;
++ seqLen.matchLength += 0x10000;
+ }
+ }
+ return seqLen;
+@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
+ * `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+ */
+ typedef struct {
++ size_t nbBlocks;
+ size_t compressedSize;
+ unsigned long long decompressedBound;
+ } ZSTD_frameSizeInfo; /* decompress & legacy */
+
+ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+-
+-/* custom memory allocation functions */
+-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
+-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
+-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
+-
+-
+-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */
+-{
+- assert(val != 0);
+- {
+-# if (__GNUC__ >= 3) /* GCC Intrinsic */
+- return __builtin_clz (val) ^ 31;
+-# else /* Software version */
+- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+- U32 v = val;
+- v |= v >> 1;
+- v |= v >> 2;
+- v |= v >> 4;
+- v |= v >> 8;
+- v |= v >> 16;
+- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+-# endif
+- }
+-}
+-
+-/*
+- * Counts the number of trailing zeros of a `size_t`.
+- * Most compilers should support CTZ as a builtin. A backup
+- * implementation is provided if the builtin isn't supported, but
+- * it may not be terribly efficient.
+- */
+-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+-{
+- if (MEM_64bits()) {
+-# if (__GNUC__ >= 4)
+- return __builtin_ctzll((U64)val);
+-# else
+- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19,
+- 4, 25, 14, 28, 9, 34, 20, 56,
+- 5, 17, 26, 54, 15, 41, 29, 43,
+- 10, 31, 38, 35, 21, 45, 49, 57,
+- 63, 6, 12, 18, 24, 27, 33, 55,
+- 16, 53, 40, 42, 30, 37, 44, 48,
+- 62, 11, 23, 32, 52, 39, 36, 47,
+- 61, 22, 51, 46, 60, 50, 59, 58 };
+- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-# endif
+- } else { /* 32 bits */
+-# if (__GNUC__ >= 3)
+- return __builtin_ctz((U32)val);
+-# else
+- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3,
+- 30, 22, 20, 15, 25, 17, 4, 8,
+- 31, 27, 13, 23, 21, 19, 16, 7,
+- 26, 12, 18, 6, 11, 5, 10, 9 };
+- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-# endif
+- }
+-}
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+
+ /* ZSTD_invalidateRepCodes() :
+diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
+index d9a76112e..6ab8be653 100644
+--- a/lib/zstd/compress/clevels.h
++++ b/lib/zstd/compress/clevels.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
+index ec5b1ca6d..e46ca6621 100644
+--- a/lib/zstd/compress/fse_compress.c
++++ b/lib/zstd/compress/fse_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * FSE : Finite State Entropy encoder
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,6 +27,7 @@
+ #define ZSTD_DEPS_NEED_MALLOC
+ #define ZSTD_DEPS_NEED_MATH64
+ #include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+
+
+ /* **************************************************************
+@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+ assert(tableLog < 16); /* required for threshold strategy to work */
+
+ /* For explanations on how to distribute symbol values over the table :
+- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+ #ifdef __clang_analyzer__
+ ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */
+@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+ break;
+ default :
+ assert(normalizedCounter[s] > 1);
+- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
+ U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
+ symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+ symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+ * FSE Compression Code
+ ****************************************************************/
+
+-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+-{
+- size_t size;
+- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+- return (FSE_CTable*)ZSTD_malloc(size);
+-}
+-
+-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
+-
+ /* provides the minimum logSize to safely represent a distribution */
+ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+ {
+- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
+ U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+ assert(srcSize > 1); /* Not supported, RLE should be used instead */
+ return minBits;
+@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+
+ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+ {
+- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
+ U32 tableLog = maxTableLog;
+ U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+ assert(srcSize > 1); /* Not supported, RLE should be used instead */
+@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+ return tableLog;
+ }
+
+-
+-/* fake FSE_CTable, for raw (uncompressed) input */
+-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+-{
+- const unsigned tableSize = 1 << nbBits;
+- const unsigned tableMask = tableSize - 1;
+- const unsigned maxSymbolValue = tableMask;
+- void* const ptr = ct;
+- U16* const tableU16 = ( (U16*) ptr) + 2;
+- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */
+- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+- unsigned s;
+-
+- /* Sanity checks */
+- if (nbBits < 1) return ERROR(GENERIC); /* min size */
+-
+- /* header */
+- tableU16[-2] = (U16) nbBits;
+- tableU16[-1] = (U16) maxSymbolValue;
+-
+- /* Build table */
+- for (s=0; s<tableSize; s++)
+- tableU16[s] = (U16)(tableSize + s);
+-
+- /* Build Symbol Transformation Table */
+- { const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+- for (s=0; s<=maxSymbolValue; s++) {
+- symbolTT[s].deltaNbBits = deltaNbBits;
+- symbolTT[s].deltaFindState = s-1;
+- } }
+-
+- return 0;
+-}
+-
+ /* fake FSE_CTable, for rle input (always same symbol) */
+ size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+ {
+@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+
+ size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+-
+ #endif /* FSE_COMMONDEFS_ONLY */
+diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
+index 3ddc6dfb6..0b12587cc 100644
+--- a/lib/zstd/compress/hist.c
++++ b/lib/zstd/compress/hist.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
+index fc1830abc..f7687b0fc 100644
+--- a/lib/zstd/compress/hist.h
++++ b/lib/zstd/compress/hist.h
+@@ -1,7 +1,8 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
+index 74ef0db47..83241abaf 100644
+--- a/lib/zstd/compress/huf_compress.c
++++ b/lib/zstd/compress/huf_compress.c
+@@ -1,6 +1,7 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * Huffman encoder, part of New Generation Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -26,9 +27,9 @@
+ #include "hist.h"
+ #define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */
+ #include "../common/fse.h" /* header compression */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
++#include "../common/bits.h" /* ZSTD_highbit32 */
+
+
+ /* **************************************************************
+@@ -39,13 +40,67 @@
+
+
+ /* **************************************************************
+-* Utils
++* Required declarations
+ ****************************************************************/
+-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
++typedef struct nodeElt_s {
++ U32 count;
++ U16 parent;
++ BYTE byte;
++ BYTE nbBits;
++} nodeElt;
++
++
++/* **************************************************************
++* Debug Traces
++****************************************************************/
++
++#if DEBUGLEVEL >= 2
++
++static size_t showU32(const U32* arr, size_t size)
++{
++ size_t u;
++ for (u=0; u<size; u++) {
++ RAWLOG(6, " %u", arr[u]); (void)arr;
++ }
++ RAWLOG(6, " \n");
++ return size;
++}
++
++static size_t HUF_getNbBits(HUF_CElt elt);
++
++static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
+ {
+- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++ size_t u;
++ for (u=0; u<size; u++) {
++ RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
++ }
++ RAWLOG(6, " \n");
++ return size;
++
+ }
+
++static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
++{
++ size_t u;
++ for (u=0; u<size; u++) {
++ RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
++ }
++ RAWLOG(6, " \n");
++ return size;
++}
++
++static size_t showHNodeBits(const nodeElt* hnode, size_t size)
++{
++ size_t u;
++ for (u=0; u<size; u++) {
++ RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
++ }
++ RAWLOG(6, " \n");
++ return size;
++}
++
++#endif
++
+
+ /* *******************************************************
+ * HUF : Huffman block compression
+@@ -86,7 +141,10 @@ typedef struct {
+ S16 norm[HUF_TABLELOG_MAX+1];
+ } HUF_CompressWeightsWksp;
+
+-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
++static size_t
++HUF_compressWeights(void* dst, size_t dstSize,
++ const void* weightTable, size_t wtSize,
++ void* workspace, size_t workspaceSize)
+ {
+ BYTE* const ostart = (BYTE*) dst;
+ BYTE* op = ostart;
+@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
+
+ static size_t HUF_getValue(HUF_CElt elt)
+ {
+- return elt & ~0xFF;
++ return elt & ~(size_t)0xFF;
+ }
+
+ static size_t HUF_getValueFast(HUF_CElt elt)
+@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+ U32 n;
+ HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
+
++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
+ /* check conditions */
+ if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+ return ((maxSymbolValue+1)/2) + 1;
+ }
+
+-/*! HUF_writeCTable() :
+- `CTable` : Huffman tree to save, using huf representation.
+- @return : size of saved CTable */
+-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+-{
+- HUF_WriteCTableWksp wksp;
+- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
+-}
+-
+
+ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+ {
+@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
+ {
+- const HUF_CElt* ct = CTable + 1;
++ const HUF_CElt* const ct = CTable + 1;
+ assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+ return (U32)HUF_getNbBits(ct[symbolValue]);
+ }
+
+
+-typedef struct nodeElt_s {
+- U32 count;
+- U16 parent;
+- BYTE byte;
+- BYTE nbBits;
+-} nodeElt;
+-
+ /*
+ * HUF_setMaxHeight():
+- * Enforces maxNbBits on the Huffman tree described in huffNode.
++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
+ *
+- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
+- * the tree to so that it is a valid canonical Huffman tree.
++ * It attempts to convert all nodes with nbBits > @targetNbBits
++ * to employ @targetNbBits instead. Then it adjusts the tree
++ * so that it remains a valid canonical Huffman tree.
+ *
+ * @pre The sum of the ranks of each symbol == 2^largestBits,
+ * where largestBits == huffNode[lastNonNull].nbBits.
+ * @post The sum of the ranks of each symbol == 2^largestBits,
+- * where largestBits is the return value <= maxNbBits.
++ * where largestBits is the return value (expected <= targetNbBits).
+ *
+- * @param huffNode The Huffman tree modified in place to enforce maxNbBits.
++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits.
++ * It's presumed sorted, from most frequent to rarest symbol.
+ * @param lastNonNull The symbol with the lowest count in the Huffman tree.
+- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree
++ * @param targetNbBits The allowed number of bits, which the Huffman tree
+ * may not respect. After this function the Huffman tree will
+- * respect maxNbBits.
+- * @return The maximum number of bits of the Huffman tree after adjustment,
+- * necessarily no more than maxNbBits.
++ * respect targetNbBits.
++ * @return The maximum number of bits of the Huffman tree after adjustment.
+ */
+-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
+ {
+ const U32 largestBits = huffNode[lastNonNull].nbBits;
+- /* early exit : no elt > maxNbBits, so the tree is already valid. */
+- if (largestBits <= maxNbBits) return largestBits;
++ /* early exit : no elt > targetNbBits, so the tree is already valid. */
++ if (largestBits <= targetNbBits) return largestBits;
++
++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
+
+ /* there are several too large elements (at least >= 2) */
+ { int totalCost = 0;
+- const U32 baseCost = 1 << (largestBits - maxNbBits);
++ const U32 baseCost = 1 << (largestBits - targetNbBits);
+ int n = (int)lastNonNull;
+
+- /* Adjust any ranks > maxNbBits to maxNbBits.
++ /* Adjust any ranks > targetNbBits to targetNbBits.
+ * Compute totalCost, which is how far the sum of the ranks is
+ * we are over 2^largestBits after adjust the offending ranks.
+ */
+- while (huffNode[n].nbBits > maxNbBits) {
++ while (huffNode[n].nbBits > targetNbBits) {
+ totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+- huffNode[n].nbBits = (BYTE)maxNbBits;
++ huffNode[n].nbBits = (BYTE)targetNbBits;
+ n--;
+ }
+- /* n stops at huffNode[n].nbBits <= maxNbBits */
+- assert(huffNode[n].nbBits <= maxNbBits);
+- /* n end at index of smallest symbol using < maxNbBits */
+- while (huffNode[n].nbBits == maxNbBits) --n;
++ /* n stops at huffNode[n].nbBits <= targetNbBits */
++ assert(huffNode[n].nbBits <= targetNbBits);
++ /* n end at index of smallest symbol using < targetNbBits */
++ while (huffNode[n].nbBits == targetNbBits) --n;
+
+- /* renorm totalCost from 2^largestBits to 2^maxNbBits
++ /* renorm totalCost from 2^largestBits to 2^targetNbBits
+ * note : totalCost is necessarily a multiple of baseCost */
+- assert((totalCost & (baseCost - 1)) == 0);
+- totalCost >>= (largestBits - maxNbBits);
++ assert(((U32)totalCost & (baseCost - 1)) == 0);
++ totalCost >>= (largestBits - targetNbBits);
+ assert(totalCost > 0);
+
+ /* repay normalized cost */
+@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+
+ /* Get pos of last (smallest = lowest cum. count) symbol per rank */
+ ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
+- { U32 currentNbBits = maxNbBits;
++ { U32 currentNbBits = targetNbBits;
+ int pos;
+ for (pos=n ; pos >= 0; pos--) {
+ if (huffNode[pos].nbBits >= currentNbBits) continue;
+- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */
+- rankLast[maxNbBits-currentNbBits] = (U32)pos;
++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */
++ rankLast[targetNbBits-currentNbBits] = (U32)pos;
+ } }
+
+ while (totalCost > 0) {
+ /* Try to reduce the next power of 2 above totalCost because we
+ * gain back half the rank.
+ */
+- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
+ for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+ U32 const highPos = rankLast[nBitsToDecrease];
+ U32 const lowPos = rankLast[nBitsToDecrease-1];
+@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ rankLast[nBitsToDecrease] = noSymbol;
+ else {
+ rankLast[nBitsToDecrease]--;
+- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
+ rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
+ }
+ } /* while (totalCost > 0) */
+@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ * TODO.
+ */
+ while (totalCost < 0) { /* Sometimes, cost correction overshoot */
+- /* special case : no rank 1 symbol (using maxNbBits-1);
+- * let's create one from largest rank 0 (using maxNbBits).
++ /* special case : no rank 1 symbol (using targetNbBits-1);
++ * let's create one from largest rank 0 (using targetNbBits).
+ */
+ if (rankLast[1] == noSymbol) {
+- while (huffNode[n].nbBits == maxNbBits) n--;
++ while (huffNode[n].nbBits == targetNbBits) n--;
+ huffNode[n+1].nbBits--;
+ assert(n >= 0);
+ rankLast[1] = (U32)(n+1);
+@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+ } /* repay normalized cost */
+ } /* there are several too large elements (at least >= 2) */
+
+- return maxNbBits;
++ return targetNbBits;
+ }
+
+ typedef struct {
+@@ -429,7 +475,7 @@ typedef struct {
+ U16 curr;
+ } rankPos;
+
+-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
+
+ /* Number of buckets available for HUF_sort() */
+ #define RANK_POSITION_TABLE_SIZE 192
+@@ -448,8 +494,8 @@ typedef struct {
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+ */
+ #define RANK_POSITION_MAX_COUNT_LOG 32
+-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
+
+ /* Return the appropriate bucket index for a given count. See definition of
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+@@ -457,7 +503,7 @@ typedef struct {
+ static U32 HUF_getIndex(U32 const count) {
+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+ ? count
+- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+ }
+
+ /* Helper swap function for HUF_quickSortPartition() */
+@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+
+ /* Sort each bucket. */
+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
+ U32 const bucketStartIdx = rankPosition[n].base;
+ if (bucketSize > 1) {
+ assert(bucketStartIdx < maxSymbolValue1);
+@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+ assert(HUF_isSorted(huffNode, maxSymbolValue1));
+ }
+
++
+ /* HUF_buildCTable_wksp() :
+ * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+ int lowS, lowN;
+ int nodeNb = STARTNODE;
+ int n, nodeRoot;
++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
+ /* init for parents */
+ nonNullRank = (int)maxSymbolValue;
+ while(huffNode[nonNullRank].count == 0) nonNullRank--;
+@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+ for (n=0; n<=nonNullRank; n++)
+ huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
++
+ return nonNullRank;
+ }
+
+@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+ CTable[0] = maxNbBits;
+ }
+
+-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
++size_t
++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
++ void* workSpace, size_t wkspSize)
+ {
+- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
++ HUF_buildCTable_wksp_tables* const wksp_tables =
++ (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+ nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+ nodeElt* const huffNode = huffNode0+1;
+ int nonNullRank;
+
++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
++
++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
++
+ /* safety checks */
+ if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+- return ERROR(workSpace_tooSmall);
++ return ERROR(workSpace_tooSmall);
+ if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+ if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+- return ERROR(maxSymbolValue_tooLarge);
++ return ERROR(maxSymbolValue_tooLarge);
+ ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
+
+ /* sort, decreasing order */
+ HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
+
+ /* build tree */
+ nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
+
+- /* enforce maxTableLog */
++ /* determine and enforce maxTableLog */
+ maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+ if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
+
+@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+ #if DEBUGLEVEL >= 1
+ {
+ size_t const nbBits = HUF_getNbBits(elt);
+- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
+ (void)dirtyBits;
+ /* Middle bits are 0. */
+ assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+ {
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+- return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
+ }
+ }
+
+@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+- const HUF_CElt* CTable, const int bmi2)
++ const HUF_CElt* CTable, const int flags)
+ {
+- if (bmi2) {
++ if (flags & HUF_flags_bmi2) {
+ return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+ }
+ return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ static size_t
+ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+- const HUF_CElt* CTable, const int bmi2)
++ const HUF_CElt* CTable, const int flags)
+ {
+- (void)bmi2;
++ (void)flags;
+ return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+ }
+
+ #endif
+
+-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+-{
+- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+
+ static size_t
+ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+- const HUF_CElt* CTable, int bmi2)
++ const HUF_CElt* CTable, int flags)
+ {
+ size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */
+ const BYTE* ip = (const BYTE*) src;
+@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ op += 6; /* jumpTable */
+
+ assert(op <= oend);
+- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+ if (cSize == 0 || cSize > 65535) return 0;
+ MEM_writeLE16(ostart, (U16)cSize);
+ op += cSize;
+@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+
+ ip += segmentSize;
+ assert(op <= oend);
+- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+ if (cSize == 0 || cSize > 65535) return 0;
+ MEM_writeLE16(ostart+2, (U16)cSize);
+ op += cSize;
+@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+
+ ip += segmentSize;
+ assert(op <= oend);
+- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
+ if (cSize == 0 || cSize > 65535) return 0;
+ MEM_writeLE16(ostart+4, (U16)cSize);
+ op += cSize;
+@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ ip += segmentSize;
+ assert(op <= oend);
+ assert(ip <= iend);
+- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
+ if (cSize == 0 || cSize > 65535) return 0;
+ op += cSize;
+ }
+@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+ return (size_t)(op-ostart);
+ }
+
+-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
+- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+-{
+- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
+ }
+
+ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+ static size_t HUF_compressCTable_internal(
+ BYTE* const ostart, BYTE* op, BYTE* const oend,
+ const void* src, size_t srcSize,
+- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
+ {
+ size_t const cSize = (nbStreams==HUF_singleStream) ?
+- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
+ if (HUF_isError(cSize)) { return cSize; }
+ if (cSize==0) { return 0; } /* uncompressible */
+ op += cSize;
+@@ -1168,6 +1216,79 @@ typedef struct {
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
+
++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
++{
++ unsigned cardinality = 0;
++ unsigned i;
++
++ for (i = 0; i < maxSymbolValue + 1; i++) {
++ if (count[i] != 0) cardinality += 1;
++ }
++
++ return cardinality;
++}
++
++unsigned HUF_minTableLog(unsigned symbolCardinality)
++{
++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
++ return minBitsSymbols;
++}
++
++unsigned HUF_optimalTableLog(
++ unsigned maxTableLog,
++ size_t srcSize,
++ unsigned maxSymbolValue,
++ void* workSpace, size_t wkspSize,
++ HUF_CElt* table,
++ const unsigned* count,
++ int flags)
++{
++ assert(srcSize > 1); /* Not supported, RLE should be used instead */
++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
++
++ if (!(flags & HUF_flags_optimalDepth)) {
++ /* cheap evaluation, based on FSE */
++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++ }
++
++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
++ size_t maxBits, hSize, newSize;
++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
++ size_t optSize = ((size_t) ~0) - 1;
++ unsigned optLog = maxTableLog, optLogGuess;
++
++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
++
++ /* Search until size increases */
++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
++ maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++ if (ERR_isError(maxBits)) continue;
++
++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++
++ if (ERR_isError(hSize)) continue;
++
++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
++
++ if (newSize > optSize + 1) {
++ break;
++ }
++
++ if (newSize < optSize) {
++ optSize = newSize;
++ optLog = optLogGuess;
++ }
++ }
++ assert(optLog <= HUF_TABLELOG_MAX);
++ return optLog;
++ }
++}
++
+ /* HUF_compress_internal() :
+ * `workSpace_align4` must be aligned on 4-bytes boundaries,
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
+@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ HUF_nbStreams_e nbStreams,
+ void* workSpace, size_t wkspSize,
+- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+- const int bmi2, unsigned suspectUncompressible)
++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
+ {
+ HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
+ BYTE* const ostart = (BYTE*)dst;
+ BYTE* const oend = ostart + dstSize;
+ BYTE* op = ostart;
+
++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
+ HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
+
+ /* checks & inits */
+@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+ /* Heuristic : If old table is valid, use it for small inputs */
+- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+- nbStreams, oldHufTable, bmi2);
++ nbStreams, oldHufTable, flags);
+ }
+
+ /* If uncompressible data is suspected, do a smaller sampling first */
+ DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+ size_t largestTotal = 0;
++ DEBUGLOG(5, "input suspected incompressible : sampling to check");
+ { unsigned maxSymbolValueBegin = maxSymbolValue;
+ CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+ largestTotal += largestBegin;
+@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
+ if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
+ }
++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
+
+ /* Check validity of previous table */
+ if ( repeat
+@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ *repeat = HUF_repeat_none;
+ }
+ /* Heuristic : use existing table for small inputs */
+- if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+- nbStreams, oldHufTable, bmi2);
++ nbStreams, oldHufTable, flags);
+ }
+
+ /* Build Huffman Tree */
+- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
+ { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+ maxSymbolValue, huffLog,
+ &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
+ CHECK_F(maxBits);
+ huffLog = (U32)maxBits;
++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
+ }
+ /* Zero unused symbols in CTable, so we can check it for validity */
+ {
+@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+- nbStreams, oldHufTable, bmi2);
++ nbStreams, oldHufTable, flags);
+ } }
+
+ /* Use the new huffman table */
+@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ }
+ return HUF_compressCTable_internal(ostart, op, oend,
+ src, srcSize,
+- nbStreams, table->CTable, bmi2);
+-}
+-
+-
+-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+- const void* src, size_t srcSize,
+- unsigned maxSymbolValue, unsigned huffLog,
+- void* workSpace, size_t wkspSize)
+-{
+- return HUF_compress_internal(dst, dstSize, src, srcSize,
+- maxSymbolValue, huffLog, HUF_singleStream,
+- workSpace, wkspSize,
+- NULL, NULL, 0, 0 /*bmi2*/, 0);
++ nbStreams, table->CTable, flags);
+ }
+
+ size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ void* workSpace, size_t wkspSize,
+- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+- int bmi2, unsigned suspectUncompressible)
++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
+ return HUF_compress_internal(dst, dstSize, src, srcSize,
+ maxSymbolValue, huffLog, HUF_singleStream,
+ workSpace, wkspSize, hufTable,
+- repeat, preferRepeat, bmi2, suspectUncompressible);
+-}
+-
+-/* HUF_compress4X_repeat():
+- * compress input using 4 streams.
+- * provide workspace to generate compression tables */
+-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+- const void* src, size_t srcSize,
+- unsigned maxSymbolValue, unsigned huffLog,
+- void* workSpace, size_t wkspSize)
+-{
+- return HUF_compress_internal(dst, dstSize, src, srcSize,
+- maxSymbolValue, huffLog, HUF_fourStreams,
+- workSpace, wkspSize,
+- NULL, NULL, 0, 0 /*bmi2*/, 0);
++ repeat, flags);
+ }
+
+ /* HUF_compress4X_repeat():
+@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ unsigned maxSymbolValue, unsigned huffLog,
+ void* workSpace, size_t wkspSize,
+- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
+ {
++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
+ return HUF_compress_internal(dst, dstSize, src, srcSize,
+ maxSymbolValue, huffLog, HUF_fourStreams,
+ workSpace, wkspSize,
+- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
++ hufTable, repeat, flags);
+ }
+-
+diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
+index f620cafca..c1c316e9e 100644
+--- a/lib/zstd/compress/zstd_compress.c
++++ b/lib/zstd/compress/zstd_compress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -11,12 +12,12 @@
+ /*-*************************************
+ * Dependencies
+ ***************************************/
++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
+ #include "../common/mem.h"
+ #include "hist.h" /* HIST_countFast_wksp */
+ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_compress_internal.h"
+ #include "zstd_compress_sequences.h"
+@@ -27,6 +28,7 @@
+ #include "zstd_opt.h"
+ #include "zstd_ldm.h"
+ #include "zstd_compress_superblock.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
+
+ /* ***************************************************************
+ * Tuning parameters
+@@ -55,14 +57,17 @@
+ * Helper functions
+ ***************************************/
+ /* ZSTD_compressBound()
+- * Note that the result from this function is only compatible with the "normal"
+- * full-block strategy.
+- * When there are a lot of small blocks due to frequent flush in streaming mode
+- * the overhead of headers can make the compressed data to be larger than the
+- * return value of ZSTD_compressBound().
++ * Note that the result from this function is only valid for
++ * the one-pass compression functions.
++ * When employing the streaming mode,
++ * if flushes are frequently altering the size of blocks,
++ * the overhead from block headers can make the compressed data larger
++ * than the return value of ZSTD_compressBound().
+ */
+ size_t ZSTD_compressBound(size_t srcSize) {
+- return ZSTD_COMPRESSBOUND(srcSize);
++ size_t const r = ZSTD_COMPRESSBOUND(srcSize);
++ if (r==0) return ERROR(srcSize_wrong);
++ return r;
+ }
+
+
+@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ if (cctx==NULL) return 0; /* support free on NULL */
+ RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+ "not compatible with static CCtx");
+- {
+- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+ ZSTD_freeCCtxContent(cctx);
+- if (!cctxInWorkspace) {
+- ZSTD_customFree(cctx, cctx->customMem);
+- }
++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
+ }
+ return 0;
+ }
+@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+ return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+ }
+
+-/* Returns 1 if compression parameters are such that we should
++/* Returns ZSTD_ps_enable if compression parameters are such that we should
+ * enable long distance matching (wlog >= 27, strategy >= btopt).
+- * Returns 0 otherwise.
++ * Returns ZSTD_ps_disable otherwise.
+ */
+ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+ const ZSTD_compressionParameters* const cParams) {
+@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+ return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
+ }
+
++static int ZSTD_resolveExternalSequenceValidation(int mode) {
++ return mode;
++}
++
++/* Resolves maxBlockSize to the default if no value is present. */
++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
++ if (maxBlockSize == 0) {
++ return ZSTD_BLOCKSIZE_MAX;
++ } else {
++ return maxBlockSize;
++ }
++}
++
++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
++ if (value != ZSTD_ps_auto) return value;
++ if (cLevel < 10) {
++ return ZSTD_ps_disable;
++ } else {
++ return ZSTD_ps_enable;
++ }
++}
++
++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
++}
++
+ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+ ZSTD_compressionParameters cParams)
+ {
+@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+ }
+ cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
+ cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
++ cctxParams.compressionLevel);
+ assert(!ZSTD_checkCParams(cParams));
+ return cctxParams;
+ }
+@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+ #define ZSTD_NO_CLEVEL 0
+
+ /*
+- * Initializes the cctxParams from params and compressionLevel.
++ * Initializes `cctxParams` from `params` and `compressionLevel`.
+ * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
+ */
+-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
++static void
++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
++ const ZSTD_parameters* params,
++ int compressionLevel)
+ {
+ assert(!ZSTD_checkCParams(params->cParams));
+ ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
+@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+ cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+ cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+ cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
+ DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+ cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
+ }
+@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+
+ /*
+ * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
+- * @param param Validated zstd parameters.
++ * @param params Validated zstd parameters.
+ */
+ static void ZSTD_CCtxParams_setZstdParams(
+ ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+ return bounds;
+
+ case ZSTD_c_enableLongDistanceMatching:
+- bounds.lowerBound = 0;
+- bounds.upperBound = 1;
++ bounds.lowerBound = (int)ZSTD_ps_auto;
++ bounds.upperBound = (int)ZSTD_ps_disable;
+ return bounds;
+
+ case ZSTD_c_ldmHashLog:
+@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+ bounds.upperBound = 1;
+ return bounds;
+
++ case ZSTD_c_prefetchCDictTables:
++ bounds.lowerBound = (int)ZSTD_ps_auto;
++ bounds.upperBound = (int)ZSTD_ps_disable;
++ return bounds;
++
++ case ZSTD_c_enableSeqProducerFallback:
++ bounds.lowerBound = 0;
++ bounds.upperBound = 1;
++ return bounds;
++
++ case ZSTD_c_maxBlockSize:
++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++ return bounds;
++
++ case ZSTD_c_searchForExternalRepcodes:
++ bounds.lowerBound = (int)ZSTD_ps_auto;
++ bounds.upperBound = (int)ZSTD_ps_disable;
++ return bounds;
++
+ default:
+ bounds.error = ERROR(parameter_unsupported);
+ return bounds;
+@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+ case ZSTD_c_useBlockSplitter:
+ case ZSTD_c_useRowMatchFinder:
+ case ZSTD_c_deterministicRefPrefix:
++ case ZSTD_c_prefetchCDictTables:
++ case ZSTD_c_enableSeqProducerFallback:
++ case ZSTD_c_maxBlockSize:
++ case ZSTD_c_searchForExternalRepcodes:
+ default:
+ return 0;
+ }
+@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+ if (ZSTD_isUpdateAuthorized(param)) {
+ cctx->cParamsChanged = 1;
+ } else {
+- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
+ } }
+
+ switch(param)
+@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+ case ZSTD_c_useBlockSplitter:
+ case ZSTD_c_useRowMatchFinder:
+ case ZSTD_c_deterministicRefPrefix:
++ case ZSTD_c_prefetchCDictTables:
++ case ZSTD_c_enableSeqProducerFallback:
++ case ZSTD_c_maxBlockSize:
++ case ZSTD_c_searchForExternalRepcodes:
+ break;
+
+ default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ case ZSTD_c_minMatch :
+ if (value!=0) /* 0 => use default */
+ BOUNDCHECK(ZSTD_c_minMatch, value);
+- CCtxParams->cParams.minMatch = value;
++ CCtxParams->cParams.minMatch = (U32)value;
+ return CCtxParams->cParams.minMatch;
+
+ case ZSTD_c_targetLength :
+ BOUNDCHECK(ZSTD_c_targetLength, value);
+- CCtxParams->cParams.targetLength = value;
++ CCtxParams->cParams.targetLength = (U32)value;
+ return CCtxParams->cParams.targetLength;
+
+ case ZSTD_c_strategy :
+@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ /* Content size written in frame header _when known_ (default:1) */
+ DEBUGLOG(4, "set content size flag = %u", (value!=0));
+ CCtxParams->fParams.contentSizeFlag = value != 0;
+- return CCtxParams->fParams.contentSizeFlag;
++ return (size_t)CCtxParams->fParams.contentSizeFlag;
+
+ case ZSTD_c_checksumFlag :
+ /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+ CCtxParams->fParams.checksumFlag = value != 0;
+- return CCtxParams->fParams.checksumFlag;
++ return (size_t)CCtxParams->fParams.checksumFlag;
+
+ case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+ DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+
+ case ZSTD_c_forceMaxWindow :
+ CCtxParams->forceWindow = (value != 0);
+- return CCtxParams->forceWindow;
++ return (size_t)CCtxParams->forceWindow;
+
+ case ZSTD_c_forceAttachDict : {
+ const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+- BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
+ CCtxParams->attachDictPref = pref;
+ return CCtxParams->attachDictPref;
+ }
+
+ case ZSTD_c_literalCompressionMode : {
+ const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
+- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
+ CCtxParams->literalCompressionMode = lcm;
+ return CCtxParams->literalCompressionMode;
+ }
+@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+
+ case ZSTD_c_enableDedicatedDictSearch :
+ CCtxParams->enableDedicatedDictSearch = (value!=0);
+- return CCtxParams->enableDedicatedDictSearch;
++ return (size_t)CCtxParams->enableDedicatedDictSearch;
+
+ case ZSTD_c_enableLongDistanceMatching :
++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
+ CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
+ return CCtxParams->ldmParams.enableLdm;
+
+ case ZSTD_c_ldmHashLog :
+ if (value!=0) /* 0 ==> auto */
+ BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+- CCtxParams->ldmParams.hashLog = value;
++ CCtxParams->ldmParams.hashLog = (U32)value;
+ return CCtxParams->ldmParams.hashLog;
+
+ case ZSTD_c_ldmMinMatch :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+- CCtxParams->ldmParams.minMatchLength = value;
++ CCtxParams->ldmParams.minMatchLength = (U32)value;
+ return CCtxParams->ldmParams.minMatchLength;
+
+ case ZSTD_c_ldmBucketSizeLog :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+- CCtxParams->ldmParams.bucketSizeLog = value;
++ CCtxParams->ldmParams.bucketSizeLog = (U32)value;
+ return CCtxParams->ldmParams.bucketSizeLog;
+
+ case ZSTD_c_ldmHashRateLog :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
+- CCtxParams->ldmParams.hashRateLog = value;
++ CCtxParams->ldmParams.hashRateLog = (U32)value;
+ return CCtxParams->ldmParams.hashRateLog;
+
+ case ZSTD_c_targetCBlockSize :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+- CCtxParams->targetCBlockSize = value;
++ CCtxParams->targetCBlockSize = (U32)value;
+ return CCtxParams->targetCBlockSize;
+
+ case ZSTD_c_srcSizeHint :
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+ CCtxParams->srcSizeHint = value;
+- return CCtxParams->srcSizeHint;
++ return (size_t)CCtxParams->srcSizeHint;
+
+ case ZSTD_c_stableInBuffer:
+ BOUNDCHECK(ZSTD_c_stableInBuffer, value);
+@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+ CCtxParams->deterministicRefPrefix = !!value;
+ return CCtxParams->deterministicRefPrefix;
+
++ case ZSTD_c_prefetchCDictTables:
++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
++ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
++ return CCtxParams->prefetchCDictTables;
++
++ case ZSTD_c_enableSeqProducerFallback:
++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
++ CCtxParams->enableMatchFinderFallback = value;
++ return CCtxParams->enableMatchFinderFallback;
++
++ case ZSTD_c_maxBlockSize:
++ if (value!=0) /* 0 ==> default */
++ BOUNDCHECK(ZSTD_c_maxBlockSize, value);
++ CCtxParams->maxBlockSize = value;
++ return CCtxParams->maxBlockSize;
++
++ case ZSTD_c_searchForExternalRepcodes:
++ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
++ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
++ return CCtxParams->searchForExternalRepcodes;
++
+ default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+ }
+ }
+@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
+ case ZSTD_c_deterministicRefPrefix:
+ *value = (int)CCtxParams->deterministicRefPrefix;
+ break;
++ case ZSTD_c_prefetchCDictTables:
++ *value = (int)CCtxParams->prefetchCDictTables;
++ break;
++ case ZSTD_c_enableSeqProducerFallback:
++ *value = CCtxParams->enableMatchFinderFallback;
++ break;
++ case ZSTD_c_maxBlockSize:
++ *value = (int)CCtxParams->maxBlockSize;
++ break;
++ case ZSTD_c_searchForExternalRepcodes:
++ *value = (int)CCtxParams->searchForExternalRepcodes;
++ break;
+ default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+ }
+ return 0;
+@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+ return 0;
+ }
+
++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
++{
++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
++ DEBUGLOG(4, "ZSTD_CCtx_setCParams");
++ /* only update if all parameters are valid */
++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
++ return 0;
++}
++
++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
++{
++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
++ DEBUGLOG(4, "ZSTD_CCtx_setFParams");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
++ return 0;
++}
++
++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
++{
++ DEBUGLOG(4, "ZSTD_CCtx_setParams");
++ /* First check cParams, because we want to update all or none. */
++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */
++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
++ /* Finally set cParams, which should succeed. */
++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
++ return 0;
++}
++
+ size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+ {
+- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+ "Can't set pledgedSrcSize when not in init stage.");
+ cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+ ZSTD_compressionParameters* cParams);
+
+ /*
+- * Initializes the local dict using the requested parameters.
+- * NOTE: This does not use the pledged src size, because it may be used for more
+- * than one compression.
++ * Initializes the local dictionary using requested parameters.
++ * NOTE: Initialization does not employ the pledged src size,
++ * because the dictionary may be used for multiple compressions.
+ */
+ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ {
+@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ return 0;
+ }
+ if (dl->cdict != NULL) {
+- assert(cctx->cdict == dl->cdict);
+ /* Local dictionary already initialized. */
++ assert(cctx->cdict == dl->cdict);
+ return 0;
+ }
+ assert(dl->dictSize > 0);
+@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+ }
+
+ size_t ZSTD_CCtx_loadDictionary_advanced(
+- ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
++ ZSTD_CCtx* cctx,
++ const void* dict, size_t dictSize,
++ ZSTD_dictLoadMethod_e dictLoadMethod,
++ ZSTD_dictContentType_e dictContentType)
+ {
+- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+- "Can't load a dictionary when ctx is not in init stage.");
+ DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+- ZSTD_clearAllDicts(cctx); /* in case one already exists */
+- if (dict == NULL || dictSize == 0) /* no dictionary mode */
++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
++ "Can't load a dictionary when cctx is not in init stage.");
++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */
++ if (dict == NULL || dictSize == 0) /* no dictionary */
+ return 0;
+ if (dictLoadMethod == ZSTD_dlm_byRef) {
+ cctx->localDict.dict = dict;
+ } else {
++ /* copy dictionary content inside CCtx to own its lifetime */
+ void* dictBuffer;
+ RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+- "no malloc for static CCtx");
++ "static CCtx can't allocate for an internal copy of dictionary");
+ dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
+- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
++ "allocation failed for dictionary content");
+ ZSTD_memcpy(dictBuffer, dict, dictSize);
+- cctx->localDict.dictBuffer = dictBuffer;
+- cctx->localDict.dict = dictBuffer;
++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */
++ cctx->localDict.dict = dictBuffer; /* read-only reference */
+ }
+ cctx->localDict.dictSize = dictSize;
+ cctx->localDict.dictContentType = dictContentType;
+@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+ if ( (reset == ZSTD_reset_parameters)
+ || (reset == ZSTD_reset_session_and_parameters) ) {
+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+- "Can't reset parameters only when not in init stage.");
++ "Reset parameters is only possible during init stage.");
+ ZSTD_clearAllDicts(cctx);
++ ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
+ return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+ }
+ return 0;
+@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
+ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+ unsigned long long srcSize,
+ size_t dictSize,
+- ZSTD_cParamMode_e mode)
++ ZSTD_cParamMode_e mode,
++ ZSTD_paramSwitch_e useRowMatchFinder)
+ {
+ const U64 minSrcSize = 513; /* (1<<9) + 1 */
+ const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+ }
+
+ /* resize windowLog if input is small enough, to use less memory */
+- if ( (srcSize < maxWindowResize)
+- && (dictSize < maxWindowResize) ) {
++ if ( (srcSize <= maxWindowResize)
++ && (dictSize <= maxWindowResize) ) {
+ U32 const tSize = (U32)(srcSize + dictSize);
+ static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+ U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+ if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+ cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */
+
++ /* We can't use more than 32 bits of hash in total, so that means that we require:
++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
++ */
++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
++ if (cPar.hashLog > maxShortCacheHashLog) {
++ cPar.hashLog = maxShortCacheHashLog;
++ }
++ if (cPar.chainLog > maxShortCacheHashLog) {
++ cPar.chainLog = maxShortCacheHashLog;
++ }
++ }
++
++
++ /* At this point, we aren't 100% sure if we are using the row match finder.
++ * Unless it is explicitly disabled, conservatively assume that it is enabled.
++ * In this case it will only be disabled for small sources, so shrinking the
++ * hash log a little bit shouldn't result in any ratio loss.
++ */
++ if (useRowMatchFinder == ZSTD_ps_auto)
++ useRowMatchFinder = ZSTD_ps_enable;
++
++ /* We can't hash more than 32-bits in total. So that means that we require:
++ * (hashLog - rowLog + 8) <= 32
++ */
++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
++ /* Switch to 32-entry rows if searchLog is 5 (or more) */
++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
++ U32 const maxHashLog = maxRowHashLog + rowLog;
++ assert(cPar.hashLog >= rowLog);
++ if (cPar.hashLog > maxHashLog) {
++ cPar.hashLog = maxHashLog;
++ }
++ }
++
+ return cPar;
+ }
+
+@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+ {
+ cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */
+ if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
+ }
+
+ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
+@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+ ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
+ assert(!ZSTD_checkCParams(cParams));
+ /* srcSizeHint == 0 means 0 */
+- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
+ }
+
+ static size_t
+@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+ + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+ + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+ size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
++ ? ZSTD_cwksp_aligned_alloc_size(hSize)
+ : 0;
+ size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+ ? optPotentialSpace
+@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+ return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
+ }
+
++/* Helper function for calculating memory requirements.
++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
++ return blockSize / divider;
++}
++
+ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ const ZSTD_compressionParameters* cParams,
+ const ldmParams_t* ldmParams,
+@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ const ZSTD_paramSwitch_e useRowMatchFinder,
+ const size_t buffInSize,
+ const size_t buffOutSize,
+- const U64 pledgedSrcSize)
++ const U64 pledgedSrcSize,
++ int useSequenceProducer,
++ size_t maxBlockSize)
+ {
+ size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
+- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+- U32 const divider = (cParams->minMatch==3) ? 3 : 4;
+- size_t const maxNbSeq = blockSize / divider;
++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
+ size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+ + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+ + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+
+ size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+
++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++ size_t const externalSeqSpace = useSequenceProducer
++ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
++ : 0;
++
+ size_t const neededSpace =
+ cctxSpace +
+ entropySpace +
+@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ ldmSeqSpace +
+ matchStateSize +
+ tokenSpace +
+- bufferSpace;
++ bufferSpace +
++ externalSeqSpace;
+
+ DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+ return neededSpace;
+@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ * be needed. However, we still allocate two 0-sized buffers, which can
+ * take space under ASAN. */
+ return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+- &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
++ &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
+ }
+
+ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+ RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+ { ZSTD_compressionParameters const cParams =
+ ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
+ size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
+ ? ((size_t)1 << cParams.windowLog) + blockSize
+ : 0;
+@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+
+ return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
+- ZSTD_CONTENTSIZE_UNKNOWN);
++ ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
+ }
+ }
+
+@@ -1637,6 +1833,19 @@ typedef enum {
+ ZSTD_resetTarget_CCtx
+ } ZSTD_resetTarget_e;
+
++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
++static U64 ZSTD_bitmix(U64 val, U64 len) {
++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
++ val *= 0x9FB21C651E98DF25ULL;
++ val ^= (val >> 35) + len ;
++ val *= 0x9FB21C651E98DF25ULL;
++ return val ^ (val >> 28);
++}
++
++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
++}
+
+ static size_t
+ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+ }
+
+ ms->hashLog3 = hashLog3;
++ ms->lazySkipping = 0;
+
+ ZSTD_invalidateMatchState(ms);
+
+@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+ ZSTD_cwksp_clean_tables(ws);
+ }
+
++ if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
++ /* Row match finder needs an additional table of hashes ("tags") */
++ size_t const tagTableSize = hSize;
++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use
++ * 0 when we reset a Cdict */
++ if(forWho == ZSTD_resetTarget_CCtx) {
++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
++ ZSTD_advanceHashSalt(ms);
++ } else {
++ /* When we are not salting we want to always memset the memory */
++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
++ ZSTD_memset(ms->tagTable, 0, tagTableSize);
++ ms->hashSalt = 0;
++ }
++ { /* Switch to 32-entry rows if searchLog is 5 (or more) */
++ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
++ assert(cParams->hashLog >= rowLog);
++ ms->rowHashLog = cParams->hashLog - rowLog;
++ }
++ }
++
+ /* opt parser space */
+ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+ DEBUGLOG(4, "reserving optimal parser space");
+@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+ }
+
+- if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+- { /* Row match finder needs an additional table of hashes ("tags") */
+- size_t const tagTableSize = hSize*sizeof(U16);
+- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+- }
+- { /* Switch to 32-entry rows if searchLog is 5 (or more) */
+- U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+- assert(cParams->hashLog >= rowLog);
+- ms->rowHashLog = cParams->hashLog - rowLog;
+- }
+- }
+-
+ ms->cParams = *cParams;
+
+ RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ assert(params->useRowMatchFinder != ZSTD_ps_auto);
+ assert(params->useBlockSplitter != ZSTD_ps_auto);
+ assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
++ assert(params->maxBlockSize != 0);
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+ /* Adjust long distance matching parameters */
+ ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
+@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ }
+
+ { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
+- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4;
+- size_t const maxNbSeq = blockSize / divider;
++ size_t const blockSize = MIN(params->maxBlockSize, windowSize);
++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
+ size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
+ ? ZSTD_compressBound(blockSize) + 1
+ : 0;
+@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ size_t const neededSpace =
+ ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+ &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
+- buffInSize, buffOutSize, pledgedSrcSize);
++ buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
+ int resizeWorkspace;
+
+ FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
+@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+
+ /* init params */
+ zc->blockState.matchState.cParams = params->cParams;
++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
+ zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+ zc->consumedSrcSize = 0;
+ zc->producedCSize = 0;
+@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+
+ ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
++ FORWARD_IF_ERROR(ZSTD_reset_matchState(
++ &zc->blockState.matchState,
++ ws,
++ &params->cParams,
++ params->useRowMatchFinder,
++ crp,
++ needsIndexReset,
++ ZSTD_resetTarget_CCtx), "");
++
++ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
++
++ /* ldm hash table */
++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
++ /* TODO: avoid memset? */
++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
++ zc->maxNbLdmSequences = maxNbLdmSeq;
++
++ ZSTD_window_init(&zc->ldmState.window);
++ zc->ldmState.loadedDictEnd = 0;
++ }
++
++ /* reserve space for block-level external sequences */
++ if (params->useSequenceProducer) {
++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
++ zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
++ zc->externalMatchCtx.seqBuffer =
++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
++ }
++
++ /* buffers */
++
+ /* ZSTD_wildcopy() is used to copy into the literals buffer,
+ * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+ */
+ zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+ zc->seqStore.maxNbLit = blockSize;
+
+- /* buffers */
+ zc->bufferedPolicy = zbuff;
+ zc->inBuffSize = buffInSize;
+ zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+ zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+ zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+-
+- FORWARD_IF_ERROR(ZSTD_reset_matchState(
+- &zc->blockState.matchState,
+- ws,
+- &params->cParams,
+- params->useRowMatchFinder,
+- crp,
+- needsIndexReset,
+- ZSTD_resetTarget_CCtx), "");
+-
+- /* ldm hash table */
+- if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+- /* TODO: avoid memset? */
+- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+- zc->maxNbLdmSequences = maxNbLdmSeq;
+-
+- ZSTD_window_init(&zc->ldmState.window);
+- zc->ldmState.loadedDictEnd = 0;
+- }
+
+ DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
+
+ zc->initialized = 1;
+
+@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+ }
+
+ params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
+- cdict->dictContentSize, ZSTD_cpm_attachDict);
++ cdict->dictContentSize, ZSTD_cpm_attachDict,
++ params.useRowMatchFinder);
+ params.cParams.windowLog = windowLog;
+ params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */
+ FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
+@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+ return 0;
+ }
+
++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
++ ZSTD_compressionParameters const* cParams) {
++ if (ZSTD_CDictIndicesAreTagged(cParams)){
++ /* Remove tags from the CDict table if they are present.
++ * See docs on "short cache" in zstd_compress_internal.h for context. */
++ size_t i;
++ for (i = 0; i < tableSize; i++) {
++ U32 const taggedIndex = src[i];
++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
++ dst[i] = index;
++ }
++ } else {
++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
++ }
++}
++
+ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+ const ZSTD_CDict* cdict,
+ ZSTD_CCtx_params params,
+@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+ : 0;
+ size_t const hSize = (size_t)1 << cdict_cParams->hashLog;
+
+- ZSTD_memcpy(cctx->blockState.matchState.hashTable,
+- cdict->matchState.hashTable,
+- hSize * sizeof(U32));
++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
++ cdict->matchState.hashTable,
++ hSize, cdict_cParams);
++
+ /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+ if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+- ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+- cdict->matchState.chainTable,
+- chainSize * sizeof(U32));
++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
++ cdict->matchState.chainTable,
++ chainSize, cdict_cParams);
+ }
+ /* copy tag table */
+ if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+- size_t const tagTableSize = hSize*sizeof(U16);
++ size_t const tagTableSize = hSize;
+ ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+- cdict->matchState.tagTable,
+- tagTableSize);
++ cdict->matchState.tagTable,
++ tagTableSize);
++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
+ }
+ }
+
+@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+ params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+ params.ldmParams = srcCCtx->appliedParams.ldmParams;
+ params.fParams = fParams;
++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
+ ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+ /* loadedDictSize */ 0,
+ ZSTDcrp_leaveDirty, zbuff);
+@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+
+ /* See doc/zstd_compression_format.md for detailed format description */
+
+-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ {
+ const seqDef* const sequences = seqStorePtr->sequencesStart;
+ BYTE* const llCodeTable = seqStorePtr->llCode;
+@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+ BYTE* const mlCodeTable = seqStorePtr->mlCode;
+ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ U32 u;
++ int longOffsets = 0;
+ assert(nbSeq <= seqStorePtr->maxNbSeq);
+ for (u=0; u<nbSeq; u++) {
+ U32 const llv = sequences[u].litLength;
++ U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
+ U32 const mlv = sequences[u].mlBase;
+ llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+- ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
++ ofCodeTable[u] = (BYTE)ofCode;
+ mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
++ assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
++ longOffsets = 1;
+ }
+ if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
+ llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+ if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
+ mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
++ return longOffsets;
+ }
+
+ /* ZSTD_useTargetCBlockSize():
+@@ -2347,6 +2602,7 @@ typedef struct {
+ U32 MLtype;
+ size_t size;
+ size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
++ int longOffsets;
+ } ZSTD_symbolEncodingTypeStats_t;
+
+ /* ZSTD_buildSequencesStatistics():
+@@ -2357,11 +2613,13 @@ typedef struct {
+ * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
+ */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+- BYTE* dst, const BYTE* const dstEnd,
+- ZSTD_strategy strategy, unsigned* countWorkspace,
+- void* entropyWorkspace, size_t entropyWkspSize) {
++ZSTD_buildSequencesStatistics(
++ const seqStore_t* seqStorePtr, size_t nbSeq,
++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
++ BYTE* dst, const BYTE* const dstEnd,
++ ZSTD_strategy strategy, unsigned* countWorkspace,
++ void* entropyWorkspace, size_t entropyWkspSize)
++{
+ BYTE* const ostart = dst;
+ const BYTE* const oend = dstEnd;
+ BYTE* op = ostart;
+@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+
+ stats.lastCountSize = 0;
+ /* convert length/distances into codes */
+- ZSTD_seqToCodes(seqStorePtr);
++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
+ assert(op <= oend);
+ assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+ /* build CTable for Literal Lengths */
+@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+ */
+ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+- const ZSTD_entropyCTables_t* prevEntropy,
+- ZSTD_entropyCTables_t* nextEntropy,
+- const ZSTD_CCtx_params* cctxParams,
+- void* dst, size_t dstCapacity,
+- void* entropyWorkspace, size_t entropyWkspSize,
+- const int bmi2)
++ZSTD_entropyCompressSeqStore_internal(
++ const seqStore_t* seqStorePtr,
++ const ZSTD_entropyCTables_t* prevEntropy,
++ ZSTD_entropyCTables_t* nextEntropy,
++ const ZSTD_CCtx_params* cctxParams,
++ void* dst, size_t dstCapacity,
++ void* entropyWorkspace, size_t entropyWkspSize,
++ const int bmi2)
+ {
+- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+ ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+ unsigned* count = (unsigned*)entropyWorkspace;
+ FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+ FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+ FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+ const seqDef* const sequences = seqStorePtr->sequencesStart;
+- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+ const BYTE* const llCodeTable = seqStorePtr->llCode;
+ const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ BYTE* const oend = ostart + dstCapacity;
+ BYTE* op = ostart;
+ size_t lastCountSize;
++ int longOffsets = 0;
+
+ entropyWorkspace = count + (MaxSeq + 1);
+ entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
+
+- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
+ ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+ assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
+
+ /* Compress literals */
+ { const BYTE* const literals = seqStorePtr->litStart;
+- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
++ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
++ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+ /* Base suspicion of uncompressibility on ratio of literals to sequences */
+ unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
+ size_t const litSize = (size_t)(seqStorePtr->lit - literals);
++
+ size_t const cSize = ZSTD_compressLiterals(
+- &prevEntropy->huf, &nextEntropy->huf,
+- cctxParams->cParams.strategy,
+- ZSTD_literalsCompressionIsDisabled(cctxParams),
+ op, dstCapacity,
+ literals, litSize,
+ entropyWorkspace, entropyWkspSize,
+- bmi2, suspectUncompressible);
++ &prevEntropy->huf, &nextEntropy->huf,
++ cctxParams->cParams.strategy,
++ ZSTD_literalsCompressionIsDisabled(cctxParams),
++ suspectUncompressible, bmi2);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+ assert(cSize <= dstCapacity);
+ op += cSize;
+@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+ return (size_t)(op - ostart);
+ }
+- {
+- ZSTD_symbolEncodingTypeStats_t stats;
+- BYTE* seqHead = op++;
++ { BYTE* const seqHead = op++;
+ /* build stats for sequences */
+- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
++ const ZSTD_symbolEncodingTypeStats_t stats =
++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+ &prevEntropy->fse, &nextEntropy->fse,
+ op, oend,
+ strategy, count,
+@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
+ lastCountSize = stats.lastCountSize;
+ op += stats.size;
++ longOffsets = stats.longOffsets;
+ }
+
+ { size_t const bitstreamSize = ZSTD_encodeSequences(
+@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+ }
+
+ MEM_STATIC size_t
+-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+- const ZSTD_entropyCTables_t* prevEntropy,
+- ZSTD_entropyCTables_t* nextEntropy,
+- const ZSTD_CCtx_params* cctxParams,
+- void* dst, size_t dstCapacity,
+- size_t srcSize,
+- void* entropyWorkspace, size_t entropyWkspSize,
+- int bmi2)
++ZSTD_entropyCompressSeqStore(
++ const seqStore_t* seqStorePtr,
++ const ZSTD_entropyCTables_t* prevEntropy,
++ ZSTD_entropyCTables_t* nextEntropy,
++ const ZSTD_CCtx_params* cctxParams,
++ void* dst, size_t dstCapacity,
++ size_t srcSize,
++ void* entropyWorkspace, size_t entropyWkspSize,
++ int bmi2)
+ {
+ size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
+ seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+ /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+ * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+ */
+- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
++ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
+ return 0; /* block not compressed */
++ }
+ FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
+
+ /* Check compressibility */
+ { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+ if (cSize >= maxCSize) return 0; /* block not compressed */
+ }
+- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
++ */
++ assert(cSize < ZSTD_BLOCKSIZE_MAX);
+ return cSize;
+ }
+
+@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+ ssPtr->longLengthType = ZSTD_llt_none;
+ }
+
++/* ZSTD_postProcessSequenceProducerResult() :
++ * Validates and post-processes sequences obtained through the external matchfinder API:
++ * - Checks whether nbExternalSeqs represents an error condition.
++ * - Appends a block delimiter to outSeqs if one is not already present.
++ * See zstd.h for context regarding block delimiters.
++ * Returns the number of sequences after post-processing, or an error code. */
++static size_t ZSTD_postProcessSequenceProducerResult(
++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
++) {
++ RETURN_ERROR_IF(
++ nbExternalSeqs > outSeqsCapacity,
++ sequenceProducer_failed,
++ "External sequence producer returned error code %lu",
++ (unsigned long)nbExternalSeqs
++ );
++
++ RETURN_ERROR_IF(
++ nbExternalSeqs == 0 && srcSize > 0,
++ sequenceProducer_failed,
++ "Got zero sequences from external sequence producer for a non-empty src buffer!"
++ );
++
++ if (srcSize == 0) {
++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
++ return 1;
++ }
++
++ {
++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
++
++ /* We can return early if lastSeq is already a block delimiter. */
++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
++ return nbExternalSeqs;
++ }
++
++ /* This error condition is only possible if the external matchfinder
++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
++ RETURN_ERROR_IF(
++ nbExternalSeqs == outSeqsCapacity,
++ sequenceProducer_failed,
++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
++ );
++
++ /* lastSeq is not a block delimiter, so we need to append one. */
++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
++ return nbExternalSeqs + 1;
++ }
++}
++
++/* ZSTD_fastSequenceLengthSum() :
++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
++ * Similar to another function in zstd_compress.c (determine_blockSize),
++ * except it doesn't check for a block delimiter to end summation.
++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
++ size_t matchLenSum, litLenSum, i;
++ matchLenSum = 0;
++ litLenSum = 0;
++ for (i = 0; i < seqBufSize; i++) {
++ litLenSum += seqBuf[i].litLength;
++ matchLenSum += seqBuf[i].matchLength;
++ }
++ return litLenSum + matchLenSum;
++}
++
+ typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+
+ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+ assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+ /* Assert that we have correctly flushed the ctx params into the ms's copy */
+ ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++ * additional 1. We need to revisit and change this logic to be more consistent */
++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+ if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
+ ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
+ } else {
+@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+ }
+ if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+ assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
++
++ /* External matchfinder + LDM is technically possible, just not implemented yet.
++ * We need to revisit soon and implement it. */
++ RETURN_ERROR_IF(
++ zc->appliedParams.useSequenceProducer,
++ parameter_combination_unsupported,
++ "Long-distance matching with external sequence producer enabled is not currently supported."
++ );
++
+ /* Updates ldmSeqStore.pos */
+ lastLLSize =
+ ZSTD_ldm_blockCompress(&zc->externSeqStore,
+@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+ } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
+ rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
+
++ /* External matchfinder + LDM is technically possible, just not implemented yet.
++ * We need to revisit soon and implement it. */
++ RETURN_ERROR_IF(
++ zc->appliedParams.useSequenceProducer,
++ parameter_combination_unsupported,
++ "Long-distance matching with external sequence producer enabled is not currently supported."
++ );
++
+ ldmSeqStore.seq = zc->ldmSequences;
+ ldmSeqStore.capacity = zc->maxNbLdmSequences;
+ /* Updates ldmSeqStore.size */
+@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+ zc->appliedParams.useRowMatchFinder,
+ src, srcSize);
+ assert(ldmSeqStore.pos == ldmSeqStore.size);
+- } else { /* not long range mode */
++ } else if (zc->appliedParams.useSequenceProducer) {
++ assert(
++ zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
++ );
++ assert(zc->externalMatchCtx.mFinder != NULL);
++
++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
++
++ size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
++ zc->externalMatchCtx.mState,
++ zc->externalMatchCtx.seqBuffer,
++ zc->externalMatchCtx.seqBufferCapacity,
++ src, srcSize,
++ NULL, 0, /* dict and dictSize, currently not supported */
++ zc->appliedParams.compressionLevel,
++ windowSize
++ );
++
++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
++ zc->externalMatchCtx.seqBuffer,
++ nbExternalSeqs,
++ zc->externalMatchCtx.seqBufferCapacity,
++ srcSize
++ );
++
++ /* Return early if there is no error, since we don't need to worry about last literals */
++ if (!ZSTD_isError(nbPostProcessedSeqs)) {
++ ZSTD_sequencePosition seqPos = {0,0,0};
++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
++ FORWARD_IF_ERROR(
++ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
++ zc, &seqPos,
++ zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
++ src, srcSize,
++ zc->appliedParams.searchForExternalRepcodes
++ ),
++ "Failed to copy external sequences to seqStore!"
++ );
++ ms->ldmSeqStore = NULL;
++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
++ return ZSTDbss_compress;
++ }
++
++ /* Propagate the error if fallback is disabled */
++ if (!zc->appliedParams.enableMatchFinderFallback) {
++ return nbPostProcessedSeqs;
++ }
++
++ /* Fallback to software matchfinder */
++ { ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
++ zc->appliedParams.useRowMatchFinder,
++ dictMode);
++ ms->ldmSeqStore = NULL;
++ DEBUGLOG(
++ 5,
++ "External sequence producer returned error code %lu. Falling back to internal parser.",
++ (unsigned long)nbExternalSeqs
++ );
++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
++ } }
++ } else { /* not long range mode and no external matchfinder */
+ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+ zc->appliedParams.useRowMatchFinder,
+ dictMode);
+@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+ /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+ so we provide seqStoreSeqs[i].offset - 1 */
+ ZSTD_updateRep(updatedRepcodes.rep,
+- seqStoreSeqs[i].offBase - 1,
++ seqStoreSeqs[i].offBase,
+ seqStoreSeqs[i].litLength == 0);
+ literalsRead += outSeqs[i].litLength;
+ }
+@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+ zc->seqCollector.seqIndex += seqStoreSeqSize;
+ }
+
++size_t ZSTD_sequenceBound(size_t srcSize) {
++ return (srcSize / ZSTD_MINMATCH_MIN) + 1;
++}
++
+ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+ size_t outSeqsSize, const void* src, size_t srcSize)
+ {
+@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+ const size_t unrollMask = unrollSize - 1;
+ const size_t prefixLength = length & unrollMask;
+ size_t i;
+- size_t u;
+ if (length == 1) return 1;
+ /* Check if prefix is RLE first before using unrolled loop */
+ if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
+ return 0;
+ }
+ for (i = prefixLength; i != length; i += unrollSize) {
++ size_t u;
+ for (u = 0; u < unrollSize; u += sizeof(size_t)) {
+ if (MEM_readST(ip + i + u) != valueST) {
+ return 0;
+- }
+- }
+- }
++ } } }
+ return 1;
+ }
+
+@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+ return nbSeqs < 4 && nbLits < 10;
+ }
+
+-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
++static void
++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+ {
+ ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
+ bs->prevCBlock = bs->nextCBlock;
+@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+ }
+
+ /* Writes the block header */
+-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
++static void
++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
++{
+ U32 const cBlockHeader = cSize == 1 ?
+ lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+ lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+ * Stores literals block type (raw, rle, compressed, repeat) and
+ * huffman description table to hufMetadata.
+ * Requires ENTROPY_WORKSPACE_SIZE workspace
+- * @return : size of huffman description table or error code */
+-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+- const ZSTD_hufCTables_t* prevHuf,
+- ZSTD_hufCTables_t* nextHuf,
+- ZSTD_hufCTablesMetadata_t* hufMetadata,
+- const int literalsCompressionIsDisabled,
+- void* workspace, size_t wkspSize)
++ * @return : size of huffman description table, or an error code
++ */
++static size_t
++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
++ const ZSTD_hufCTables_t* prevHuf,
++ ZSTD_hufCTables_t* nextHuf,
++ ZSTD_hufCTablesMetadata_t* hufMetadata,
++ const int literalsCompressionIsDisabled,
++ void* workspace, size_t wkspSize,
++ int hufFlags)
+ {
+ BYTE* const wkspStart = (BYTE*)workspace;
+ BYTE* const wkspEnd = wkspStart + wkspSize;
+@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ unsigned* const countWksp = (unsigned*)workspace;
+ const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+ BYTE* const nodeWksp = countWkspStart + countWkspSize;
+- const size_t nodeWkspSize = wkspEnd-nodeWksp;
++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
+ unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+- unsigned huffLog = HUF_TABLELOG_DEFAULT;
++ unsigned huffLog = LitHufLog;
+ HUF_repeat repeat = prevHuf->repeatMode;
+ DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
+
+@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+
+ /* small ? don't even attempt compression (speed opt) */
+ #ifndef COMPRESS_LITERALS_SIZE_MIN
+-#define COMPRESS_LITERALS_SIZE_MIN 63
++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */
+ #endif
+ { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+ if (srcSize <= minLitSize) {
+ DEBUGLOG(5, "set_basic - too small");
+ hufMetadata->hType = set_basic;
+ return 0;
+- }
+- }
++ } }
+
+ /* Scan input and build symbol stats */
+- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
++ { size_t const largest =
++ HIST_count_wksp (countWksp, &maxSymbolValue,
++ (const BYTE*)src, srcSize,
++ workspace, wkspSize);
+ FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+ if (largest == srcSize) {
++ /* only one literal symbol */
+ DEBUGLOG(5, "set_rle");
+ hufMetadata->hType = set_rle;
+ return 0;
+ }
+ if (largest <= (srcSize >> 7)+4) {
++ /* heuristic: likely not compressible */
+ DEBUGLOG(5, "set_basic - no gain");
+ hufMetadata->hType = set_basic;
+ return 0;
+- }
+- }
++ } }
+
+ /* Validate the previous Huffman table */
+- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
++ if (repeat == HUF_repeat_check
++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+ repeat = HUF_repeat_none;
+ }
+
+ /* Build Huffman Tree */
+ ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
++ assert(huffLog <= LitHufLog);
+ { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+ maxSymbolValue, huffLog,
+ nodeWksp, nodeWkspSize);
+ FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+ huffLog = (U32)maxBits;
+- { /* Build and write the CTable */
+- size_t const newCSize = HUF_estimateCompressedSize(
+- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+- size_t const hSize = HUF_writeCTable_wksp(
+- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+- nodeWksp, nodeWkspSize);
+- /* Check against repeating the previous CTable */
+- if (repeat != HUF_repeat_none) {
+- size_t const oldCSize = HUF_estimateCompressedSize(
+- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+- DEBUGLOG(5, "set_repeat - smaller");
+- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+- hufMetadata->hType = set_repeat;
+- return 0;
+- }
+- }
+- if (newCSize + hSize >= srcSize) {
+- DEBUGLOG(5, "set_basic - no gains");
++ }
++ { /* Build and write the CTable */
++ size_t const newCSize = HUF_estimateCompressedSize(
++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
++ size_t const hSize = HUF_writeCTable_wksp(
++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
++ nodeWksp, nodeWkspSize);
++ /* Check against repeating the previous CTable */
++ if (repeat != HUF_repeat_none) {
++ size_t const oldCSize = HUF_estimateCompressedSize(
++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
++ DEBUGLOG(5, "set_repeat - smaller");
+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+- hufMetadata->hType = set_basic;
++ hufMetadata->hType = set_repeat;
+ return 0;
+- }
+- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+- hufMetadata->hType = set_compressed;
+- nextHuf->repeatMode = HUF_repeat_check;
+- return hSize;
++ } }
++ if (newCSize + hSize >= srcSize) {
++ DEBUGLOG(5, "set_basic - no gains");
++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++ hufMetadata->hType = set_basic;
++ return 0;
+ }
++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
++ hufMetadata->hType = set_compressed;
++ nextHuf->repeatMode = HUF_repeat_check;
++ return hSize;
+ }
+ }
+
+@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+ * and updates nextEntropy to the appropriate repeatMode.
+ */
+ static ZSTD_symbolEncodingTypeStats_t
+-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
++{
++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
+ nextEntropy->litlength_repeatMode = FSE_repeat_none;
+ nextEntropy->offcode_repeatMode = FSE_repeat_none;
+ nextEntropy->matchlength_repeatMode = FSE_repeat_none;
+@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+ * Builds entropy for the sequences.
+ * Stores symbol compression modes and fse table to fseMetadata.
+ * Requires ENTROPY_WORKSPACE_SIZE wksp.
+- * @return : size of fse tables or error code */
+-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+- const ZSTD_fseCTables_t* prevEntropy,
+- ZSTD_fseCTables_t* nextEntropy,
+- const ZSTD_CCtx_params* cctxParams,
+- ZSTD_fseCTablesMetadata_t* fseMetadata,
+- void* workspace, size_t wkspSize)
++ * @return : size of fse tables or error code */
++static size_t
++ZSTD_buildBlockEntropyStats_sequences(
++ const seqStore_t* seqStorePtr,
++ const ZSTD_fseCTables_t* prevEntropy,
++ ZSTD_fseCTables_t* nextEntropy,
++ const ZSTD_CCtx_params* cctxParams,
++ ZSTD_fseCTablesMetadata_t* fseMetadata,
++ void* workspace, size_t wkspSize)
+ {
+ ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ BYTE* const ostart = fseMetadata->fseTablesBuffer;
+ BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+ BYTE* op = ostart;
+@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+ /* ZSTD_buildBlockEntropyStats() :
+ * Builds entropy for the block.
+ * Requires workspace size ENTROPY_WORKSPACE_SIZE
+- *
+- * @return : 0 on success or error code
++ * @return : 0 on success, or an error code
++ * Note : also employed in superblock
+ */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+- const ZSTD_entropyCTables_t* prevEntropy,
+- ZSTD_entropyCTables_t* nextEntropy,
+- const ZSTD_CCtx_params* cctxParams,
+- ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+- void* workspace, size_t wkspSize)
+-{
+- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
++size_t ZSTD_buildBlockEntropyStats(
++ const seqStore_t* seqStorePtr,
++ const ZSTD_entropyCTables_t* prevEntropy,
++ ZSTD_entropyCTables_t* nextEntropy,
++ const ZSTD_CCtx_params* cctxParams,
++ ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++ void* workspace, size_t wkspSize)
++{
++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
++
+ entropyMetadata->hufMetadata.hufDesSize =
+ ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
+ &prevEntropy->huf, &nextEntropy->huf,
+ &entropyMetadata->hufMetadata,
+ ZSTD_literalsCompressionIsDisabled(cctxParams),
+- workspace, wkspSize);
++ workspace, wkspSize, hufFlags);
++
+ FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
+ entropyMetadata->fseMetadata.fseTablesSize =
+ ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+ }
+
+ /* Returns the size estimate for the literals section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+- const ZSTD_hufCTables_t* huf,
+- const ZSTD_hufCTablesMetadata_t* hufMetadata,
+- void* workspace, size_t wkspSize,
+- int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
++ const ZSTD_hufCTables_t* huf,
++ const ZSTD_hufCTablesMetadata_t* hufMetadata,
++ void* workspace, size_t wkspSize,
++ int writeEntropy)
+ {
+ unsigned* const countWksp = (unsigned*)workspace;
+ unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+ }
+
+ /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
+-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+- const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+- const FSE_CTable* fseCTable,
+- const U8* additionalBits,
+- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+- void* workspace, size_t wkspSize)
++static size_t
++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
++ const FSE_CTable* fseCTable,
++ const U8* additionalBits,
++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
++ void* workspace, size_t wkspSize)
+ {
+ unsigned* const countWksp = (unsigned*)workspace;
+ const BYTE* ctp = codeTable;
+@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+ }
+
+ /* Returns the size estimate for the sequences section (header + content) of a block */
+-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+- const BYTE* llCodeTable,
+- const BYTE* mlCodeTable,
+- size_t nbSeq,
+- const ZSTD_fseCTables_t* fseTables,
+- const ZSTD_fseCTablesMetadata_t* fseMetadata,
+- void* workspace, size_t wkspSize,
+- int writeEntropy)
++static size_t
++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
++ const BYTE* llCodeTable,
++ const BYTE* mlCodeTable,
++ size_t nbSeq,
++ const ZSTD_fseCTables_t* fseTables,
++ const ZSTD_fseCTablesMetadata_t* fseMetadata,
++ void* workspace, size_t wkspSize,
++ int writeEntropy)
+ {
+ size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
+ size_t cSeqSizeEstimate = 0;
+ cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
+- fseTables->offcodeCTable, NULL,
+- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+- workspace, wkspSize);
++ fseTables->offcodeCTable, NULL,
++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
++ workspace, wkspSize);
+ cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
+- fseTables->litlengthCTable, LL_bits,
+- LL_defaultNorm, LL_defaultNormLog, MaxLL,
+- workspace, wkspSize);
++ fseTables->litlengthCTable, LL_bits,
++ LL_defaultNorm, LL_defaultNormLog, MaxLL,
++ workspace, wkspSize);
+ cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
+- fseTables->matchlengthCTable, ML_bits,
+- ML_defaultNorm, ML_defaultNormLog, MaxML,
+- workspace, wkspSize);
++ fseTables->matchlengthCTable, ML_bits,
++ ML_defaultNorm, ML_defaultNormLog, MaxML,
++ workspace, wkspSize);
+ if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+ return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+
+ /* Returns the size estimate for a given stream of literals, of, ll, ml */
+-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+- const BYTE* ofCodeTable,
+- const BYTE* llCodeTable,
+- const BYTE* mlCodeTable,
+- size_t nbSeq,
+- const ZSTD_entropyCTables_t* entropy,
+- const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+- void* workspace, size_t wkspSize,
+- int writeLitEntropy, int writeSeqEntropy) {
++static size_t
++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
++ const BYTE* ofCodeTable,
++ const BYTE* llCodeTable,
++ const BYTE* mlCodeTable,
++ size_t nbSeq,
++ const ZSTD_entropyCTables_t* entropy,
++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++ void* workspace, size_t wkspSize,
++ int writeLitEntropy, int writeSeqEntropy)
++{
+ size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
+- &entropy->huf, &entropyMetadata->hufMetadata,
+- workspace, wkspSize, writeLitEntropy);
++ &entropy->huf, &entropyMetadata->hufMetadata,
++ workspace, wkspSize, writeLitEntropy);
+ size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+- workspace, wkspSize, writeSeqEntropy);
++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
++ workspace, wkspSize, writeSeqEntropy);
+ return seqSize + literalsSize + ZSTD_blockHeaderSize;
+ }
+
+ /* Builds entropy statistics and uses them for blocksize estimation.
+ *
+- * Returns the estimated compressed size of the seqStore, or a zstd error.
++ * @return: estimated compressed size of the seqStore, or a zstd error.
+ */
+-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
++static size_t
++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
++{
++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+ DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+ FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+ &zc->blockState.prevCBlock->entropy,
+ &zc->blockState.nextCBlock->entropy,
+ &zc->appliedParams,
+ entropyMetadata,
+- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
++ return ZSTD_estimateBlockSize(
++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+ seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+ (size_t)(seqStore->sequences - seqStore->sequencesStart),
+- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
++ &zc->blockState.nextCBlock->entropy,
++ entropyMetadata,
++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+ (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
+ }
+
+ /* Returns literals bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
++{
+ size_t literalsBytes = 0;
+- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+ size_t i;
+ for (i = 0; i < nbSeqs; ++i) {
+- seqDef seq = seqStore->sequencesStart[i];
++ seqDef const seq = seqStore->sequencesStart[i];
+ literalsBytes += seq.litLength;
+ if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
+ literalsBytes += 0x10000;
+- }
+- }
++ } }
+ return literalsBytes;
+ }
+
+ /* Returns match bytes represented in a seqStore */
+-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
++{
+ size_t matchBytes = 0;
+- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+ size_t i;
+ for (i = 0; i < nbSeqs; ++i) {
+ seqDef seq = seqStore->sequencesStart[i];
+ matchBytes += seq.mlBase + MINMATCH;
+ if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
+ matchBytes += 0x10000;
+- }
+- }
++ } }
+ return matchBytes;
+ }
+
+@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+ */
+ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ const seqStore_t* originalSeqStore,
+- size_t startIdx, size_t endIdx) {
+- BYTE* const litEnd = originalSeqStore->lit;
+- size_t literalsBytes;
+- size_t literalsBytesPreceding = 0;
+-
++ size_t startIdx, size_t endIdx)
++{
+ *resultSeqStore = *originalSeqStore;
+ if (startIdx > 0) {
+ resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
+- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+ }
+
+ /* Move longLengthPos into the correct position if necessary */
+@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+ resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
+ resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
+- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+- resultSeqStore->litStart += literalsBytesPreceding;
+ if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
+ /* This accounts for possible last literals if the derived chunk reaches the end of the block */
+- resultSeqStore->lit = litEnd;
++ assert(resultSeqStore->lit == originalSeqStore->lit);
+ } else {
+- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
+ }
+ resultSeqStore->llCode += startIdx;
+ resultSeqStore->mlCode += startIdx;
+@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+ }
+
+ /*
+- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
+- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ */
+ static U32
+-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+-{
+- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */
+- assert(STORED_IS_REPCODE(offCode));
+- if (adjustedOffCode == ZSTD_REP_NUM) {
+- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
+- assert(rep[0] > 0);
++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
++{
++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */
++ assert(OFFBASE_IS_REPCODE(offBase));
++ if (adjustedRepCode == ZSTD_REP_NUM) {
++ assert(ll0);
++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
++ * This is only valid if it results in a valid offset value, aka > 0.
++ * Note : it may happen that `rep[0]==1` in exceptional circumstances.
++ * In which case this function will return 0, which is an invalid offset.
++ * It's not an issue though, since this value will be
++ * compared and discarded within ZSTD_seqStore_resolveOffCodes().
++ */
+ return rep[0] - 1;
+ }
+- return rep[adjustedOffCode];
++ return rep[adjustedRepCode];
+ }
+
+ /*
+@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+ * 1-3 : repcode 1-3
+ * 4+ : real_offset+3
+ */
+-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+- seqStore_t* const seqStore, U32 const nbSeq) {
++static void
++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
++ const seqStore_t* const seqStore, U32 const nbSeq)
++{
+ U32 idx = 0;
++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
+ for (; idx < nbSeq; ++idx) {
+ seqDef* const seq = seqStore->sequencesStart + idx;
+- U32 const ll0 = (seq->litLength == 0);
+- U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+- assert(seq->offBase > 0);
+- if (STORED_IS_REPCODE(offCode)) {
+- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
+- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
++ U32 const offBase = seq->offBase;
++ assert(offBase > 0);
++ if (OFFBASE_IS_REPCODE(offBase)) {
++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
+ /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
+ * the repcode with the offset it actually references, determined by the compression
+ * repcode history.
+ */
+ if (dRawOffset != cRawOffset) {
+- seq->offBase = cRawOffset + ZSTD_REP_NUM;
++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
+ }
+ }
+ /* Compression repcode history is always updated with values directly from the unmodified seqStore.
+ * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
+ */
+- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+- ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
+ }
+ }
+
+@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+ * Returns the total size of that block (including header) or a ZSTD error code.
+ */
+ static size_t
+-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
++ const seqStore_t* const seqStore,
+ repcodes_t* const dRep, repcodes_t* const cRep,
+ void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize,
++ const void* src, size_t srcSize,
+ U32 lastBlock, U32 isPartition)
+ {
+ const U32 rleMaxLength = 25;
+@@ -3481,45 +3930,49 @@ typedef struct {
+
+ /* Helper function to perform the recursive search for block splits.
+ * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
+- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
+- * we do not recurse.
++ * If advantageous to split, then we recurse down the two sub-blocks.
++ * If not, or if an error occurred in estimation, then we do not recurse.
+ *
+- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
++ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+ * In practice, recursion depth usually doesn't go beyond 4.
+ *
+- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * maximum of 128 KB, this value is actually impossible to reach.
+ */
+ static void
+ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+ ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+ {
+- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
++ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
++ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
++ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+ size_t estimatedOriginalSize;
+ size_t estimatedFirstHalfSize;
+ size_t estimatedSecondHalfSize;
+ size_t midIdx = (startIdx + endIdx)/2;
+
++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
++ assert(endIdx >= startIdx);
+ if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
+ return;
+ }
+- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+ ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+ ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+ ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+ estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+ estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+ estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+ estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
+ if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+ return;
+ }
+ if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
+ ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
+ splits->splitLocations[splits->idx] = (U32)midIdx;
+ splits->idx++;
+@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+ }
+ }
+
+-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
++/* Base recursive function.
++ * Populates a table with intra-block partition indices that can improve compression ratio.
+ *
+- * Returns the number of splits made (which equals the size of the partition table - 1).
++ * @return: number of splits made (which equals the size of the partition table - 1).
+ */
+-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
+- seqStoreSplits splits = {partitions, 0};
++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
++{
++ seqStoreSplits splits;
++ splits.splitLocations = partitions;
++ splits.idx = 0;
+ if (nbSeq <= 4) {
+- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
+ /* Refuse to try and split anything with less than 4 sequences */
+ return 0;
+ }
+@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+ * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
+ */
+ static size_t
+-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t blockSize,
++ U32 lastBlock, U32 nbSeq)
+ {
+ size_t cSize = 0;
+ const BYTE* ip = (const BYTE*)src;
+ BYTE* op = (BYTE*)dst;
+ size_t i = 0;
+ size_t srcBytesTotal = 0;
+- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
+- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
++ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
++ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+
+ /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
+ * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
+@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+ ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+ ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+
+- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+ (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+ (unsigned)zc->blockState.matchState.nextToUpdate);
+
+ if (numSplits == 0) {
+- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+- &dRep, &cRep,
+- op, dstCapacity,
+- ip, blockSize,
+- lastBlock, 0 /* isPartition */);
++ size_t cSizeSingleBlock =
++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
++ &dRep, &cRep,
++ op, dstCapacity,
++ ip, blockSize,
++ lastBlock, 0 /* isPartition */);
+ FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
+ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
++ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
+ return cSizeSingleBlock;
+ }
+
+ ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
+ for (i = 0; i <= numSplits; ++i) {
+- size_t srcBytes;
+ size_t cSizeChunk;
+ U32 const lastPartition = (i == numSplits);
+ U32 lastBlockEntireSrc = 0;
+
+- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+ srcBytesTotal += srcBytes;
+ if (lastPartition) {
+ /* This is the final partition, need to account for possible last literals */
+@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+ op, dstCapacity,
+ ip, srcBytes,
+ lastBlockEntireSrc, 1 /* isPartition */);
+- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+ FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+
+ ip += srcBytes;
+@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+ dstCapacity -= cSizeChunk;
+ cSize += cSizeChunk;
+ *currSeqStore = *nextSeqStore;
+- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
++ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
+ }
+- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
+- * for the next block.
++ /* cRep and dRep may have diverged during the compression.
++ * If so, we use the dRep repcodes for the next block.
+ */
+ ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
+ return cSize;
+@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, U32 lastBlock)
+ {
+- const BYTE* ip = (const BYTE*)src;
+- BYTE* op = (BYTE*)dst;
+ U32 nbSeq;
+ size_t cSize;
+ DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+ if (bss == ZSTDbss_noCompress) {
+ if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+ zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+ FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+ DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+ return cSize;
+@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, U32 frame)
+ {
+- /* This the upper bound for the length of an rle block.
+- * This isn't the actual upper bound. Finding the real threshold
+- * needs further investigation.
++ /* This is an estimated upper bound for the length of an rle block.
++ * This isn't the actual upper bound.
++ * Finding the real threshold needs further investigation.
+ */
+ const U32 rleMaxLength = 25;
+ size_t cSize;
+@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+ * * cSize >= blockBound(srcSize): We have expanded the block too much so
+ * emit an uncompressed block.
+ */
+- {
+- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
++ { size_t const cSize =
++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+ if (cSize != ERROR(dstSize_tooSmall)) {
+- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
++ size_t const maxCSize =
++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+ if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+ ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+ }
+ }
+ }
+- }
++ } /* if (bss == ZSTDbss_compress)*/
+
+ DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+ /* Superblock compression failed, attempt to emit a single no compress block.
+@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+ * All blocks will be terminated, all input will be consumed.
+ * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+ * Frame is supposed already started (header already produced)
+-* @return : compressed size, or an error code
++* @return : compressed size, or an error code
+ */
+ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+ ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+ U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++ * additional 1. We need to revisit and change this logic to be more consistent */
++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
+ dstSize_tooSmall,
+ "not enough space to store compressed block");
+ if (remaining < blockSize) blockSize = remaining;
+@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+ MEM_writeLE24(op, cBlockHeader);
+ cSize += ZSTD_blockHeaderSize;
+ }
+- }
++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
+
+
+ ip += blockSize;
+@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+ }
+ }
+
+-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+- void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize)
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize)
+ {
+ DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+ return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+ }
+
++/* NOTE: Must just wrap ZSTD_compressContinue_public() */
++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize)
++{
++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
++}
+
+-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
+ {
+ ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+ assert(!ZSTD_checkCParams(cParams));
+- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
+ }
+
+-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
++{
++ return ZSTD_getBlockSize_deprecated(cctx);
++}
++
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+ DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
+ RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+
+ return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+ }
+
++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
++{
++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ /*! ZSTD_loadDictionaryContent() :
+ * @return : 0, or an error code
+ */
+@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+ ZSTD_cwksp* ws,
+ ZSTD_CCtx_params const* params,
+ const void* src, size_t srcSize,
+- ZSTD_dictTableLoadMethod_e dtlm)
++ ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp)
+ {
+ const BYTE* ip = (const BYTE*) src;
+ const BYTE* const iend = ip + srcSize;
+ int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
+
+- /* Assert that we the ms params match the params we're being given */
++ /* Assert that the ms params match the params we're being given */
+ ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
+- if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++ { /* Ensure large dictionaries can't cause index overflow */
++
+ /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
+ * Dictionaries right at the edge will immediately trigger overflow
+ * correction, but I don't want to insert extra constraints here.
+ */
+- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
+- /* We must have cleared our windows when our source is this large. */
+- assert(ZSTD_window_isEmpty(ms->window));
+- if (loadLdmDict)
+- assert(ZSTD_window_isEmpty(ls->window));
++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
++
++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
++ /* Some dictionary matchfinders in zstd use "short cache",
++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
++ * CDict hashtable entry as a tag rather than as part of an index.
++ * When short cache is used, we need to truncate the dictionary
++ * so that its indices don't overlap with the tag. */
++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
++ assert(!loadLdmDict);
++ }
++
+ /* If the dictionary is too large, only load the suffix of the dictionary. */
+ if (srcSize > maxDictSize) {
+ ip = iend - maxDictSize;
+@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+ }
+ }
+
+- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++ if (srcSize > ZSTD_CHUNKSIZE_MAX) {
++ /* We must have cleared our windows when our source is this large. */
++ assert(ZSTD_window_isEmpty(ms->window));
++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
++ }
+ ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
+- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+- ms->forceNonContiguous = params->deterministicRefPrefix;
+
+- if (loadLdmDict) {
++ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
++
++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
+ ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
+ ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
++ ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
++ }
++
++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
++ if (params->cParams.strategy < ZSTD_btultra) {
++ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
++ if (srcSize > maxDictSize) {
++ ip = iend - maxDictSize;
++ src = ip;
++ srcSize = maxDictSize;
++ }
+ }
+
++ ms->nextToUpdate = (U32)(ip - ms->window.base);
++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
++ ms->forceNonContiguous = params->deterministicRefPrefix;
++
+ if (srcSize <= HASH_READ_SIZE) return 0;
+
+ ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
+
+- if (loadLdmDict)
+- ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
+-
+ switch(params->cParams.strategy)
+ {
+ case ZSTD_fast:
+- ZSTD_fillHashTable(ms, iend, dtlm);
++ ZSTD_fillHashTable(ms, iend, dtlm, tfp);
+ break;
+ case ZSTD_dfast:
+- ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
+ break;
+
+ case ZSTD_greedy:
+@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+ } else {
+ assert(params->useRowMatchFinder != ZSTD_ps_auto);
+ if (params->useRowMatchFinder == ZSTD_ps_enable) {
+- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
+ ZSTD_memset(ms->tagTable, 0, tagTableSize);
+ ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+ DEBUGLOG(4, "Using row-based hash table for lazy dict");
+@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+ ZSTD_CCtx_params const* params,
+ const void* dict, size_t dictSize,
+ ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp,
+ void* workspace)
+ {
+ const BYTE* dictPtr = (const BYTE*)dict;
+@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+ {
+ size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+ FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
+ }
+ return dictID;
+ }
+@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+ const void* dict, size_t dictSize,
+ ZSTD_dictContentType_e dictContentType,
+ ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp,
+ void* workspace)
+ {
+ DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+
+ /* dict restricted modes */
+ if (dictContentType == ZSTD_dct_rawContent)
+- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+
+ if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+ if (dictContentType == ZSTD_dct_auto) {
+ DEBUGLOG(4, "raw content dictionary detected");
+ return ZSTD_loadDictionaryContent(
+- ms, ls, ws, params, dict, dictSize, dtlm);
++ ms, ls, ws, params, dict, dictSize, dtlm, tfp);
+ }
+ RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+ assert(0); /* impossible */
+@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+
+ /* dict as full zstd dictionary */
+ return ZSTD_loadZstdDictionary(
+- bs, ms, ws, params, dict, dictSize, dtlm, workspace);
++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
+ }
+
+ #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+ #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
+
+ /*! ZSTD_compressBegin_internal() :
++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
+ * @return : 0, or an error code */
+ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+ const void* dict, size_t dictSize,
+@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+ cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+ &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+ cdict->dictContentSize, cdict->dictContentType, dtlm,
+- cctx->entropyWorkspace)
++ ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
+ : ZSTD_compress_insertDictionary(
+ cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+ &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+- dictContentType, dtlm, cctx->entropyWorkspace);
++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
+ FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+ assert(dictID <= UINT_MAX);
+ cctx->dictID = (U32)dictID;
+@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+ &cctxParams, pledgedSrcSize);
+ }
+
+-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++static size_t
++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+ {
+ ZSTD_CCtx_params cctxParams;
+- {
+- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+ ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
+ }
+ DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+ &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+ }
+
++size_t
++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
++{
++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
++}
++
+ size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+ {
+- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
+ }
+
+
+@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+ (void)extraCSize;
+ }
+
+-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+- void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize)
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize)
+ {
+ size_t endResult;
+ size_t const cSize = ZSTD_compressContinue_internal(cctx,
+@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+ return cSize + endResult;
+ }
+
++/* NOTE: Must just wrap ZSTD_compressEnd_public() */
++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize)
++{
++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
++}
++
+ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
+ FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+ dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+ params, srcSize, ZSTDb_not_buffered) , "");
+- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+
+ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
+ { size_t const dictID = ZSTD_compress_insertDictionary(
+ &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+ &params, cdict->dictContent, cdict->dictContentSize,
+- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
+ FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+ assert(dictID <= (size_t)(U32)-1);
+ cdict->dictID = (U32)dictID;
+@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+ params.cParams = cParams;
+ params.useRowMatchFinder = useRowMatchFinder;
+ cdict->useRowMatchFinder = useRowMatchFinder;
++ cdict->compressionLevel = ZSTD_NO_CLEVEL;
+
+ if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+ dict, dictSize,
+@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+
+ /* ZSTD_compressBegin_usingCDict() :
+ * cdict must be != NULL */
+-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+ {
+ ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+ return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+ }
+
++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
++{
++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
++}
++
+ /*! ZSTD_compress_usingCDict_internal():
+ * Implementation of various ZSTD_compress_usingCDict* functions.
+ */
+@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+ const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+ {
+ FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
+- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+ }
+
+ /*! ZSTD_compress_usingCDict_advanced():
+@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+
+ static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+ {
+- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+- if (hintInSize==0) hintInSize = cctx->blockSize;
+- return hintInSize;
++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
++ return cctx->blockSize - cctx->stableIn_notConsumed;
++ }
++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
++ if (hintInSize==0) hintInSize = cctx->blockSize;
++ return hintInSize;
++ }
+ }
+
+ /* ZSTD_compressStream_generic():
+ * internal function for all *compressStream*() variants
+- * non-static, because can be called from zstdmt_compress.c
+- * @return : hint size for next input */
++ * @return : hint size for next input to complete ongoing block */
+ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+ ZSTD_EndDirective const flushMode)
+ {
+- const char* const istart = (const char*)input->src;
+- const char* const iend = input->size != 0 ? istart + input->size : istart;
+- const char* ip = input->pos != 0 ? istart + input->pos : istart;
+- char* const ostart = (char*)output->dst;
+- char* const oend = output->size != 0 ? ostart + output->size : ostart;
+- char* op = output->pos != 0 ? ostart + output->pos : ostart;
++ const char* const istart = (assert(input != NULL), (const char*)input->src);
++ const char* const iend = (istart != NULL) ? istart + input->size : istart;
++ const char* ip = (istart != NULL) ? istart + input->pos : istart;
++ char* const ostart = (assert(output != NULL), (char*)output->dst);
++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
++ char* op = (ostart != NULL) ? ostart + output->pos : ostart;
+ U32 someMoreWork = 1;
+
+ /* check expectations */
+- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
++ assert(zcs != NULL);
++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
++ assert(input->pos >= zcs->stableIn_notConsumed);
++ input->pos -= zcs->stableIn_notConsumed;
++ ip -= zcs->stableIn_notConsumed;
++ zcs->stableIn_notConsumed = 0;
++ }
+ if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
+ assert(zcs->inBuff != NULL);
+ assert(zcs->inBuffSize > 0);
+@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ assert(zcs->outBuff != NULL);
+ assert(zcs->outBuffSize > 0);
+ }
+- assert(output->pos <= output->size);
++ if (input->src == NULL) assert(input->size == 0);
+ assert(input->pos <= input->size);
++ if (output->dst == NULL) assert(output->size == 0);
++ assert(output->pos <= output->size);
+ assert((U32)flushMode <= (U32)ZSTD_e_end);
+
+ while (someMoreWork) {
+@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */
+ && (zcs->inBuffPos == 0) ) {
+ /* shortcut to compression pass directly into output buffer */
+- size_t const cSize = ZSTD_compressEnd(zcs,
++ size_t const cSize = ZSTD_compressEnd_public(zcs,
+ op, oend-op, ip, iend-ip);
+ DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+ FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ zcs->inBuff + zcs->inBuffPos, toLoad,
+ ip, iend-ip);
+ zcs->inBuffPos += loaded;
+- if (loaded != 0)
+- ip += loaded;
++ if (ip) ip += loaded;
+ if ( (flushMode == ZSTD_e_continue)
+ && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+ /* not enough input to fill full block : stop here */
+@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ /* empty */
+ someMoreWork = 0; break;
+ }
++ } else {
++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++ if ( (flushMode == ZSTD_e_continue)
++ && ( (size_t)(iend - ip) < zcs->blockSize) ) {
++ /* can't compress a full block : stop here */
++ zcs->stableIn_notConsumed = (size_t)(iend - ip);
++ ip = iend; /* pretend to have consumed input */
++ someMoreWork = 0; break;
++ }
++ if ( (flushMode == ZSTD_e_flush)
++ && (ip == iend) ) {
++ /* empty */
++ someMoreWork = 0; break;
++ }
+ }
+ /* compress current block (note : this stage cannot be stopped in the middle) */
+ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ void* cDst;
+ size_t cSize;
+ size_t oSize = oend-op;
+- size_t const iSize = inputBuffered
+- ? zcs->inBuffPos - zcs->inToCompress
+- : MIN((size_t)(iend - ip), zcs->blockSize);
++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
++ : MIN((size_t)(iend - ip), zcs->blockSize);
+ if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
+ cDst = op; /* compress into output buffer, to skip flush stage */
+ else
+@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ if (inputBuffered) {
+ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+ cSize = lastBlock ?
+- ZSTD_compressEnd(zcs, cDst, oSize,
++ ZSTD_compressEnd_public(zcs, cDst, oSize,
+ zcs->inBuff + zcs->inToCompress, iSize) :
+- ZSTD_compressContinue(zcs, cDst, oSize,
++ ZSTD_compressContinue_public(zcs, cDst, oSize,
+ zcs->inBuff + zcs->inToCompress, iSize);
+ FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+ zcs->frameEnded = lastBlock;
+@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+ if (!lastBlock)
+ assert(zcs->inBuffTarget <= zcs->inBuffSize);
+ zcs->inToCompress = zcs->inBuffPos;
+- } else {
+- unsigned const lastBlock = (ip + iSize == iend);
+- assert(flushMode == ZSTD_e_end /* Already validated */);
++ } else { /* !inputBuffered, hence ZSTD_bm_stable */
++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
+ cSize = lastBlock ?
+- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
+- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
+ /* Consume the input prior to error checking to mirror buffered mode. */
+- if (iSize > 0)
+- ip += iSize;
++ if (ip) ip += iSize;
+ FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+ zcs->frameEnded = lastBlock;
+- if (lastBlock)
+- assert(ip == iend);
++ if (lastBlock) assert(ip == iend);
+ }
+ if (cDst == op) { /* no need to flush */
+ op += cSize;
+@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+ /* After a compression call set the expected input/output buffer.
+ * This is validated at the start of the next compression call.
+ */
+-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
++static void
++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
+ {
++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
+ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+ cctx->expectedInBuffer = *input;
+ }
+@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+ {
+ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+ ZSTD_inBuffer const expect = cctx->expectedInBuffer;
+- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
+- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
+- if (endOp != ZSTD_e_end)
+- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
++ if (expect.src != input->src || expect.pos != input->pos)
++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
+ }
++ (void)endOp;
+ if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
+ size_t const outBufferSize = output->size - output->pos;
+ if (cctx->expectedOutBufferSize != outBufferSize)
+- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+ }
+ return 0;
+ }
+
+ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+ ZSTD_EndDirective endOp,
+- size_t inSize) {
++ size_t inSize)
++{
+ ZSTD_CCtx_params params = cctx->requestedParams;
+ ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+ FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+ params.compressionLevel = cctx->cdict->compressionLevel;
+ }
+ DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */
+- {
+- size_t const dictSize = prefixDict.dict
++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */
++
++ { size_t const dictSize = prefixDict.dict
+ ? prefixDict.dictSize
+ : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
+ ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
+@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+ params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+ params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
+ params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
+
+ { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
+ assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+ return 0;
+ }
+
++/* @return provides a minimum amount of data remaining to be flushed from internal buffers
++ */
+ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+ ZSTD_outBuffer* output,
+ ZSTD_inBuffer* input,
+@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+
+ /* transparent initialization stage */
+ if (cctx->streamStage == zcss_init) {
+- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
+- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */
++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */
++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */
++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */
++ if (cctx->stableIn_notConsumed) { /* not the first time */
++ /* check stable source guarantees */
++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
++ }
++ /* pretend input was consumed, to give a sense forward progress */
++ input->pos = input->size;
++ /* save stable inBuffer, for later control, and flush/end */
++ cctx->expectedInBuffer = *input;
++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
++ cctx->stableIn_notConsumed += inputSize;
++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */
++ }
++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */
+ }
+ /* end of transparent initialization stage */
+
+@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+ const void* src, size_t srcSize, size_t* srcPos,
+ ZSTD_EndDirective endOp)
+ {
+- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+- ZSTD_inBuffer input = { src, srcSize, *srcPos };
++ ZSTD_outBuffer output;
++ ZSTD_inBuffer input;
++ output.dst = dst;
++ output.size = dstCapacity;
++ output.pos = *dstPos;
++ input.src = src;
++ input.size = srcSize;
++ input.pos = *srcPos;
+ /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+- *dstPos = output.pos;
+- *srcPos = input.pos;
+- return cErr;
++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
++ *dstPos = output.pos;
++ *srcPos = input.pos;
++ return cErr;
++ }
+ }
+
+ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+ /* Reset to the original values. */
+ cctx->requestedParams.inBufferMode = originalInBufferMode;
+ cctx->requestedParams.outBufferMode = originalOutBufferMode;
++
+ FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+ if (result != 0) { /* compression not completed, due to lack of output space */
+ assert(oPos == dstCapacity);
+@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+ }
+ }
+
+-typedef struct {
+- U32 idx; /* Index in array of ZSTD_Sequence */
+- U32 posInSequence; /* Position within sequence at idx */
+- size_t posInSrc; /* Number of bytes given by sequences provided so far */
+-} ZSTD_sequencePosition;
+-
+ /* ZSTD_validateSequence() :
+ * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+ * @returns a ZSTD error code if sequence is not valid
+ */
+ static size_t
+-ZSTD_validateSequence(U32 offCode, U32 matchLength,
+- size_t posInSrc, U32 windowLog, size_t dictSize)
++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
+ {
+- U32 const windowSize = 1 << windowLog;
++ U32 const windowSize = 1u << windowLog;
+ /* posInSrc represents the amount of data the decoder would decode up to this point.
+ * As long as the amount of data decoded is less than or equal to window size, offsets may be
+ * larger than the total length of output decoded in order to reference the dict, even larger than
+ * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
+ */
+ size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
++ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
+ return 0;
+ }
+
+ /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
+-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+ {
+- U32 offCode = STORE_OFFSET(rawOffset);
++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
+
+ if (!ll0 && rawOffset == rep[0]) {
+- offCode = STORE_REPCODE_1;
++ offBase = REPCODE1_TO_OFFBASE;
+ } else if (rawOffset == rep[1]) {
+- offCode = STORE_REPCODE(2 - ll0);
++ offBase = REPCODE_TO_OFFBASE(2 - ll0);
+ } else if (rawOffset == rep[2]) {
+- offCode = STORE_REPCODE(3 - ll0);
++ offBase = REPCODE_TO_OFFBASE(3 - ll0);
+ } else if (ll0 && rawOffset == rep[0] - 1) {
+- offCode = STORE_REPCODE_3;
++ offBase = REPCODE3_TO_OFFBASE;
+ }
+- return offCode;
++ return offBase;
+ }
+
+-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+ ZSTD_sequencePosition* seqPos,
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+- const void* src, size_t blockSize)
++ const void* src, size_t blockSize,
++ ZSTD_paramSwitch_e externalRepSearch)
+ {
+ U32 idx = seqPos->idx;
++ U32 const startIdx = idx;
+ BYTE const* ip = (BYTE const*)(src);
+ const BYTE* const iend = ip + blockSize;
+ repcodes_t updatedRepcodes;
+ U32 dictSize;
+
++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
++
+ if (cctx->cdict) {
+ dictSize = (U32)cctx->cdict->dictContentSize;
+ } else if (cctx->prefixDict.dict) {
+@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+ dictSize = 0;
+ }
+ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
+ U32 const litLength = inSeqs[idx].litLength;
+- U32 const ll0 = (litLength == 0);
+ U32 const matchLength = inSeqs[idx].matchLength;
+- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++ U32 offBase;
+
+- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++ if (externalRepSearch == ZSTD_ps_disable) {
++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
++ } else {
++ U32 const ll0 = (litLength == 0);
++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
++ }
++
++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+ if (cctx->appliedParams.validateSequences) {
+ seqPos->posInSrc += litLength + matchLength;
+- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+- cctx->appliedParams.cParams.windowLog, dictSize),
++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
+ "Sequence validation failed");
+ }
+- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+ "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+ ip += matchLength + litLength;
+ }
++
++ /* If we skipped repcode search while parsing, we need to update repcodes now */
++ assert(externalRepSearch != ZSTD_ps_auto);
++ assert(idx >= startIdx);
++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
++ U32* const rep = updatedRepcodes.rep;
++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
++
++ if (lastSeqIdx >= startIdx + 2) {
++ rep[2] = inSeqs[lastSeqIdx - 2].offset;
++ rep[1] = inSeqs[lastSeqIdx - 1].offset;
++ rep[0] = inSeqs[lastSeqIdx].offset;
++ } else if (lastSeqIdx == startIdx + 1) {
++ rep[2] = rep[0];
++ rep[1] = inSeqs[lastSeqIdx - 1].offset;
++ rep[0] = inSeqs[lastSeqIdx].offset;
++ } else {
++ assert(lastSeqIdx == startIdx);
++ rep[2] = rep[1];
++ rep[1] = rep[0];
++ rep[0] = inSeqs[lastSeqIdx].offset;
++ }
++ }
++
+ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
+
+ if (inSeqs[idx].litLength) {
+@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+ ip += inSeqs[idx].litLength;
+ seqPos->posInSrc += inSeqs[idx].litLength;
+ }
+- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
+ seqPos->idx = idx+1;
+ return 0;
+ }
+
+-/* Returns the number of bytes to move the current read position back by. Only non-zero
+- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
+- * went wrong.
+- *
+- * This function will attempt to scan through blockSize bytes represented by the sequences
+- * in inSeqs, storing any (partial) sequences.
+- *
+- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+- */
+-static size_t
++size_t
+ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+- const void* src, size_t blockSize)
++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
+ {
+ U32 idx = seqPos->idx;
+ U32 startPosInSequence = seqPos->posInSequence;
+@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ U32 bytesAdjustment = 0;
+ U32 finalMatchSplit = 0;
+
++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */
++ (void)externalRepSearch;
++
+ if (cctx->cdict) {
+ dictSize = cctx->cdict->dictContentSize;
+ } else if (cctx->prefixDict.dict) {
+@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ } else {
+ dictSize = 0;
+ }
+- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+ DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
+ while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
+@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ U32 litLength = currSeq.litLength;
+ U32 matchLength = currSeq.matchLength;
+ U32 const rawOffset = currSeq.offset;
+- U32 offCode;
++ U32 offBase;
+
+ /* Modify the sequence depending on where endPosInSequence lies */
+ if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
+@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ /* Move to the next sequence */
+ endPosInSequence -= currSeq.litLength + currSeq.matchLength;
+ startPosInSequence = 0;
+- idx++;
+ } else {
+ /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
+ does not reach the end of the match. So, we have to split the sequence */
+@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+ }
+ /* Check if this offset can be represented with a repcode */
+ { U32 const ll0 = (litLength == 0);
+- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
+- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+ }
+
+ if (cctx->appliedParams.validateSequences) {
+ seqPos->posInSrc += litLength + matchLength;
+- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
+- cctx->appliedParams.cParams.windowLog, dictSize),
++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
++ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
+ "Sequence validation failed");
+ }
+- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
+ "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
+- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
+ ip += matchLength + litLength;
++ if (!finalMatchSplit)
++ idx++; /* Next Sequence */
+ }
+ DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
+ assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
+@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+
+ typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+- const void* src, size_t blockSize);
++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ {
+ ZSTD_sequenceCopier sequenceCopier = NULL;
+@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+ return sequenceCopier;
+ }
+
++/* Discover the size of next block by searching for the delimiter.
++ * Note that a block delimiter **must** exist in this mode,
++ * otherwise it's an input error.
++ * The block size retrieved will be later compared to ensure it remains within bounds */
++static size_t
++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++ int end = 0;
++ size_t blockSize = 0;
++ size_t spos = seqPos.idx;
++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
++ assert(spos <= inSeqsSize);
++ while (spos < inSeqsSize) {
++ end = (inSeqs[spos].offset == 0);
++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
++ if (end) {
++ if (inSeqs[spos].matchLength != 0)
++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
++ break;
++ }
++ spos++;
++ }
++ if (!end)
++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
++ return blockSize;
++}
++
++/* More a "target" block size */
++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
++{
++ int const lastBlock = (remaining <= blockSize);
++ return lastBlock ? remaining : blockSize;
++}
++
++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
++ size_t blockSize, size_t remaining,
++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
++{
++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
++ if (mode == ZSTD_sf_noBlockDelimiters)
++ return blockSize_noDelimiter(blockSize, remaining);
++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
++ if (explicitBlockSize > blockSize)
++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
++ if (explicitBlockSize > remaining)
++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
++ return explicitBlockSize;
++ }
++}
++
+ /* Compress, block-by-block, all of the sequences given.
+ *
+ * Returns the cumulative size of all compressed blocks (including their headers),
+@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ const void* src, size_t srcSize)
+ {
+ size_t cSize = 0;
+- U32 lastBlock;
+- size_t blockSize;
+- size_t compressedSeqsSize;
+ size_t remaining = srcSize;
+ ZSTD_sequencePosition seqPos = {0, 0, 0};
+
+@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ }
+
+ while (remaining) {
++ size_t compressedSeqsSize;
+ size_t cBlockSize;
+ size_t additionalByteAdjustment;
+- lastBlock = remaining <= cctx->blockSize;
+- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
++ cctx->blockSize, remaining,
++ inSeqs, inSeqsSize, seqPos);
++ U32 const lastBlock = (blockSize == remaining);
++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
++ assert(blockSize <= remaining);
+ ZSTD_resetSeqStore(&cctx->seqStore);
+- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
++ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
+
+- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
++ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
+ FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
+ blockSize -= additionalByteAdjustment;
+
+ /* If blocks are too small, emit as a nocompress block */
+- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
++ * additional 1. We need to revisit and change this logic to be more consistent */
++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
+ cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+ FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
++ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+ cSize += cBlockSize;
+ ip += blockSize;
+ op += cBlockSize;
+@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ continue;
+ }
+
++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
+ compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
+ &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
+ &cctx->appliedParams,
+@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+ cctx->bmi2);
+ FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
+- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
+
+ if (!cctx->isFirstBlock &&
+ ZSTD_maybeRLE(&cctx->seqStore) &&
+- ZSTD_isRLE((BYTE const*)src, srcSize)) {
++ ZSTD_isRLE(ip, blockSize)) {
+ /* We don't want to emit our first block as a RLE even if it qualifies because
+ * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+ * This is only an issue for zstd <= v1.4.3
+@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ if (compressedSeqsSize == 0) {
+ /* ZSTD_noCompressBlock writes the block header as well */
+ cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
+- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
+ } else if (compressedSeqsSize == 1) {
+ cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
+- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
+- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
+ } else {
+ U32 cBlockHeader;
+ /* Error checking and repcodes update */
+@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
+ MEM_writeLE24(op, cBlockHeader);
+ cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
+- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
+ }
+
+ cSize += cBlockSize;
+- DEBUGLOG(4, "cSize running total: %zu", cSize);
+
+ if (lastBlock) {
+ break;
+@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ dstCapacity -= cBlockSize;
+ cctx->isFirstBlock = 0;
+ }
++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
+ }
+
++ DEBUGLOG(4, "cSize final total: %zu", cSize);
+ return cSize;
+ }
+
+-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+ const void* src, size_t srcSize)
+ {
+@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+ size_t frameHeaderSize = 0;
+
+ /* Transparent initialization stage, same as compressStream2() */
+- DEBUGLOG(3, "ZSTD_compressSequences()");
++ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
+ assert(cctx != NULL);
+ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
+ /* Begin writing output, starting with frame header */
+@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+ cSize += 4;
+ }
+
+- DEBUGLOG(3, "Final compressed size: %zu", cSize);
++ DEBUGLOG(4, "Final compressed size: %zu", cSize);
+ return cSize;
+ }
+
+ /*====== Finalize ======*/
+
++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
++{
++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
++ return stableInput ? zcs->expectedInBuffer : nullInput;
++}
++
+ /*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+- ZSTD_inBuffer input = { NULL, 0, 0 };
++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
++ input.size = input.pos; /* do not ingest more input during flush */
+ return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+ }
+
+
+ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+ {
+- ZSTD_inBuffer input = { NULL, 0, 0 };
++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+ size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
+ if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */
+ /* single thread mode : attempt to calculate remaining to flush more precisely */
+ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+ cp.targetLength = (unsigned)(-clampedCompressionLevel);
+ }
+ /* refine parameters based on srcSize & dictSize */
+- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
+ }
+ }
+
+@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+ if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+ return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
+ }
++
++void ZSTD_registerSequenceProducer(
++ ZSTD_CCtx* zc, void* mState,
++ ZSTD_sequenceProducer_F* mFinder
++) {
++ if (mFinder != NULL) {
++ ZSTD_externalMatchCtx emctx;
++ emctx.mState = mState;
++ emctx.mFinder = mFinder;
++ emctx.seqBuffer = NULL;
++ emctx.seqBufferCapacity = 0;
++ zc->externalMatchCtx = emctx;
++ zc->requestedParams.useSequenceProducer = 1;
++ } else {
++ ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
++ zc->requestedParams.useSequenceProducer = 0;
++ }
++}
+diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
+index 71697a11a..899f5e2de 100644
+--- a/lib/zstd/compress/zstd_compress_internal.h
++++ b/lib/zstd/compress/zstd_compress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -20,6 +21,7 @@
+ ***************************************/
+ #include "../common/zstd_internal.h"
+ #include "zstd_cwksp.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+
+
+ /*-*************************************
+@@ -111,12 +113,13 @@ typedef struct {
+ /* ZSTD_buildBlockEntropyStats() :
+ * Builds entropy for the block.
+ * @return : 0 on success or error code */
+-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+- const ZSTD_entropyCTables_t* prevEntropy,
+- ZSTD_entropyCTables_t* nextEntropy,
+- const ZSTD_CCtx_params* cctxParams,
+- ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+- void* workspace, size_t wkspSize);
++size_t ZSTD_buildBlockEntropyStats(
++ const seqStore_t* seqStorePtr,
++ const ZSTD_entropyCTables_t* prevEntropy,
++ ZSTD_entropyCTables_t* nextEntropy,
++ const ZSTD_CCtx_params* cctxParams,
++ ZSTD_entropyCTablesMetadata_t* entropyMetadata,
++ void* workspace, size_t wkspSize);
+
+ /* *******************************
+ * Compression internals structs *
+@@ -142,6 +145,12 @@ typedef struct {
+ size_t capacity; /* The capacity starting from `seq` pointer */
+ } rawSeqStore_t;
+
++typedef struct {
++ U32 idx; /* Index in array of ZSTD_Sequence */
++ U32 posInSequence; /* Position within sequence at idx */
++ size_t posInSrc; /* Number of bytes given by sequences provided so far */
++} ZSTD_sequencePosition;
++
+ UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
+
+ typedef struct {
+@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
+ U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */
+
+ U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */
++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+ U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
++ U64 hashSalt; /* For row-based matchFinder: salts the hash for re-use of tag table */
++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */
+
+ U32* hashTable;
+ U32* hashTable3;
+@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
+ const ZSTD_matchState_t* dictMatchState;
+ ZSTD_compressionParameters cParams;
+ const rawSeqStore_t* ldmSeqStore;
++
++ /* Controls prefetching in some dictMatchState matchfinders.
++ * This behavior is controlled from the cctx ms.
++ * This parameter has no effect in the cdict ms. */
++ int prefetchCDictTables;
++
++ /* When == 0, lazy match finders insert every position.
++ * When != 0, lazy match finders only insert positions they search.
++ * This allows them to skip much faster over incompressible data,
++ * at a small cost to compression ratio.
++ */
++ int lazySkipping;
+ };
+
+ typedef struct {
+@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
+
+ /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+ ZSTD_customMem customMem;
++
++ /* Controls prefetching in some dictMatchState matchfinders */
++ ZSTD_paramSwitch_e prefetchCDictTables;
++
++ /* Controls whether zstd will fall back to an internal matchfinder
++ * if the external matchfinder returns an error code. */
++ int enableMatchFinderFallback;
++
++ /* Indicates whether an external matchfinder has been referenced.
++ * Users can't set this externally.
++ * It is set internally in ZSTD_registerSequenceProducer(). */
++ int useSequenceProducer;
++
++ /* Adjust the max block size*/
++ size_t maxBlockSize;
++
++ /* Controls repcode search in external sequence parsing */
++ ZSTD_paramSwitch_e searchForExternalRepcodes;
+ }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
+@@ -355,6 +396,14 @@ typedef struct {
+ ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ } ZSTD_blockSplitCtx;
+
++/* Context for block-level external matchfinder API */
++typedef struct {
++ void* mState;
++ ZSTD_sequenceProducer_F* mFinder;
++ ZSTD_Sequence* seqBuffer;
++ size_t seqBufferCapacity;
++} ZSTD_externalMatchCtx;
++
+ struct ZSTD_CCtx_s {
+ ZSTD_compressionStage_e stage;
+ int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
+
+ /* Stable in/out buffer verification */
+ ZSTD_inBuffer expectedInBuffer;
++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
+ size_t expectedOutBufferSize;
+
+ /* Dictionary */
+@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
+
+ /* Workspace for block splitter */
+ ZSTD_blockSplitCtx blockSplitCtx;
++
++ /* Workspace for external matchfinder */
++ ZSTD_externalMatchCtx externalMatchCtx;
+ };
+
+ typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
+
+ typedef enum {
+ ZSTD_noDict = 0,
+@@ -441,7 +495,7 @@ typedef enum {
+ * In this mode we take both the source size and the dictionary size
+ * into account when selecting and adjusting the parameters.
+ */
+- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+ * We don't know what these parameters are for. We default to the legacy
+ * behavior of taking both the source size and the dict size into account
+ * when selecting and adjusting parameters.
+@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+ /* ZSTD_noCompressBlock() :
+ * Writes uncompressed block to dst buffer from given src.
+ * Returns the size of the block */
+-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+ {
+ U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
+ RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+ dstSize_tooSmall, "dst buf too small for uncompressed block");
+ MEM_writeLE24(dst, cBlockHeader24);
+@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+ return ZSTD_blockHeaderSize + srcSize;
+ }
+
+-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
++MEM_STATIC size_t
++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+ {
+ BYTE* const op = (BYTE*)dst;
+ U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+ {
+ U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+ ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
+ return (srcSize >> minlog) + 2;
+ }
+
+@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+ while (ip < iend) *op++ = *ip++;
+ }
+
+-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1)
+-#define STORE_REPCODE_1 STORE_REPCODE(1)
+-#define STORE_REPCODE_2 STORE_REPCODE(2)
+-#define STORE_REPCODE_3 STORE_REPCODE(3)
+-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE)
+-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE)
+-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */
+-#define STORED_TO_OFFBASE(o) ((o)+1)
+-#define OFFBASE_TO_STORED(o) ((o)-1)
++
++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM)
++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM)
++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */
+
+ /*! ZSTD_storeSeq() :
+- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
++ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
+ * @matchLength : must be >= MINMATCH
+- * Allowed to overread literals up to litLimit.
++ * Allowed to over-read literals up to litLimit.
+ */
+ HINT_INLINE UNUSED_ATTR void
+ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ size_t litLength, const BYTE* literals, const BYTE* litLimit,
+- U32 offBase_minus1,
++ U32 offBase,
+ size_t matchLength)
+ {
+ BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ static const BYTE* g_start = NULL;
+ if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */
+ { U32 const pos = (U32)((const BYTE*)literals - g_start);
+- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
++ pos, (U32)litLength, (U32)matchLength, (U32)offBase);
+ }
+ #endif
+ assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ assert(literals + litLength <= litLimit);
+ if (litEnd <= litLimit_w) {
+ /* Common case we can use wildcopy.
+- * First copy 16 bytes, because literals are likely short.
+- */
+- assert(WILDCOPY_OVERLENGTH >= 16);
++ * First copy 16 bytes, because literals are likely short.
++ */
++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
+ ZSTD_copy16(seqStorePtr->lit, literals);
+ if (litLength > 16) {
+ ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+ /* match offset */
+- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
++ seqStorePtr->sequences[0].offBase = offBase;
+
+ /* match Length */
+ assert(matchLength >= MINMATCH);
+@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+
+ /* ZSTD_updateRep() :
+ * updates in-place @rep (array of repeat offsets)
+- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
+ */
+ MEM_STATIC void
+-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */
++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */
+ rep[2] = rep[1];
+ rep[1] = rep[0];
+- rep[0] = STORED_OFFSET(offBase_minus1);
++ rep[0] = OFFBASE_TO_OFFSET(offBase);
+ } else { /* repcode */
+- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
+ if (repCode > 0) { /* note : if repCode==0, no change */
+ U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+ rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+@@ -673,11 +728,11 @@ typedef struct repcodes_s {
+ } repcodes_t;
+
+ MEM_STATIC repcodes_t
+-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
+ {
+ repcodes_t newReps;
+ ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
++ ZSTD_updateRep(newReps.rep, offBase, ll0);
+ return newReps;
+ }
+
+@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+ /*-*************************************
+ * Match length counter
+ ***************************************/
+-static unsigned ZSTD_NbCommonBytes (size_t val)
+-{
+- if (MEM_isLittleEndian()) {
+- if (MEM_64bits()) {
+-# if (__GNUC__ >= 4)
+- return (__builtin_ctzll((U64)val) >> 3);
+-# else
+- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+- 0, 3, 1, 3, 1, 4, 2, 7,
+- 0, 2, 3, 6, 1, 5, 3, 5,
+- 1, 3, 4, 4, 2, 5, 6, 7,
+- 7, 0, 1, 2, 3, 3, 4, 6,
+- 2, 6, 5, 5, 3, 4, 5, 6,
+- 7, 1, 2, 4, 6, 4, 4, 5,
+- 7, 2, 6, 5, 7, 6, 7, 7 };
+- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+-# endif
+- } else { /* 32 bits */
+-# if (__GNUC__ >= 3)
+- return (__builtin_ctz((U32)val) >> 3);
+-# else
+- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+- 3, 2, 2, 1, 3, 2, 0, 1,
+- 3, 3, 1, 2, 2, 2, 2, 0,
+- 3, 1, 2, 0, 1, 0, 1, 1 };
+- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+-# endif
+- }
+- } else { /* Big Endian CPU */
+- if (MEM_64bits()) {
+-# if (__GNUC__ >= 4)
+- return (__builtin_clzll(val) >> 3);
+-# else
+- unsigned r;
+- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
+- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+- r += (!val);
+- return r;
+-# endif
+- } else { /* 32 bits */
+-# if (__GNUC__ >= 3)
+- return (__builtin_clz((U32)val) >> 3);
+-# else
+- unsigned r;
+- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+- r += (!val);
+- return r;
+-# endif
+- } }
+-}
+-
+-
+ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+ {
+ const BYTE* const pStart = pIn;
+@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+ * Hashes
+ ***************************************/
+ static const U32 prime3bytes = 506832829U;
+-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; }
+-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; }
++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
+
+ static const U32 prime4bytes = 2654435761U;
+-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
+
+ static const U64 prime5bytes = 889523592379ULL;
+-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
+
+ static const U64 prime6bytes = 227718039650203ULL;
+-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
+
+ static const U64 prime7bytes = 58295818150454627ULL;
+-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
+
+ static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; }
++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
++
+
+ MEM_STATIC FORCE_INLINE_ATTR
+ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ {
++ /* Although some of these hashes do support hBits up to 64, some do not.
++ * To be on the safe side, always avoid hBits > 32. */
++ assert(hBits <= 32);
++
+ switch(mls)
+ {
+ default:
+@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+ }
+ }
+
++MEM_STATIC FORCE_INLINE_ATTR
++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
++ /* Although some of these hashes do support hBits up to 64, some do not.
++ * To be on the safe side, always avoid hBits > 32. */
++ assert(hBits <= 32);
++
++ switch(mls)
++ {
++ default:
++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
++ }
++}
++
++
+ /* ZSTD_ipow() :
+ * Return base^exponent.
+ */
+@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+ (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+ assert(blockEndIdx >= loadedDictEnd);
+
+- if (blockEndIdx > loadedDictEnd + maxDist) {
++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
+ /* On reaching window size, dictionaries are invalidated.
+ * For simplification, if window size is reached anywhere within next block,
+ * the dictionary is invalidated for the full block.
++ *
++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected
++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
++ * dictMatchState, so setting it to NULL is not a problem.
+ */
+ DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+ *loadedDictEndPtr = 0;
+@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+
+ #endif
+
++/* Short Cache */
++
++/* Normally, zstd matchfinders follow this flow:
++ * 1. Compute hash at ip
++ * 2. Load index from hashTable[hash]
++ * 3. Check if *ip == *(base + index)
++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
++ *
++ * Short cache is an optimization which allows us to avoid step 3 most of the time
++ * when the data doesn't actually match. With short cache, the flow becomes:
++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
++ *
++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
++ * dictMatchState matchfinders.
++ */
++#define ZSTD_SHORT_CACHE_TAG_BITS 8
++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
++
++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
++}
++
++/* Helper function for short cache matchfinders.
++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
++ return tag1 == tag2;
++}
+
+
+ /* ===============================================================
+@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+ */
+ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
+
++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
++ * Note that the block delimiter must include the last literals of the block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
++ ZSTD_sequencePosition* seqPos,
++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++/* Returns the number of bytes to move the current read position back by.
++ * Only non-zero if we ended up splitting a sequence.
++ * Otherwise, it may return a ZSTD error if something went wrong.
++ *
++ * This function will attempt to scan through blockSize bytes
++ * represented by the sequences in @inSeqs,
++ * storing any (partial) sequences.
++ *
++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
++ */
++size_t
++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
++
++
++/* ===============================================================
++ * Deprecated definitions that are still used internally to avoid
++ * deprecation warnings. These functions are exactly equivalent to
++ * their public variants, but avoid the deprecation warnings.
++ * =============================================================== */
++
++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
++
++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize);
++
++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize);
++
++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
++
++
+ #endif /* ZSTD_COMPRESS_H */
+diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
+index 52b0a8059..3e9ea46a6 100644
+--- a/lib/zstd/compress/zstd_compress_literals.c
++++ b/lib/zstd/compress/zstd_compress_literals.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -13,11 +14,36 @@
+ ***************************************/
+ #include "zstd_compress_literals.h"
+
++
++/* **************************************************************
++* Debug Traces
++****************************************************************/
++#if DEBUGLEVEL >= 2
++
++static size_t showHexa(const void* src, size_t srcSize)
++{
++ const BYTE* const ip = (const BYTE*)src;
++ size_t u;
++ for (u=0; u<srcSize; u++) {
++ RAWLOG(5, " %02X", ip[u]); (void)ip;
++ }
++ RAWLOG(5, " \n");
++ return srcSize;
++}
++
++#endif
++
++
++/* **************************************************************
++* Literals compression - special cases
++****************************************************************/
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+ BYTE* const ostart = (BYTE*)dst;
+ U32 const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
++
+ RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+
+ switch(flSize)
+@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
+ }
+
+ ZSTD_memcpy(ostart + flSize, src, srcSize);
+- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+ return srcSize + flSize;
+ }
+
++static int allBytesIdentical(const void* src, size_t srcSize)
++{
++ assert(srcSize >= 1);
++ assert(src != NULL);
++ { const BYTE b = ((const BYTE*)src)[0];
++ size_t p;
++ for (p=1; p<srcSize; p++) {
++ if (((const BYTE*)src)[p] != b) return 0;
++ }
++ return 1;
++ }
++}
++
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+ {
+ BYTE* const ostart = (BYTE*)dst;
+ U32 const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */
++ assert(dstCapacity >= 4); (void)dstCapacity;
++ assert(allBytesIdentical(src, srcSize));
+
+ switch(flSize)
+ {
+@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
+ }
+
+ ostart[flSize] = *(const BYTE*)src;
+- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
+ return flSize+1;
+ }
+
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+- ZSTD_hufCTables_t* nextHuf,
+- ZSTD_strategy strategy, int disableLiteralCompression,
+- void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize,
+- void* entropyWorkspace, size_t entropyWorkspaceSize,
+- const int bmi2,
+- unsigned suspectUncompressible)
++/* ZSTD_minLiteralsToCompress() :
++ * returns minimal amount of literals
++ * for literal compression to even be attempted.
++ * Minimum is made tighter as compression strategy increases.
++ */
++static size_t
++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
++{
++ assert((int)strategy >= 0);
++ assert((int)strategy <= 9);
++ /* btultra2 : min 8 bytes;
++ * then 2x larger for each successive compression strategy
++ * max threshold 64 bytes */
++ { int const shift = MIN(9-(int)strategy, 3);
++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
++ return mintc;
++ }
++}
++
++size_t ZSTD_compressLiterals (
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize,
++ void* entropyWorkspace, size_t entropyWorkspaceSize,
++ const ZSTD_hufCTables_t* prevHuf,
++ ZSTD_hufCTables_t* nextHuf,
++ ZSTD_strategy strategy,
++ int disableLiteralCompression,
++ int suspectUncompressible,
++ int bmi2)
+ {
+- size_t const minGain = ZSTD_minGain(srcSize, strategy);
+ size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+ BYTE* const ostart = (BYTE*)dst;
+ U32 singleStream = srcSize < 256;
+ symbolEncodingType_e hType = set_compressed;
+ size_t cLitSize;
+
+- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+- disableLiteralCompression, (U32)srcSize);
++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
++ disableLiteralCompression, (U32)srcSize, dstCapacity);
++
++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
+
+ /* Prepare nextEntropy assuming reusing the existing table */
+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+ if (disableLiteralCompression)
+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+- /* small ? don't even attempt compression (speed opt) */
+-# define COMPRESS_LITERALS_SIZE_MIN 63
+- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+- }
++ /* if too small, don't even attempt compression (speed opt) */
++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+ RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+ { HUF_repeat repeat = prevHuf->repeatMode;
+- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
++ int const flags = 0
++ | (bmi2 ? HUF_flags_bmi2 : 0)
++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
++
++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
++ huf_compress_f huf_compress;
+ if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+- cLitSize = singleStream ?
+- HUF_compress1X_repeat(
+- ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
+- HUF_compress4X_repeat(
+- ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
++ src, srcSize,
++ HUF_SYMBOLVALUE_MAX, LitHufLog,
++ entropyWorkspace, entropyWorkspaceSize,
++ (HUF_CElt*)nextHuf->CTable,
++ &repeat, flags);
++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
+ if (repeat != HUF_repeat_none) {
+ /* reused the existing table */
+- DEBUGLOG(5, "Reusing previous huffman table");
++ DEBUGLOG(5, "reusing statistics from previous huffman block");
+ hType = set_repeat;
+ }
+ }
+
+- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+- }
++ { size_t const minGain = ZSTD_minGain(srcSize, strategy);
++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
++ } }
+ if (cLitSize==1) {
+- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+- }
++ /* A return value of 1 signals that the alphabet consists of a single symbol.
++ * However, in some rare circumstances, it could be the compressed size (a single byte).
++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
++ * (it's also necessary to not generate statistics).
++ * Therefore, in such a case, actively check that all bytes are identical. */
++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
++ } }
+
+ if (hType == set_compressed) {
+ /* using a newly constructed table */
+@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+ switch(lhSize)
+ {
+ case 3: /* 2 - 2 - 10 - 10 */
+- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+ MEM_writeLE24(ostart, lhc);
+ break;
+ }
+ case 4: /* 2 - 2 - 14 - 14 */
++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+ { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+ MEM_writeLE32(ostart, lhc);
+ break;
+ }
+ case 5: /* 2 - 2 - 18 - 18 */
++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+ { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+ MEM_writeLE32(ostart, lhc);
+ ostart[4] = (BYTE)(cLitSize >> 10);
+diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
+index 9775fb97c..a2a85d6b6 100644
+--- a/lib/zstd/compress/zstd_compress_literals.h
++++ b/lib/zstd/compress/zstd_compress_literals.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -16,16 +17,24 @@
+
+ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
++/* ZSTD_compressRleLiteralsBlock() :
++ * Conditions :
++ * - All bytes in @src are identical
++ * - dstCapacity >= 4 */
+ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
+-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+- ZSTD_hufCTables_t* nextHuf,
+- ZSTD_strategy strategy, int disableLiteralCompression,
+- void* dst, size_t dstCapacity,
++/* ZSTD_compressLiterals():
++ * @entropyWorkspace: must be aligned on 4-bytes boundaries
++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
++ */
++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ void* entropyWorkspace, size_t entropyWorkspaceSize,
+- const int bmi2,
+- unsigned suspectUncompressible);
++ const ZSTD_hufCTables_t* prevHuf,
++ ZSTD_hufCTables_t* nextHuf,
++ ZSTD_strategy strategy, int disableLiteralCompression,
++ int suspectUncompressible,
++ int bmi2);
+
+ #endif /* ZSTD_COMPRESS_LITERALS_H */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
+index 21ddc1b37..5c028c78d 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.c
++++ b/lib/zstd/compress/zstd_compress_sequences.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
+ {
+ /* Heuristic: This should cover most blocks <= 16K and
+ * start to fade out after 16K to about 32K depending on
+- * comprssibility.
++ * compressibility.
+ */
+ return nbSeq >= 2048;
+ }
+@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
+ if (mostFrequent == nbSeq) {
+ *repeatMode = FSE_repeat_none;
+ if (isDefaultAllowed && nbSeq <= 2) {
+- /* Prefer set_basic over set_rle when there are 2 or less symbols,
++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
+ * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+ * If basic encoding isn't possible, always choose RLE.
+ */
+diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
+index 7991364c2..7fe6f4ff5 100644
+--- a/lib/zstd/compress/zstd_compress_sequences.h
++++ b/lib/zstd/compress/zstd_compress_sequences.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
+index 17d836cc8..dbacbaf72 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.c
++++ b/lib/zstd/compress/zstd_compress_superblock.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -36,13 +37,14 @@
+ * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+ * and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ * @return : compressed size of literals section of a sub-block
+- * Or 0 if it unable to compress.
++ * Or 0 if unable to compress.
+ * Or error code */
+-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+- const ZSTD_hufCTablesMetadata_t* hufMetadata,
+- const BYTE* literals, size_t litSize,
+- void* dst, size_t dstSize,
+- const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
++ const ZSTD_hufCTablesMetadata_t* hufMetadata,
++ const BYTE* literals, size_t litSize,
++ void* dst, size_t dstSize,
++ const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+ size_t const header = writeEntropy ? 200 : 0;
+ size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+ symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+ size_t cLitSize = 0;
+
+- (void)bmi2; /* TODO bmi2... */
+-
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+ *entropyWritten = 0;
+@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+ DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+ }
+
+- /* TODO bmi2 */
+- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0;
++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
++ : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
+ op += cSize;
+ cLitSize += cSize;
+ if (cSize == 0 || ERR_isError(cSize)) {
+@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+ return op-ostart;
+ }
+
+-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
++static size_t
++ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
++ const seqDef* sequences, size_t nbSeq,
++ size_t litSize, int lastSequence)
++{
+ const seqDef* const sstart = sequences;
+ const seqDef* const send = sequences + nbSeq;
+ const seqDef* sp = sstart;
+@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
+ * @return : compressed size of sequences section of a sub-block
+ * Or 0 if it is unable to compress
+ * Or error code. */
+-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+- const ZSTD_fseCTablesMetadata_t* fseMetadata,
+- const seqDef* sequences, size_t nbSeq,
+- const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+- const ZSTD_CCtx_params* cctxParams,
+- void* dst, size_t dstCapacity,
+- const int bmi2, int writeEntropy, int* entropyWritten)
++static size_t
++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
++ const ZSTD_fseCTablesMetadata_t* fseMetadata,
++ const seqDef* sequences, size_t nbSeq,
++ const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
++ const ZSTD_CCtx_params* cctxParams,
++ void* dst, size_t dstCapacity,
++ const int bmi2, int writeEntropy, int* entropyWritten)
+ {
+ const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+ BYTE* const ostart = (BYTE*)dst;
+@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+ repcodes_t rep;
+ ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
+ for (seq = sstart; seq < sp; ++seq) {
+- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+ }
+ ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
+ }
+diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
+index 224ece795..826bbc9e0 100644
+--- a/lib/zstd/compress/zstd_compress_superblock.h
++++ b/lib/zstd/compress/zstd_compress_superblock.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
+index 349fc923c..65ea53b62 100644
+--- a/lib/zstd/compress/zstd_cwksp.h
++++ b/lib/zstd/compress/zstd_cwksp.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -14,7 +15,9 @@
+ /*-*************************************
+ * Dependencies
+ ***************************************/
++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_internal.h"
++#include "../common/portability_macros.h"
+
+
+ /*-*************************************
+@@ -41,8 +44,9 @@
+ ***************************************/
+ typedef enum {
+ ZSTD_cwksp_alloc_objects,
+- ZSTD_cwksp_alloc_buffers,
+- ZSTD_cwksp_alloc_aligned
++ ZSTD_cwksp_alloc_aligned_init_once,
++ ZSTD_cwksp_alloc_aligned,
++ ZSTD_cwksp_alloc_buffers
+ } ZSTD_cwksp_alloc_phase_e;
+
+ /*
+@@ -95,8 +99,8 @@ typedef enum {
+ *
+ * Workspace Layout:
+ *
+- * [ ... workspace ... ]
+- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
++ * [ ... workspace ... ]
++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
+ *
+ * The various objects that live in the workspace are divided into the
+ * following categories, and are allocated separately:
+@@ -120,9 +124,18 @@ typedef enum {
+ * uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+ * Their sizes depend on the cparams. These tables are 64-byte aligned.
+ *
+- * - Aligned: these buffers are used for various purposes that require 4 byte
+- * alignment, but don't require any initialization before they're used. These
+- * buffers are each aligned to 64 bytes.
++ * - Init once: these buffers require to be initialized at least once before
++ * use. They should be used when we want to skip memory initialization
++ * while not triggering memory checkers (like Valgrind) when reading from
++ * from this memory without writing to it first.
++ * These buffers should be used carefully as they might contain data
++ * from previous compressions.
++ * Buffers are aligned to 64 bytes.
++ *
++ * - Aligned: these buffers don't require any initialization before they're
++ * used. The user of the buffer should make sure they write into a buffer
++ * location before reading from it.
++ * Buffers are aligned to 64 bytes.
+ *
+ * - Buffers: these buffers are used for various purposes that don't require
+ * any alignment or initialization before they're used. This means they can
+@@ -134,8 +147,9 @@ typedef enum {
+ * correctly packed into the workspace buffer. That order is:
+ *
+ * 1. Objects
+- * 2. Buffers
+- * 3. Aligned/Tables
++ * 2. Init once / Tables
++ * 3. Aligned / Tables
++ * 4. Buffers / Tables
+ *
+ * Attempts to reserve objects of different types out of order will fail.
+ */
+@@ -147,6 +161,7 @@ typedef struct {
+ void* tableEnd;
+ void* tableValidEnd;
+ void* allocStart;
++ void* initOnceStart;
+
+ BYTE allocFailed;
+ int workspaceOversizedDuration;
+@@ -159,6 +174,7 @@ typedef struct {
+ ***************************************/
+
+ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
+
+ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+ (void)ws;
+@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+ assert(ws->tableEnd <= ws->allocStart);
+ assert(ws->tableValidEnd <= ws->allocStart);
+ assert(ws->allocStart <= ws->workspaceEnd);
++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
++ assert(ws->workspace <= ws->initOnceStart);
+ }
+
+ /*
+@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+ * for internal purposes (currently only alignment).
+ */
+ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
+- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
+- * to align the beginning of the aligned section.
+- *
+- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
+- * aligneds being sized in multiples of 64 bytes.
++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
++ * bytes to align the beginning of tables section and end of buffers;
+ */
+- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
+ return slackSpace;
+ }
+
+@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
+ size_t const alignBytesMask = alignBytes - 1;
+ size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
+ assert((alignBytes & alignBytesMask) == 0);
+- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
++ assert(bytes < alignBytes);
+ return bytes;
+ }
+
++/*
++ * Returns the initial value for allocStart which is used to determine the position from
++ * which we can allocate from the end of the workspace.
++ */
++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
++ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
++}
++
+ /*
+ * Internal function. Do not use directly.
+ * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ {
+ assert(phase >= ws->phase);
+ if (phase > ws->phase) {
+- /* Going from allocating objects to allocating buffers */
+- if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+- phase >= ZSTD_cwksp_alloc_buffers) {
++ /* Going from allocating objects to allocating initOnce / tables */
++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
++ phase >= ZSTD_cwksp_alloc_aligned_init_once) {
+ ws->tableValidEnd = ws->objectEnd;
+- }
++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+
+- /* Going from allocating buffers to allocating aligneds/tables */
+- if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+- phase >= ZSTD_cwksp_alloc_aligned) {
+- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
+- size_t const bytesToAlign =
+- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
+- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
+- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
+- memory_allocation, "aligned phase - alignment initial allocation failed!");
+- }
+ { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
+- void* const alloc = ws->objectEnd;
++ void *const alloc = ws->objectEnd;
+ size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+- void* const objectEnd = (BYTE*)alloc + bytesToAlign;
++ void *const objectEnd = (BYTE *) alloc + bytesToAlign;
+ DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
+ RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
+ "table phase - alignment initial allocation failed!");
+@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ ws->tableEnd = objectEnd; /* table area starts being empty */
+ if (ws->tableValidEnd < ws->tableEnd) {
+ ws->tableValidEnd = ws->tableEnd;
+- } } }
++ }
++ }
++ }
+ ws->phase = phase;
+ ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
+ */
+ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+ {
+- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
+ }
+
+ /*
+@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+ return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+ }
+
++/*
++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
++ * This memory has been initialized at least once in the past.
++ * This doesn't mean it has been initialized this time, and it might contain data from previous
++ * operations.
++ * The main usage is for algorithms that might need read access into uninitialized memory.
++ * The algorithm must maintain safety under these conditions and must make sure it doesn't
++ * leak any of the past data (directly or in side channels).
++ */
++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
++{
++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
++ if(ptr && ptr < ws->initOnceStart) {
++ /* We assume the memory following the current allocation is either:
++ * 1. Not usable as initOnce memory (end of workspace)
++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
++ * 3. An ASAN redzone, in which case we don't want to write on it
++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */
++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
++ ws->initOnceStart = ptr;
++ }
++ return ptr;
++}
++
+ /*
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ */
+@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+ */
+ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+ {
+- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
+ void* alloc;
+ void* end;
+ void* top;
+
+- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+- return NULL;
++ /* We can only start allocating tables after we are done reserving space for objects at the
++ * start of the workspace */
++ if(ws->phase < phase) {
++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
++ return NULL;
++ }
+ }
+ alloc = ws->tableEnd;
+ end = (BYTE *)alloc + bytes;
+@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+ assert(ws->tableValidEnd >= ws->objectEnd);
+ assert(ws->tableValidEnd <= ws->allocStart);
+ if (ws->tableValidEnd < ws->tableEnd) {
+- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
+ }
+ ZSTD_cwksp_mark_tables_clean(ws);
+ }
+@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+
+
+ ws->tableEnd = ws->objectEnd;
+- ws->allocStart = ws->workspaceEnd;
++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
+ ws->allocFailed = 0;
+- if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+- ws->phase = ZSTD_cwksp_alloc_buffers;
++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
+ }
+ ZSTD_cwksp_assert_internal_consistency(ws);
+ }
+@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+ ws->workspaceEnd = (BYTE*)start + size;
+ ws->objectEnd = ws->workspace;
+ ws->tableValidEnd = ws->objectEnd;
++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
+ ws->phase = ZSTD_cwksp_alloc_objects;
+ ws->isStatic = isStatic;
+ ZSTD_cwksp_clear(ws);
+@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+ * Returns if the estimated space needed for a wksp is within an acceptable limit of the
+ * actual amount of space used.
+ */
+-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
+- size_t const estimatedSpace, int resizedWorkspace) {
+- if (resizedWorkspace) {
+- /* Resized/newly allocated wksp should have exact bounds */
+- return ZSTD_cwksp_used(ws) == estimatedSpace;
+- } else {
+- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
+- * than estimatedSpace. See the comments in zstd_cwksp.h for details.
+- */
+- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
+- }
++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
++ * the alignment bytes difference between estimation and actual usage */
++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
++ ZSTD_cwksp_used(ws) <= estimatedSpace;
+ }
+
+
+diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
+index 76933dea2..ab9440a99 100644
+--- a/lib/zstd/compress/zstd_double_fast.c
++++ b/lib/zstd/compress/zstd_double_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,43 @@
+ #include "zstd_compress_internal.h"
+ #include "zstd_double_fast.h"
+
++static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++ void const* end, ZSTD_dictTableLoadMethod_e dtlm)
++{
++ const ZSTD_compressionParameters* const cParams = &ms->cParams;
++ U32* const hashLarge = ms->hashTable;
++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++ U32 const mls = cParams->minMatch;
++ U32* const hashSmall = ms->chainTable;
++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
++ const BYTE* const base = ms->window.base;
++ const BYTE* ip = base + ms->nextToUpdate;
++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++ const U32 fastHashFillStep = 3;
+
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++ /* Always insert every fastHashFillStep position into the hash tables.
++ * Insert the other positions into the large hash table if their entry
++ * is empty.
++ */
++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
++ U32 const curr = (U32)(ip - base);
++ U32 i;
++ for (i = 0; i < fastHashFillStep; ++i) {
++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
++ if (i == 0) {
++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
++ }
++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
++ }
++ /* Only load extra positions for ZSTD_dtlm_full */
++ if (dtlm == ZSTD_dtlm_fast)
++ break;
++ } }
++}
++
++static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
+ void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+ {
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+ /* Only load extra positions for ZSTD_dtlm_full */
+ if (dtlm == ZSTD_dtlm_fast)
+ break;
+- } }
++ } }
++}
++
++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++ const void* const end,
++ ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp)
++{
++ if (tfp == ZSTD_tfp_forCDict) {
++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
++ } else {
++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
++ }
+ }
+
+
+@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+- U32 offsetSaved = 0;
++ U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+ size_t mLength;
+ U32 offset;
+@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ U32 const current = (U32)(ip - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+ U32 const maxRep = current - windowLow;
+- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+ }
+
+ /* Outer Loop: one iteration per match found and stored */
+@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+ mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+ ip++;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+ goto _match_stored;
+ }
+
+@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ } while (ip1 <= ilimit);
+
+ _cleanup:
++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+ /* save reps for next block */
+- rep[0] = offset_1 ? offset_1 : offsetSaved;
+- rep[1] = offset_2 ? offset_2 : offsetSaved;
++ rep[0] = offset_1 ? offset_1 : offsetSaved1;
++ rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ hashLong[hl1] = (U32)(ip1 - base);
+ }
+
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+ _match_stored:
+ /* match found */
+@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+ ip += rLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+- U32 offsetSaved = 0;
+
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ const BYTE* const dictStart = dictBase + dictStartIndex;
+ const BYTE* const dictEnd = dms->window.nextSrc;
+ const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase);
+- const U32 dictHBitsL = dictCParams->hashLog;
+- const U32 dictHBitsS = dictCParams->chainLog;
++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+ const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+
+ DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
+@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ /* if a dictionary is attached, it must be within window range */
+ assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+
++ if (ms->prefetchCDictTables) {
++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
++ PREFETCH_AREA(dictHashLong, hashTableBytes)
++ PREFETCH_AREA(dictHashSmall, chainTableBytes)
++ }
++
+ /* init */
+ ip += (dictAndPrefixLength == 0);
+
+@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ U32 offset;
+ size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+ size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
+ U32 const curr = (U32)(ip-base);
+ U32 const matchIndexL = hashLong[h2];
+ U32 matchIndexS = hashSmall[h];
+@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ ip++;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+ goto _match_stored;
+ }
+
+@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+ goto _match_found;
+ }
+- } else {
++ } else if (dictTagsMatchL) {
+ /* check dictMatchState long match */
+- U32 const dictMatchIndexL = dictHashLong[dictHL];
++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
+ const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+ assert(dictMatchL < dictEnd);
+
+@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ if (MEM_read32(match) == MEM_read32(ip)) {
+ goto _search_next_long;
+ }
+- } else {
++ } else if (dictTagsMatchS) {
+ /* check dictMatchState short match */
+- U32 const dictMatchIndexS = dictHashSmall[dictHS];
++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
+ match = dictBase + dictMatchIndexS;
+ matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ continue;
+
+ _search_next_long:
+-
+ { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+ U32 const matchIndexL3 = hashLong[hl3];
++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
+ const BYTE* matchL3 = base + matchIndexL3;
+ hashLong[hl3] = curr + 1;
+
+@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+ goto _match_found;
+ }
+- } else {
++ } else if (dictTagsMatchL3) {
+ /* check dict long +1 match */
+- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
+ const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+ assert(dictMatchL3 < dictEnd);
+ if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ offset_2 = offset_1;
+ offset_1 = offset;
+
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+ _match_stored:
+ /* match found */
+@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+ U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+ ip += repLength2;
+@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+ } /* while (ip < ilimit) */
+
+ /* save reps for next block */
+- rep[0] = offset_1 ? offset_1 : offsetSaved;
+- rep[1] = offset_2 ? offset_2 : offsetSaved;
++ rep[0] = offset_1;
++ rep[1] = offset_2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+ const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+ mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+ ip++;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+ } else {
+ if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+ const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+ while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+ } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+ size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+ }
+ offset_2 = offset_1;
+ offset_1 = offset;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+
+ } else {
+ ip += ((ip-anchor) >> kSearchStrength) + 1;
+@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+ U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+ ip += repLength2;
+diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
+index 6822bde65..0204f12e4 100644
+--- a/lib/zstd/compress/zstd_double_fast.h
++++ b/lib/zstd/compress/zstd_double_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */
+
+ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+- void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++ void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_doubleFast(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
+index a752e6bea..3399b39c5 100644
+--- a/lib/zstd/compress/zstd_fast.c
++++ b/lib/zstd/compress/zstd_fast.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -11,8 +12,42 @@
+ #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
+ #include "zstd_fast.h"
+
++static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++ const void* const end,
++ ZSTD_dictTableLoadMethod_e dtlm)
++{
++ const ZSTD_compressionParameters* const cParams = &ms->cParams;
++ U32* const hashTable = ms->hashTable;
++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
++ U32 const mls = cParams->minMatch;
++ const BYTE* const base = ms->window.base;
++ const BYTE* ip = base + ms->nextToUpdate;
++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
++ const U32 fastHashFillStep = 3;
+
+-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
++ * Feel free to remove this assert if there's a good reason! */
++ assert(dtlm == ZSTD_dtlm_full);
++
++ /* Always insert every fastHashFillStep position into the hash table.
++ * Insert the other positions if their hash entry is empty.
++ */
++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
++ U32 const curr = (U32)(ip - base);
++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
++
++ if (dtlm == ZSTD_dtlm_fast) continue;
++ /* Only load extra positions for ZSTD_dtlm_full */
++ { U32 p;
++ for (p = 1; p < fastHashFillStep; ++p) {
++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
++ } } } }
++}
++
++static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+ const void* const end,
+ ZSTD_dictTableLoadMethod_e dtlm)
+ {
+@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+ const U32 fastHashFillStep = 3;
+
++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
++ * Feel free to remove this assert if there's a good reason! */
++ assert(dtlm == ZSTD_dtlm_fast);
++
+ /* Always insert every fastHashFillStep position into the hash table.
+ * Insert the other positions if their hash entry is empty.
+ */
+@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+ } } } }
+ }
+
++void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
++ const void* const end,
++ ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp)
++{
++ if (tfp == ZSTD_tfp_forCDict) {
++ ZSTD_fillHashTableForCDict(ms, end, dtlm);
++ } else {
++ ZSTD_fillHashTableForCCtx(ms, end, dtlm);
++ }
++}
++
+
+ /*
+ * If you squint hard enough (and ignore repcodes), the search operation at any
+@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+
+ U32 rep_offset1 = rep[0];
+ U32 rep_offset2 = rep[1];
+- U32 offsetSaved = 0;
++ U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+ size_t hash0; /* hash for ip0 */
+ size_t hash1; /* hash for ip1 */
+@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+ { U32 const curr = (U32)(ip0 - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
+ U32 const maxRep = curr - windowLow;
+- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
+ }
+
+ /* start each op */
+@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
+ mLength = ip0[-1] == match0[-1];
+ ip0 -= mLength;
+ match0 -= mLength;
+- offcode = STORE_REPCODE_1;
++ offcode = REPCODE1_TO_OFFBASE;
+ mLength += 4;
++
++ /* First write next hash table entry; we've already calculated it.
++ * This write is known to be safe because the ip1 is before the
++ * repcode (ip2). */
++ hashTable[hash1] = (U32)(ip1 - base);
++
+ goto _match;
+ }
+
+@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+ /* check match at ip[0] */
+ if (MEM_read32(ip0) == mval) {
+ /* found a match! */
++
++ /* First write next hash table entry; we've already calculated it.
++ * This write is known to be safe because the ip1 == ip0 + 1, so
++ * we know we will resume searching after ip1 */
++ hashTable[hash1] = (U32)(ip1 - base);
++
+ goto _offset;
+ }
+
+@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
+ /* check match at ip[0] */
+ if (MEM_read32(ip0) == mval) {
+ /* found a match! */
++
++ /* first write next hash table entry; we've already calculated it */
++ if (step <= 4) {
++ /* We need to avoid writing an index into the hash table >= the
++ * position at which we will pick up our searching after we've
++ * taken this match.
++ *
++ * The minimum possible match has length 4, so the earliest ip0
++ * can be after we take this match will be the current ip0 + 4.
++ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
++ * write this position.
++ */
++ hashTable[hash1] = (U32)(ip1 - base);
++ }
++
+ goto _offset;
+ }
+
+@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
+ * However, it seems to be a meaningful performance hit to try to search
+ * them. So let's not. */
+
++ /* When the repcodes are outside of the prefix, we set them to zero before the loop.
++ * When the offsets are still zero, we need to restore them after the block to have a correct
++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
++ * offsets were invalid. We need to figure out which offset to refill with.
++ * - If both offsets are zero they are in the same order.
++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
++ * - If only one is zero, we need to decide which offset to restore.
++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
++ * - It is impossible for rep_offset2 to be non-zero.
++ *
++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
++ */
++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
++
+ /* save reps for next block */
+- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ match0 = base + idx;
+ rep_offset2 = rep_offset1;
+ rep_offset1 = (U32)(ip0-match0);
+- offcode = STORE_OFFSET(rep_offset1);
++ offcode = OFFSET_TO_OFFBASE(rep_offset1);
+ mLength = 4;
+
+ /* Count the backwards match length. */
+@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+ ip0 += mLength;
+ anchor = ip0;
+
+- /* write next hash table entry */
+- if (ip1 < ip0) {
+- hashTable[hash1] = (U32)(ip1 - base);
+- }
+-
+ /* Fill table and check for immediate repcode. */
+ if (ip0 <= ilimit) {
+ /* Fill Table */
+@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+ { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+ ip0 += rLength;
+- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
+ anchor = ip0;
+ continue; /* faster when present (confirmed on gcc-8) ... (?) */
+ } } }
+@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+ U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+ const BYTE* const base = ms->window.base;
+ const BYTE* const istart = (const BYTE*)src;
+- const BYTE* ip = istart;
++ const BYTE* ip0 = istart;
++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
+ const BYTE* anchor = istart;
+ const U32 prefixStartIndex = ms->window.dictLimit;
+ const BYTE* const prefixStart = base + prefixStartIndex;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+- U32 offsetSaved = 0;
+
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+ const BYTE* const dictStart = dictBase + dictStartIndex;
+ const BYTE* const dictEnd = dms->window.nextSrc;
+ const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
+- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart);
+- const U32 dictHLog = dictCParams->hashLog;
++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+
+ /* if a dictionary is still attached, it necessarily means that
+ * it is within window size. So we just check it. */
+ const U32 maxDistance = 1U << cParams->windowLog;
+- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ assert(endIndex - prefixStartIndex <= maxDistance);
+ (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
+
+@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+ * when translating a dict index into a local index */
+ assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+
++ if (ms->prefetchCDictTables) {
++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
++ PREFETCH_AREA(dictHashTable, hashTableBytes)
++ }
++
+ /* init */
+ DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+- ip += (dictAndPrefixLength == 0);
++ ip0 += (dictAndPrefixLength == 0);
+ /* dictMatchState repCode checks don't currently handle repCode == 0
+ * disabling. */
+ assert(offset_1 <= dictAndPrefixLength);
+ assert(offset_2 <= dictAndPrefixLength);
+
+- /* Main Search Loop */
+- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
++ /* Outer search loop */
++ assert(stepSize >= 1);
++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+ size_t mLength;
+- size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+- U32 const curr = (U32)(ip-base);
+- U32 const matchIndex = hashTable[h];
+- const BYTE* match = base + matchIndex;
+- const U32 repIndex = curr + 1 - offset_1;
+- const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+- dictBase + (repIndex - dictIndexDelta) :
+- base + repIndex;
+- hashTable[h] = curr; /* update hash table */
+-
+- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+- ip++;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+- } else if ( (matchIndex <= prefixStartIndex) ) {
+- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+- U32 const dictMatchIndex = dictHashTable[dictHash];
+- const BYTE* dictMatch = dictBase + dictMatchIndex;
+- if (dictMatchIndex <= dictStartIndex ||
+- MEM_read32(dictMatch) != MEM_read32(ip)) {
+- assert(stepSize >= 1);
+- ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+- continue;
+- } else {
+- /* found a dict match */
+- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
+- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+- while (((ip>anchor) & (dictMatch>dictStart))
+- && (ip[-1] == dictMatch[-1])) {
+- ip--; dictMatch--; mLength++;
++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++
++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
++
++ U32 matchIndex = hashTable[hash0];
++ U32 curr = (U32)(ip0 - base);
++ size_t step = stepSize;
++ const size_t kStepIncr = 1 << kSearchStrength;
++ const BYTE* nextStep = ip0 + kStepIncr;
++
++ /* Inner search loop */
++ while (1) {
++ const BYTE* match = base + matchIndex;
++ const U32 repIndex = curr + 1 - offset_1;
++ const BYTE* repMatch = (repIndex < prefixStartIndex) ?
++ dictBase + (repIndex - dictIndexDelta) :
++ base + repIndex;
++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
++ hashTable[hash0] = curr; /* update hash table */
++
++ if (((U32) ((prefixStartIndex - 1) - repIndex) >=
++ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
++ ip0++;
++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
++ break;
++ }
++
++ if (dictTagsMatch) {
++ /* Found a possible dict match */
++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
++ const BYTE* dictMatch = dictBase + dictMatchIndex;
++ if (dictMatchIndex > dictStartIndex &&
++ MEM_read32(dictMatch) == MEM_read32(ip0)) {
++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
++ if (matchIndex <= prefixStartIndex) {
++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
++ while (((ip0 > anchor) & (dictMatch > dictStart))
++ && (ip0[-1] == dictMatch[-1])) {
++ ip0--;
++ dictMatch--;
++ mLength++;
++ } /* catch up */
++ offset_2 = offset_1;
++ offset_1 = offset;
++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++ break;
++ }
++ }
++ }
++
++ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
++ /* found a regular match */
++ U32 const offset = (U32) (ip0 - match);
++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
++ while (((ip0 > anchor) & (match > prefixStart))
++ && (ip0[-1] == match[-1])) {
++ ip0--;
++ match--;
++ mLength++;
+ } /* catch up */
+ offset_2 = offset_1;
+ offset_1 = offset;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
++ break;
+ }
+- } else if (MEM_read32(match) != MEM_read32(ip)) {
+- /* it's not a match, and we're not going to check the dictionary */
+- assert(stepSize >= 1);
+- ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+- continue;
+- } else {
+- /* found a regular match */
+- U32 const offset = (U32)(ip-match);
+- mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+- while (((ip>anchor) & (match>prefixStart))
+- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+- offset_2 = offset_1;
+- offset_1 = offset;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+- }
++
++ /* Prepare for next iteration */
++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
++ matchIndex = hashTable[hash1];
++
++ if (ip1 >= nextStep) {
++ step++;
++ nextStep += kStepIncr;
++ }
++ ip0 = ip1;
++ ip1 = ip1 + step;
++ if (ip1 > ilimit) goto _cleanup;
++
++ curr = (U32)(ip0 - base);
++ hash0 = hash1;
++ } /* end inner search loop */
+
+ /* match found */
+- ip += mLength;
+- anchor = ip;
++ assert(mLength);
++ ip0 += mLength;
++ anchor = ip0;
+
+- if (ip <= ilimit) {
++ if (ip0 <= ilimit) {
+ /* Fill Table */
+ assert(base+curr+2 > istart); /* check base overflow */
+ hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */
+- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+ /* check immediate repcode */
+- while (ip <= ilimit) {
+- U32 const current2 = (U32)(ip-base);
++ while (ip0 <= ilimit) {
++ U32 const current2 = (U32)(ip0-base);
+ U32 const repIndex2 = current2 - offset_2;
+ const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+ dictBase - dictIndexDelta + repIndex2 :
+ base + repIndex2;
+ if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+ U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+- ip += repLength2;
+- anchor = ip;
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
++ ip0 += repLength2;
++ anchor = ip0;
+ continue;
+ }
+ break;
+ }
+ }
++
++ /* Prepare for next iteration */
++ assert(ip0 == anchor);
++ ip1 = ip0 + stepSize;
+ }
+
++_cleanup:
+ /* save reps for next block */
+- rep[0] = offset_1 ? offset_1 : offsetSaved;
+- rep[1] = offset_2 ? offset_2 : offsetSaved;
++ rep[0] = offset_1;
++ rep[1] = offset_2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+ U32* const hashTable = ms->hashTable;
+ U32 const hlog = cParams->hashLog;
+ /* support stepSize of 0 */
+- U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const BYTE* const istart = (const BYTE*)src;
+- const BYTE* ip = istart;
+ const BYTE* anchor = istart;
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - 8;
+ U32 offset_1=rep[0], offset_2=rep[1];
++ U32 offsetSaved1 = 0, offsetSaved2 = 0;
++
++ const BYTE* ip0 = istart;
++ const BYTE* ip1;
++ const BYTE* ip2;
++ const BYTE* ip3;
++ U32 current0;
++
++
++ size_t hash0; /* hash for ip0 */
++ size_t hash1; /* hash for ip1 */
++ U32 idx; /* match idx for ip0 */
++ const BYTE* idxBase; /* base pointer for idx */
++
++ U32 offcode;
++ const BYTE* match0;
++ size_t mLength;
++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
++
++ size_t step;
++ const BYTE* nextStep;
++ const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
+ (void)hasStep; /* not currently specialized on whether it's accelerated */
+
+@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+ if (prefixStartIndex == dictStartIndex)
+ return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
+
+- /* Search Loop */
+- while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+- const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+- const U32 matchIndex = hashTable[h];
+- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+- const BYTE* match = matchBase + matchIndex;
+- const U32 curr = (U32)(ip-base);
+- const U32 repIndex = curr + 1 - offset_1;
+- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+- const BYTE* const repMatch = repBase + repIndex;
+- hashTable[h] = curr; /* update hash table */
+- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
+-
+- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
+- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+- ip++;
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
+- ip += rLength;
+- anchor = ip;
+- } else {
+- if ( (matchIndex < dictStartIndex) ||
+- (MEM_read32(match) != MEM_read32(ip)) ) {
+- assert(stepSize >= 1);
+- ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+- continue;
++ { U32 const curr = (U32)(ip0 - base);
++ U32 const maxRep = curr - dictStartIndex;
++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
++ }
++
++ /* start each op */
++_start: /* Requires: ip0 */
++
++ step = stepSize;
++ nextStep = ip0 + kStepIncr;
++
++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */
++ ip1 = ip0 + 1;
++ ip2 = ip0 + step;
++ ip3 = ip2 + 1;
++
++ if (ip3 >= ilimit) {
++ goto _cleanup;
++ }
++
++ hash0 = ZSTD_hashPtr(ip0, hlog, mls);
++ hash1 = ZSTD_hashPtr(ip1, hlog, mls);
++
++ idx = hashTable[hash0];
++ idxBase = idx < prefixStartIndex ? dictBase : base;
++
++ do {
++ { /* load repcode match for ip[2] */
++ U32 const current2 = (U32)(ip2 - base);
++ U32 const repIndex = current2 - offset_1;
++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
++ U32 rval;
++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
++ & (offset_1 > 0) ) {
++ rval = MEM_read32(repBase + repIndex);
++ } else {
++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
+ }
+- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+- U32 const offset = curr - matchIndex;
+- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+- offset_2 = offset_1; offset_1 = offset; /* update offset history */
+- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+- ip += mLength;
+- anchor = ip;
++
++ /* write back hash table entry */
++ current0 = (U32)(ip0 - base);
++ hashTable[hash0] = current0;
++
++ /* check repcode at ip[2] */
++ if (MEM_read32(ip2) == rval) {
++ ip0 = ip2;
++ match0 = repBase + repIndex;
++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
++ assert((match0 != prefixStart) & (match0 != dictStart));
++ mLength = ip0[-1] == match0[-1];
++ ip0 -= mLength;
++ match0 -= mLength;
++ offcode = REPCODE1_TO_OFFBASE;
++ mLength += 4;
++ goto _match;
+ } }
+
+- if (ip <= ilimit) {
+- /* Fill Table */
+- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
+- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+- /* check immediate repcode */
+- while (ip <= ilimit) {
+- U32 const current2 = (U32)(ip-base);
+- U32 const repIndex2 = current2 - offset_2;
+- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */
+- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
+- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
+- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+- ip += repLength2;
+- anchor = ip;
+- continue;
+- }
+- break;
+- } } }
++ { /* load match for ip[0] */
++ U32 const mval = idx >= dictStartIndex ?
++ MEM_read32(idxBase + idx) :
++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++ /* check match at ip[0] */
++ if (MEM_read32(ip0) == mval) {
++ /* found a match! */
++ goto _offset;
++ } }
++
++ /* lookup ip[1] */
++ idx = hashTable[hash1];
++ idxBase = idx < prefixStartIndex ? dictBase : base;
++
++ /* hash ip[2] */
++ hash0 = hash1;
++ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++ /* advance to next positions */
++ ip0 = ip1;
++ ip1 = ip2;
++ ip2 = ip3;
++
++ /* write back hash table entry */
++ current0 = (U32)(ip0 - base);
++ hashTable[hash0] = current0;
++
++ { /* load match for ip[0] */
++ U32 const mval = idx >= dictStartIndex ?
++ MEM_read32(idxBase + idx) :
++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */
++
++ /* check match at ip[0] */
++ if (MEM_read32(ip0) == mval) {
++ /* found a match! */
++ goto _offset;
++ } }
++
++ /* lookup ip[1] */
++ idx = hashTable[hash1];
++ idxBase = idx < prefixStartIndex ? dictBase : base;
++
++ /* hash ip[2] */
++ hash0 = hash1;
++ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
++
++ /* advance to next positions */
++ ip0 = ip1;
++ ip1 = ip2;
++ ip2 = ip0 + step;
++ ip3 = ip1 + step;
++
++ /* calculate step */
++ if (ip2 >= nextStep) {
++ step++;
++ PREFETCH_L1(ip1 + 64);
++ PREFETCH_L1(ip1 + 128);
++ nextStep += kStepIncr;
++ }
++ } while (ip3 < ilimit);
++
++_cleanup:
++ /* Note that there are probably still a couple positions we could search.
++ * However, it seems to be a meaningful performance hit to try to search
++ * them. So let's not. */
++
++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+ /* save reps for next block */
+- rep[0] = offset_1;
+- rep[1] = offset_2;
++ rep[0] = offset_1 ? offset_1 : offsetSaved1;
++ rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
++
++_offset: /* Requires: ip0, idx, idxBase */
++
++ /* Compute the offset code. */
++ { U32 const offset = current0 - idx;
++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
++ matchEnd = idx < prefixStartIndex ? dictEnd : iend;
++ match0 = idxBase + idx;
++ offset_2 = offset_1;
++ offset_1 = offset;
++ offcode = OFFSET_TO_OFFBASE(offset);
++ mLength = 4;
++
++ /* Count the backwards match length. */
++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
++ ip0--;
++ match0--;
++ mLength++;
++ } }
++
++_match: /* Requires: ip0, match0, offcode, matchEnd */
++
++ /* Count the forward length. */
++ assert(matchEnd != 0);
++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
++
++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
++
++ ip0 += mLength;
++ anchor = ip0;
++
++ /* write next hash table entry */
++ if (ip1 < ip0) {
++ hashTable[hash1] = (U32)(ip1 - base);
++ }
++
++ /* Fill table and check for immediate repcode. */
++ if (ip0 <= ilimit) {
++ /* Fill Table */
++ assert(base+current0+2 > istart); /* check base overflow */
++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
++
++ while (ip0 <= ilimit) {
++ U32 const repIndex2 = (U32)(ip0-base) - offset_2;
++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
++ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */
++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
++ ip0 += repLength2;
++ anchor = ip0;
++ continue;
++ }
++ break;
++ } }
++
++ goto _start;
+ }
+
+ ZSTD_GEN_FAST_FN(extDict, 4, 0)
+@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+ void const* src, size_t srcSize)
+ {
+ U32 const mls = ms->cParams.minMatch;
++ assert(ms->dictMatchState == NULL);
+ switch(mls)
+ {
+ default: /* includes case 3 */
+diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
+index fddc2f532..e64d9e1b2 100644
+--- a/lib/zstd/compress/zstd_fast.h
++++ b/lib/zstd/compress/zstd_fast.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,8 @@
+ #include "zstd_compress_internal.h"
+
+ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+- void const* end, ZSTD_dictTableLoadMethod_e dtlm);
++ void const* end, ZSTD_dictTableLoadMethod_e dtlm,
++ ZSTD_tableFillPurpose_e tfp);
+ size_t ZSTD_compressBlock_fast(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
+index 0298a01a7..f6b4978ce 100644
+--- a/lib/zstd/compress/zstd_lazy.c
++++ b/lib/zstd/compress/zstd_lazy.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -10,6 +11,9 @@
+
+ #include "zstd_compress_internal.h"
+ #include "zstd_lazy.h"
++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
++
++#define kLazySkippingStep 8
+
+
+ /*-*************************************
+@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
+ U32 matchIndex = dictMatchIndex + dictIndexDelta;
+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+ DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+ }
+ if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+ break; /* drop, to guarantee consistency (miss a little bit of compression) */
+@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+
+ if (bestLength >= MINMATCH) {
+- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
+ DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+ curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+ }
+@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
+ static size_t
+ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iend,
+- size_t* offsetPtr,
++ size_t* offBasePtr,
+ U32 const mls,
+ const ZSTD_dictMode_e dictMode)
+ {
+@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ if (matchLength > bestLength) {
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+ if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
+ if (dictMode == ZSTD_dictMatchState) {
+ nbCompares = 0; /* in addition to avoiding checking any
+@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ if (dictMode == ZSTD_dictMatchState && nbCompares) {
+ bestLength = ZSTD_DUBT_findBetterDictMatch(
+ ms, ip, iend,
+- offsetPtr, bestLength, nbCompares,
++ offBasePtr, bestLength, nbCompares,
+ mls, dictMode);
+ }
+
+ assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
+ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
+ if (bestLength >= MINMATCH) {
+- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
+ DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+- curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
+ }
+ return bestLength;
+ }
+@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ FORCE_INLINE_TEMPLATE size_t
+ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iLimit,
+- size_t* offsetPtr,
++ size_t* offBasePtr,
+ const U32 mls /* template */,
+ const ZSTD_dictMode_e dictMode)
+ {
+ DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+ if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
+ ZSTD_updateDUBT(ms, ip, iLimit, mls);
+- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
+ }
+
+ /* *********************************
+@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+ if (ip+currentMl == iLimit) {
+ /* best possible, avoids read overflow on next attempt */
+ return ml;
+@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+ }
+@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ ZSTD_matchState_t* ms,
+ const ZSTD_compressionParameters* const cParams,
+- const BYTE* ip, U32 const mls)
++ const BYTE* ip, U32 const mls, U32 const lazySkipping)
+ {
+ U32* const hashTable = ms->hashTable;
+ const U32 hashLog = cParams->hashLog;
+@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+ hashTable[h] = idx;
+ idx++;
++ /* Stop inserting every position when in the lazy skipping mode. */
++ if (lazySkipping)
++ break;
+ }
+
+ ms->nextToUpdate = target;
+@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
+ }
+
+ /* inlining is important to hardwire a hot branch (template emulation) */
+@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
+ }
+
+ /* HC4 match finder */
+- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
+
+ for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+ size_t currentMl=0;
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+ const BYTE* const match = base + matchIndex;
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
+- if (match[ml] == ip[ml]) /* potentially better */
++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
+ currentMl = ZSTD_count(ip, match, iLimit);
+ } else {
+ const BYTE* const match = dictBase + matchIndex;
+@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+- *offsetPtr = STORE_OFFSET(curr - matchIndex);
++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+
+@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
+ if (currentMl > ml) {
+ ml = currentMl;
+ assert(curr > matchIndex + dmsIndexDelta);
+- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+
+@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
+ * (SIMD) Row-based matchfinder
+ ***********************************/
+ /* Constants for row-based hash */
+-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
+ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+ #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
+
+@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr
+ * Starting from the LSB, returns the idx of the next non-zero bit.
+ * Basically counting the nb of trailing zeroes.
+ */
+-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+- assert(val != 0);
+-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+- if (sizeof(size_t) == 4) {
+- U32 mostSignificantWord = (U32)(val >> 32);
+- U32 leastSignificantWord = (U32)val;
+- if (leastSignificantWord == 0) {
+- return 32 + (U32)__builtin_ctz(mostSignificantWord);
+- } else {
+- return (U32)__builtin_ctz(leastSignificantWord);
+- }
+- } else {
+- return (U32)__builtin_ctzll(val);
+- }
+-# else
+- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+- */
+- val = ~val & (val - 1ULL); /* Lowest set bit mask */
+- val = val - ((val >> 1) & 0x5555555555555555);
+- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+-# endif
+-}
+-
+-/* ZSTD_rotateRight_*():
+- * Rotates a bitfield to the right by "count" bits.
+- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+- */
+-FORCE_INLINE_TEMPLATE
+-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+- assert(count < 64);
+- count &= 0x3F; /* for fickle pattern recognition */
+- return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+- assert(count < 32);
+- count &= 0x1F; /* for fickle pattern recognition */
+- return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+-}
+-
+-FORCE_INLINE_TEMPLATE
+-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+- assert(count < 16);
+- count &= 0x0F; /* for fickle pattern recognition */
+- return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
++ return ZSTD_countTrailingZeros64(val);
+ }
+
+ /* ZSTD_row_nextIndex():
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
+- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
+ */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+- U32 const next = (*tagRow - 1) & rowMask;
+- *tagRow = (BYTE)next;
+- return next;
++ U32 next = (*tagRow-1) & rowMask;
++ next += (next == 0) ? rowMask : 0; /* skip first position */
++ *tagRow = (BYTE)next;
++ return next;
+ }
+
+ /* ZSTD_isAligned():
+@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ /* ZSTD_row_prefetch():
+ * Performs prefetching for the hashTable and tagTable at a given row.
+ */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
+ PREFETCH_L1(hashTable + relRow);
+ if (rowLog >= 5) {
+ PREFETCH_L1(hashTable + relRow + 16);
+@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+ U32 idx, const BYTE* const iLimit)
+ {
+ U32 const* const hashTable = ms->hashTable;
+- U16 const* const tagTable = ms->tagTable;
++ BYTE const* const tagTable = ms->tagTable;
+ U32 const hashLog = ms->rowHashLog;
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+
+ for (; idx < lim; ++idx) {
+- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+ */
+ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+- U16 const* tagTable, BYTE const* base,
++ BYTE const* tagTable, BYTE const* base,
+ U32 idx, U32 const hashLog,
+- U32 const rowLog, U32 const mls)
++ U32 const rowLog, U32 const mls,
++ U64 const hashSalt)
+ {
+- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+ U32 const rowMask, U32 const useCache)
+ {
+ U32* const hashTable = ms->hashTable;
+- U16* const tagTable = ms->tagTable;
++ BYTE* const tagTable = ms->tagTable;
+ U32 const hashLog = ms->rowHashLog;
+ const BYTE* const base = ms->window.base;
+
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ U32* const row = hashTable + relRow;
+- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+- Explicit cast allows us to get exact desired position within each row */
++ BYTE* tagRow = tagTable + relRow;
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+
+- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
+ row[pos] = updateStartIdx;
+ }
+ }
+@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
++}
++
++/* Returns the mask width of bits group of which will be set to 1. Given not all
++ * architectures have easy movemask instruction, this helps to iterate over
++ * groups of bits easier and faster.
++ */
++FORCE_INLINE_TEMPLATE U32
++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
++{
++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++ (void)rowEntries;
++#if defined(ZSTD_ARCH_ARM_NEON)
++ /* NEON path only works for little endian */
++ if (!MEM_isLittleEndian()) {
++ return 1;
++ }
++ if (rowEntries == 16) {
++ return 4;
++ }
++ if (rowEntries == 32) {
++ return 2;
++ }
++ if (rowEntries == 64) {
++ return 1;
++ }
++#endif
++ return 1;
+ }
+
+ #if defined(ZSTD_ARCH_X86_SSE2)
+@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+ }
+ #endif
+
+-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+- * the hash at the nth position in a row of the tagTable.
+- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+- * to match up with the actual layout of the entries within the hashTable */
++#if defined(ZSTD_ARCH_ARM_NEON)
++FORCE_INLINE_TEMPLATE ZSTD_VecMask
++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
++{
++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
++ if (rowEntries == 16) {
++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
++ * After that groups of 4 bits represent the equalMask. We lower
++ * all bits except the highest in these groups by doing AND with
++ * 0x88 = 0b10001000.
++ */
++ const uint8x16_t chunk = vld1q_u8(src);
++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
++ const uint8x8_t res = vshrn_n_u16(equalMask, 4);
++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
++ } else if (rowEntries == 32) {
++ /* Same idea as with rowEntries == 16 but doing AND with
++ * 0x55 = 0b01010101.
++ */
++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
++ const uint8x16_t dup = vdupq_n_u8(tag);
++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
++ const uint8x8_t res = vsli_n_u8(t0, t1, 4);
++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
++ } else { /* rowEntries == 64 */
++ const uint8x16x4_t chunk = vld4q_u8(src);
++ const uint8x16_t dup = vdupq_n_u8(tag);
++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
++
++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
++ return ZSTD_rotateRight_U64(matches, headGrouped);
++ }
++}
++#endif
++
++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
++ * matches the hash at the nth position in a row of the tagTable.
++ * Each row is a circular buffer beginning at the value of "headGrouped". So we
++ * must rotate the "matches" bitfield to match up with the actual layout of the
++ * entries within the hashTable */
+ FORCE_INLINE_TEMPLATE ZSTD_VecMask
+-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
+ {
+- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
++ const BYTE* const src = tagRow;
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
+
+ #if defined(ZSTD_ARCH_X86_SSE2)
+
+- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
+
+ #else /* SW or NEON-LE */
+
+ # if defined(ZSTD_ARCH_ARM_NEON)
+ /* This NEON path only works for little endian - otherwise use SWAR below */
+ if (MEM_isLittleEndian()) {
+- if (rowEntries == 16) {
+- const uint8x16_t chunk = vld1q_u8(src);
+- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+- const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+- const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+- return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+- } else if (rowEntries == 32) {
+- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+- const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+- const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+- const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+- return ZSTD_rotateRight_U32(matches, head);
+- } else { /* rowEntries == 64 */
+- const uint8x16x4_t chunk = vld4q_u8(src);
+- const uint8x16_t dup = vdupq_n_u8(tag);
+- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+-
+- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+- return ZSTD_rotateRight_U64(matches, head);
+- }
++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
+ }
+ # endif /* ZSTD_ARCH_ARM_NEON */
+ /* SWAR */
+- { const size_t chunkSize = sizeof(size_t);
++ { const int chunkSize = sizeof(size_t);
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+ const size_t xFF = ~((size_t)0);
+ const size_t x01 = xFF / 0xFF;
+@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+ }
+ matches = ~matches;
+ if (rowEntries == 16) {
+- return ZSTD_rotateRight_U16((U16)matches, head);
++ return ZSTD_rotateRight_U16((U16)matches, headGrouped);
+ } else if (rowEntries == 32) {
+- return ZSTD_rotateRight_U32((U32)matches, head);
++ return ZSTD_rotateRight_U32((U32)matches, headGrouped);
+ } else {
+- return ZSTD_rotateRight_U64((U64)matches, head);
++ return ZSTD_rotateRight_U64((U64)matches, headGrouped);
+ }
+ }
+ #endif
+@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
+ const U32 rowLog)
+ {
+ U32* const hashTable = ms->hashTable;
+- U16* const tagTable = ms->tagTable;
++ BYTE* const tagTable = ms->tagTable;
+ U32* const hashCache = ms->hashCache;
+ const U32 hashLog = ms->rowHashLog;
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
+ const U32 rowEntries = (1U << rowLog);
+ const U32 rowMask = rowEntries - 1;
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
++ const U64 hashSalt = ms->hashSalt;
+ U32 nbAttempts = 1U << cappedSearchLog;
+ size_t ml=4-1;
++ U32 hash;
+
+ /* DMS/DDS variables that may be referenced laster */
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
+ if (dictMode == ZSTD_dictMatchState) {
+ /* Prefetch DMS rows */
+ U32* const dmsHashTable = dms->hashTable;
+- U16* const dmsTagTable = dms->tagTable;
++ BYTE* const dmsTagTable = dms->tagTable;
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
+ }
+
+ /* Update the hashTable and tagTable up to (but not including) ip */
+- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++ if (!ms->lazySkipping) {
++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
++ } else {
++ /* Stop inserting every position when in the lazy skipping mode.
++ * The hash cache is also not kept up to date in this mode.
++ */
++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
++ ms->nextToUpdate = curr;
++ }
++ ms->hashSaltEntropy += hash; /* collect salt entropy */
++
+ { /* Get the hash for ip, compute the appropriate row */
+- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+ U32* const row = hashTable + relRow;
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
+- U32 const head = *tagRow & rowMask;
++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+ size_t numMatches = 0;
+ size_t currMatch = 0;
+- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+
+ /* Cycle through the matches and prefetch */
+- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+ U32 const matchIndex = row[matchPos];
++ if(matchPos == 0) continue;
+ assert(numMatches < rowEntries);
+ if (matchIndex < lowLimit)
+ break;
+@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
+ PREFETCH_L1(dictBase + matchIndex);
+ }
+ matchBuffer[numMatches++] = matchIndex;
++ --nbAttempts;
+ }
+
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+ in ZSTD_row_update_internal() at the next search. */
+ {
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
++ tagRow[pos] = (BYTE)tag;
+ row[pos] = ms->nextToUpdate++;
+ }
+
+@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+ const BYTE* const match = base + matchIndex;
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
+- if (match[ml] == ip[ml]) /* potentially better */
++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
+ currentMl = ZSTD_count(ip, match, iLimit);
+ } else {
+ const BYTE* const match = dictBase + matchIndex;
+@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
+ /* Save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+- *offsetPtr = STORE_OFFSET(curr - matchIndex);
++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+ }
+@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
+
+- { U32 const head = *dmsTagRow & rowMask;
++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+ size_t numMatches = 0;
+ size_t currMatch = 0;
+- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
+
+- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
+ U32 const matchIndex = dmsRow[matchPos];
++ if(matchPos == 0) continue;
+ if (matchIndex < dmsLowestIndex)
+ break;
+ PREFETCH_L1(dmsBase + matchIndex);
+ matchBuffer[numMatches++] = matchIndex;
++ --nbAttempts;
+ }
+
+ /* Return the longest match */
+@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
+ if (currentMl > ml) {
+ ml = currentMl;
+ assert(curr > matchIndex + dmsIndexDelta);
+- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
+ if (ip+currentMl == iLimit) break;
+ }
+ }
+@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
+ const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
+
+- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
++ U32 offset_1 = rep[0], offset_2 = rep[1];
++ U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+ const int isDMS = dictMode == ZSTD_dictMatchState;
+ const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
+ U32 const curr = (U32)(ip - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
+ U32 const maxRep = curr - windowLow;
+- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+ }
+ if (isDxS) {
+ /* dictMatchState repCode checks don't currently handle repCode == 0
+@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
+ assert(offset_2 <= dictAndPrefixLength);
+ }
+
++ /* Reset the lazy skipping state */
++ ms->lazySkipping = 0;
++
+ if (searchMethod == search_rowHash) {
+- ZSTD_row_fillHashCache(ms, base, rowLog,
+- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+- ms->nextToUpdate, ilimit);
++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+ }
+
+ /* Match Loop */
+@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
+ #endif
+ while (ip < ilimit) {
+ size_t matchLength=0;
+- size_t offcode=STORE_REPCODE_1;
++ size_t offBase = REPCODE1_TO_OFFBASE;
+ const BYTE* start=ip+1;
+ DEBUGLOG(7, "search baseline (depth 0)");
+
+@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
+ }
+
+ /* first search (depth 0) */
+- { size_t offsetFound = 999999999;
+- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
++ { size_t offbaseFound = 999999999;
++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
+ if (ml2 > matchLength)
+- matchLength = ml2, start = ip, offcode=offsetFound;
++ matchLength = ml2, start = ip, offBase = offbaseFound;
+ }
+
+ if (matchLength < 4) {
+- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
++ ip += step;
++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++ * In this mode we stop inserting every position into our tables, and only insert
++ * positions that we search, which is one in step positions.
++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++ * triggered once we've gone 2KB without finding any matches.
++ */
++ ms->lazySkipping = step > kLazySkippingStep;
+ continue;
+ }
+
+@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
+ DEBUGLOG(7, "search depth 1");
+ ip ++;
+ if ( (dictMode == ZSTD_noDict)
+- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+ size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+ int const gain2 = (int)(mlRep * 3);
+- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+ }
+ if (isDxS) {
+ const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ int const gain2 = (int)(mlRep * 3);
+- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+ }
+ }
+- { size_t offset2=999999999;
+- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++ { size_t ofbCandidate=999999999;
++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+- matchLength = ml2, offcode = offset2, start = ip;
++ matchLength = ml2, offBase = ofbCandidate, start = ip;
+ continue; /* search a better one */
+ } }
+
+@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
+ DEBUGLOG(7, "search depth 2");
+ ip ++;
+ if ( (dictMode == ZSTD_noDict)
+- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+ size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+ int const gain2 = (int)(mlRep * 4);
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+ }
+ if (isDxS) {
+ const U32 repIndex = (U32)(ip - base) - offset_1;
+@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
+ const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+ size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+ int const gain2 = (int)(mlRep * 4);
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+ if ((mlRep >= 4) && (gain2 > gain1))
+- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
+ }
+ }
+- { size_t offset2=999999999;
+- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
+- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++ { size_t ofbCandidate=999999999;
++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+- matchLength = ml2, offcode = offset2, start = ip;
++ matchLength = ml2, offBase = ofbCandidate, start = ip;
+ continue;
+ } } }
+ break; /* nothing found : store previous solution */
+@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
+ * notably if `value` is unsigned, resulting in a large positive `-value`.
+ */
+ /* catch up */
+- if (STORED_IS_OFFSET(offcode)) {
++ if (OFFBASE_IS_OFFSET(offBase)) {
+ if (dictMode == ZSTD_noDict) {
+- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
+ { start--; matchLength++; }
+ }
+ if (isDxS) {
+- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+ const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+ const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+ while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
+ }
+- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+ }
+ /* store sequence */
+ _storeSequence:
+ { size_t const litLength = (size_t)(start - anchor);
+- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+ anchor = ip = start + matchLength;
+ }
++ if (ms->lazySkipping) {
++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++ if (searchMethod == search_rowHash) {
++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++ }
++ ms->lazySkipping = 0;
++ }
+
+ /* check immediate repcode */
+ if (isDxS) {
+@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
+ && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+ matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+ ip += matchLength;
+ anchor = ip;
+ continue;
+@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
+ && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+ /* store sequence */
+ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+ ip += matchLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ } } }
+
+- /* Save reps for next block */
+- rep[0] = offset_1 ? offset_1 : savedOffset;
+- rep[1] = offset_2 ? offset_2 : savedOffset;
++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
++
++ /* save reps for next block */
++ rep[0] = offset_1 ? offset_1 : offsetSaved1;
++ rep[1] = offset_2 ? offset_2 : offsetSaved2;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
+
++ /* Reset the lazy skipping state */
++ ms->lazySkipping = 0;
++
+ /* init */
+ ip += (ip == prefixStart);
+ if (searchMethod == search_rowHash) {
+- ZSTD_row_fillHashCache(ms, base, rowLog,
+- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+- ms->nextToUpdate, ilimit);
++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+ }
+
+ /* Match Loop */
+@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ #endif
+ while (ip < ilimit) {
+ size_t matchLength=0;
+- size_t offcode=STORE_REPCODE_1;
++ size_t offBase = REPCODE1_TO_OFFBASE;
+ const BYTE* start=ip+1;
+ U32 curr = (U32)(ip-base);
+
+@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ } }
+
+ /* first search (depth 0) */
+- { size_t offsetFound = 999999999;
+- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
++ { size_t ofbCandidate = 999999999;
++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+ if (ml2 > matchLength)
+- matchLength = ml2, start = ip, offcode=offsetFound;
++ matchLength = ml2, start = ip, offBase = ofbCandidate;
+ }
+
+ if (matchLength < 4) {
+- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
++ ip += step + 1; /* jump faster over incompressible sections */
++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
++ * In this mode we stop inserting every position into our tables, and only insert
++ * positions that we search, which is one in step positions.
++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
++ * triggered once we've gone 2KB without finding any matches.
++ */
++ ms->lazySkipping = step > kLazySkippingStep;
+ continue;
+ }
+
+@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ ip ++;
+ curr++;
+ /* check repCode */
+- if (offcode) {
++ if (offBase) {
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+ const U32 repIndex = (U32)(curr - offset_1);
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+ int const gain2 = (int)(repLength * 3);
+- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
+ if ((repLength >= 4) && (gain2 > gain1))
+- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+ } }
+
+ /* search match, depth 1 */
+- { size_t offset2=999999999;
+- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
++ { size_t ofbCandidate = 999999999;
++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+- matchLength = ml2, offcode = offset2, start = ip;
++ matchLength = ml2, offBase = ofbCandidate, start = ip;
+ continue; /* search a better one */
+ } }
+
+@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ ip ++;
+ curr++;
+ /* check repCode */
+- if (offcode) {
++ if (offBase) {
+ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
+ const U32 repIndex = (U32)(curr - offset_1);
+ const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+ int const gain2 = (int)(repLength * 4);
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
+ if ((repLength >= 4) && (gain2 > gain1))
+- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
+ } }
+
+ /* search match, depth 2 */
+- { size_t offset2=999999999;
+- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
+- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
++ { size_t ofbCandidate = 999999999;
++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
+ if ((ml2 >= 4) && (gain2 > gain1)) {
+- matchLength = ml2, offcode = offset2, start = ip;
++ matchLength = ml2, offBase = ofbCandidate, start = ip;
+ continue;
+ } } }
+ break; /* nothing found : store previous solution */
+ }
+
+ /* catch up */
+- if (STORED_IS_OFFSET(offcode)) {
+- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
++ if (OFFBASE_IS_OFFSET(offBase)) {
++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
+ const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+ const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+ while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
+- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
+ }
+
+ /* store sequence */
+ _storeSequence:
+ { size_t const litLength = (size_t)(start - anchor);
+- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
+ anchor = ip = start + matchLength;
+ }
++ if (ms->lazySkipping) {
++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
++ if (searchMethod == search_rowHash) {
++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
++ }
++ ms->lazySkipping = 0;
++ }
+
+ /* check immediate repcode */
+ while (ip <= ilimit) {
+@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ /* repcode detected we should take it */
+ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+ matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
+- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
+ ip += matchLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
+ size_t ZSTD_compressBlock_lazy2_extDict_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+-
+ {
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
+diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
+index e5bdf4df8..9505bed93 100644
+--- a/lib/zstd/compress/zstd_lazy.h
++++ b/lib/zstd/compress/zstd_lazy.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -22,6 +23,8 @@
+ */
+ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
+
++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
++
+ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
+
+@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
+ size_t ZSTD_compressBlock_btlazy2_extDict(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize);
+-
++
+
+
+ #endif /* ZSTD_LAZY_H */
+diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
+index dd86fc83e..b7da76b0d 100644
+--- a/lib/zstd/compress/zstd_ldm.c
++++ b/lib/zstd/compress/zstd_ldm.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+ switch(ms->cParams.strategy)
+ {
+ case ZSTD_fast:
+- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+ break;
+
+ case ZSTD_dfast:
+- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+ break;
+
+ case ZSTD_greedy:
+@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
+ * the window through early invalidation.
+ * TODO: * Test the chunk size.
+ * * Try invalidation after the sequence generation and test the
+- * the offset against maxDist directly.
++ * offset against maxDist directly.
+ *
+ * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+ * that any offset used is valid at the END of the sequence, since it may
+@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+ rep[0] = sequence.offset;
+ /* Store the sequence */
+ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+- STORE_OFFSET(sequence.offset),
++ OFFSET_TO_OFFBASE(sequence.offset),
+ sequence.matchLength);
+ ip += sequence.matchLength;
+ }
+diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
+index fbc6a5e88..c540731ab 100644
+--- a/lib/zstd/compress/zstd_ldm.h
++++ b/lib/zstd/compress/zstd_ldm.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
+index 647f865be..cfccfc46f 100644
+--- a/lib/zstd/compress/zstd_ldm_geartab.h
++++ b/lib/zstd/compress/zstd_ldm_geartab.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
+index fd82acfda..1e41cb04f 100644
+--- a/lib/zstd/compress/zstd_opt.c
++++ b/lib/zstd/compress/zstd_opt.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -16,7 +17,7 @@
+ #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+ #define ZSTD_MAX_PRICE (1<<30)
+
+-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+
+
+ /*-*************************************
+@@ -26,27 +27,35 @@
+ #if 0 /* approximation at bit level (for tests) */
+ # define BITCOST_ACCURACY 0
+ # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
+ #elif 0 /* fractional bit accuracy (for tests) */
+ # define BITCOST_ACCURACY 8
+ # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
+ #else /* opt==approx, ultra==accurate */
+ # define BITCOST_ACCURACY 8
+ # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+ #endif
+
++/* ZSTD_bitWeight() :
++ * provide estimated "cost" of a stat in full bits only */
+ MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+ {
+ return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+ }
+
++/* ZSTD_fracWeight() :
++ * provide fractional-bit "cost" of a stat,
++ * using linear interpolation approximation */
+ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ {
+ U32 const stat = rawStat + 1;
+ U32 const hb = ZSTD_highbit32(stat);
+ U32 const BWeight = hb * BITCOST_MULTIPLIER;
++ /* Fweight was meant for "Fractional weight"
++ * but it's effectively a value between 1 and 2
++ * using fixed point arithmetic */
+ U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+ U32 const weight = BWeight + FWeight;
+ assert(hb + BITCOST_ACCURACY < 31);
+@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+ /* debugging function,
+ * @return price in bytes as fractional value
+ * for debug messages only */
+-MEM_STATIC double ZSTD_fCost(U32 price)
++MEM_STATIC double ZSTD_fCost(int price)
+ {
+ return (double)price / (BITCOST_MULTIPLIER*8);
+ }
+@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+ return total;
+ }
+
+-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
++
++static U32
++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
+ {
+ U32 s, sum=0;
+- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
++ (unsigned)lastEltIndex+1, (unsigned)shift );
+ assert(shift < 30);
+ for (s=0; s<lastEltIndex+1; s++) {
+- table[s] = 1 + (table[s] >> shift);
+- sum += table[s];
++ unsigned const base = base1 ? 1 : (table[s]>0);
++ unsigned const newStat = base + (table[s] >> shift);
++ sum += newStat;
++ table[s] = newStat;
+ }
+ return sum;
+ }
+
+ /* ZSTD_scaleStats() :
+- * reduce all elements in table is sum too large
++ * reduce all elt frequencies in table if sum too large
+ * return the resulting sum of elements */
+ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ {
+@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+ DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+ assert(logTarget < 30);
+ if (factor <= 1) return prevsum;
+- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
+ }
+
+ /* ZSTD_rescaleFreqs() :
+@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+ DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+ optPtr->priceType = zop_dynamic;
+
+- if (optPtr->litLengthSum == 0) { /* first block : init */
+- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */
+- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */
++
++ /* heuristic: use pre-defined stats for too small inputs */
++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
+ optPtr->priceType = zop_predef;
+ }
+
+ assert(optPtr->symbolCosts != NULL);
+ if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+- /* huffman table presumed generated by dictionary */
++
++ /* huffman stats covering the full value set : table presumed generated by dictionary */
+ optPtr->priceType = zop_dynamic;
+
+ if (compressedLiterals) {
++ /* generate literals statistics from huffman table */
+ unsigned lit;
+ assert(optPtr->litFreq != NULL);
+ optPtr->litSum = 0;
+@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+ optPtr->offCodeSum += optPtr->offCodeFreq[of];
+ } }
+
+- } else { /* not a dictionary */
++ } else { /* first block, no dictionary */
+
+ assert(optPtr->litFreq != NULL);
+ if (compressedLiterals) {
++ /* base initial cost of literals on direct frequency within src */
+ unsigned lit = MaxLit;
+ HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
+- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
+ }
+
+ { unsigned const baseLLfreqs[MaxLL+1] = {
+@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+ optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
+ }
+
+-
+ }
+
+- } else { /* new block : re-use previous statistics, scaled down */
++ } else { /* new block : scale down accumulated statistics */
+
+ if (compressedLiterals)
+ optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+ return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */
+
+ /* dynamic statistics */
+- { U32 price = litLength * optPtr->litSumBasePrice;
++ { U32 price = optPtr->litSumBasePrice * litLength;
++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
+ U32 u;
++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
+ for (u=0; u < litLength; u++) {
+- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */
+- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
++ price -= litPrice;
+ }
+ return price;
+ }
+@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+ if (optPtr->priceType == zop_predef)
+ return WEIGHT(litLength, optLevel);
+- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+- * because it isn't representable in the zstd format. So instead just
+- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+- * would be all literals.
++
++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
++ * because it isn't representable in the zstd format.
++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
++ * In such a case, the block would be all literals.
+ */
+ if (litLength == ZSTD_BLOCKSIZE_MAX)
+ return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
+@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+ }
+
+ /* ZSTD_getMatchPrice() :
+- * Provides the cost of the match part (offset + matchLength) of a sequence
++ * Provides the cost of the match part (offset + matchLength) of a sequence.
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
+ * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+ */
+ FORCE_INLINE_TEMPLATE U32
+-ZSTD_getMatchPrice(U32 const offcode,
++ZSTD_getMatchPrice(U32 const offBase,
+ U32 const matchLength,
+ const optState_t* const optPtr,
+ int const optLevel)
+ {
+ U32 price;
+- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
++ U32 const offCode = ZSTD_highbit32(offBase);
+ U32 const mlBase = matchLength - MINMATCH;
+ assert(matchLength >= MINMATCH);
+
+- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */
+- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */
++ return WEIGHT(mlBase, optLevel)
++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
+
+ /* dynamic statistics */
+ price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+ }
+
+ /* ZSTD_updateStats() :
+- * assumption : literals + litLengtn <= iend */
++ * assumption : literals + litLength <= iend */
+ static void ZSTD_updateStats(optState_t* const optPtr,
+ U32 litLength, const BYTE* literals,
+- U32 offsetCode, U32 matchLength)
++ U32 offBase, U32 matchLength)
+ {
+ /* literals */
+ if (ZSTD_compressedLiterals(optPtr)) {
+@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+ optPtr->litLengthSum++;
+ }
+
+- /* offset code : expected to follow storeSeq() numeric representation */
+- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
++ /* offset code : follows storeSeq() numeric representation */
++ { U32 const offCode = ZSTD_highbit32(offBase);
+ assert(offCode <= MaxOff);
+ optPtr->offCodeFreq[offCode]++;
+ optPtr->offCodeSum++;
+@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+ ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+ }
+
+-FORCE_INLINE_TEMPLATE
+-U32 ZSTD_insertBtAndGetAllMatches (
+- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
+- ZSTD_matchState_t* ms,
+- U32* nextToUpdate3,
+- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+- const U32 rep[ZSTD_REP_NUM],
+- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+- const U32 lengthToBeat,
+- U32 const mls /* template */)
++FORCE_INLINE_TEMPLATE U32
++ZSTD_insertBtAndGetAllMatches (
++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
++ ZSTD_matchState_t* ms,
++ U32* nextToUpdate3,
++ const BYTE* const ip, const BYTE* const iLimit,
++ const ZSTD_dictMode_e dictMode,
++ const U32 rep[ZSTD_REP_NUM],
++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
++ const U32 lengthToBeat,
++ const U32 mls /* template */)
+ {
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+ repCode, ll0, repOffset, repLen);
+ bestLength = repLen;
+- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */
++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */
+ matches[mnum].len = (U32)repLen;
+ mnum++;
+ if ( (repLen > sufficient_len)
+@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ bestLength = mlen;
+ assert(curr > matchIndex3);
+ assert(mnum==0); /* no prior solution */
+- matches[0].off = STORE_OFFSET(curr - matchIndex3);
++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
+ matches[0].len = (U32)mlen;
+ mnum = 1;
+ if ( (mlen > sufficient_len) |
+@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+ }
+
+ if (matchLength > bestLength) {
+- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+ assert(matchEndIdx > matchIndex);
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ bestLength = matchLength;
+- matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+ matches[mnum].len = (U32)matchLength;
+ mnum++;
+ if ( (matchLength > ZSTD_OPT_NUM)
+@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+
+ if (matchLength > bestLength) {
+ matchIndex = dictMatchIndex + dmsIndexDelta;
+- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
+ if (matchLength > matchEndIdx - matchIndex)
+ matchEndIdx = matchIndex + (U32)matchLength;
+ bestLength = matchLength;
+- matches[mnum].off = STORE_OFFSET(curr - matchIndex);
++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
+ matches[mnum].len = (U32)matchLength;
+ mnum++;
+ if ( (matchLength > ZSTD_OPT_NUM)
+@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+ const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ {
+ U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
+- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
+ U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
+
+ /* Ensure that current block position is not outside of the match */
+@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+ }
+
+ if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
+- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
+- candidateOffCode, candidateMatchLength, currPosInBlock);
++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
++ candidateOffBase, candidateMatchLength, currPosInBlock);
+ matches[*nbMatches].len = candidateMatchLength;
+- matches[*nbMatches].off = candidateOffCode;
++ matches[*nbMatches].off = candidateOffBase;
+ (*nbMatches)++;
+ }
+ }
+@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ ZSTD_optimal_t lastSequence;
+ ZSTD_optLdm_t optLdm;
+
++ ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
++
+ optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
+ optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
+ ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
+@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+
+ /* large match -> immediate encoding */
+ { U32 const maxML = matches[nbMatches-1].len;
+- U32 const maxOffcode = matches[nbMatches-1].off;
+- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
++ U32 const maxOffBase = matches[nbMatches-1].off;
++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
+
+ if (maxML > sufficient_len) {
+ lastSequence.litlen = litlen;
+ lastSequence.mlen = maxML;
+- lastSequence.off = maxOffcode;
++ lastSequence.off = maxOffBase;
+ DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+ maxML, sufficient_len);
+ cur = 0;
+@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */
+ }
+ for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+- U32 const offcode = matches[matchNb].off;
++ U32 const offBase = matches[matchNb].off;
+ U32 const end = matches[matchNb].len;
+ for ( ; pos <= end ; pos++ ) {
+- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
++ U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+ U32 const sequencePrice = literalsPrice + matchPrice;
+ DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+- pos, ZSTD_fCost(sequencePrice));
++ pos, ZSTD_fCost((int)sequencePrice));
+ opt[pos].mlen = pos;
+- opt[pos].off = offcode;
++ opt[pos].off = offBase;
+ opt[pos].litlen = litlen;
+ opt[pos].price = (int)sequencePrice;
+ } }
+@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+ U32 mlen;
+
+- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+ matchNb, matches[matchNb].off, lastML, litlen);
+
+ for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */
+@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+ U32 const llen = opt[storePos].litlen;
+ U32 const mlen = opt[storePos].mlen;
+- U32 const offCode = opt[storePos].off;
++ U32 const offBase = opt[storePos].off;
+ U32 const advance = llen + mlen;
+ DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+ anchor - istart, (unsigned)llen, (unsigned)mlen);
+@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ }
+
+ assert(anchor + llen <= iend);
+- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
+ anchor += advance;
+ ip = anchor;
+ } }
+@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
+ /* ZSTD_initStats_ultra():
+ * make a first compression pass, just to seed stats with more accurate starting values.
+ * only works on first block, with no dictionary and no ldm.
+- * this function cannot error, hence its contract must be respected.
++ * this function cannot error out, its narrow contract must be respected.
+ */
+ static void
+ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+
+ ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/
+
+- /* invalidate first scan from history */
++ /* invalidate first scan from history, only keep entropy stats */
+ ZSTD_resetSeqStore(seqStore);
+ ms->window.base -= srcSize;
+ ms->window.dictLimit += (U32)srcSize;
+@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
+ U32 const curr = (U32)((const BYTE*)src - ms->window.base);
+ DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+
+- /* 2-pass strategy:
++ /* 2-passes strategy:
+ * this strategy makes a first pass over first block to collect statistics
+- * and seed next round's statistics with it.
+- * After 1st pass, function forgets everything, and starts a new block.
++ * in order to seed next round's statistics with it.
++ * After 1st pass, function forgets history, and starts a new block.
+ * Consequently, this can only work if no data has been previously loaded in tables,
+ * aka, no dictionary, no prefix, no ldm preprocessing.
+ * The compression ratio gain is generally small (~0.5% on first block),
+- * the cost is 2x cpu time on first block. */
++ ** the cost is 2x cpu time on first block. */
+ assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+ if ( (ms->opt.litLengthSum==0) /* first block */
+ && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */
+ && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */
+- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
+- && (srcSize > ZSTD_PREDEF_THRESHOLD)
++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
+ ) {
+ ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+ }
+diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
+index 22b862858..faa73ff4b 100644
+--- a/lib/zstd/compress/zstd_opt.h
++++ b/lib/zstd/compress/zstd_opt.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
+index 60958afeb..db670d71f 100644
+--- a/lib/zstd/decompress/huf_decompress.c
++++ b/lib/zstd/decompress/huf_decompress.c
+@@ -1,7 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /* ******************************************************************
+ * huff0 huffman decoder,
+ * part of Finite State Entropy library
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * You can contact the author at :
+ * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -19,10 +20,10 @@
+ #include "../common/compiler.h"
+ #include "../common/bitstream.h" /* BIT_* */
+ #include "../common/fse.h" /* to compress headers */
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/error_private.h"
+ #include "../common/zstd_internal.h"
++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
+
+ /* **************************************************************
+ * Constants
+@@ -34,6 +35,12 @@
+ * Macros
+ ****************************************************************/
+
++#ifdef HUF_DISABLE_FAST_DECODE
++# define HUF_ENABLE_FAST_DECODE 0
++#else
++# define HUF_ENABLE_FAST_DECODE 1
++#endif
++
+ /* These two optional macros force the use one way or another of the two
+ * Huffman decompression implementations. You can't force in both directions
+ * at the same time.
+@@ -43,27 +50,25 @@
+ #error "Cannot force the use of the X1 and X2 decoders at the same time!"
+ #endif
+
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
++ * supported at runtime, so we can add the BMI2 target attribute.
++ * When it is disabled, we will still get BMI2 if it is enabled statically.
++ */
++#if DYNAMIC_BMI2
++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+ #else
+-# define HUF_ASM_X86_64_BMI2_ATTRS
++# define HUF_FAST_BMI2_ATTRS
+ #endif
+
+ #define HUF_EXTERN_C
+ #define HUF_ASM_DECL HUF_EXTERN_C
+
+-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
++#if DYNAMIC_BMI2
+ # define HUF_NEED_BMI2_FUNCTION 1
+ #else
+ # define HUF_NEED_BMI2_FUNCTION 0
+ #endif
+
+-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+-# define HUF_NEED_DEFAULT_FUNCTION 1
+-#else
+-# define HUF_NEED_DEFAULT_FUNCTION 0
+-#endif
+-
+ /* **************************************************************
+ * Error Management
+ ****************************************************************/
+@@ -80,6 +85,11 @@
+ /* **************************************************************
+ * BMI2 Variant Wrappers
+ ****************************************************************/
++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
++ const void *cSrc,
++ size_t cSrcSize,
++ const HUF_DTable *DTable);
++
+ #if DYNAMIC_BMI2
+
+ #define HUF_DGEN(fn) \
+@@ -101,9 +111,9 @@
+ } \
+ \
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
+- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
+ { \
+- if (bmi2) { \
++ if (flags & HUF_flags_bmi2) { \
+ return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
+ } \
+ return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
+@@ -113,9 +123,9 @@
+
+ #define HUF_DGEN(fn) \
+ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
+- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
+ { \
+- (void)bmi2; \
++ (void)flags; \
+ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
+ }
+
+@@ -134,15 +144,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+ return dtd;
+ }
+
+-#if ZSTD_ENABLE_ASM_X86_64_BMI2
+-
+-static size_t HUF_initDStream(BYTE const* ip) {
++static size_t HUF_initFastDStream(BYTE const* ip) {
+ BYTE const lastByte = ip[7];
+- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
+ size_t const value = MEM_readLEST(ip) | 1;
+ assert(bitsConsumed <= 8);
++ assert(sizeof(size_t) == 8);
+ return value << bitsConsumed;
+ }
++
++
++/*
++ * The input/output arguments to the Huffman fast decoding loop:
++ *
++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
++ * op [in/out] - The output pointers, must be updated to reflect what is written.
++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
++ * dt [in] - The decoding table.
++ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
++ * oend [in] - The end of the output stream. op[3] must not cross oend.
++ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
++ * as long as it is above ilimit, but that indicates corruption.
++ */
+ typedef struct {
+ BYTE const* ip[4];
+ BYTE* op[4];
+@@ -151,15 +174,17 @@ typedef struct {
+ BYTE const* ilimit;
+ BYTE* oend;
+ BYTE const* iend[4];
+-} HUF_DecompressAsmArgs;
++} HUF_DecompressFastArgs;
++
++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
+
+ /*
+- * Initializes args for the asm decoding loop.
+- * @returns 0 on success
+- * 1 if the fallback implementation should be used.
++ * Initializes args for the fast decoding loop.
++ * @returns 1 on success
++ * 0 if the fallback implementation should be used.
+ * Or an error code on failure.
+ */
+-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+ {
+ void const* dt = DTable + 1;
+ U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
+@@ -168,9 +193,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+
+ BYTE* const oend = (BYTE*)dst + dstSize;
+
+- /* The following condition is false on x32 platform,
+- * but HUF_asm is not compatible with this ABI */
+- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
++ /* The fast decoding loop assumes 64-bit little-endian.
++ * This condition is false on x32.
++ */
++ if (!MEM_isLittleEndian() || MEM_32bits())
++ return 0;
+
+ /* strict minimum : jump table + 1 byte per stream */
+ if (srcSize < 10)
+@@ -181,7 +208,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
+ */
+ if (dtLog != HUF_DECODER_FAST_TABLELOG)
+- return 1;
++ return 0;
+
+ /* Read the jump table. */
+ {
+@@ -195,13 +222,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ args->iend[2] = args->iend[1] + length2;
+ args->iend[3] = args->iend[2] + length3;
+
+- /* HUF_initDStream() requires this, and this small of an input
++ /* HUF_initFastDStream() requires this, and this small of an input
+ * won't benefit from the ASM loop anyways.
+ * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+ * starts.
+ */
+ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+- return 1;
++ return 0;
+ if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
+ }
+ /* ip[] contains the position that is currently loaded into bits[]. */
+@@ -218,7 +245,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+
+ /* No point to call the ASM loop for tiny outputs. */
+ if (args->op[3] >= oend)
+- return 1;
++ return 0;
+
+ /* bits[] is the bit container.
+ * It is read from the MSB down to the LSB.
+@@ -227,10 +254,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ * set, so that CountTrailingZeros(bits[]) can be used
+ * to count how many bits we've consumed.
+ */
+- args->bits[0] = HUF_initDStream(args->ip[0]);
+- args->bits[1] = HUF_initDStream(args->ip[1]);
+- args->bits[2] = HUF_initDStream(args->ip[2]);
+- args->bits[3] = HUF_initDStream(args->ip[3]);
++ args->bits[0] = HUF_initFastDStream(args->ip[0]);
++ args->bits[1] = HUF_initFastDStream(args->ip[1]);
++ args->bits[2] = HUF_initFastDStream(args->ip[2]);
++ args->bits[3] = HUF_initFastDStream(args->ip[3]);
+
+ /* If ip[] >= ilimit, it is guaranteed to be safe to
+ * reload bits[]. It may be beyond its section, but is
+@@ -241,10 +268,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+ args->oend = oend;
+ args->dt = dt;
+
+- return 0;
++ return 1;
+ }
+
+-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
+ {
+ /* Validate that we haven't overwritten. */
+ if (args->op[stream] > segmentEnd)
+@@ -258,15 +285,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+ return ERROR(corruption_detected);
+
+ /* Construct the BIT_DStream_t. */
+- bit->bitContainer = MEM_readLE64(args->ip[stream]);
+- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
++ assert(sizeof(size_t) == 8);
++ bit->bitContainer = MEM_readLEST(args->ip[stream]);
++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+ bit->start = (const char*)args->iend[0];
+ bit->limitPtr = bit->start + sizeof(size_t);
+ bit->ptr = (const char*)args->ip[stream];
+
+ return 0;
+ }
+-#endif
++
++/* Calls X(N) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM(X) \
++ { \
++ X(0) \
++ X(1) \
++ X(2) \
++ X(3) \
++ }
++
++/* Calls X(N, var) for each stream 0, 1, 2, 3. */
++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
++ { \
++ X(0, (var)) \
++ X(1, (var)) \
++ X(2, (var)) \
++ X(3, (var)) \
++ }
+
+
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+@@ -283,10 +328,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi
+ static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
+ U64 D4;
+ if (MEM_isLittleEndian()) {
+- D4 = (symbol << 8) + nbBits;
++ D4 = (U64)((symbol << 8) + nbBits);
+ } else {
+- D4 = symbol + (nbBits << 8);
++ D4 = (U64)(symbol + (nbBits << 8));
+ }
++ assert(D4 < (1U << 16));
+ D4 *= 0x0001000100010001ULL;
+ return D4;
+ }
+@@ -329,13 +375,7 @@ typedef struct {
+ BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+ } HUF_ReadDTableX1_Workspace;
+
+-
+-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+-{
+- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+ U32 tableLog = 0;
+ U32 nbSymbols = 0;
+@@ -350,7 +390,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+ DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+ /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
+
+- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
+ if (HUF_isError(iSize)) return iSize;
+
+
+@@ -377,9 +417,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+ * rankStart[0] is not filled because there are no entries in the table for
+ * weight 0.
+ */
+- {
+- int n;
+- int nextRankStart = 0;
++ { int n;
++ U32 nextRankStart = 0;
+ int const unroll = 4;
+ int const nLimit = (int)nbSymbols - unroll + 1;
+ for (n=0; n<(int)tableLog+1; n++) {
+@@ -406,10 +445,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+ * We can switch based on the length to a different inner loop which is
+ * optimized for that particular case.
+ */
+- {
+- U32 w;
+- int symbol=wksp->rankVal[0];
+- int rankStart=0;
++ { U32 w;
++ int symbol = wksp->rankVal[0];
++ int rankStart = 0;
+ for (w=1; w<tableLog+1; ++w) {
+ int const symbolCount = wksp->rankVal[w];
+ int const length = (1 << w) >> 1;
+@@ -519,7 +557,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+ while (p < pEnd)
+ HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+- return pEnd-pStart;
++ return (size_t)(pEnd-pStart);
+ }
+
+ FORCE_INLINE_TEMPLATE size_t
+@@ -545,6 +583,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+ return dstSize;
+ }
+
++/* HUF_decompress4X1_usingDTable_internal_body():
++ * Conditions :
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X1_usingDTable_internal_body(
+ void* dst, size_t dstSize,
+@@ -588,6 +630,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -650,38 +693,156 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+ size_t cSrcSize, HUF_DTable const* DTable) {
+ return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++ U64 bits[4];
++ BYTE const* ip[4];
++ BYTE* op[4];
++ U16 const* const dtable = (U16 const*)args->dt;
++ BYTE* const oend = args->oend;
++ BYTE const* const ilimit = args->ilimit;
++
++ /* Copy the arguments to local variables */
++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++ ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++ assert(MEM_isLittleEndian());
++ assert(!MEM_32bits());
++
++ for (;;) {
++ BYTE* olimit;
++ int stream;
++
++ /* Assert loop preconditions */
++#ifndef NDEBUG
++ for (stream = 0; stream < 4; ++stream) {
++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
++ assert(ip[stream] >= ilimit);
++ }
++#endif
++ /* Compute olimit */
++ {
++ /* Each iteration produces 5 output symbols per stream */
++ size_t const oiters = (size_t)(oend - op[3]) / 5;
++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
++ * per stream.
++ */
++ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
++ /* We can safely run iters iterations before running bounds checks */
++ size_t const iters = MIN(oiters, iiters);
++ size_t const symbols = iters * 5;
++
++ /* We can simply check that op[3] < olimit, instead of checking all
++ * of our bounds, since we can't hit the other bounds until we've run
++ * iters iterations, which only happens when op[3] == olimit.
++ */
++ olimit = op[3] + symbols;
++
++ /* Exit fast decoding loop once we get close to the end. */
++ if (op[3] + 20 > olimit)
++ break;
++
++ /* Exit the decoding loop if any input pointer has crossed the
++ * previous one. This indicates corruption, and a precondition
++ * to our loop is that ip[i] >= ip[0].
++ */
++ for (stream = 1; stream < 4; ++stream) {
++ if (ip[stream] < ip[stream - 1])
++ goto _out;
++ }
++ }
++
++#ifndef NDEBUG
++ for (stream = 1; stream < 4; ++stream) {
++ assert(ip[stream] >= ip[stream - 1]);
++ }
++#endif
++
++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
++ { \
++ int const index = (int)(bits[(_stream)] >> 53); \
++ int const entry = (int)dtable[index]; \
++ bits[(_stream)] <<= (entry & 0x3F); \
++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
++ }
++
++#define HUF_4X1_RELOAD_STREAM(_stream) \
++ { \
++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++ int const nbBits = ctz & 7; \
++ int const nbBytes = ctz >> 3; \
++ op[(_stream)] += 5; \
++ ip[(_stream)] -= nbBytes; \
++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
++ bits[(_stream)] <<= nbBits; \
++ }
++
++ /* Manually unroll the loop because compilers don't consistently
++ * unroll the inner loops, which destroys performance.
++ */
++ do {
++ /* Decode 5 symbols in each of the 4 streams */
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4)
++
++ /* Reload each of the 4 the bitstreams */
++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM)
++ } while (op[3] < olimit);
++
++#undef HUF_4X1_DECODE_SYMBOL
++#undef HUF_4X1_RELOAD_STREAM
++ }
++
++_out:
+
+-static HUF_ASM_X86_64_BMI2_ATTRS
++ /* Save the final values of each of the state variables back to args. */
++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++ ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++/*
++ * @returns @p dstSize on success (>= 6)
++ * 0 if the fallback implementation should be used
++ * An error if an error occurred
++ */
++static HUF_FAST_BMI2_ATTRS
+ size_t
+-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
++HUF_decompress4X1_usingDTable_internal_fast(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
++ const HUF_DTable* DTable,
++ HUF_DecompressFastLoopFn loopFn)
+ {
+ void const* dt = DTable + 1;
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
+ BYTE* const oend = (BYTE*)dst + dstSize;
+- HUF_DecompressAsmArgs args;
+- {
+- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+- FORWARD_IF_ERROR(ret, "Failed to init asm args");
+- if (ret != 0)
+- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++ HUF_DecompressFastArgs args;
++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
++ if (ret == 0)
++ return 0;
+ }
+
+ assert(args.ip[0] >= args.ilimit);
+- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++ loopFn(&args);
+
+ /* Our loop guarantees that ip[] >= ilimit and that we haven't
+ * overwritten any op[].
+@@ -694,8 +855,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+ (void)iend;
+
+ /* finish bit streams one by one. */
+- {
+- size_t const segmentSize = (dstSize+3) / 4;
++ { size_t const segmentSize = (dstSize+3) / 4;
+ BYTE* segmentEnd = (BYTE*)dst;
+ int i;
+ for (i = 0; i < 4; ++i) {
+@@ -712,97 +872,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+ }
+
+ /* decoded size */
++ assert(dstSize != 0);
+ return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+-
+-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+- const void *cSrc,
+- size_t cSrcSize,
+- const HUF_DTable *DTable);
+
+ HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+
+ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++ size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+- if (bmi2) {
++ if (flags & HUF_flags_bmi2) {
++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++ if (!(flags & HUF_flags_disableAsm)) {
++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++ }
+ # endif
++ } else {
++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#else
+- (void)bmi2;
+ #endif
+
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++ if (!(flags & HUF_flags_disableAsm)) {
++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
++ }
+ #endif
+-}
+-
+-
+-size_t HUF_decompress1X1_usingDTable(
+- void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
+-{
+- DTableDesc dtd = HUF_getDTableDesc(DTable);
+- if (dtd.tableType != 0) return ERROR(GENERIC);
+- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize)
+-{
+- const BYTE* ip = (const BYTE*) cSrc;
+
+- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+- if (HUF_isError(hSize)) return hSize;
+- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+- ip += hSize; cSrcSize -= hSize;
+-
+- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+-}
+-
+-
+-size_t HUF_decompress4X1_usingDTable(
+- void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
+-{
+- DTableDesc dtd = HUF_getDTableDesc(DTable);
+- if (dtd.tableType != 0) return ERROR(GENERIC);
+- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++ if (ret != 0)
++ return ret;
++ }
++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+
+-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize, int bmi2)
++ void* workSpace, size_t wkspSize, int flags)
+ {
+ const BYTE* ip = (const BYTE*) cSrc;
+
+- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+-}
+-
+-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize)
+-{
+- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X2 */
+
+
+@@ -985,7 +1107,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+
+ static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+ const sortedSymbol_t* sortedList,
+- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
+ const U32 nbBitsBaseline)
+ {
+ U32* const rankVal = rankValOrigin[0];
+@@ -1040,14 +1162,7 @@ typedef struct {
+
+ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+ const void* src, size_t srcSize,
+- void* workSpace, size_t wkspSize)
+-{
+- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+- const void* src, size_t srcSize,
+- void* workSpace, size_t wkspSize, int bmi2)
++ void* workSpace, size_t wkspSize, int flags)
+ {
+ U32 tableLog, maxW, nbSymbols;
+ DTableDesc dtd = HUF_getDTableDesc(DTable);
+@@ -1069,7 +1184,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+ /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
+
+- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
+ if (HUF_isError(iSize)) return iSize;
+
+ /* check result */
+@@ -1240,6 +1355,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+ /* decoded size */
+ return dstSize;
+ }
++
++/* HUF_decompress4X2_usingDTable_internal_body():
++ * Conditions:
++ * @dstSize >= 6
++ */
+ FORCE_INLINE_TEMPLATE size_t
+ HUF_decompress4X2_usingDTable_internal_body(
+ void* dst, size_t dstSize,
+@@ -1280,8 +1400,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ U32 const dtLog = dtd.tableLog;
+
+- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
+- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
+ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+ CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+ CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+@@ -1366,36 +1487,178 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+ }
+ #endif
+
+-#if HUF_NEED_DEFAULT_FUNCTION
+ static
+ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
+ size_t cSrcSize, HUF_DTable const* DTable) {
+ return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#endif
+
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
++
++#endif
++
++static HUF_FAST_BMI2_ATTRS
++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
++{
++ U64 bits[4];
++ BYTE const* ip[4];
++ BYTE* op[4];
++ BYTE* oend[4];
++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
++ BYTE const* const ilimit = args->ilimit;
++
++ /* Copy the arguments to local registers. */
++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
++ ZSTD_memcpy(&op, &args->op, sizeof(op));
++
++ oend[0] = op[1];
++ oend[1] = op[2];
++ oend[2] = op[3];
++ oend[3] = args->oend;
++
++ assert(MEM_isLittleEndian());
++ assert(!MEM_32bits());
++
++ for (;;) {
++ BYTE* olimit;
++ int stream;
++
++ /* Assert loop preconditions */
++#ifndef NDEBUG
++ for (stream = 0; stream < 4; ++stream) {
++ assert(op[stream] <= oend[stream]);
++ assert(ip[stream] >= ilimit);
++ }
++#endif
++ /* Compute olimit */
++ {
++ /* Each loop does 5 table lookups for each of the 4 streams.
++ * Each table lookup consumes up to 11 bits of input, and produces
++ * up to 2 bytes of output.
++ */
++ /* We can consume up to 7 bytes of input per iteration per stream.
++ * We also know that each input pointer is >= ip[0]. So we can run
++ * iters loops before running out of input.
++ */
++ size_t iters = (size_t)(ip[0] - ilimit) / 7;
++ /* Each iteration can produce up to 10 bytes of output per stream.
++ * Each output stream my advance at different rates. So take the
++ * minimum number of safe iterations among all the output streams.
++ */
++ for (stream = 0; stream < 4; ++stream) {
++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
++ iters = MIN(iters, oiters);
++ }
++
++ /* Each iteration produces at least 5 output symbols. So until
++ * op[3] crosses olimit, we know we haven't executed iters
++ * iterations yet. This saves us maintaining an iters counter,
++ * at the expense of computing the remaining # of iterations
++ * more frequently.
++ */
++ olimit = op[3] + (iters * 5);
++
++ /* Exit the fast decoding loop if we are too close to the end. */
++ if (op[3] + 10 > olimit)
++ break;
++
++ /* Exit the decoding loop if any input pointer has crossed the
++ * previous one. This indicates corruption, and a precondition
++ * to our loop is that ip[i] >= ip[0].
++ */
++ for (stream = 1; stream < 4; ++stream) {
++ if (ip[stream] < ip[stream - 1])
++ goto _out;
++ }
++ }
++
++#ifndef NDEBUG
++ for (stream = 1; stream < 4; ++stream) {
++ assert(ip[stream] >= ip[stream - 1]);
++ }
++#endif
++
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
++ if ((_decode3) || (_stream) != 3) { \
++ int const index = (int)(bits[(_stream)] >> 53); \
++ HUF_DEltX2 const entry = dtable[index]; \
++ MEM_write16(op[(_stream)], entry.sequence); \
++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
++ op[(_stream)] += (entry.length); \
++ }
++
++#define HUF_4X2_RELOAD_STREAM(_stream) \
++ { \
++ HUF_4X2_DECODE_SYMBOL(3, 1) \
++ { \
++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
++ int const nbBits = ctz & 7; \
++ int const nbBytes = ctz >> 3; \
++ ip[(_stream)] -= nbBytes; \
++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
++ bits[(_stream)] <<= nbBits; \
++ } \
++ }
++
++ /* Manually unroll the loop because compilers don't consistently
++ * unroll the inner loops, which destroys performance.
++ */
++ do {
++ /* Decode 5 symbols from each of the first 3 streams.
++ * The final stream will be decoded during the reload phase
++ * to reduce register pressure.
++ */
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++
++ /* Decode one symbol from the final stream */
++ HUF_4X2_DECODE_SYMBOL(3, 1)
++
++ /* Decode 4 symbols from the final stream & reload bitstreams.
++ * The final stream is reloaded last, meaning that all 5 symbols
++ * are decoded from the final stream before it is reloaded.
++ */
++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM)
++ } while (op[3] < olimit);
++ }
+
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#undef HUF_4X2_DECODE_SYMBOL
++#undef HUF_4X2_RELOAD_STREAM
++
++_out:
++
++ /* Save the final values of each of the state variables back to args. */
++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
++ ZSTD_memcpy(&args->op, &op, sizeof(op));
++}
++
++
++static HUF_FAST_BMI2_ATTRS size_t
++HUF_decompress4X2_usingDTable_internal_fast(
+ void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable) {
++ const HUF_DTable* DTable,
++ HUF_DecompressFastLoopFn loopFn) {
+ void const* dt = DTable + 1;
+ const BYTE* const iend = (const BYTE*)cSrc + 6;
+ BYTE* const oend = (BYTE*)dst + dstSize;
+- HUF_DecompressAsmArgs args;
++ HUF_DecompressFastArgs args;
+ {
+- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+ FORWARD_IF_ERROR(ret, "Failed to init asm args");
+- if (ret != 0)
+- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++ if (ret == 0)
++ return 0;
+ }
+
+ assert(args.ip[0] >= args.ilimit);
+- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++ loopFn(&args);
+
+ /* note : op4 already verified within main loop */
+ assert(args.ip[0] >= iend);
+@@ -1426,91 +1689,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+ /* decoded size */
+ return dstSize;
+ }
+-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+
+ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
+- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
++ size_t cSrcSize, HUF_DTable const* DTable, int flags)
+ {
++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
++
+ #if DYNAMIC_BMI2
+- if (bmi2) {
++ if (flags & HUF_flags_bmi2) {
++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
+ # if ZSTD_ENABLE_ASM_X86_64_BMI2
+- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-# else
+- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++ if (!(flags & HUF_flags_disableAsm)) {
++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++ }
+ # endif
++ } else {
++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+-#else
+- (void)bmi2;
+ #endif
+
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
+-#else
+- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
++ if (!(flags & HUF_flags_disableAsm)) {
++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
++ }
+ #endif
++
++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
++ if (ret != 0)
++ return ret;
++ }
++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
+ }
+
+ HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+
+-size_t HUF_decompress1X2_usingDTable(
+- void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
+-{
+- DTableDesc dtd = HUF_getDTableDesc(DTable);
+- if (dtd.tableType != 1) return ERROR(GENERIC);
+- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+ size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize)
++ void* workSpace, size_t wkspSize, int flags)
+ {
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+- workSpace, wkspSize);
++ workSpace, wkspSize, flags);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
+ }
+
+-
+-size_t HUF_decompress4X2_usingDTable(
+- void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
+-{
+- DTableDesc dtd = HUF_getDTableDesc(DTable);
+- if (dtd.tableType != 1) return ERROR(GENERIC);
+- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-}
+-
+-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize, int bmi2)
++ void* workSpace, size_t wkspSize, int flags)
+ {
+ const BYTE* ip = (const BYTE*) cSrc;
+
+ size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+- workSpace, wkspSize);
++ workSpace, wkspSize, flags);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+
+-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+- const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize)
+-{
+- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+-}
+-
+-
+ #endif /* HUF_FORCE_DECOMPRESS_X1 */
+
+
+@@ -1518,44 +1762,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ /* Universal decompression selectors */
+ /* ***********************************/
+
+-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+- const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
+-{
+- DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+- (void)dtd;
+- assert(dtd.tableType == 0);
+- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+- (void)dtd;
+- assert(dtd.tableType == 1);
+- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+- const void* cSrc, size_t cSrcSize,
+- const HUF_DTable* DTable)
+-{
+- DTableDesc const dtd = HUF_getDTableDesc(DTable);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+- (void)dtd;
+- assert(dtd.tableType == 0);
+- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+- (void)dtd;
+- assert(dtd.tableType == 1);
+- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#else
+- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+-#endif
+-}
+-
+
+ #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+ typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+@@ -1610,36 +1816,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+ #endif
+ }
+
+-
+-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+- size_t dstSize, const void* cSrc,
+- size_t cSrcSize, void* workSpace,
+- size_t wkspSize)
+-{
+- /* validation checks */
+- if (dstSize == 0) return ERROR(dstSize_tooSmall);
+- if (cSrcSize == 0) return ERROR(corruption_detected);
+-
+- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+-#if defined(HUF_FORCE_DECOMPRESS_X1)
+- (void)algoNb;
+- assert(algoNb == 0);
+- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#elif defined(HUF_FORCE_DECOMPRESS_X2)
+- (void)algoNb;
+- assert(algoNb == 1);
+- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#else
+- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+- cSrcSize, workSpace, wkspSize):
+- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+-#endif
+- }
+-}
+-
+ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ const void* cSrc, size_t cSrcSize,
+- void* workSpace, size_t wkspSize)
++ void* workSpace, size_t wkspSize, int flags)
+ {
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1652,71 +1831,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+ (void)algoNb;
+ assert(algoNb == 0);
+ return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+- cSrcSize, workSpace, wkspSize);
++ cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+ return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+- cSrcSize, workSpace, wkspSize);
++ cSrcSize, workSpace, wkspSize, flags);
+ #else
+ return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+- cSrcSize, workSpace, wkspSize):
++ cSrcSize, workSpace, wkspSize, flags):
+ HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+- cSrcSize, workSpace, wkspSize);
++ cSrcSize, workSpace, wkspSize, flags);
+ #endif
+ }
+ }
+
+
+-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dtd;
+ assert(dtd.tableType == 0);
+- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dtd;
+ assert(dtd.tableType == 1);
+- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+
+ #ifndef HUF_FORCE_DECOMPRESS_X2
+-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+ const BYTE* ip = (const BYTE*) cSrc;
+
+- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ if (HUF_isError(hSize)) return hSize;
+ if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+ ip += hSize; cSrcSize -= hSize;
+
+- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
+ }
+ #endif
+
+-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
+ {
+ DTableDesc const dtd = HUF_getDTableDesc(DTable);
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)dtd;
+ assert(dtd.tableType == 0);
+- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)dtd;
+ assert(dtd.tableType == 1);
+- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #else
+- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
+ #endif
+ }
+
+-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
+ {
+ /* validation checks */
+ if (dstSize == 0) return ERROR(dstSize_tooSmall);
+@@ -1726,15 +1905,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+ #if defined(HUF_FORCE_DECOMPRESS_X1)
+ (void)algoNb;
+ assert(algoNb == 0);
+- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #elif defined(HUF_FORCE_DECOMPRESS_X2)
+ (void)algoNb;
+ assert(algoNb == 1);
+- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #else
+- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
+ #endif
+ }
+ }
+-
+diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
+index dbbc7919d..30ef65e1a 100644
+--- a/lib/zstd/decompress/zstd_ddict.c
++++ b/lib/zstd/decompress/zstd_ddict.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -14,12 +15,12 @@
+ /*-*******************************************************
+ * Dependencies
+ *********************************************************/
++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/cpu.h" /* bmi2 */
+ #include "../common/mem.h" /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "zstd_decompress_internal.h"
+ #include "zstd_ddict.h"
+@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+ ZSTD_memcpy(internalBuffer, dict, dictSize);
+ }
+ ddict->dictSize = dictSize;
+- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
+
+ /* parse dictionary content */
+ FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+ {
+ if (ddict==NULL) return 0;
+- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
++ return ddict->dictID;
+ }
+diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
+index 8c1a79d66..de459a0da 100644
+--- a/lib/zstd/decompress/zstd_ddict.h
++++ b/lib/zstd/decompress/zstd_ddict.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
+index 6b3177c94..03dbdf391 100644
+--- a/lib/zstd/decompress/zstd_decompress.c
++++ b/lib/zstd/decompress/zstd_decompress.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -52,17 +53,18 @@
+ /*-*******************************************************
+ * Dependencies
+ *********************************************************/
++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+ #include "../common/mem.h" /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
+ #include "../common/zstd_internal.h" /* blockProperties_t */
+ #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
+ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */
++#include "../common/bits.h" /* ZSTD_highbit32 */
+
+
+
+@@ -72,11 +74,11 @@
+ *************************************/
+
+ #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
+-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+- * Currently, that means a 0.75 load factor.
+- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+- * the load factor of the ddict hash set.
+- */
++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
++ * Currently, that means a 0.75 load factor.
++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
++ * the load factor of the ddict hash set.
++ */
+
+ #define DDICT_HASHSET_TABLE_BASE_SIZE 64
+ #define DDICT_HASHSET_RESIZE_FACTOR 2
+@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+ dctx->outBufferMode = ZSTD_bm_buffered;
+ dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
+ dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
++ dctx->disableHufAsm = 0;
+ }
+
+ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+ * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ * >0, `srcSize` is too small, value is wanted `srcSize` amount,
+- * or an error code, which can be tested using ZSTD_isError() */
++** or an error code, which can be tested using ZSTD_isError() */
+ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+ const BYTE* ip = (const BYTE*)src;
+ size_t const minInputSize = ZSTD_startingInputLength(format);
+
+- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+- if (srcSize < minInputSize) return minInputSize;
+- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
++
++ if (srcSize > 0) {
++ /* note : technically could be considered an assert(), since it's an invalid entry */
++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
++ }
++ if (srcSize < minInputSize) {
++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
++ /* when receiving less than @minInputSize bytes,
++ * control these bytes at least correspond to a supported magic number
++ * in order to error out early if they don't.
++ **/
++ size_t const toCopy = MIN(4, srcSize);
++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
++ assert(src != NULL);
++ ZSTD_memcpy(hbuf, src, toCopy);
++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
++ /* not a zstd frame : let's check if it's a skippable frame */
++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
++ ZSTD_memcpy(hbuf, src, toCopy);
++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
++ RETURN_ERROR(prefix_unknown,
++ "first bytes don't correspond to any supported magic number");
++ } } }
++ return minInputSize;
++ }
+
++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
+ if ( (format != ZSTD_f_zstd1_magicless)
+ && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+ if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+ sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+ RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+ frameParameter_unsupported, "");
+- {
+- size_t const skippableSize = skippableHeaderSize + sizeU32;
++ { size_t const skippableSize = skippableHeaderSize + sizeU32;
+ RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+ return skippableSize;
+ }
+ }
+
+ /*! ZSTD_readSkippableFrame() :
+- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
++ * Retrieves content of a skippable frame, and writes it to dst buffer.
+ *
+ * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
+ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested
+ * in the magicVariant.
+ *
+- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
+- const void* src, size_t srcSize)
++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
++ unsigned* magicVariant, /* optional, can be NULL */
++ const void* src, size_t srcSize)
+ {
+- U32 const magicNumber = MEM_readLE32(src);
+- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+-
+- /* check input validity */
+- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+
+- /* deliver payload */
+- if (skippableContentSize > 0 && dst != NULL)
+- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+- if (magicVariant != NULL)
+- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+- return skippableContentSize;
++ { U32 const magicNumber = MEM_readLE32(src);
++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
++
++ /* check input validity */
++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
++
++ /* deliver payload */
++ if (skippableContentSize > 0 && dst != NULL)
++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
++ if (magicVariant != NULL)
++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
++ return skippableContentSize;
++ }
+ }
+
+ /* ZSTD_findDecompressedSize() :
+- * compatible with legacy mode
+ * `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ * skippable frames
+- * @return : decompressed size of the frames contained */
++ * note: compatible with legacy mode
++ * @return : decompressed size of the frames contained */
+ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ {
+ unsigned long long totalDstSize = 0;
+@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+
+ if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+ size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+- if (ZSTD_isError(skippableSize)) {
+- return ZSTD_CONTENTSIZE_ERROR;
+- }
++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
+ assert(skippableSize <= srcSize);
+
+ src = (const BYTE *)src + skippableSize;
+@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+ continue;
+ }
+
+- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
+
+- /* check for overflow */
+- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+- totalDstSize += ret;
++ if (totalDstSize + fcs < totalDstSize)
++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
++ totalDstSize += fcs;
+ }
++ /* skip to next frame */
+ { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+- if (ZSTD_isError(frameSrcSize)) {
+- return ZSTD_CONTENTSIZE_ERROR;
+- }
++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
++ assert(frameSrcSize <= srcSize);
+
+ src = (const BYTE *)src + frameSrcSize;
+ srcSize -= frameSrcSize;
+@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+ ip += 4;
+ }
+
++ frameSizeInfo.nbBlocks = nbBlocks;
+ frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
+ frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+ ? zfh.frameContentSize
+- : nbBlocks * zfh.blockSizeMax;
++ : (unsigned long long)nbBlocks * zfh.blockSizeMax;
+ return frameSizeInfo;
+ }
+ }
+@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+ return bound;
+ }
+
++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
++{
++ size_t margin = 0;
++ unsigned maxBlockSize = 0;
++
++ /* Iterate over each frame */
++ while (srcSize > 0) {
++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++ size_t const compressedSize = frameSizeInfo.compressedSize;
++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
++ ZSTD_frameHeader zfh;
++
++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
++ return ERROR(corruption_detected);
++
++ if (zfh.frameType == ZSTD_frame) {
++ /* Add the frame header to our margin */
++ margin += zfh.headerSize;
++ /* Add the checksum to our margin */
++ margin += zfh.checksumFlag ? 4 : 0;
++ /* Add 3 bytes per block */
++ margin += 3 * frameSizeInfo.nbBlocks;
++
++ /* Compute the max block size */
++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
++ } else {
++ assert(zfh.frameType == ZSTD_skippableFrame);
++ /* Add the entire skippable frame size to our margin. */
++ margin += compressedSize;
++ }
++
++ assert(srcSize >= compressedSize);
++ src = (const BYTE*)src + compressedSize;
++ srcSize -= compressedSize;
++ }
++
++ /* Add the max block size back to the margin. */
++ margin += maxBlockSize;
++
++ return margin;
++}
+
+ /*-*************************************************************
+ * Frame decoding
+@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+ }
+ ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
+ /* Allow caller to get size read */
++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
+ *srcPtr = ip;
+ *srcSizePtr = remainingSrcSize;
+ return (size_t)(op-ostart);
+@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+ while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+
+
+- { U32 const magicNumber = MEM_readLE32(src);
+- DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+- (unsigned)magicNumber, ZSTD_MAGICNUMBER);
++ if (srcSize >= 4) {
++ U32 const magicNumber = MEM_readLE32(src);
++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
+ if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
++ /* skippable frame detected : skip it */
+ size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
+ assert(skippableSize <= srcSize);
+
+ src = (const BYTE *)src + skippableSize;
+ srcSize -= skippableSize;
+- continue;
++ continue; /* check next frame */
+ } }
+
+ if (ddict) {
+@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+ /*
+- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
+- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * be streamed.
+ *
+ * For blocks that can be streamed, this allows us to reduce the latency until we produce
+@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+
+ default:
+ assert(0); /* impossible */
+- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
+ }
+ }
+
+@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+ /* in minimal huffman, we always use X1 variants */
+ size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+ dictPtr, dictEnd - dictPtr,
+- workspace, workspaceSize);
++ workspace, workspaceSize, /* flags */ 0);
+ #else
+ size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+ dictPtr, (size_t)(dictEnd - dictPtr),
+- workspace, workspaceSize);
++ workspace, workspaceSize, /* flags */ 0);
+ #endif
+ RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+ dictPtr += hSize;
+@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+ dctx->prefixStart = NULL;
+ dctx->virtualStart = NULL;
+ dctx->dictEnd = NULL;
+- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
+ dctx->litEntropy = dctx->fseEntropy = 0;
+ dctx->dictID = 0;
+ dctx->bType = bt_reserved;
+@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+ * This could for one of the following reasons :
+ * - The frame does not require a dictionary (most common case).
+ * - The frame was built with dictID intentionally removed.
+- * Needed dictionary is a hidden information.
++ * Needed dictionary is a hidden piece of information.
+ * Note : this use case also happens when using a non-conformant dictionary.
+ * - `srcSize` is too small, and as a result, frame header could not be decoded.
+ * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+ * ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+ {
+- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
++ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
+ size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+ if (ZSTD_isError(hError)) return 0;
+ return zfp.dictID;
+@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ {
+ DEBUGLOG(4, "ZSTD_initDStream");
+- return ZSTD_initDStream_usingDDict(zds, NULL);
++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
++ return ZSTD_startingInputLength(zds->format);
+ }
+
+ /* ZSTD_initDStream_usingDDict() :
+@@ -1589,6 +1664,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+ * this function cannot fail */
+ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ {
++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
+ FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+ FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+ return ZSTD_startingInputLength(dctx->format);
+@@ -1599,6 +1675,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+ * this function cannot fail */
+ size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+ {
++ DEBUGLOG(4, "ZSTD_resetDStream");
+ FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+ return ZSTD_startingInputLength(dctx->format);
+ }
+@@ -1670,6 +1747,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+ bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
+ bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
+ return bounds;
++ case ZSTD_d_disableHuffmanAssembly:
++ bounds.lowerBound = 0;
++ bounds.upperBound = 1;
++ return bounds;
++
+ default:;
+ }
+ bounds.error = ERROR(parameter_unsupported);
+@@ -1710,6 +1792,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+ case ZSTD_d_refMultipleDDicts:
+ *value = (int)dctx->refMultipleDDicts;
+ return 0;
++ case ZSTD_d_disableHuffmanAssembly:
++ *value = (int)dctx->disableHufAsm;
++ return 0;
+ default:;
+ }
+ RETURN_ERROR(parameter_unsupported, "");
+@@ -1743,6 +1828,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+ }
+ dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
+ return 0;
++ case ZSTD_d_disableHuffmanAssembly:
++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
++ dctx->disableHufAsm = value != 0;
++ return 0;
+ default:;
+ }
+ RETURN_ERROR(parameter_unsupported, "");
+@@ -1918,7 +2007,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ if (zds->refMultipleDDicts && zds->ddictSet) {
+ ZSTD_DCtx_selectFrameDDict(zds);
+ }
+- DEBUGLOG(5, "header size : %u", (U32)hSize);
+ if (ZSTD_isError(hSize)) {
+ return hSize; /* error */
+ }
+@@ -1932,6 +2020,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ zds->lhSize += remainingInput;
+ }
+ input->pos = input->size;
++ /* check first few bytes */
++ FORWARD_IF_ERROR(
++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
++ "First few bytes detected incorrect" );
++ /* return hint input size */
+ return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
+ }
+ assert(ip != NULL);
+@@ -1949,8 +2042,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
+ if (ZSTD_isError(decompressedSize)) return decompressedSize;
+ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++ assert(istart != NULL);
+ ip = istart + cSize;
+- op += decompressedSize;
++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
+ zds->expected = 0;
+ zds->streamStage = zdss_init;
+ someMoreWork = 0;
+@@ -2034,6 +2128,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ }
+ if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */
+ FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
++ assert(ip != NULL);
+ ip += neededInSize;
+ /* Function modifies the stage so we must break */
+ break;
+@@ -2048,7 +2143,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ int const isSkipFrame = ZSTD_isSkipFrame(zds);
+ size_t loadedSize;
+ /* At this point we shouldn't be decompressing a block that we can stream. */
+- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
+ if (isSkipFrame) {
+ loadedSize = MIN(toLoad, (size_t)(iend-ip));
+ } else {
+@@ -2057,8 +2152,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ "should never happen");
+ loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
+ }
+- ip += loadedSize;
+- zds->inPos += loadedSize;
++ if (loadedSize != 0) {
++ /* ip may be NULL */
++ ip += loadedSize;
++ zds->inPos += loadedSize;
++ }
+ if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */
+
+ /* decode loaded input */
+@@ -2068,14 +2166,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ break;
+ }
+ case zdss_flush:
+- { size_t const toFlushSize = zds->outEnd - zds->outStart;
++ {
++ size_t const toFlushSize = zds->outEnd - zds->outStart;
+ size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
+- op += flushedSize;
++
++ op = op ? op + flushedSize : op;
++
+ zds->outStart += flushedSize;
+ if (flushedSize == toFlushSize) { /* flush completed */
+ zds->streamStage = zdss_read;
+ if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+ DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+ (int)(zds->outBuffSize - zds->outStart),
+ (U32)zds->fParams.blockSizeMax);
+@@ -2089,7 +2190,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+
+ default:
+ assert(0); /* impossible */
+- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
+ } }
+
+ /* result */
+@@ -2102,8 +2203,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+ if ((ip==istart) && (op==ostart)) { /* no forward progress */
+ zds->noForwardProgress ++;
+ if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+- RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
+ assert(0);
+ }
+ } else {
+@@ -2140,11 +2241,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+ void* dst, size_t dstCapacity, size_t* dstPos,
+ const void* src, size_t srcSize, size_t* srcPos)
+ {
+- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+- ZSTD_inBuffer input = { src, srcSize, *srcPos };
+- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+- *dstPos = output.pos;
+- *srcPos = input.pos;
+- return cErr;
++ ZSTD_outBuffer output;
++ ZSTD_inBuffer input;
++ output.dst = dst;
++ output.size = dstCapacity;
++ output.pos = *dstPos;
++ input.src = src;
++ input.size = srcSize;
++ input.pos = *srcPos;
++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
++ *dstPos = output.pos;
++ *srcPos = input.pos;
++ return cErr;
++ }
+ }
+diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
+index c1913b8e7..9f5577e5b 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.c
++++ b/lib/zstd/decompress/zstd_decompress_block.c
+@@ -1,5 +1,6 @@
++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -20,12 +21,12 @@
+ #include "../common/mem.h" /* low level memory routines */
+ #define FSE_STATIC_LINKING_ONLY
+ #include "../common/fse.h"
+-#define HUF_STATIC_LINKING_ONLY
+ #include "../common/huf.h"
+ #include "../common/zstd_internal.h"
+ #include "zstd_decompress_internal.h" /* ZSTD_DCtx */
+ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */
+ #include "zstd_decompress_block.h"
++#include "../common/bits.h" /* ZSTD_highbit32 */
+
+ /*_*******************************************************
+ * Macros
+@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
+ dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
+ }
+ else {
+- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
+ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
+ dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
+ }
+@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ ZSTD_FALLTHROUGH;
+
+ case set_compressed:
+- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
+ { size_t lhSize, litSize, litCSize;
+ U32 singleStream=0;
+ U32 const lhlCode = (istart[0] >> 2) & 3;
+ U32 const lhc = MEM_readLE32(istart);
+ size_t hufSuccess;
+ size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++ int const flags = 0
++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
+ switch(lhlCode)
+ {
+ case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
+@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ }
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+ RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++ if (!singleStream)
++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
++ "Not enough literals (%zu) for the 4-streams mode (min %u)",
++ litSize, MIN_LITERALS_FOR_4_STREAMS);
+ RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+ RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
+ ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
+@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+
+ if (litEncType==set_repeat) {
+ if (singleStream) {
+- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
++ hufSuccess = HUF_decompress1X_usingDTable(
+ dctx->litBuffer, litSize, istart+lhSize, litCSize,
+- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++ dctx->HUFptr, flags);
+ } else {
+- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
++ hufSuccess = HUF_decompress4X_usingDTable(
+ dctx->litBuffer, litSize, istart+lhSize, litCSize,
+- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
++ dctx->HUFptr, flags);
+ }
+ } else {
+ if (singleStream) {
+@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ hufSuccess = HUF_decompress1X_DCtx_wksp(
+ dctx->entropy.hufTable, dctx->litBuffer, litSize,
+ istart+lhSize, litCSize, dctx->workspace,
+- sizeof(dctx->workspace));
++ sizeof(dctx->workspace), flags);
+ #else
+- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
++ hufSuccess = HUF_decompress1X1_DCtx_wksp(
+ dctx->entropy.hufTable, dctx->litBuffer, litSize,
+ istart+lhSize, litCSize, dctx->workspace,
+- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++ sizeof(dctx->workspace), flags);
+ #endif
+ } else {
+- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
++ hufSuccess = HUF_decompress4X_hufOnly_wksp(
+ dctx->entropy.hufTable, dctx->litBuffer, litSize,
+ istart+lhSize, litCSize, dctx->workspace,
+- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
++ sizeof(dctx->workspace), flags);
+ }
+ }
+ if (dctx->litBufferLocation == ZSTD_split)
+@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ break;
+ case 3:
+ lhSize = 3;
++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
+ litSize = MEM_readLE24(istart) >> 4;
+ break;
+ }
+@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ break;
+ case 1:
+ lhSize = 2;
++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
+ litSize = MEM_readLE16(istart) >> 4;
+ break;
+ case 3:
+ lhSize = 3;
++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
+ litSize = MEM_readLE24(istart) >> 4;
+- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+ break;
+ }
+ RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+ for (i = 8; i < n; i += 8) {
+ MEM_write64(spread + pos + i, sv);
+ }
+- pos += n;
++ assert(n>=0);
++ pos += (size_t)n;
+ }
+ }
+ /* Now we spread those positions across the table.
+- * The benefit of doing it in two stages is that we avoid the the
++ * The benefit of doing it in two stages is that we avoid the
+ * variable size inner loop, which caused lots of branch misses.
+ * Now we can run through all the positions without any branch misses.
+- * We unroll the loop twice, since that is what emperically worked best.
++ * We unroll the loop twice, since that is what empirically worked best.
+ */
+ {
+ size_t position = 0;
+@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+ for (i=0; i<n; i++) {
+ tableDecode[position].baseValue = s;
+ position = (position + step) & tableMask;
+- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
+ } }
+ assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+ }
+@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+ for (u=0; u<tableSize; u++) {
+ U32 const symbol = tableDecode[u].baseValue;
+ U32 const nextState = symbolNext[symbol]++;
+- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
++ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
+ tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+ assert(nbAdditionalBits[symbol] < 255);
+ tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
+@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
+
+ assert(op != NULL /* Precondition */);
+ assert(oend_w < oend /* No underflow */);
++
++#if defined(__aarch64__)
++ /* prefetch sequence starting from match that will be used for copy later */
++ PREFETCH_L1(match);
++#endif
+ /* Handle edge cases in a slow path:
+ * - Read beyond end of literals
+ * - Match end is within WILDCOPY_OVERLIMIT of oend
+@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ }
+
+ /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offsets.
+ */
+@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
+ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ {
+ seq_t seq;
++ /*
++ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
++ * loaded in one operation and extracted its fields by simply shifting or
++ * bit-extracting on aarch64.
++ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
++ * operations that cause performance drop. This can be avoided by using this
++ * ZSTD_memcpy hack.
++ */
++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
++ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
++ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
++ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
++ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
++ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
++#else
+ const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
+ const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
+ const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
++#endif
+ seq.matchLength = mlDInfo->baseValue;
+ seq.litLength = llDInfo->baseValue;
+ { U32 const ofBase = ofDInfo->baseValue;
+@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ U32 const llnbBits = llDInfo->nbBits;
+ U32 const mlnbBits = mlDInfo->nbBits;
+ U32 const ofnbBits = ofDInfo->nbBits;
++
++ assert(llBits <= MaxLLBits);
++ assert(mlBits <= MaxMLBits);
++ assert(ofBits <= MaxOff);
+ /*
+ * As gcc has better branch and block analyzers, sometimes it is only
+- * valuable to mark likelyness for clang, it gives around 3-4% of
++ * valuable to mark likeliness for clang, it gives around 3-4% of
+ * performance.
+ */
+
+ /* sequence */
+ { size_t offset;
+- #if defined(__clang__)
+- if (LIKELY(ofBits > 1)) {
+- #else
+ if (ofBits > 1) {
+- #endif
+ ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+ ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+- assert(ofBits <= MaxOff);
++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
+ if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
++ /* Always read extra bits, this keeps the logic simple,
++ * avoids branches, and avoids accidentally reading 0 bits.
++ */
++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
+ offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+ BIT_reloadDStream(&seqState->DStream);
+- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
++ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+ } else {
+ offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ seq.offset = offset;
+ }
+
+- #if defined(__clang__)
+- if (UNLIKELY(mlBits > 0))
+- #else
+ if (mlBits > 0)
+- #endif
+ seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+- #if defined(__clang__)
+- if (UNLIKELY(llBits > 0))
+- #else
+ if (llBits > 0)
+- #endif
+ seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+ if (MEM_32bits())
+@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+ const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
+ const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
+ const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
+- DEBUGLOG(5, "ZSTD_decompressSequences_body");
++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
+ (void)frame;
+
+ /* Regen sequences */
+@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
++/*
++ * @returns The total size of the history referenceable by zstd, including
++ * both the prefix and the extDict. At @p op any offset larger than this
++ * is invalid.
++ */
++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
++{
++ return (size_t)(op - virtualStart);
++}
++
++typedef struct {
++ unsigned longOffsetShare;
++ unsigned maxNbAdditionalBits;
++} ZSTD_OffsetInfo;
+
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+-/* ZSTD_getLongOffsetsShare() :
++/* ZSTD_getOffsetInfo() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+- * compared to maximum possible of (1<<OffFSELog) */
+-static unsigned
+-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
++ * compared to maximum possible of (1<<OffFSELog),
++ * as well as the maximum number additional bits required.
++ */
++static ZSTD_OffsetInfo
++ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
+ {
+- const void* ptr = offTable;
+- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+- const ZSTD_seqSymbol* table = offTable + 1;
+- U32 const max = 1 << tableLog;
+- U32 u, total = 0;
+- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+-
+- assert(max <= (1 << OffFSELog)); /* max not too large */
+- for (u=0; u<max; u++) {
+- if (table[u].nbAdditionalBits > 22) total += 1;
++ ZSTD_OffsetInfo info = {0, 0};
++ /* If nbSeq == 0, then the offTable is uninitialized, but we have
++ * no sequences, so both values should be 0.
++ */
++ if (nbSeq != 0) {
++ const void* ptr = offTable;
++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
++ const ZSTD_seqSymbol* table = offTable + 1;
++ U32 const max = 1 << tableLog;
++ U32 u;
++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
++
++ assert(max <= (1 << OffFSELog)); /* max not too large */
++ for (u=0; u<max; u++) {
++ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
++ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
++ }
++
++ assert(tableLog <= OffFSELog);
++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
+ }
+
+- assert(tableLog <= OffFSELog);
+- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
++ return info;
++}
+
+- return total;
++/*
++ * @returns The maximum offset we can decode in one read of our bitstream, without
++ * reloading more bits in the middle of the offset bits read. Any offsets larger
++ * than this must use the long offset decoder.
++ */
++static size_t ZSTD_maxShortOffset(void)
++{
++ if (MEM_64bits()) {
++ /* We can decode any offset without reloading bits.
++ * This might change if the max window size grows.
++ */
++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
++ return (size_t)-1;
++ } else {
++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
++ */
++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
++ return maxOffset;
++ }
+ }
+-#endif
+
+ size_t
+ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
+ { /* blockType == blockCompressed */
+ const BYTE* ip = (const BYTE*)src;
+- /* isLongOffset must be true if there are long offsets.
+- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+- * We don't expect that to be the case in 64-bit mode.
+- * In block mode, window size is not known, so we have to be conservative.
+- * (note: but it could be evaluated from current-lowLimit)
+- */
+- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+ DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+
+- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++ /* Note : the wording of the specification
++ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
++ * This generally does not happen, as it makes little sense,
++ * since an uncompressed block would feature same size and have no decompression cost.
++ * Also, note that decoder from reference libzstd before < v1.5.4
++ * would consider this edge case as an error.
++ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
++ * for broader compatibility with the deployed ecosystem of zstd decoders */
++ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+
+ /* Decode literals section */
+ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
+- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
+ if (ZSTD_isError(litCSize)) return litCSize;
+ ip += litCSize;
+ srcSize -= litCSize;
+@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+
+ /* Build Decoding Tables */
+ {
++ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
++ */
++ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
++ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
++ /* isLongOffset must be true if there are long offsets.
++ * Offsets are long if they are larger than ZSTD_maxShortOffset().
++ * We don't expect that to be the case in 64-bit mode.
++ *
++ * We check here to see if our history is large enough to allow long offsets.
++ * If it isn't, then we can't possible have (valid) long offsets. If the offset
++ * is invalid, then it is okay to read it incorrectly.
++ *
++ * If isLongOffsets is true, then we will later check our decoding table to see
++ * if it is even possible to generate long offsets.
++ */
++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
+ /* These macros control at build-time which decompressor implementation
+ * we use. If neither is defined, we do some inspection and dispatch at
+ * runtime.
+@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+ int usePrefetchDecoder = dctx->ddictIsCold;
++#else
++ /* Set to 1 to avoid computing offset info if we don't need to.
++ * Otherwise this value is ignored.
++ */
++ int usePrefetchDecoder = 1;
+ #endif
+ int nbSeq;
+ size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+ ip += seqHSize;
+ srcSize -= seqHSize;
+
+- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
++ "invalid dst");
+
+-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+- if ( !usePrefetchDecoder
+- && (!frame || (dctx->fParams.windowSize > (1<<24)))
+- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
+- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+- usePrefetchDecoder = (shareLongOffsets >= minShare);
++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
++ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
++ * NOTE: could probably use a larger nbSeq limit
++ */
++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
++ * enough, then we know it is impossible to have too long an offset in this block, so we can
++ * use the regular offset decoder.
++ */
++ isLongOffset = ZSTD_lo_isRegularOffset;
++ }
++ if (!usePrefetchDecoder) {
++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
++ usePrefetchDecoder = (info.longOffsetShare >= minShare);
++ }
+ }
+-#endif
+
+ dctx->ddictIsCold = 0;
+
+ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+- if (usePrefetchDecoder)
++ if (usePrefetchDecoder) {
++#else
++ (void)usePrefetchDecoder;
++ {
+ #endif
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+ #endif
++ }
+
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ /* else */
+@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ }
+
+
+-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+- void* dst, size_t dstCapacity,
+- const void* src, size_t srcSize)
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize)
+ {
+ size_t dSize;
+ ZSTD_checkContinuity(dctx, dst, dstCapacity);
+@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+ dctx->previousDstEnd = (char*)dst + dSize;
+ return dSize;
+ }
++
++
++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize)
++{
++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
++}
+diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
+index 3d2d57a5d..5888e6cc7 100644
+--- a/lib/zstd/decompress/zstd_decompress_block.h
++++ b/lib/zstd/decompress/zstd_decompress_block.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+ unsigned tableLog, void* wksp, size_t wkspSize,
+ int bmi2);
+
++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
++ void* dst, size_t dstCapacity,
++ const void* src, size_t srcSize);
++
+
+ #endif /* ZSTD_DEC_BLOCK_H */
+diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
+index 98102edb6..32f79fb28 100644
+--- a/lib/zstd/decompress/zstd_decompress_internal.h
++++ b/lib/zstd/decompress/zstd_decompress_internal.h
+@@ -1,5 +1,6 @@
++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Yann Collet, Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
+
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
+ #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
+
+ typedef struct {
+ ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
+ ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
+ ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */
+ U32 rep[ZSTD_REP_NUM];
+ U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
+ } ZSTD_entropyDTables_t;
+@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
+ ZSTD_dictUses_e dictUses;
+ ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */
+ ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
++ int disableHufAsm;
+
+ /* streaming */
+ ZSTD_dStreamStage streamStage;
+diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
+index a06ca187a..8a47eb2a4 100644
+--- a/lib/zstd/decompress_sources.h
++++ b/lib/zstd/decompress_sources.h
+@@ -1,6 +1,6 @@
+ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
+index 22686e367..466828e35 100644
+--- a/lib/zstd/zstd_common_module.c
++++ b/lib/zstd/zstd_common_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+ EXPORT_SYMBOL_GPL(ZSTD_isError);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+ EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+-EXPORT_SYMBOL_GPL(ZSTD_customFree);
+
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_DESCRIPTION("Zstd Common");
+diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
+index 04e1b5c01..8ecf43226 100644
+--- a/lib/zstd/zstd_compress_module.c
++++ b/lib/zstd/zstd_compress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
+index f4ed952ed..7d31518e9 100644
+--- a/lib/zstd/zstd_decompress_module.c
++++ b/lib/zstd/zstd_decompress_module.c
+@@ -1,6 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+ /*
+- * Copyright (c) Facebook, Inc.
++ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
+
+ size_t zstd_reset_dstream(zstd_dstream *dstream)
+ {
+- return ZSTD_resetDStream(dstream);
++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
+ }
+ EXPORT_SYMBOL(zstd_reset_dstream);
+