summaryrefslogtreecommitdiff
path: root/lib/zstd/compress/zstd_opt.c
diff options
context:
space:
mode:
authorNick Terrell <terrelln@meta.com>2025-03-08 12:09:33 -0800
committerNick Terrell <terrelln@meta.com>2025-03-13 13:25:58 -0700
commit65d1f5507ed2c78c64fce40e44e5574a9419eb09 (patch)
tree4a1b819db2ea7f2a322e9bcc7582d946d0a4ea29 /lib/zstd/compress/zstd_opt.c
parent7eb172143d5508b4da468ed59ee857c6e5e01da6 (diff)
zstd: Import upstream v1.5.7
In addition to keeping the kernel's copy of zstd up to date, this update was requested by Intel to expose upstream's APIs that allow QAT to accelerate the LZ match finding stage of Zstd. This patch is imported from the upstream tag v1.5.7-kernel [0], which is signed with upstream's signing key EF8FE99528B52FFD [1]. It was imported from upstream using this command: export ZSTD=/path/to/repo/zstd/ export LINUX=/path/to/repo/linux/ cd "$ZSTD/contrib/linux-kernel" git checkout v1.5.7-kernel make import LINUX="$LINUX" This patch has been tested on x86-64, and has been boot tested with a zstd compressed kernel & initramfs on i386 and aarch64. I benchmarked the patch on x86-64 with gcc-14.2.1 on an Intel i9-9900K by measruing the performance of compressed filesystem reads and writes. Component, Level, Size delta, C. time delta, D. time delta Btrfs , 1, +0.00%, -6.1%, +1.4% Btrfs , 3, +0.00%, -9.8%, +3.0% Btrfs , 5, +0.00%, +1.7%, +1.4% Btrfs , 7, +0.00%, -1.9%, +2.7% Btrfs , 9, +0.00%, -3.4%, +3.7% Btrfs , 15, +0.00%, -0.3%, +3.6% SquashFS , 1, +0.00%, N/A, +1.9% The major changes that impact the kernel use cases for each version are: v1.5.7: https://github.com/facebook/zstd/releases/tag/v1.5.7 * Add zstd_compress_sequences_and_literals() for use by Intel's QAT driver to implement Zstd compression acceleration in the kernel. * Fix an underflow bug in 32-bit builds that can cause data corruption when processing more than 4GB of data with a single `ZSTD_CCtx` object, when an input crosses the 4GB boundry. I don't believe this impacts any current kernel use cases, because the `ZSTD_CCtx` is typically reconstructed between compressions. * Levels 1-4 see 5-10% compression speed improvements for inputs smaller than 128KB. v1.5.6: https://github.com/facebook/zstd/releases/tag/v1.5.6 * Improved compression ratio for the highest compression levels. I don't expect these see much use however, due to their slow speeds. v1.5.5: https://github.com/facebook/zstd/releases/tag/v1.5.5 * Fix a rare corruption bug that can trigger on levels 13 and above. * Improve compression speed of levels 5-11 on incompressible data. v1.5.4: https://github.com/facebook/zstd/releases/tag/v1.5.4 * Improve copmression speed of levels 5-11 on ARM. * Improve dictionary compression speed. Signed-off-by: Nick Terrell <terrelln@fb.com>
Diffstat (limited to 'lib/zstd/compress/zstd_opt.c')
-rw-r--r--lib/zstd/compress/zstd_opt.c571
1 files changed, 353 insertions, 218 deletions
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
index fd82acfda62f..b62fd1b0d83e 100644
--- a/lib/zstd/compress/zstd_opt.c
+++ b/lib/zstd/compress/zstd_opt.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
/*
- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -12,11 +13,14 @@
#include "hist.h"
#include "zstd_opt.h"
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
#define ZSTD_MAX_PRICE (1<<30)
-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
/*-*************************************
@@ -26,27 +30,35 @@
#if 0 /* approximation at bit level (for tests) */
# define BITCOST_ACCURACY 0
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
#elif 0 /* fractional bit accuracy (for tests) */
# define BITCOST_ACCURACY 8
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
#else /* opt==approx, ultra==accurate */
# define BITCOST_ACCURACY 8
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
#endif
+/* ZSTD_bitWeight() :
+ * provide estimated "cost" of a stat in full bits only */
MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
{
return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
}
+/* ZSTD_fracWeight() :
+ * provide fractional-bit "cost" of a stat,
+ * using linear interpolation approximation */
MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
{
U32 const stat = rawStat + 1;
U32 const hb = ZSTD_highbit32(stat);
U32 const BWeight = hb * BITCOST_MULTIPLIER;
+ /* Fweight was meant for "Fractional weight"
+ * but it's effectively a value between 1 and 2
+ * using fixed point arithmetic */
U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
U32 const weight = BWeight + FWeight;
assert(hb + BITCOST_ACCURACY < 31);
@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
/* debugging function,
* @return price in bytes as fractional value
* for debug messages only */
-MEM_STATIC double ZSTD_fCost(U32 price)
+MEM_STATIC double ZSTD_fCost(int price)
{
return (double)price / (BITCOST_MULTIPLIER*8);
}
@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
return total;
}
-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
+
+static U32
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
{
U32 s, sum=0;
- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
+ (unsigned)lastEltIndex+1, (unsigned)shift );
assert(shift < 30);
for (s=0; s<lastEltIndex+1; s++) {
- table[s] = 1 + (table[s] >> shift);
- sum += table[s];
+ unsigned const base = base1 ? 1 : (table[s]>0);
+ unsigned const newStat = base + (table[s] >> shift);
+ sum += newStat;
+ table[s] = newStat;
}
return sum;
}
/* ZSTD_scaleStats() :
- * reduce all elements in table is sum too large
+ * reduce all elt frequencies in table if sum too large
* return the resulting sum of elements */
static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
{
@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
assert(logTarget < 30);
if (factor <= 1) return prevsum;
- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
}
/* ZSTD_rescaleFreqs() :
@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
optPtr->priceType = zop_dynamic;
- if (optPtr->litLengthSum == 0) { /* first block : init */
- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */
- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */
+
+ /* heuristic: use pre-defined stats for too small inputs */
+ if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
+ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
optPtr->priceType = zop_predef;
}
assert(optPtr->symbolCosts != NULL);
if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
- /* huffman table presumed generated by dictionary */
+
+ /* huffman stats covering the full value set : table presumed generated by dictionary */
optPtr->priceType = zop_dynamic;
if (compressedLiterals) {
+ /* generate literals statistics from huffman table */
unsigned lit;
assert(optPtr->litFreq != NULL);
optPtr->litSum = 0;
@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
optPtr->offCodeSum += optPtr->offCodeFreq[of];
} }
- } else { /* not a dictionary */
+ } else { /* first block, no dictionary */
assert(optPtr->litFreq != NULL);
if (compressedLiterals) {
+ /* base initial cost of literals on direct frequency within src */
unsigned lit = MaxLit;
HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
+ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
}
{ unsigned const baseLLfreqs[MaxLL+1] = {
@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
}
-
}
- } else { /* new block : re-use previous statistics, scaled down */
+ } else { /* new block : scale down accumulated statistics */
if (compressedLiterals)
optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
const optState_t* const optPtr,
int optLevel)
{
+ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
if (litLength == 0) return 0;
if (!ZSTD_compressedLiterals(optPtr))
@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */
/* dynamic statistics */
- { U32 price = litLength * optPtr->litSumBasePrice;
+ { U32 price = optPtr->litSumBasePrice * litLength;
+ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
U32 u;
+ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
for (u=0; u < litLength; u++) {
- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */
- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+ price -= litPrice;
}
return price;
}
@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
assert(litLength <= ZSTD_BLOCKSIZE_MAX);
if (optPtr->priceType == zop_predef)
return WEIGHT(litLength, optLevel);
- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
- * because it isn't representable in the zstd format. So instead just
- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
- * would be all literals.
+
+ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+ * because it isn't representable in the zstd format.
+ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
+ * In such a case, the block would be all literals.
*/
if (litLength == ZSTD_BLOCKSIZE_MAX)
return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
}
/* ZSTD_getMatchPrice() :
- * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
* Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
* @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
*/
FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offcode,
+ZSTD_getMatchPrice(U32 const offBase,
U32 const matchLength,
const optState_t* const optPtr,
int const optLevel)
{
U32 price;
- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
+ U32 const offCode = ZSTD_highbit32(offBase);
U32 const mlBase = matchLength - MINMATCH;
assert(matchLength >= MINMATCH);
- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */
- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */
+ return WEIGHT(mlBase, optLevel)
+ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
/* dynamic statistics */
price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
}
/* ZSTD_updateStats() :
- * assumption : literals + litLengtn <= iend */
+ * assumption : literals + litLength <= iend */
static void ZSTD_updateStats(optState_t* const optPtr,
U32 litLength, const BYTE* literals,
- U32 offsetCode, U32 matchLength)
+ U32 offBase, U32 matchLength)
{
/* literals */
if (ZSTD_compressedLiterals(optPtr)) {
@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
optPtr->litLengthSum++;
}
- /* offset code : expected to follow storeSeq() numeric representation */
- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
+ /* offset code : follows storeSeq() numeric representation */
+ { U32 const offCode = ZSTD_highbit32(offBase);
assert(offCode <= MaxOff);
optPtr->offCodeFreq[offCode]++;
optPtr->offCodeSum++;
@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
/* Update hashTable3 up to ip (excluded)
Assumption : always within prefix (i.e. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
- U32* nextToUpdate3,
- const BYTE* const ip)
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms,
+ U32* nextToUpdate3,
+ const BYTE* const ip)
{
U32* const hashTable3 = ms->hashTable3;
U32 const hashLog3 = ms->hashLog3;
@@ -408,8 +438,10 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
* @param ip assumed <= iend-8 .
* @param target The target of ZSTD_updateTree_internal() - we are filling to this position
* @return : nb of positions added */
-static U32 ZSTD_insertBt1(
- const ZSTD_matchState_t* ms,
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertBt1(
+ const ZSTD_MatchState_t* ms,
const BYTE* const ip, const BYTE* const iend,
U32 const target,
U32 const mls, const int extDict)
@@ -527,15 +559,16 @@ static U32 ZSTD_insertBt1(
}
FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
void ZSTD_updateTree_internal(
- ZSTD_matchState_t* ms,
+ ZSTD_MatchState_t* ms,
const BYTE* const ip, const BYTE* const iend,
const U32 mls, const ZSTD_dictMode_e dictMode)
{
const BYTE* const base = ms->window.base;
U32 const target = (U32)(ip - base);
U32 idx = ms->nextToUpdate;
- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)",
+ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)",
idx, target, dictMode);
while(idx < target) {
@@ -548,20 +581,23 @@ void ZSTD_updateTree_internal(
ms->nextToUpdate = target;
}
-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) {
ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
}
FORCE_INLINE_TEMPLATE
-U32 ZSTD_insertBtAndGetAllMatches (
- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
- ZSTD_matchState_t* ms,
- U32* nextToUpdate3,
- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
- const U32 rep[ZSTD_REP_NUM],
- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
- const U32 lengthToBeat,
- U32 const mls /* template */)
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32
+ZSTD_insertBtAndGetAllMatches (
+ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */
+ ZSTD_MatchState_t* ms,
+ U32* nextToUpdate3,
+ const BYTE* const ip, const BYTE* const iLimit,
+ const ZSTD_dictMode_e dictMode,
+ const U32 rep[ZSTD_REP_NUM],
+ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+ const U32 lengthToBeat,
+ const U32 mls /* template */)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
@@ -590,7 +626,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
U32 mnum = 0;
U32 nbCompares = 1U << cParams->searchLog;
- const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+ const ZSTD_MatchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
const ZSTD_compressionParameters* const dmsCParams =
dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
@@ -629,13 +665,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
assert(curr >= windowLow);
if ( dictMode == ZSTD_extDict
&& ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */
- & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+ & (ZSTD_index_overlap_check(dictLimit, repIndex)) )
&& (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
}
if (dictMode == ZSTD_dictMatchState
&& ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */
- & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+ & (ZSTD_index_overlap_check(dictLimit, repIndex)) )
&& (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
} }
@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
repCode, ll0, repOffset, repLen);
bestLength = repLen;
- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */
+ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */
matches[mnum].len = (U32)repLen;
mnum++;
if ( (repLen > sufficient_len)
@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
bestLength = mlen;
assert(curr > matchIndex3);
assert(mnum==0); /* no prior solution */
- matches[0].off = STORE_OFFSET(curr - matchIndex3);
+ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
matches[0].len = (U32)mlen;
mnum = 1;
if ( (mlen > sufficient_len) |
@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
}
if (matchLength > bestLength) {
- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
assert(matchEndIdx > matchIndex);
if (matchLength > matchEndIdx - matchIndex)
matchEndIdx = matchIndex + (U32)matchLength;
bestLength = matchLength;
- matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
matches[mnum].len = (U32)matchLength;
mnum++;
if ( (matchLength > ZSTD_OPT_NUM)
@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
if (matchLength > bestLength) {
matchIndex = dictMatchIndex + dmsIndexDelta;
- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
if (matchLength > matchEndIdx - matchIndex)
matchEndIdx = matchIndex + (U32)matchLength;
bestLength = matchLength;
- matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
matches[mnum].len = (U32)matchLength;
mnum++;
if ( (matchLength > ZSTD_OPT_NUM)
@@ -784,7 +820,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
typedef U32 (*ZSTD_getAllMatchesFn)(
ZSTD_match_t*,
- ZSTD_matchState_t*,
+ ZSTD_MatchState_t*,
U32*,
const BYTE*,
const BYTE*,
@@ -792,9 +828,11 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
U32 const ll0,
U32 const lengthToBeat);
-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_btGetAllMatches_internal(
ZSTD_match_t* matches,
- ZSTD_matchState_t* ms,
+ ZSTD_MatchState_t* ms,
U32* nextToUpdate3,
const BYTE* ip,
const BYTE* const iHighLimit,
@@ -817,7 +855,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \
static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \
ZSTD_match_t* matches, \
- ZSTD_matchState_t* ms, \
+ ZSTD_MatchState_t* ms, \
U32* nextToUpdate3, \
const BYTE* ip, \
const BYTE* const iHighLimit, \
@@ -849,7 +887,7 @@ GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
}
static ZSTD_getAllMatchesFn
-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
+ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode)
{
ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
@@ -868,7 +906,7 @@ ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const di
/* Struct containing info needed to make decision about ldm inclusion */
typedef struct {
- rawSeqStore_t seqStore; /* External match candidates store for this block */
+ RawSeqStore_t seqStore; /* External match candidates store for this block */
U32 startPosInBlock; /* Start position of the current match candidate */
U32 endPosInBlock; /* End position of the current match candidate */
U32 offset; /* Offset of the match candidate */
@@ -878,7 +916,7 @@ typedef struct {
* Moves forward in @rawSeqStore by @nbBytes,
* which will update the fields 'pos' and 'posInSequence'.
*/
-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes)
+static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes)
{
U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
while (currPos && rawSeqStore->pos < rawSeqStore->size) {
@@ -935,7 +973,7 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock
return;
}
- /* Matches may be < MINMATCH by this process. In that case, we will reject them
+ /* Matches may be < minMatch by this process. In that case, we will reject them
when we are deciding whether or not to add the ldm */
optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining;
optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining;
@@ -957,25 +995,26 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock
* into 'matches'. Maintains the correct ordering of 'matches'.
*/
static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
- const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+ const ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
+ U32 minMatch)
{
U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
+ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
/* Ensure that current block position is not outside of the match */
if (currPosInBlock < optLdm->startPosInBlock
|| currPosInBlock >= optLdm->endPosInBlock
- || candidateMatchLength < MINMATCH) {
+ || candidateMatchLength < minMatch) {
return;
}
if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
- candidateOffCode, candidateMatchLength, currPosInBlock);
+ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+ candidateOffBase, candidateMatchLength, currPosInBlock);
matches[*nbMatches].len = candidateMatchLength;
- matches[*nbMatches].off = candidateOffCode;
+ matches[*nbMatches].off = candidateOffBase;
(*nbMatches)++;
}
}
@@ -986,7 +1025,8 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
static void
ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
ZSTD_match_t* matches, U32* nbMatches,
- U32 currPosInBlock, U32 remainingBytes)
+ U32 currPosInBlock, U32 remainingBytes,
+ U32 minMatch)
{
if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
return;
@@ -1003,7 +1043,7 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
}
ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
}
- ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
+ ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch);
}
@@ -1011,11 +1051,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
* Optimal parser
*********************************/
-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
-{
- return sol.litlen + sol.mlen;
-}
-
#if 0 /* debug */
static void
@@ -1033,9 +1068,15 @@ listStats(const U32* table, int lastEltID)
#endif
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
- seqStore_t* seqStore,
+#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
+#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
+#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t
+ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms,
+ SeqStore_t* seqStore,
U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize,
const int optLevel,
@@ -1059,9 +1100,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
ZSTD_optimal_t* const opt = optStatePtr->priceTable;
ZSTD_match_t* const matches = optStatePtr->matchTable;
- ZSTD_optimal_t lastSequence;
+ ZSTD_optimal_t lastStretch;
ZSTD_optLdm_t optLdm;
+ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
+
optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
@@ -1082,103 +1125,140 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
U32 const ll0 = !litlen;
U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
- (U32)(ip-istart), (U32)(iend - ip));
- if (!nbMatches) { ip++; continue; }
+ (U32)(ip-istart), (U32)(iend-ip),
+ minMatch);
+ if (!nbMatches) {
+ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
+ ip++;
+ continue;
+ }
+
+ /* Match found: let's store this solution, and eventually find more candidates.
+ * During this forward pass, @opt is used to store stretches,
+ * defined as "a match followed by N literals".
+ * Note how this is different from a Sequence, which is "N literals followed by a match".
+ * Storing stretches allows us to store different match predecessors
+ * for each literal position part of a literals run. */
/* initialize opt[0] */
- { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
- opt[0].mlen = 0; /* means is_a_literal */
+ opt[0].mlen = 0; /* there are only literals so far */
opt[0].litlen = litlen;
- /* We don't need to include the actual price of the literals because
- * it is static for the duration of the forward pass, and is included
- * in every price. We include the literal length to avoid negative
- * prices when we subtract the previous literal length.
+ /* No need to include the actual price of the literals before the first match
+ * because it is static for the duration of the forward pass, and is included
+ * in every subsequent price. But, we include the literal length because
+ * the cost variation of litlen depends on the value of litlen.
*/
- opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
+ opt[0].price = LL_PRICE(litlen);
+ ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
+ ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
/* large match -> immediate encoding */
{ U32 const maxML = matches[nbMatches-1].len;
- U32 const maxOffcode = matches[nbMatches-1].off;
- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
+ U32 const maxOffBase = matches[nbMatches-1].off;
+ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
if (maxML > sufficient_len) {
- lastSequence.litlen = litlen;
- lastSequence.mlen = maxML;
- lastSequence.off = maxOffcode;
- DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+ lastStretch.litlen = 0;
+ lastStretch.mlen = maxML;
+ lastStretch.off = maxOffBase;
+ DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
maxML, sufficient_len);
cur = 0;
- last_pos = ZSTD_totalLen(lastSequence);
+ last_pos = maxML;
goto _shortestPath;
} }
/* set prices for first matches starting position == 0 */
assert(opt[0].price >= 0);
- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
- U32 pos;
+ { U32 pos;
U32 matchNb;
for (pos = 1; pos < minMatch; pos++) {
- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */
+ opt[pos].price = ZSTD_MAX_PRICE;
+ opt[pos].mlen = 0;
+ opt[pos].litlen = litlen + pos;
}
for (matchNb = 0; matchNb < nbMatches; matchNb++) {
- U32 const offcode = matches[matchNb].off;
+ U32 const offBase = matches[matchNb].off;
U32 const end = matches[matchNb].len;
for ( ; pos <= end ; pos++ ) {
- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
- U32 const sequencePrice = literalsPrice + matchPrice;
+ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+ int const sequencePrice = opt[0].price + matchPrice;
DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
pos, ZSTD_fCost(sequencePrice));
opt[pos].mlen = pos;
- opt[pos].off = offcode;
- opt[pos].litlen = litlen;
- opt[pos].price = (int)sequencePrice;
- } }
+ opt[pos].off = offBase;
+ opt[pos].litlen = 0; /* end of match */
+ opt[pos].price = sequencePrice + LL_PRICE(0);
+ }
+ }
last_pos = pos-1;
+ opt[pos].price = ZSTD_MAX_PRICE;
}
}
/* check further positions */
for (cur = 1; cur <= last_pos; cur++) {
const BYTE* const inr = ip + cur;
- assert(cur < ZSTD_OPT_NUM);
- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
+ assert(cur <= ZSTD_OPT_NUM);
+ DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur);
/* Fix current position with one literal if cheaper */
- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+ { U32 const litlen = opt[cur-1].litlen + 1;
int const price = opt[cur-1].price
- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+ + LIT_PRICE(ip+cur-1)
+ + LL_INCPRICE(litlen);
assert(price < 1000000000); /* overflow check */
if (price <= opt[cur].price) {
- DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+ ZSTD_optimal_t const prevMatch = opt[cur];
+ DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
- opt[cur].mlen = 0;
- opt[cur].off = 0;
+ opt[cur] = opt[cur-1];
opt[cur].litlen = litlen;
opt[cur].price = price;
+ if ( (optLevel >= 1) /* additional check only for higher modes */
+ && (prevMatch.litlen == 0) /* replace a match */
+ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
+ && LIKELY(ip + cur < iend)
+ ) {
+ /* check next position, in case it would be cheaper */
+ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
+ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
+ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
+ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
+ if ( (with1literal < withMoreLiterals)
+ && (with1literal < opt[cur+1].price) ) {
+ /* update offset history - before it disappears */
+ U32 const prev = cur - prevMatch.mlen;
+ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
+ assert(cur >= prevMatch.mlen);
+ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
+ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
+ newReps.rep[0], newReps.rep[1], newReps.rep[2] );
+ opt[cur+1] = prevMatch; /* mlen & offbase */
+ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t));
+ opt[cur+1].litlen = 1;
+ opt[cur+1].price = with1literal;
+ if (last_pos < cur+1) last_pos = cur+1;
+ }
+ }
} else {
- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+ DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)",
+ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
}
}
- /* Set the repcodes of the current position. We must do it here
- * because we rely on the repcodes of the 2nd to last sequence being
- * correct to set the next chunks repcodes during the backward
- * traversal.
+ /* Offset history is not updated during match comparison.
+ * Do it here, now that the match is selected and confirmed.
*/
- ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+ ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t));
assert(cur >= opt[cur].mlen);
- if (opt[cur].mlen != 0) {
+ if (opt[cur].litlen == 0) {
+ /* just finished a match => alter offset history */
U32 const prev = cur - opt[cur].mlen;
- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
- ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
- } else {
- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
+ ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t));
}
/* last match must start at a minimum distance of 8 from oend */
@@ -1188,38 +1268,37 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
if ( (optLevel==0) /*static_test*/
&& (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
+ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
}
assert(opt[cur].price >= 0);
- { U32 const ll0 = (opt[cur].mlen != 0);
- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
- U32 const previousPrice = (U32)opt[cur].price;
- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+ { U32 const ll0 = (opt[cur].litlen == 0);
+ int const previousPrice = opt[cur].price;
+ int const basePrice = previousPrice + LL_PRICE(0);
U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
U32 matchNb;
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
- (U32)(inr-istart), (U32)(iend-inr));
+ (U32)(inr-istart), (U32)(iend-inr),
+ minMatch);
if (!nbMatches) {
DEBUGLOG(7, "rPos:%u : no match found", cur);
continue;
}
- { U32 const maxML = matches[nbMatches-1].len;
- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
- inr-istart, cur, nbMatches, maxML);
-
- if ( (maxML > sufficient_len)
- || (cur + maxML >= ZSTD_OPT_NUM) ) {
- lastSequence.mlen = maxML;
- lastSequence.off = matches[nbMatches-1].off;
- lastSequence.litlen = litlen;
- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
- last_pos = cur + ZSTD_totalLen(lastSequence);
- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */
+ { U32 const longestML = matches[nbMatches-1].len;
+ DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u",
+ (int)(inr-istart), cur, nbMatches, longestML);
+
+ if ( (longestML > sufficient_len)
+ || (cur + longestML >= ZSTD_OPT_NUM)
+ || (ip + cur + longestML >= iend) ) {
+ lastStretch.mlen = longestML;
+ lastStretch.off = matches[nbMatches-1].off;
+ lastStretch.litlen = 0;
+ last_pos = cur + longestML;
goto _shortestPath;
} }
@@ -1230,20 +1309,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
U32 mlen;
- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
- matchNb, matches[matchNb].off, lastML, litlen);
+ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+ matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */
U32 const pos = cur + mlen;
- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
if ((pos > last_pos) || (price < opt[pos].price)) {
DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */
+ while (last_pos < pos) {
+ /* fill empty positions, for future comparisons */
+ last_pos++;
+ opt[last_pos].price = ZSTD_MAX_PRICE;
+ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */
+ }
opt[pos].mlen = mlen;
opt[pos].off = offset;
- opt[pos].litlen = litlen;
+ opt[pos].litlen = 0;
opt[pos].price = price;
} else {
DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
@@ -1251,55 +1335,89 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
}
} } }
+ opt[last_pos+1].price = ZSTD_MAX_PRICE;
} /* for (cur = 1; cur <= last_pos; cur++) */
- lastSequence = opt[last_pos];
- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */
- assert(cur < ZSTD_OPT_NUM); /* control overflow*/
+ lastStretch = opt[last_pos];
+ assert(cur >= lastStretch.mlen);
+ cur = last_pos - lastStretch.mlen;
_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
assert(opt[0].mlen == 0);
+ assert(last_pos >= lastStretch.mlen);
+ assert(cur == last_pos - lastStretch.mlen);
- /* Set the next chunk's repcodes based on the repcodes of the beginning
- * of the last match, and the last sequence. This avoids us having to
- * update them while traversing the sequences.
- */
- if (lastSequence.mlen != 0) {
- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
- ZSTD_memcpy(rep, &reps, sizeof(reps));
+ if (lastStretch.mlen==0) {
+ /* no solution : all matches have been converted into literals */
+ assert(lastStretch.litlen == (ip - anchor) + last_pos);
+ ip += last_pos;
+ continue;
+ }
+ assert(lastStretch.off > 0);
+
+ /* Update offset history */
+ if (lastStretch.litlen == 0) {
+ /* finishing on a match : update offset history */
+ Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
+ ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t));
} else {
- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
+ ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t));
+ assert(cur >= lastStretch.litlen);
+ cur -= lastStretch.litlen;
}
- { U32 const storeEnd = cur + 1;
+ /* Let's write the shortest path solution.
+ * It is stored in @opt in reverse order,
+ * starting from @storeEnd (==cur+2),
+ * effectively partially @opt overwriting.
+ * Content is changed too:
+ * - So far, @opt stored stretches, aka a match followed by literals
+ * - Now, it will store sequences, aka literals followed by a match
+ */
+ { U32 const storeEnd = cur + 2;
U32 storeStart = storeEnd;
- U32 seqPos = cur;
+ U32 stretchPos = cur;
DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
last_pos, cur); (void)last_pos;
- assert(storeEnd < ZSTD_OPT_NUM);
- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
- opt[storeEnd] = lastSequence;
- while (seqPos > 0) {
- U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+ assert(storeEnd < ZSTD_OPT_SIZE);
+ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
+ if (lastStretch.litlen > 0) {
+ /* last "sequence" is unfinished: just a bunch of literals */
+ opt[storeEnd].litlen = lastStretch.litlen;
+ opt[storeEnd].mlen = 0;
+ storeStart = storeEnd-1;
+ opt[storeStart] = lastStretch;
+ } {
+ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */
+ storeStart = storeEnd;
+ }
+ while (1) {
+ ZSTD_optimal_t nextStretch = opt[stretchPos];
+ opt[storeStart].litlen = nextStretch.litlen;
+ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
+ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
+ if (nextStretch.mlen == 0) {
+ /* reaching beginning of segment */
+ break;
+ }
storeStart--;
- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
- opt[storeStart] = opt[seqPos];
- seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+ opt[storeStart] = nextStretch; /* note: litlen will be fixed */
+ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
+ stretchPos -= nextStretch.litlen + nextStretch.mlen;
}
/* save sequences */
- DEBUGLOG(6, "sending selected sequences into seqStore")
+ DEBUGLOG(6, "sending selected sequences into seqStore");
{ U32 storePos;
for (storePos=storeStart; storePos <= storeEnd; storePos++) {
U32 const llen = opt[storePos].litlen;
U32 const mlen = opt[storePos].mlen;
- U32 const offCode = opt[storePos].off;
+ U32 const offBase = opt[storePos].off;
U32 const advance = llen + mlen;
- DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
- anchor - istart, (unsigned)llen, (unsigned)mlen);
+ DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u",
+ (int)(anchor - istart), (unsigned)llen, (unsigned)mlen);
if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */
assert(storePos == storeEnd); /* must be last sequence */
@@ -1308,11 +1426,14 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
}
assert(anchor + llen <= iend);
- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
+ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
anchor += advance;
ip = anchor;
} }
+ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
+
+ /* update all costs */
ZSTD_setBasePrices(optStatePtr, optLevel);
}
} /* while (ip < ilimit) */
@@ -1320,42 +1441,51 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
/* Return the last literals size */
return (size_t)(iend - anchor);
}
+#endif /* build exclusions */
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
static size_t ZSTD_compressBlock_opt0(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
{
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
}
+#endif
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
static size_t ZSTD_compressBlock_opt2(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
{
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
}
+#endif
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
size_t ZSTD_compressBlock_btopt(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
DEBUGLOG(5, "ZSTD_compressBlock_btopt");
return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
}
+#endif
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
/* ZSTD_initStats_ultra():
* make a first compression pass, just to seed stats with more accurate starting values.
* only works on first block, with no dictionary and no ldm.
- * this function cannot error, hence its contract must be respected.
+ * this function cannot error out, its narrow contract must be respected.
*/
-static void
-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
- seqStore_t* seqStore,
- U32 rep[ZSTD_REP_NUM],
- const void* src, size_t srcSize)
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms,
+ SeqStore_t* seqStore,
+ U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize)
{
U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */
ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
@@ -1368,7 +1498,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/
- /* invalidate first scan from history */
+ /* invalidate first scan from history, only keep entropy stats */
ZSTD_resetSeqStore(seqStore);
ms->window.base -= srcSize;
ms->window.dictLimit += (U32)srcSize;
@@ -1378,7 +1508,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
}
size_t ZSTD_compressBlock_btultra(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
@@ -1386,16 +1516,16 @@ size_t ZSTD_compressBlock_btultra(
}
size_t ZSTD_compressBlock_btultra2(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
U32 const curr = (U32)((const BYTE*)src - ms->window.base);
DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
- /* 2-pass strategy:
+ /* 2-passes strategy:
* this strategy makes a first pass over first block to collect statistics
- * and seed next round's statistics with it.
- * After 1st pass, function forgets everything, and starts a new block.
+ * in order to seed next round's statistics with it.
+ * After 1st pass, function forgets history, and starts a new block.
* Consequently, this can only work if no data has been previously loaded in tables,
* aka, no dictionary, no prefix, no ldm preprocessing.
* The compression ratio gain is generally small (~0.5% on first block),
@@ -1404,42 +1534,47 @@ size_t ZSTD_compressBlock_btultra2(
if ( (ms->opt.litLengthSum==0) /* first block */
&& (seqStore->sequences == seqStore->sequencesStart) /* no ldm */
&& (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */
- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
- && (srcSize > ZSTD_PREDEF_THRESHOLD)
+ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */
+ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
) {
ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
}
return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
}
+#endif
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
size_t ZSTD_compressBlock_btopt_dictMatchState(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
}
-size_t ZSTD_compressBlock_btultra_dictMatchState(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+size_t ZSTD_compressBlock_btopt_extDict(
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
}
+#endif
-size_t ZSTD_compressBlock_btopt_extDict(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
}
size_t ZSTD_compressBlock_btultra_extDict(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
}
+#endif
/* note : no btultra2 variant for extDict nor dictMatchState,
* because btultra2 is not meant to work with dictionaries