diff options
Diffstat (limited to 'lib/zstd/compress')
| -rw-r--r-- | lib/zstd/compress/clevels.h | 132 | ||||
| -rw-r--r-- | lib/zstd/compress/fse_compress.c | 83 | ||||
| -rw-r--r-- | lib/zstd/compress/huf_compress.c | 644 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_compress.c | 2000 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_compress_internal.h | 375 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_compress_literals.c | 9 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_compress_literals.h | 4 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_compress_sequences.c | 31 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_compress_superblock.c | 295 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_cwksp.h | 225 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_double_fast.c | 413 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_fast.c | 441 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_lazy.c | 1352 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_lazy.h | 38 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_ldm.c | 76 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_ldm.h | 1 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_ldm_geartab.h | 5 | ||||
| -rw-r--r-- | lib/zstd/compress/zstd_opt.c | 402 | 
18 files changed, 4706 insertions, 1820 deletions
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h new file mode 100644 index 000000000000..d9a76112ec3a --- /dev/null +++ b/lib/zstd/compress/clevels.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CLEVELS_H +#define ZSTD_CLEVELS_H + +#define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressionParameters  */ +#include <linux/zstd.h> + +/*-=====  Pre-defined compression levels  =====-*/ + +#define ZSTD_MAX_CLEVEL     22 + +__attribute__((__unused__)) + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { +{   /* "default" - for any srcSize > 256 KB */ +    /* W,  C,  H,  S,  L, TL, strat */ +    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */ +    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */ +    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */ +    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */ +    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */ +    { 21, 18, 19,  3,  5,  2, ZSTD_greedy  },  /* level  5 */ +    { 21, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6 */ +    { 21, 19, 20,  4,  5,  8, ZSTD_lazy    },  /* level  7 */ +    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  8 */ +    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */ +    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 10 */ +    { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 11 */ +    { 22, 22, 23,  6,  5, 32, ZSTD_lazy2   },  /* level 12 */ +    { 22, 22, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */ +    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */ +    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */ +    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */ +    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */ +    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */ +    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */ +    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */ +    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */ +    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */ +}, +{   /* for srcSize <= 256 KB */ +    /* W,  C,  H,  S,  L,  T, strat */ +    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */ +    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */ +    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */ +    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */ +    { 18, 16, 17,  3,  5,  2, ZSTD_greedy  },  /* level  4.*/ +    { 18, 17, 18,  5,  5,  2, ZSTD_greedy  },  /* level  5.*/ +    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/ +    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */ +    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */ +    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */ +    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */ +    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/ +    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/ +    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */ +    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/ +    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/ +    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/ +    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/ +    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/ +    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/ +    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/ +    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/ +    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/ +}, +{   /* for srcSize <= 128 KB */ +    /* W,  C,  H,  S,  L,  T, strat */ +    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */ +    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */ +    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */ +    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */ +    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */ +    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */ +    { 17, 16, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */ +    { 17, 16, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */ +    { 17, 16, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */ +    { 17, 16, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */ +    { 17, 16, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */ +    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */ +    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */ +    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/ +    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/ +    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/ +    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/ +    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/ +    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/ +    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/ +    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/ +    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/ +    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/ +}, +{   /* for srcSize <= 16 KB */ +    /* W,  C,  H,  S,  L,  T, strat */ +    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */ +    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */ +    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */ +    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */ +    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */ +    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/ +    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */ +    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */ +    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/ +    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/ +    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/ +    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/ +    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/ +    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/ +    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/ +    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/ +    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/ +    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/ +    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/ +    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/ +    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/ +    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/ +    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/ +}, +}; + + + +#endif  /* ZSTD_CLEVELS_H */ diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c index 436985b620e5..ec5b1ca6d71a 100644 --- a/lib/zstd/compress/fse_compress.c +++ b/lib/zstd/compress/fse_compress.c @@ -75,13 +75,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,      void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;      FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);      U32 const step = FSE_TABLESTEP(tableSize); +    U32 const maxSV1 = maxSymbolValue+1; -    U32* cumul = (U32*)workSpace; -    FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2)); +    U16* cumul = (U16*)workSpace;   /* size = maxSV1 */ +    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1));  /* size = tableSize */      U32 highThreshold = tableSize-1; -    if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */ +    assert(((size_t)workSpace & 1) == 0);  /* Must be 2 bytes-aligned */      if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge);      /* CTable header */      tableU16[-2] = (U16) tableLog; @@ -98,20 +99,61 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,      /* symbol start positions */      {   U32 u;          cumul[0] = 0; -        for (u=1; u <= maxSymbolValue+1; u++) { +        for (u=1; u <= maxSV1; u++) {              if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */                  cumul[u] = cumul[u-1] + 1;                  tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);              } else { -                cumul[u] = cumul[u-1] + normalizedCounter[u-1]; +                assert(normalizedCounter[u-1] >= 0); +                cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1]; +                assert(cumul[u] >= cumul[u-1]);  /* no overflow */          }   } -        cumul[maxSymbolValue+1] = tableSize+1; +        cumul[maxSV1] = (U16)(tableSize+1);      }      /* Spread symbols */ -    {   U32 position = 0; +    if (highThreshold == tableSize - 1) { +        /* Case for no low prob count symbols. Lay down 8 bytes at a time +         * to reduce branch misses since we are operating on a small block +         */ +        BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */ +        {   U64 const add = 0x0101010101010101ull; +            size_t pos = 0; +            U64 sv = 0; +            U32 s; +            for (s=0; s<maxSV1; ++s, sv += add) { +                int i; +                int const n = normalizedCounter[s]; +                MEM_write64(spread + pos, sv); +                for (i = 8; i < n; i += 8) { +                    MEM_write64(spread + pos + i, sv); +                } +                assert(n>=0); +                pos += (size_t)n; +            } +        } +        /* Spread symbols across the table. Lack of lowprob symbols means that +         * we don't need variable sized inner loop, so we can unroll the loop and +         * reduce branch misses. +         */ +        {   size_t position = 0; +            size_t s; +            size_t const unroll = 2; /* Experimentally determined optimal unroll */ +            assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ +            for (s = 0; s < (size_t)tableSize; s += unroll) { +                size_t u; +                for (u = 0; u < unroll; ++u) { +                    size_t const uPosition = (position + (u * step)) & tableMask; +                    tableSymbol[uPosition] = spread[s + u]; +                } +                position = (position + (unroll * step)) & tableMask; +            } +            assert(position == 0);   /* Must have initialized all positions */ +        } +    } else { +        U32 position = 0;          U32 symbol; -        for (symbol=0; symbol<=maxSymbolValue; symbol++) { +        for (symbol=0; symbol<maxSV1; symbol++) {              int nbOccurrences;              int const freq = normalizedCounter[symbol];              for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) { @@ -120,7 +162,6 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,                  while (position > highThreshold)                      position = (position + step) & tableMask;   /* Low proba area */          }   } -          assert(position==0);  /* Must have initialized all positions */      } @@ -144,16 +185,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,              case -1:              case  1:                  symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog); -                symbolTT[s].deltaFindState = total - 1; +                assert(total <= INT_MAX); +                symbolTT[s].deltaFindState = (int)(total - 1);                  total ++;                  break;              default : -                { -                    U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1); -                    U32 const minStatePlus = normalizedCounter[s] << maxBitsOut; +                assert(normalizedCounter[s] > 1); +                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); +                    U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;                      symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; -                    symbolTT[s].deltaFindState = total - normalizedCounter[s]; -                    total +=  normalizedCounter[s]; +                    symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +                    total +=  (unsigned)normalizedCounter[s];      }   }   }   }  #if 0  /* debug : symbol costs */ @@ -164,8 +206,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,                  symbol, normalizedCounter[symbol],                  FSE_getMaxNbBits(symbolTT, symbol),                  (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); -        } -    } +    }   }  #endif      return 0; @@ -173,16 +214,18 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, -  #ifndef FSE_COMMONDEFS_ONLY -  /*-**************************************************************  *  FSE NCount encoding  ****************************************************************/  size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)  { -    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3; +    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog +                                   + 4 /* bitCount initialized at 4 */ +                                   + 2 /* first two symbols may use one additional bit each */) / 8) +                                    + 1 /* round up to whole nb bytes */ +                                    + 2 /* additional two bytes for bitstream flush */;      return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */  } diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c index f76a526bfa54..74ef0db47621 100644 --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -50,6 +50,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS  /* *******************************************************  *  HUF : Huffman block compression  *********************************************************/ +#define HUF_WORKSPACE_MAX_ALIGNMENT 8 + +static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align) +{ +    size_t const mask = align - 1; +    size_t const rem = (size_t)workspace & mask; +    size_t const add = (align - rem) & mask; +    BYTE* const aligned = (BYTE*)workspace + add; +    assert((align & (align - 1)) == 0); /* pow 2 */ +    assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT); +    if (*workspaceSizePtr >= add) { +        assert(add < align); +        assert(((size_t)aligned & mask) == 0); +        *workspaceSizePtr -= add; +        return aligned; +    } else { +        *workspaceSizePtr = 0; +        return NULL; +    } +} + +  /* HUF_compressWeights() :   * Same as FSE_compress(), but dedicated to huff0's weights compression.   * The use case needs much less stack memory. @@ -72,7 +94,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT      unsigned maxSymbolValue = HUF_TABLELOG_MAX;      U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; -    HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace; +    HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));      if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC); @@ -103,6 +125,40 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT      return (size_t)(op-ostart);  } +static size_t HUF_getNbBits(HUF_CElt elt) +{ +    return elt & 0xFF; +} + +static size_t HUF_getNbBitsFast(HUF_CElt elt) +{ +    return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ +    return elt & ~0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ +    return elt; +} + +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ +    assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); +    *elt = nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ +    size_t const nbBits = HUF_getNbBits(*elt); +    if (nbBits > 0) { +        assert((value >> nbBits) == 0); +        *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); +    } +}  typedef struct {      HUF_CompressWeightsWksp wksp; @@ -114,9 +170,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,                              const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,                              void* workspace, size_t workspaceSize)  { +    HUF_CElt const* const ct = CTable + 1;      BYTE* op = (BYTE*)dst;      U32 n; -    HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace; +    HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));      /* check conditions */      if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); @@ -127,9 +184,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,      for (n=1; n<huffLog+1; n++)          wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);      for (n=0; n<maxSymbolValue; n++) -        wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits]; +        wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];      /* attempt weights compression by FSE */ +    if (maxDstSize < 1) return ERROR(dstSize_tooSmall);      {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );          if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */              op[0] = (BYTE)hSize; @@ -163,6 +221,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void      U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */      U32 tableLog = 0;      U32 nbSymbols = 0; +    HUF_CElt* const ct = CTable + 1;      /* get symbol weights */      CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); @@ -172,6 +231,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void      if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);      if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); +    CTable[0] = tableLog; +      /* Prepare base value per rank */      {   U32 n, nextRankStart = 0;          for (n=1; n<=tableLog; n++) { @@ -183,13 +244,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void      /* fill nbBits */      {   U32 n; for (n=0; n<nbSymbols; n++) {              const U32 w = huffWeight[n]; -            CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0); +            HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));      }   }      /* fill val */      {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */          U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; -        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; } +        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }          /* determine stating value per rank */          valPerRank[tableLog+1] = 0;   /* for w==0 */          {   U16 min = 0; @@ -199,18 +260,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void                  min >>= 1;          }   }          /* assign value within rank, symbol order */ -        { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; } +        { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }      }      *maxSymbolValuePtr = nbSymbols - 1;      return readSize;  } -U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue) +U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)  { -    const HUF_CElt* table = (const HUF_CElt*)symbolTable; +    const HUF_CElt* ct = CTable + 1;      assert(symbolValue <= HUF_SYMBOLVALUE_MAX); -    return table[symbolValue].nbBits; +    return (U32)HUF_getNbBits(ct[symbolValue]);  } @@ -364,22 +425,118 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)  }  typedef struct { -    U32 base; -    U32 curr; +    U16 base; +    U16 curr;  } rankPos;  typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; -#define RANK_POSITION_TABLE_SIZE 32 +/* Number of buckets available for HUF_sort() */ +#define RANK_POSITION_TABLE_SIZE 192  typedef struct {    huffNodeTable huffNodeTbl;    rankPos rankPosition[RANK_POSITION_TABLE_SIZE];  } HUF_buildCTable_wksp_tables; +/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing. + * Strategy is to use as many buckets as possible for representing distinct + * counts while using the remainder to represent all "large" counts. + * + * To satisfy this requirement for 192 buckets, we can do the following: + * Let buckets 0-166 represent distinct counts of [0, 166] + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ +#define RANK_POSITION_MAX_COUNT_LOG 32 +#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ + +/* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. + */ +static U32 HUF_getIndex(U32 const count) { +    return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) +        ? count +        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; +} + +/* Helper swap function for HUF_quickSortPartition() */ +static void HUF_swapNodes(nodeElt* a, nodeElt* b) { +	nodeElt tmp = *a; +	*a = *b; +	*b = tmp; +} + +/* Returns 0 if the huffNode array is not sorted by descending count */ +MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) { +    U32 i; +    for (i = 1; i < maxSymbolValue1; ++i) { +        if (huffNode[i].count > huffNode[i-1].count) { +            return 0; +        } +    } +    return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { +    int i; +    int const size = high-low+1; +    huffNode += low; +    for (i = 1; i < size; ++i) { +        nodeElt const key = huffNode[i]; +        int j = i - 1; +        while (j >= 0 && huffNode[j].count < key.count) { +            huffNode[j + 1] = huffNode[j]; +            j--; +        } +        huffNode[j + 1] = key; +    } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { +    /* Simply select rightmost element as pivot. "Better" selectors like +     * median-of-three don't experimentally appear to have any benefit. +     */ +    U32 const pivot = arr[high].count; +    int i = low - 1; +    int j = low; +    for ( ; j < high; j++) { +        if (arr[j].count > pivot) { +            i++; +            HUF_swapNodes(&arr[i], &arr[j]); +        } +    } +    HUF_swapNodes(&arr[i + 1], &arr[high]); +    return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { +    int const kInsertionSortThreshold = 8; +    if (high - low < kInsertionSortThreshold) { +        HUF_insertionSort(arr, low, high); +        return; +    } +    while (low < high) { +        int const idx = HUF_quickSortPartition(arr, low, high); +        if (idx - low < high - idx) { +            HUF_simpleQuickSort(arr, low, idx - 1); +            low = idx + 1; +        } else { +            HUF_simpleQuickSort(arr, idx + 1, high); +            high = idx - 1; +        } +    } +} +  /*   * HUF_sort():   * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.   *   * @param[out] huffNode       Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.   *                            Must have (maxSymbolValue + 1) entries. @@ -387,44 +544,52 @@ typedef struct {   * @param[in]  maxSymbolValue Maximum symbol value.   * @param      rankPosition   This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.   */ -static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) -{ -    int n; -    int const maxSymbolValue1 = (int)maxSymbolValue + 1; +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { +    U32 n; +    U32 const maxSymbolValue1 = maxSymbolValue+1;      /* Compute base and set curr to base. -     * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1. -     * Then 2^lowerRank <= count[n]+1 <= 2^rank. +     * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. +     * See HUF_getIndex to see bucketing strategy.       * We attribute each symbol to lowerRank's base value, because we want to know where       * each rank begins in the output, so for rank R we want to count ranks R+1 and above.       */      ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);      for (n = 0; n < maxSymbolValue1; ++n) { -        U32 lowerRank = BIT_highbit32(count[n] + 1); +        U32 lowerRank = HUF_getIndex(count[n]); +        assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);          rankPosition[lowerRank].base++;      } +      assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); +    /* Set up the rankPosition table */      for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {          rankPosition[n-1].base += rankPosition[n].base;          rankPosition[n-1].curr = rankPosition[n-1].base;      } -    /* Sort */ + +    /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */      for (n = 0; n < maxSymbolValue1; ++n) {          U32 const c = count[n]; -        U32 const r = BIT_highbit32(c+1) + 1; -        U32 pos = rankPosition[r].curr++; -        /* Insert into the correct position in the rank. -         * We have at most 256 symbols, so this insertion should be fine. -         */ -        while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { -            huffNode[pos] = huffNode[pos-1]; -            pos--; -        } +        U32 const r = HUF_getIndex(c) + 1; +        U32 const pos = rankPosition[r].curr++; +        assert(pos < maxSymbolValue1);          huffNode[pos].count = c;          huffNode[pos].byte  = (BYTE)n;      } -} +    /* Sort each bucket. */ +    for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; +        U32 const bucketStartIdx = rankPosition[n].base; +        if (bucketSize > 1) { +            assert(bucketStartIdx < maxSymbolValue1); +            HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); +        } +    } + +    assert(HUF_isSorted(huffNode, maxSymbolValue1)); +}  /* HUF_buildCTable_wksp() :   *  Same as HUF_buildCTable(), but using externally allocated scratch buffer. @@ -487,6 +652,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)   */  static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)  { +    HUF_CElt* const ct = CTable + 1;      /* fill result into ctable (val, nbBits) */      int n;      U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; @@ -502,20 +668,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i              min >>= 1;      }   }      for (n=0; n<alphabetSize; n++) -        CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits;   /* push nbBits per symbol, symbol order */ +        HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */      for (n=0; n<alphabetSize; n++) -        CTable[n].val = valPerRank[CTable[n].nbBits]++;   /* assign value within rank, symbol order */ +        HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */ +    CTable[0] = maxNbBits;  } -size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) +size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)  { -    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace; +    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));      nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;      nodeElt* const huffNode = huffNode0+1;      int nonNullRank;      /* safety checks */ -    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */      if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))        return ERROR(workSpace_tooSmall);      if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; @@ -533,99 +699,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo      maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);      if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */ -    HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits); +    HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);      return maxNbBits;  }  size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)  { +    HUF_CElt const* ct = CTable + 1;      size_t nbBits = 0;      int s;      for (s = 0; s <= (int)maxSymbolValue; ++s) { -        nbBits += CTable[s].nbBits * count[s]; +        nbBits += HUF_getNbBits(ct[s]) * count[s];      }      return nbBits >> 3;  }  int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { +  HUF_CElt const* ct = CTable + 1;    int bad = 0;    int s;    for (s = 0; s <= (int)maxSymbolValue; ++s) { -    bad |= (count[s] != 0) & (CTable[s].nbBits == 0); +    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);    }    return !bad;  }  size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +/* HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + *   1. HUF_addBits() takes a HUF_CElt (size_t) which is + *      the pair (nbBits, value) in the format: + *      format: + *        - Bits [0, 4)            = nbBits + *        - Bits [4, 64 - nbBits)  = 0 + *        - Bits [64 - nbBits, 64) = value + *   2. The bitContainer is built from the upper bits and + *      right shifted. E.g. to add a new value of N bits + *      you right shift the bitContainer by N, then or in + *      the new value into the N upper bits. + *   3. The bitstream has two bit containers. You can add + *      bits to the second container and merge them into + *      the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { +    size_t bitContainer[2]; +    size_t bitPos[2]; + +    BYTE* startPtr; +    BYTE* ptr; +    BYTE* endPtr; +} HUF_CStream_t; + +/*! HUF_initCStream(): + * Initializes the bitstream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, +                                  void* startPtr, size_t dstCapacity) +{ +    ZSTD_memset(bitC, 0, sizeof(*bitC)); +    bitC->startPtr = (BYTE*)startPtr; +    bitC->ptr = bitC->startPtr; +    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); +    if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); +    return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt   The element we're adding. This is a (nbBits, value) pair. + *              See the HUF_CStream_t docs for the format. + * @param idx   Insert into the bitstream at this idx. + * @param kFast This is a template parameter. If the bitstream is guaranteed + *              to have at least 4 unused bits after this call it may be 1, + *              otherwise it must be 0. HUF_addBits() is faster when fast is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) +{ +    assert(idx <= 1); +    assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); +    /* This is efficient on x86-64 with BMI2 because shrx +     * only reads the low 6 bits of the register. The compiler +     * knows this and elides the mask. When fast is set, +     * every operation can use the same value loaded from elt. +     */ +    bitC->bitContainer[idx] >>= HUF_getNbBits(elt); +    bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); +    /* We only read the low 8 bits of bitC->bitPos[idx] so it +     * doesn't matter that the high bits have noise from the value. +     */ +    bitC->bitPos[idx] += HUF_getNbBitsFast(elt); +    assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); +    /* The last 4-bits of elt are dirty if fast is set, +     * so we must not be overwriting bits that have already been +     * inserted into the bit container. +     */ +#if DEBUGLEVEL >= 1 +    { +        size_t const nbBits = HUF_getNbBits(elt); +        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; +        (void)dirtyBits; +        /* Middle bits are 0. */ +        assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +        /* We didn't overwrite any bits in the bit container. */ +        assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); +        (void)dirtyBits; +    } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ +    bitC->bitContainer[1] = 0; +    bitC->bitPos[1] = 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ +    assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); +    bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); +    bitC->bitContainer[0] |= bitC->bitContainer[1]; +    bitC->bitPos[0] += bitC->bitPos[1]; +    assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +*              the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ +    /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ +    size_t const nbBits = bitC->bitPos[0] & 0xFF; +    size_t const nbBytes = nbBits >> 3; +    /* The top nbBits bits of bitContainer are the ones we need. */ +    size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); +    /* Mask bitPos to account for the bytes we consumed. */ +    bitC->bitPos[0] &= 7; +    assert(nbBits > 0); +    assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); +    assert(bitC->ptr <= bitC->endPtr); +    MEM_writeLEST(bitC->ptr, bitContainer); +    bitC->ptr += nbBytes; +    assert(!kFast || bitC->ptr <= bitC->endPtr); +    if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; +    /* bitContainer doesn't need to be modified because the leftover +     * bits are already the top bitPos bits. And we don't care about +     * noise in the lower values. +     */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value = 1. + */ +static HUF_CElt HUF_endMark(void) +{ +    HUF_CElt endMark; +    HUF_setNbBits(&endMark, 1); +    HUF_setValue(&endMark, 1); +    return endMark; +} + +/*! HUF_closeCStream() : + *  @return Size of CStream, in bytes, + *          or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ +    HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); +    HUF_flushBits(bitC, /* kFast */ 0); +    { +        size_t const nbBits = bitC->bitPos[0] & 0xFF; +        if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +        return (bitC->ptr - bitC->startPtr) + (nbBits > 0); +    } +} +  FORCE_INLINE_TEMPLATE void -HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)  { -    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); +    HUF_addBits(bitCPtr, CTable[symbol], idx, fast);  } -#define HUF_FLUSHBITS(s)  BIT_flushBits(s) +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, +                                   const BYTE* ip, size_t srcSize, +                                   const HUF_CElt* ct, +                                   int kUnroll, int kFastFlush, int kLastFast) +{ +    /* Join to kUnroll */ +    int n = (int)srcSize; +    int rem = n % kUnroll; +    if (rem > 0) { +        for (; rem > 0; --rem) { +            HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); +        } +        HUF_flushBits(bitC, kFastFlush); +    } +    assert(n % kUnroll == 0); + +    /* Join to 2 * kUnroll */ +    if (n % (2 * kUnroll)) { +        int u; +        for (u = 1; u < kUnroll; ++u) { +            HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); +        } +        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); +        HUF_flushBits(bitC, kFastFlush); +        n -= kUnroll; +    } +    assert(n % (2 * kUnroll) == 0); + +    for (; n>0; n-= 2 * kUnroll) { +        /* Encode kUnroll symbols into the bitstream @ index 0. */ +        int u; +        for (u = 1; u < kUnroll; ++u) { +            HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); +        } +        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); +        HUF_flushBits(bitC, kFastFlush); +        /* Encode kUnroll symbols into the bitstream @ index 1. +         * This allows us to start filling the bit container +         * without any data dependencies. +         */ +        HUF_zeroIndex1(bitC); +        for (u = 1; u < kUnroll; ++u) { +            HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); +        } +        HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); +        /* Merge bitstream @ index 1 into the bitstream @ index 0 */ +        HUF_mergeIndex1(bitC); +        HUF_flushBits(bitC, kFastFlush); +    } +    assert(n == 0); + +} -#define HUF_FLUSHBITS_1(stream) \ -    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) +/* + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ +    return ((srcSize * tableLog) >> 3) + 8; +} -#define HUF_FLUSHBITS_2(stream) \ -    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)  FORCE_INLINE_TEMPLATE size_t  HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,                                     const void* src, size_t srcSize,                                     const HUF_CElt* CTable)  { +    U32 const tableLog = (U32)CTable[0]; +    HUF_CElt const* ct = CTable + 1;      const BYTE* ip = (const BYTE*) src;      BYTE* const ostart = (BYTE*)dst;      BYTE* const oend = ostart + dstSize;      BYTE* op = ostart; -    size_t n; -    BIT_CStream_t bitC; +    HUF_CStream_t bitC;      /* init */      if (dstSize < 8) return 0;   /* not enough space to compress */ -    { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); +    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));        if (HUF_isError(initErr)) return 0; } -    n = srcSize & ~3;  /* join to mod 4 */ -    switch (srcSize & 3) -    { -        case 3: -            HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); -            HUF_FLUSHBITS_2(&bitC); -            ZSTD_FALLTHROUGH; -        case 2: -            HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); -            HUF_FLUSHBITS_1(&bitC); -            ZSTD_FALLTHROUGH; -        case 1: -            HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); -            HUF_FLUSHBITS(&bitC); -            ZSTD_FALLTHROUGH; -        case 0: ZSTD_FALLTHROUGH; -        default: break; -    } - -    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */ -        HUF_encodeSymbol(&bitC, ip[n- 1], CTable); -        HUF_FLUSHBITS_1(&bitC); -        HUF_encodeSymbol(&bitC, ip[n- 2], CTable); -        HUF_FLUSHBITS_2(&bitC); -        HUF_encodeSymbol(&bitC, ip[n- 3], CTable); -        HUF_FLUSHBITS_1(&bitC); -        HUF_encodeSymbol(&bitC, ip[n- 4], CTable); -        HUF_FLUSHBITS(&bitC); -    } - -    return BIT_closeCStream(&bitC); +    if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) +        HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); +    else { +        if (MEM_32bits()) { +            switch (tableLog) { +            case 11: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); +                break; +            case 10: ZSTD_FALLTHROUGH; +            case 9: ZSTD_FALLTHROUGH; +            case 8: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); +                break; +            case 7: ZSTD_FALLTHROUGH; +            default: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); +                break; +            } +        } else { +            switch (tableLog) { +            case 11: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); +                break; +            case 10: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); +                break; +            case 9: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); +                break; +            case 8: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); +                break; +            case 7: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); +                break; +            case 6: ZSTD_FALLTHROUGH; +            default: +                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); +                break; +            } +        } +    } +    assert(bitC.ptr <= bitC.endPtr); + +    return HUF_closeCStream(&bitC);  }  #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t  HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,                                     const void* src, size_t srcSize,                                     const HUF_CElt* CTable) @@ -667,9 +1068,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,  size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)  { -    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);  } +size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ +    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); +}  static size_t  HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, @@ -689,8 +1094,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,      assert(op <= oend);      {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); -        if (cSize==0) return 0; -        assert(cSize <= 65535); +        if (cSize == 0 || cSize > 65535) return 0;          MEM_writeLE16(ostart, (U16)cSize);          op += cSize;      } @@ -698,8 +1102,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,      ip += segmentSize;      assert(op <= oend);      {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); -        if (cSize==0) return 0; -        assert(cSize <= 65535); +        if (cSize == 0 || cSize > 65535) return 0;          MEM_writeLE16(ostart+2, (U16)cSize);          op += cSize;      } @@ -707,8 +1110,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,      ip += segmentSize;      assert(op <= oend);      {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); -        if (cSize==0) return 0; -        assert(cSize <= 65535); +        if (cSize == 0 || cSize > 65535) return 0;          MEM_writeLE16(ostart+4, (U16)cSize);          op += cSize;      } @@ -717,7 +1119,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,      assert(op <= oend);      assert(ip <= iend);      {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); -        if (cSize==0) return 0; +        if (cSize == 0 || cSize > 65535) return 0;          op += cSize;      } @@ -726,7 +1128,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,  size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)  { -    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + +size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ +    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);  }  typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; @@ -750,35 +1157,38 @@ static size_t HUF_compressCTable_internal(  typedef struct {      unsigned count[HUF_SYMBOLVALUE_MAX + 1]; -    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; +    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];      union {          HUF_buildCTable_wksp_tables buildCTable_wksp;          HUF_WriteCTableWksp writeCTable_wksp; +        U32 hist_wksp[HIST_WKSP_SIZE_U32];      } wksps;  } HUF_compress_tables_t; +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */ +  /* HUF_compress_internal() :   * `workSpace_align4` must be aligned on 4-bytes boundaries, - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */ + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */  static size_t  HUF_compress_internal (void* dst, size_t dstSize,                   const void* src, size_t srcSize,                         unsigned maxSymbolValue, unsigned huffLog,                         HUF_nbStreams_e nbStreams, -                       void* workSpace_align4, size_t wkspSize, +                       void* workSpace, size_t wkspSize,                         HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, -                 const int bmi2) +                 const int bmi2, unsigned suspectUncompressible)  { -    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4; +    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));      BYTE* const ostart = (BYTE*)dst;      BYTE* const oend = ostart + dstSize;      BYTE* op = ostart; -    HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); -    assert(((size_t)workSpace_align4 & 3) == 0);   /* must be aligned on 4-bytes boundaries */ +    HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);      /* checks & inits */ -    if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); +    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);      if (!srcSize) return 0;  /* Uncompressed */      if (!dstSize) return 0;  /* cannot fit anything within dst budget */      if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */ @@ -794,8 +1204,23 @@ HUF_compress_internal (void* dst, size_t dstSize,                                             nbStreams, oldHufTable, bmi2);      } +    /* If uncompressible data is suspected, do a smaller sampling first */ +    DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { +        size_t largestTotal = 0; +        {   unsigned maxSymbolValueBegin = maxSymbolValue; +            CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); +            largestTotal += largestBegin; +        } +        {   unsigned maxSymbolValueEnd = maxSymbolValue; +            CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); +            largestTotal += largestEnd; +        } +        if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0;   /* heuristic : probably not compressible enough */ +    } +      /* Scan input and build symbol stats */ -    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) ); +    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );          if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */          if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */      } @@ -820,9 +1245,12 @@ HUF_compress_internal (void* dst, size_t dstSize,                                              &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));          CHECK_F(maxBits);          huffLog = (U32)maxBits; -        /* Zero unused symbols in CTable, so we can check it for validity */ -        ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0, -               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); +    } +    /* Zero unused symbols in CTable, so we can check it for validity */ +    { +        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); +        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); +        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);      }      /* Write table description header */ @@ -859,19 +1287,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,      return HUF_compress_internal(dst, dstSize, src, srcSize,                                   maxSymbolValue, huffLog, HUF_singleStream,                                   workSpace, wkspSize, -                                 NULL, NULL, 0, 0 /*bmi2*/); +                                 NULL, NULL, 0, 0 /*bmi2*/, 0);  }  size_t HUF_compress1X_repeat (void* dst, size_t dstSize,                        const void* src, size_t srcSize,                        unsigned maxSymbolValue, unsigned huffLog,                        void* workSpace, size_t wkspSize, -                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +                      int bmi2, unsigned suspectUncompressible)  {      return HUF_compress_internal(dst, dstSize, src, srcSize,                                   maxSymbolValue, huffLog, HUF_singleStream,                                   workSpace, wkspSize, hufTable, -                                 repeat, preferRepeat, bmi2); +                                 repeat, preferRepeat, bmi2, suspectUncompressible);  }  /* HUF_compress4X_repeat(): @@ -885,21 +1314,22 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,      return HUF_compress_internal(dst, dstSize, src, srcSize,                                   maxSymbolValue, huffLog, HUF_fourStreams,                                   workSpace, wkspSize, -                                 NULL, NULL, 0, 0 /*bmi2*/); +                                 NULL, NULL, 0, 0 /*bmi2*/, 0);  }  /* HUF_compress4X_repeat():   * compress input using 4 streams. + * consider skipping quickly   * re-use an existing huffman compression table */  size_t HUF_compress4X_repeat (void* dst, size_t dstSize,                        const void* src, size_t srcSize,                        unsigned maxSymbolValue, unsigned huffLog,                        void* workSpace, size_t wkspSize, -                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)  {      return HUF_compress_internal(dst, dstSize, src, srcSize,                                   maxSymbolValue, huffLog, HUF_fourStreams,                                   workSpace, wkspSize, -                                 hufTable, repeat, preferRepeat, bmi2); +                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);  } diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c index a4e916008b3a..f620cafca633 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -12,7 +12,6 @@  *  Dependencies  ***************************************/  #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ -#include "../common/cpu.h"  #include "../common/mem.h"  #include "hist.h"           /* HIST_countFast_wksp */  #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */ @@ -39,6 +38,18 @@   * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected.   */ +/*! + * ZSTD_HASHLOG3_MAX : + * Maximum size of the hash table dedicated to find 3-bytes matches, + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. + * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ +#ifndef ZSTD_HASHLOG3_MAX +#  define ZSTD_HASHLOG3_MAX 17 +#endif  /*-*************************************  *  Helper functions @@ -69,6 +80,10 @@ struct ZSTD_CDict_s {      ZSTD_customMem customMem;      U32 dictID;      int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ +    ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use +                                           * row-based matchfinder. Unless the cdict is reloaded, we will use +                                           * the same greedy/lazy matchfinder at compression time. +                                           */  };  /* typedef'd to ZSTD_CDict within "zstd.h" */  ZSTD_CCtx* ZSTD_createCCtx(void) @@ -81,7 +96,7 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)      assert(cctx != NULL);      ZSTD_memset(cctx, 0, sizeof(*cctx));      cctx->customMem = memManager; -    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +    cctx->bmi2 = ZSTD_cpuSupportsBmi2();      {   size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);          assert(!ZSTD_isError(err));          (void)err; @@ -192,12 +207,64 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)  /* private API call, for dictBuilder only */  const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } +/* Returns true if the strategy supports using a row based matchfinder */ +static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { +    return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); +} + +/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) { +    assert(mode != ZSTD_ps_auto); +    return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); +} + +/* Returns row matchfinder usage given an initial mode and cParams */ +static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode, +                                                         const ZSTD_compressionParameters* const cParams) { +#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) +    int const kHasSIMD128 = 1; +#else +    int const kHasSIMD128 = 0; +#endif +    if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ +    mode = ZSTD_ps_disable; +    if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; +    if (kHasSIMD128) { +        if (cParams->windowLog > 14) mode = ZSTD_ps_enable; +    } else { +        if (cParams->windowLog > 17) mode = ZSTD_ps_enable; +    } +    return mode; +} + +/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, +                                                        const ZSTD_compressionParameters* const cParams) { +    if (mode != ZSTD_ps_auto) return mode; +    return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ +static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, +                                   const ZSTD_paramSwitch_e useRowMatchFinder, +                                   const U32 forDDSDict) { +    assert(useRowMatchFinder != ZSTD_ps_auto); +    /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. +     * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. +     */ +    return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); +} +  /* Returns 1 if compression parameters are such that we should   * enable long distance matching (wlog >= 27, strategy >= btopt).   * Returns 0 otherwise.   */ -static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* const cParams) { -    return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27; +static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, +                                 const ZSTD_compressionParameters* const cParams) { +    if (mode != ZSTD_ps_auto) return mode; +    return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;  }  static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( @@ -208,15 +275,15 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(      ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT);      cctxParams.cParams = cParams; -    if (ZSTD_CParams_shouldEnableLdm(&cParams)) { -        DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into cctx params"); -        cctxParams.ldmParams.enableLdm = 1; -        /* LDM is enabled by default for optimal parser and window size >= 128MB */ +    /* Adjust advanced params according to cParams */ +    cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams); +    if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) {          ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams);          assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);          assert(cctxParams.ldmParams.hashRateLog < 32);      } - +    cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); +    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);      assert(!ZSTD_checkCParams(cParams));      return cctxParams;  } @@ -275,6 +342,11 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par       * But, set it for tracing anyway.       */      cctxParams->compressionLevel = compressionLevel; +    cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); +    cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); +    cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); +    DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", +                cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);  }  size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) @@ -431,9 +503,9 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)          return bounds;      case ZSTD_c_literalCompressionMode: -        ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed); -        bounds.lowerBound = ZSTD_lcm_auto; -        bounds.upperBound = ZSTD_lcm_uncompressed; +        ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable); +        bounds.lowerBound = (int)ZSTD_ps_auto; +        bounds.upperBound = (int)ZSTD_ps_disable;          return bounds;      case ZSTD_c_targetCBlockSize: @@ -462,6 +534,21 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)          bounds.upperBound = 1;          return bounds; +    case ZSTD_c_useBlockSplitter: +        bounds.lowerBound = (int)ZSTD_ps_auto; +        bounds.upperBound = (int)ZSTD_ps_disable; +        return bounds; + +    case ZSTD_c_useRowMatchFinder: +        bounds.lowerBound = (int)ZSTD_ps_auto; +        bounds.upperBound = (int)ZSTD_ps_disable; +        return bounds; + +    case ZSTD_c_deterministicRefPrefix: +        bounds.lowerBound = 0; +        bounds.upperBound = 1; +        return bounds; +      default:          bounds.error = ERROR(parameter_unsupported);          return bounds; @@ -523,6 +610,9 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)      case ZSTD_c_stableOutBuffer:      case ZSTD_c_blockDelimiters:      case ZSTD_c_validateSequences: +    case ZSTD_c_useBlockSplitter: +    case ZSTD_c_useRowMatchFinder: +    case ZSTD_c_deterministicRefPrefix:      default:          return 0;      } @@ -575,6 +665,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)      case ZSTD_c_stableOutBuffer:      case ZSTD_c_blockDelimiters:      case ZSTD_c_validateSequences: +    case ZSTD_c_useBlockSplitter: +    case ZSTD_c_useRowMatchFinder: +    case ZSTD_c_deterministicRefPrefix:          break;      default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -672,7 +765,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,      }      case ZSTD_c_literalCompressionMode : { -        const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; +        const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;          BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);          CCtxParams->literalCompressionMode = lcm;          return CCtxParams->literalCompressionMode; @@ -699,7 +792,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,          return CCtxParams->enableDedicatedDictSearch;      case ZSTD_c_enableLongDistanceMatching : -        CCtxParams->ldmParams.enableLdm = (value!=0); +        CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;          return CCtxParams->ldmParams.enableLdm;      case ZSTD_c_ldmHashLog : @@ -758,6 +851,21 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,          CCtxParams->validateSequences = value;          return CCtxParams->validateSequences; +    case ZSTD_c_useBlockSplitter: +        BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +        CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value; +        return CCtxParams->useBlockSplitter; + +    case ZSTD_c_useRowMatchFinder: +        BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); +        CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value; +        return CCtxParams->useRowMatchFinder; + +    case ZSTD_c_deterministicRefPrefix: +        BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); +        CCtxParams->deterministicRefPrefix = !!value; +        return CCtxParams->deterministicRefPrefix; +      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");      }  } @@ -863,6 +971,15 @@ size_t ZSTD_CCtxParams_getParameter(      case ZSTD_c_validateSequences :          *value = (int)CCtxParams->validateSequences;          break; +    case ZSTD_c_useBlockSplitter : +        *value = (int)CCtxParams->useBlockSplitter; +        break; +    case ZSTD_c_useRowMatchFinder : +        *value = (int)CCtxParams->useRowMatchFinder; +        break; +    case ZSTD_c_deterministicRefPrefix: +        *value = (int)CCtxParams->deterministicRefPrefix; +        break;      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");      }      return 0; @@ -889,7 +1006,7 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(      return 0;  } -ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)  {      DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);      RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, @@ -969,14 +1086,14 @@ size_t ZSTD_CCtx_loadDictionary_advanced(      return 0;  } -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( +size_t ZSTD_CCtx_loadDictionary_byReference(        ZSTD_CCtx* cctx, const void* dict, size_t dictSize)  {      return ZSTD_CCtx_loadDictionary_advanced(              cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);  } -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)  {      return ZSTD_CCtx_loadDictionary_advanced(              cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); @@ -1146,7 +1263,7 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,          break;      case ZSTD_cpm_createCDict:          /* Assume a small source size when creating a dictionary -         * with an unkown source size. +         * with an unknown source size.           */          if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)              srcSize = minSrcSize; @@ -1220,7 +1337,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(        srcSizeHint = CCtxParams->srcSizeHint;      }      cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); -    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; +    if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;      ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);      assert(!ZSTD_checkCParams(cParams));      /* srcSizeHint == 0 means 0 */ @@ -1229,9 +1346,14 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(  static size_t  ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, +                       const ZSTD_paramSwitch_e useRowMatchFinder, +                       const U32 enableDedicatedDictSearch,                         const U32 forCCtx)  { -    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); +    /* chain table size should be 0 for fast or row-hash strategies */ +    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) +                                ? ((size_t)1 << cParams->chainLog) +                                : 0;      size_t const hSize = ((size_t)1) << cParams->hashLog;      U32    const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;      size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; @@ -1241,43 +1363,53 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,                              + hSize * sizeof(U32)                              + h3Size * sizeof(U32);      size_t const optPotentialSpace = -        ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) -      + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) -      + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) -      + ZSTD_cwksp_alloc_size((1<<Litbits) * sizeof(U32)) -      + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) -      + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +        ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32)) +      + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) +      + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) +      + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32)) +      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) +      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +    size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) +                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) +                                            : 0;      size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))                                  ? optPotentialSpace                                  : 0; +    size_t const slackSpace = ZSTD_cwksp_slack_space_required(); + +    /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ +    ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); +    assert(useRowMatchFinder != ZSTD_ps_auto); +      DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",                  (U32)chainSize, (U32)hSize, (U32)h3Size); -    return tableSpace + optSpace; +    return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;  }  static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(          const ZSTD_compressionParameters* cParams,          const ldmParams_t* ldmParams,          const int isStatic, +        const ZSTD_paramSwitch_e useRowMatchFinder,          const size_t buffInSize,          const size_t buffOutSize,          const U64 pledgedSrcSize)  { -    size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << cParams->windowLog), pledgedSrcSize)); +    size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);      size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);      U32    const divider = (cParams->minMatch==3) ? 3 : 4;      size_t const maxNbSeq = blockSize / divider;      size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) -                            + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) +                            + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))                              + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));      size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);      size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); -    size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, /* forCCtx */ 1); +    size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);      size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);      size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); -    size_t const ldmSeqSpace = ldmParams->enableLdm ? -        ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; +    size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? +        ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;      size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) @@ -1303,19 +1435,32 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)  {      ZSTD_compressionParameters const cParams =                  ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, +                                                                               &cParams);      RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");      /* estimateCCtxSize is for one-shot compression. So no buffers should       * be needed. However, we still allocate two 0-sized buffers, which can       * take space under ASAN. */      return ZSTD_estimateCCtxSize_usingCCtxParams_internal( -        &cParams, ¶ms->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); +        &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);  }  size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)  { -    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); -    return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); +    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); +    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { +        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ +        size_t noRowCCtxSize; +        size_t rowCCtxSize; +        initialParams.useRowMatchFinder = ZSTD_ps_disable; +        noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); +        initialParams.useRowMatchFinder = ZSTD_ps_enable; +        rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); +        return MAX(noRowCCtxSize, rowCCtxSize); +    } else { +        return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); +    }  }  static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) @@ -1355,17 +1500,29 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)          size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)                  ? ZSTD_compressBound(blockSize) + 1                  : 0; +        ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams);          return ZSTD_estimateCCtxSize_usingCCtxParams_internal( -            &cParams, ¶ms->ldmParams, 1, inBuffSize, outBuffSize, +            &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,              ZSTD_CONTENTSIZE_UNKNOWN);      }  }  size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)  { -    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); -    return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); +    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); +    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { +        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ +        size_t noRowCCtxSize; +        size_t rowCCtxSize; +        initialParams.useRowMatchFinder = ZSTD_ps_disable; +        noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); +        initialParams.useRowMatchFinder = ZSTD_ps_enable; +        rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); +        return MAX(noRowCCtxSize, rowCCtxSize); +    } else { +        return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); +    }  }  static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) @@ -1480,20 +1637,27 @@ typedef enum {      ZSTD_resetTarget_CCtx  } ZSTD_resetTarget_e; +  static size_t  ZSTD_reset_matchState(ZSTD_matchState_t* ms,                        ZSTD_cwksp* ws,                  const ZSTD_compressionParameters* cParams, +                const ZSTD_paramSwitch_e useRowMatchFinder,                  const ZSTD_compResetPolicy_e crp,                  const ZSTD_indexResetPolicy_e forceResetIndex,                  const ZSTD_resetTarget_e forWho)  { -    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); +    /* disable chain table allocation for fast or row-based strategies */ +    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, +                                                     ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) +                                ? ((size_t)1 << cParams->chainLog) +                                : 0;      size_t const hSize = ((size_t)1) << cParams->hashLog;      U32    const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;      size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;      DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); +    assert(useRowMatchFinder != ZSTD_ps_auto);      if (forceResetIndex == ZSTDirp_reset) {          ZSTD_window_init(&ms->window);          ZSTD_cwksp_mark_tables_dirty(ws); @@ -1532,11 +1696,23 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,          ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));      } +    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +        {   /* Row match finder needs an additional table of hashes ("tags") */ +            size_t const tagTableSize = hSize*sizeof(U16); +            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); +        } +        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */ +            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +            assert(cParams->hashLog >= rowLog); +            ms->rowHashLog = cParams->hashLog - rowLog; +        } +    } +      ms->cParams = *cParams;      RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,                      "failed a workspace allocation in ZSTD_reset_matchState"); -      return 0;  } @@ -1553,61 +1729,87 @@ static int ZSTD_indexTooCloseToMax(ZSTD_window_t w)      return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN);  } +/* ZSTD_dictTooBig(): + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in + * one go generically. So we ensure that in that case we reset the tables to zero, + * so that we can load as much of the dictionary as possible. + */ +static int ZSTD_dictTooBig(size_t const loadedDictSize) +{ +    return loadedDictSize > ZSTD_CHUNKSIZE_MAX; +} +  /*! ZSTD_resetCCtx_internal() : -    note : `params` are assumed fully validated at this stage */ + * @param loadedDictSize The size of the dictionary to be loaded + * into the context, if any. If no dictionary is used, or the + * dictionary is being attached / copied, then pass 0. + * note : `params` are assumed fully validated at this stage. + */  static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, -                                      ZSTD_CCtx_params params, +                                      ZSTD_CCtx_params const* params,                                        U64 const pledgedSrcSize, +                                      size_t const loadedDictSize,                                        ZSTD_compResetPolicy_e const crp,                                        ZSTD_buffered_policy_e const zbuff)  {      ZSTD_cwksp* const ws = &zc->workspace; -    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", -                (U32)pledgedSrcSize, params.cParams.windowLog); -    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", +                (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter); +    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));      zc->isFirstBlock = 1; -    if (params.ldmParams.enableLdm) { +    /* Set applied params early so we can modify them for LDM, +     * and point params at the applied params. +     */ +    zc->appliedParams = *params; +    params = &zc->appliedParams; + +    assert(params->useRowMatchFinder != ZSTD_ps_auto); +    assert(params->useBlockSplitter != ZSTD_ps_auto); +    assert(params->ldmParams.enableLdm != ZSTD_ps_auto); +    if (params->ldmParams.enableLdm == ZSTD_ps_enable) {          /* Adjust long distance matching parameters */ -        ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); -        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); -        assert(params.ldmParams.hashRateLog < 32); +        ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +        assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); +        assert(params->ldmParams.hashRateLog < 32);      } -    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); +    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));          size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); -        U32    const divider = (params.cParams.minMatch==3) ? 3 : 4; +        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;          size_t const maxNbSeq = blockSize / divider; -        size_t const buffOutSize = (zbuff == ZSTDb_buffered && params.outBufferMode == ZSTD_bm_buffered) +        size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)                  ? ZSTD_compressBound(blockSize) + 1                  : 0; -        size_t const buffInSize = (zbuff == ZSTDb_buffered && params.inBufferMode == ZSTD_bm_buffered) +        size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered)                  ? windowSize + blockSize                  : 0; -        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); +        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize);          int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); +        int const dictTooBig = ZSTD_dictTooBig(loadedDictSize);          ZSTD_indexResetPolicy_e needsIndexReset = -            (!indexTooClose && zc->initialized) ? ZSTDirp_continue : ZSTDirp_reset; +            (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue;          size_t const neededSpace =              ZSTD_estimateCCtxSize_usingCCtxParams_internal( -                ¶ms.cParams, ¶ms.ldmParams, zc->staticSize != 0, +                ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,                  buffInSize, buffOutSize, pledgedSrcSize); +        int resizeWorkspace; +          FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");          if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); -        /* Check if workspace is large enough, alloc a new one if needed */ -        { +        {   /* Check if workspace is large enough, alloc a new one if needed */              int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;              int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); - +            resizeWorkspace = workspaceTooSmall || workspaceWasteful;              DEBUGLOG(4, "Need %zu B workspace", neededSpace);              DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); -            if (workspaceTooSmall || workspaceWasteful) { +            if (resizeWorkspace) {                  DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",                              ZSTD_cwksp_sizeof(ws) >> 10,                              neededSpace >> 10); @@ -1629,14 +1831,13 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,                  zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));                  RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");                  zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); -                RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); +                RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");          }   }          ZSTD_cwksp_clear(ws);          /* init params */ -        zc->appliedParams = params; -        zc->blockState.matchState.cParams = params.cParams; +        zc->blockState.matchState.cParams = params->cParams;          zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;          zc->consumedSrcSize = 0;          zc->producedCSize = 0; @@ -1667,11 +1868,11 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,          zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);          /* ldm bucketOffsets table */ -        if (params.ldmParams.enableLdm) { +        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {              /* TODO: avoid memset? */              size_t const numBuckets = -                  ((size_t)1) << (params.ldmParams.hashLog - -                                  params.ldmParams.bucketSizeLog); +                  ((size_t)1) << (params->ldmParams.hashLog - +                                  params->ldmParams.bucketSizeLog);              zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets);              ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets);          } @@ -1687,32 +1888,28 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,          FORWARD_IF_ERROR(ZSTD_reset_matchState(              &zc->blockState.matchState,              ws, -            ¶ms.cParams, +            ¶ms->cParams, +            params->useRowMatchFinder,              crp,              needsIndexReset,              ZSTD_resetTarget_CCtx), "");          /* ldm hash table */ -        if (params.ldmParams.enableLdm) { +        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {              /* TODO: avoid memset? */ -            size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; +            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;              zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));              ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));              zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));              zc->maxNbLdmSequences = maxNbLdmSeq;              ZSTD_window_init(&zc->ldmState.window); -            ZSTD_window_clear(&zc->ldmState.window);              zc->ldmState.loadedDictEnd = 0;          } -        /* Due to alignment, when reusing a workspace, we can actually consume -         * up to 3 extra bytes for alignment. See the comments in zstd_cwksp.h -         */ -        assert(ZSTD_cwksp_used(ws) >= neededSpace && -               ZSTD_cwksp_used(ws) <= neededSpace + 3); -          DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); +          zc->initialized = 1;          return 0; @@ -1768,6 +1965,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,                          U64 pledgedSrcSize,                          ZSTD_buffered_policy_e zbuff)  { +    DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", +                (unsigned long long)pledgedSrcSize);      {          ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams;          unsigned const windowLog = params.cParams.windowLog; @@ -1783,7 +1982,9 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,          params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,                                                       cdict->dictContentSize, ZSTD_cpm_attachDict);          params.cParams.windowLog = windowLog; -        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, +        params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */ +        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +                                                 /* loadedDictSize */ 0,                                                   ZSTDcrp_makeClean, zbuff), "");          assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy);      } @@ -1827,15 +2028,17 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,      const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;      assert(!cdict->matchState.dedicatedDictSearch); - -    DEBUGLOG(4, "copying dictionary into context"); +    DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", +                (unsigned long long)pledgedSrcSize);      {   unsigned const windowLog = params.cParams.windowLog;          assert(windowLog != 0);          /* Copy only compression parameters related to tables. */          params.cParams = *cdict_cParams;          params.cParams.windowLog = windowLog; -        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, +        params.useRowMatchFinder = cdict->useRowMatchFinder; +        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +                                                 /* loadedDictSize */ 0,                                                   ZSTDcrp_leaveDirty, zbuff), "");          assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);          assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); @@ -1843,17 +2046,30 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,      }      ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); +    assert(params.useRowMatchFinder != ZSTD_ps_auto);      /* copy tables */ -    {   size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); +    {   size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) +                                                            ? ((size_t)1 << cdict_cParams->chainLog) +                                                            : 0;          size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;          ZSTD_memcpy(cctx->blockState.matchState.hashTable,                 cdict->matchState.hashTable,                 hSize * sizeof(U32)); -        ZSTD_memcpy(cctx->blockState.matchState.chainTable, +        /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ +        if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +            ZSTD_memcpy(cctx->blockState.matchState.chainTable,                 cdict->matchState.chainTable,                 chainSize * sizeof(U32)); +        } +        /* copy tag table */ +        if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +            size_t const tagTableSize = hSize*sizeof(U16); +            ZSTD_memcpy(cctx->blockState.matchState.tagTable, +                cdict->matchState.tagTable, +                tagTableSize); +        }      }      /* Zero the hashTable3, since the cdict never fills it */ @@ -1917,16 +2133,22 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,                              U64 pledgedSrcSize,                              ZSTD_buffered_policy_e zbuff)  { -    DEBUGLOG(5, "ZSTD_copyCCtx_internal");      RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong,                      "Can't copy a ctx that's not in init stage."); - +    DEBUGLOG(5, "ZSTD_copyCCtx_internal");      ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));      {   ZSTD_CCtx_params params = dstCCtx->requestedParams;          /* Copy only compression parameters related to tables. */          params.cParams = srcCCtx->appliedParams.cParams; +        assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); +        assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto); +        assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); +        params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; +        params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; +        params.ldmParams = srcCCtx->appliedParams.ldmParams;          params.fParams = fParams; -        ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, +        ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, +                                /* loadedDictSize */ 0,                                  ZSTDcrp_leaveDirty, zbuff);          assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);          assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); @@ -1938,7 +2160,11 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,      ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);      /* copy tables */ -    {   size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); +    {   size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, +                                                         srcCCtx->appliedParams.useRowMatchFinder, +                                                         0 /* forDDSDict */) +                                    ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) +                                    : 0;          size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;          int const h3log = srcCCtx->blockState.matchState.hashLog3;          size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; @@ -2005,6 +2231,8 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa      int const nbRows = (int)size / ZSTD_ROWSIZE;      int cellNb = 0;      int rowNb; +    /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ +    U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;      assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */      assert(size < (1U<<31));   /* can be casted to int */ @@ -2012,12 +2240,17 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa      for (rowNb=0 ; rowNb < nbRows ; rowNb++) {          int column;          for (column=0; column<ZSTD_ROWSIZE; column++) { -            if (preserveMark) { -                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0; -                table[cellNb] += adder; +            U32 newVal; +            if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) { +                /* This write is pointless, but is required(?) for the compiler +                 * to auto-vectorize the loop. */ +                newVal = ZSTD_DUBT_UNSORTED_MARK; +            } else if (table[cellNb] < reducerThreshold) { +                newVal = 0; +            } else { +                newVal = table[cellNb] - reducerValue;              } -            if (table[cellNb] < reducerValue) table[cellNb] = 0; -            else table[cellNb] -= reducerValue; +            table[cellNb] = newVal;              cellNb++;      }   }  } @@ -2040,7 +2273,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par          ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);      } -    if (params->cParams.strategy != ZSTD_fast) { +    if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) {          U32 const chainSize = (U32)1 << params->cParams.chainLog;          if (params->cParams.strategy == ZSTD_btlazy2)              ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); @@ -2072,14 +2305,14 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)      assert(nbSeq <= seqStorePtr->maxNbSeq);      for (u=0; u<nbSeq; u++) {          U32 const llv = sequences[u].litLength; -        U32 const mlv = sequences[u].matchLength; +        U32 const mlv = sequences[u].mlBase;          llCodeTable[u] = (BYTE)ZSTD_LLcode(llv); -        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset); +        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);          mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);      } -    if (seqStorePtr->longLengthID==1) +    if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)          llCodeTable[seqStorePtr->longLengthPos] = MaxLL; -    if (seqStorePtr->longLengthID==2) +    if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)          mlCodeTable[seqStorePtr->longLengthPos] = MaxML;  } @@ -2093,10 +2326,161 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)      return (cctxParams->targetCBlockSize != 0);  } -/* ZSTD_entropyCompressSequences_internal(): - * actually compresses both literals and sequences */ +/* ZSTD_blockSplitterEnabled(): + * Returns if block splitting param is being used + * If used, compression will do best effort to split a block in order to improve compression ratio. + * At the time this function is called, the parameter must be finalized. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) +{ +    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter); +    assert(cctxParams->useBlockSplitter != ZSTD_ps_auto); +    return (cctxParams->useBlockSplitter == ZSTD_ps_enable); +} + +/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types + * and size of the sequences statistics + */ +typedef struct { +    U32 LLtype; +    U32 Offtype; +    U32 MLtype; +    size_t size; +    size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_symbolEncodingTypeStats_t; + +/* ZSTD_buildSequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. + * Modifies `nextEntropy` to have the appropriate values as a side effect. + * nbSeq must be greater than 0. + * + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +                              BYTE* dst, const BYTE* const dstEnd, +                              ZSTD_strategy strategy, unsigned* countWorkspace, +                              void* entropyWorkspace, size_t entropyWkspSize) { +    BYTE* const ostart = dst; +    const BYTE* const oend = dstEnd; +    BYTE* op = ostart; +    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; +    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; +    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; +    const BYTE* const ofCodeTable = seqStorePtr->ofCode; +    const BYTE* const llCodeTable = seqStorePtr->llCode; +    const BYTE* const mlCodeTable = seqStorePtr->mlCode; +    ZSTD_symbolEncodingTypeStats_t stats; + +    stats.lastCountSize = 0; +    /* convert length/distances into codes */ +    ZSTD_seqToCodes(seqStorePtr); +    assert(op <= oend); +    assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ +    /* build CTable for Literal Lengths */ +    {   unsigned max = MaxLL; +        size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */ +        DEBUGLOG(5, "Building LL table"); +        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; +        stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, +                                        countWorkspace, max, mostFrequent, nbSeq, +                                        LLFSELog, prevEntropy->litlengthCTable, +                                        LL_defaultNorm, LL_defaultNormLog, +                                        ZSTD_defaultAllowed, strategy); +        assert(set_basic < set_compressed && set_rle < set_compressed); +        assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ +        {   size_t const countSize = ZSTD_buildCTable( +                op, (size_t)(oend - op), +                CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, +                countWorkspace, max, llCodeTable, nbSeq, +                LL_defaultNorm, LL_defaultNormLog, MaxLL, +                prevEntropy->litlengthCTable, +                sizeof(prevEntropy->litlengthCTable), +                entropyWorkspace, entropyWkspSize); +            if (ZSTD_isError(countSize)) { +                DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); +                stats.size = countSize; +                return stats; +            } +            if (stats.LLtype == set_compressed) +                stats.lastCountSize = countSize; +            op += countSize; +            assert(op <= oend); +    }   } +    /* build CTable for Offsets */ +    {   unsigned max = MaxOff; +        size_t const mostFrequent = HIST_countFast_wksp( +            countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */ +        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ +        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; +        DEBUGLOG(5, "Building OF table"); +        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; +        stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, +                                        countWorkspace, max, mostFrequent, nbSeq, +                                        OffFSELog, prevEntropy->offcodeCTable, +                                        OF_defaultNorm, OF_defaultNormLog, +                                        defaultPolicy, strategy); +        assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ +        {   size_t const countSize = ZSTD_buildCTable( +                op, (size_t)(oend - op), +                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, +                countWorkspace, max, ofCodeTable, nbSeq, +                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +                prevEntropy->offcodeCTable, +                sizeof(prevEntropy->offcodeCTable), +                entropyWorkspace, entropyWkspSize); +            if (ZSTD_isError(countSize)) { +                DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); +                stats.size = countSize; +                return stats; +            } +            if (stats.Offtype == set_compressed) +                stats.lastCountSize = countSize; +            op += countSize; +            assert(op <= oend); +    }   } +    /* build CTable for MatchLengths */ +    {   unsigned max = MaxML; +        size_t const mostFrequent = HIST_countFast_wksp( +            countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */ +        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); +        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; +        stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, +                                        countWorkspace, max, mostFrequent, nbSeq, +                                        MLFSELog, prevEntropy->matchlengthCTable, +                                        ML_defaultNorm, ML_defaultNormLog, +                                        ZSTD_defaultAllowed, strategy); +        assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ +        {   size_t const countSize = ZSTD_buildCTable( +                op, (size_t)(oend - op), +                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, +                countWorkspace, max, mlCodeTable, nbSeq, +                ML_defaultNorm, ML_defaultNormLog, MaxML, +                prevEntropy->matchlengthCTable, +                sizeof(prevEntropy->matchlengthCTable), +                entropyWorkspace, entropyWkspSize); +            if (ZSTD_isError(countSize)) { +                DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); +                stats.size = countSize; +                return stats; +            } +            if (stats.MLtype == set_compressed) +                stats.lastCountSize = countSize; +            op += countSize; +            assert(op <= oend); +    }   } +    stats.size = (size_t)(op-ostart); +    return stats; +} + +/* ZSTD_entropyCompressSeqStore_internal(): + * compresses both literals and sequences + * Returns compressed size of block, or a zstd error. + */ +#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20  MEM_STATIC size_t -ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,                            const ZSTD_entropyCTables_t* prevEntropy,                                  ZSTD_entropyCTables_t* nextEntropy,                            const ZSTD_CCtx_params* cctxParams, @@ -2110,36 +2494,38 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr,      FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;      FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;      FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; -    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */      const seqDef* const sequences = seqStorePtr->sequencesStart; +    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;      const BYTE* const ofCodeTable = seqStorePtr->ofCode;      const BYTE* const llCodeTable = seqStorePtr->llCode;      const BYTE* const mlCodeTable = seqStorePtr->mlCode;      BYTE* const ostart = (BYTE*)dst;      BYTE* const oend = ostart + dstCapacity;      BYTE* op = ostart; -    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); -    BYTE* seqHead; -    BYTE* lastNCount = NULL; +    size_t lastCountSize;      entropyWorkspace = count + (MaxSeq + 1);      entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); -    DEBUGLOG(4, "ZSTD_entropyCompressSequences_internal (nbSeq=%zu)", nbSeq); +    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);      ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));      assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);      /* Compress literals */      {   const BYTE* const literals = seqStorePtr->litStart; +        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; +        /* Base suspicion of uncompressibility on ratio of literals to sequences */ +        unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);          size_t const litSize = (size_t)(seqStorePtr->lit - literals);          size_t const cSize = ZSTD_compressLiterals(                                      &prevEntropy->huf, &nextEntropy->huf,                                      cctxParams->cParams.strategy, -                                    ZSTD_disableLiteralsCompression(cctxParams), +                                    ZSTD_literalsCompressionIsDisabled(cctxParams),                                      op, dstCapacity,                                      literals, litSize,                                      entropyWorkspace, entropyWkspSize, -                                    bmi2); +                                    bmi2, suspectUncompressible);          FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");          assert(cSize <= dstCapacity);          op += cSize; @@ -2165,95 +2551,20 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr,          ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));          return (size_t)(op - ostart);      } - -    /* seqHead : flags for FSE encoding type */ -    seqHead = op++; -    assert(op <= oend); - -    /* convert length/distances into codes */ -    ZSTD_seqToCodes(seqStorePtr); -    /* build CTable for Literal Lengths */ -    {   unsigned max = MaxLL; -        size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */ -        DEBUGLOG(5, "Building LL table"); -        nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; -        LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, -                                        count, max, mostFrequent, nbSeq, -                                        LLFSELog, prevEntropy->fse.litlengthCTable, -                                        LL_defaultNorm, LL_defaultNormLog, -                                        ZSTD_defaultAllowed, strategy); -        assert(set_basic < set_compressed && set_rle < set_compressed); -        assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ -        {   size_t const countSize = ZSTD_buildCTable( -                op, (size_t)(oend - op), -                CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, -                count, max, llCodeTable, nbSeq, -                LL_defaultNorm, LL_defaultNormLog, MaxLL, -                prevEntropy->fse.litlengthCTable, -                sizeof(prevEntropy->fse.litlengthCTable), -                entropyWorkspace, entropyWkspSize); -            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); -            if (LLtype == set_compressed) -                lastNCount = op; -            op += countSize; -            assert(op <= oend); -    }   } -    /* build CTable for Offsets */ -    {   unsigned max = MaxOff; -        size_t const mostFrequent = HIST_countFast_wksp( -            count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */ -        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ -        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; -        DEBUGLOG(5, "Building OF table"); -        nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; -        Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, -                                        count, max, mostFrequent, nbSeq, -                                        OffFSELog, prevEntropy->fse.offcodeCTable, -                                        OF_defaultNorm, OF_defaultNormLog, -                                        defaultPolicy, strategy); -        assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ -        {   size_t const countSize = ZSTD_buildCTable( -                op, (size_t)(oend - op), -                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, -                count, max, ofCodeTable, nbSeq, -                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, -                prevEntropy->fse.offcodeCTable, -                sizeof(prevEntropy->fse.offcodeCTable), -                entropyWorkspace, entropyWkspSize); -            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); -            if (Offtype == set_compressed) -                lastNCount = op; -            op += countSize; -            assert(op <= oend); -    }   } -    /* build CTable for MatchLengths */ -    {   unsigned max = MaxML; -        size_t const mostFrequent = HIST_countFast_wksp( -            count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */ -        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); -        nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; -        MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, -                                        count, max, mostFrequent, nbSeq, -                                        MLFSELog, prevEntropy->fse.matchlengthCTable, -                                        ML_defaultNorm, ML_defaultNormLog, -                                        ZSTD_defaultAllowed, strategy); -        assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ -        {   size_t const countSize = ZSTD_buildCTable( -                op, (size_t)(oend - op), -                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, -                count, max, mlCodeTable, nbSeq, -                ML_defaultNorm, ML_defaultNormLog, MaxML, -                prevEntropy->fse.matchlengthCTable, -                sizeof(prevEntropy->fse.matchlengthCTable), -                entropyWorkspace, entropyWkspSize); -            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); -            if (MLtype == set_compressed) -                lastNCount = op; -            op += countSize; -            assert(op <= oend); -    }   } - -    *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); +    { +        ZSTD_symbolEncodingTypeStats_t stats; +        BYTE* seqHead = op++; +        /* build stats for sequences */ +        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, +                                             &prevEntropy->fse, &nextEntropy->fse, +                                              op, oend, +                                              strategy, count, +                                              entropyWorkspace, entropyWkspSize); +        FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); +        *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); +        lastCountSize = stats.lastCountSize; +        op += stats.size; +    }      {   size_t const bitstreamSize = ZSTD_encodeSequences(                                          op, (size_t)(oend - op), @@ -2273,9 +2584,9 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr,           * In this exceedingly rare case, we will simply emit an uncompressed           * block, since it isn't worth optimizing.           */ -        if (lastNCount && (op - lastNCount) < 4) { -            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ -            assert(op - lastNCount == 3); +        if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { +            /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ +            assert(lastCountSize + bitstreamSize == 3);              DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "                          "emitting an uncompressed block.");              return 0; @@ -2287,7 +2598,7 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr,  }  MEM_STATIC size_t -ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,                         const ZSTD_entropyCTables_t* prevEntropy,                               ZSTD_entropyCTables_t* nextEntropy,                         const ZSTD_CCtx_params* cctxParams, @@ -2296,7 +2607,7 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr,                               void* entropyWorkspace, size_t entropyWkspSize,                               int bmi2)  { -    size_t const cSize = ZSTD_entropyCompressSequences_internal( +    size_t const cSize = ZSTD_entropyCompressSeqStore_internal(                              seqStorePtr, prevEntropy, nextEntropy, cctxParams,                              dst, dstCapacity,                              entropyWorkspace, entropyWkspSize, bmi2); @@ -2306,20 +2617,20 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr,       */      if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))          return 0;  /* block not compressed */ -    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSequences_internal failed"); +    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");      /* Check compressibility */      {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);          if (cSize >= maxCSize) return 0;  /* block not compressed */      } -    DEBUGLOG(4, "ZSTD_entropyCompressSequences() cSize: %zu\n", cSize); +    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);      return cSize;  }  /* ZSTD_selectBlockCompressor() :   * Not static, but internal use only (used by long distance matcher)   * assumption : strat is a valid strategy */ -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)  {      static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {          { ZSTD_compressBlock_fast  /* default for 0 */, @@ -2367,7 +2678,28 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMo      ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);      assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); -    selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; +    DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); +    if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { +        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +            { ZSTD_compressBlock_greedy_row, +            ZSTD_compressBlock_lazy_row, +            ZSTD_compressBlock_lazy2_row }, +            { ZSTD_compressBlock_greedy_extDict_row, +            ZSTD_compressBlock_lazy_extDict_row, +            ZSTD_compressBlock_lazy2_extDict_row }, +            { ZSTD_compressBlock_greedy_dictMatchState_row, +            ZSTD_compressBlock_lazy_dictMatchState_row, +            ZSTD_compressBlock_lazy2_dictMatchState_row }, +            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, +            ZSTD_compressBlock_lazy_dedicatedDictSearch_row, +            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } +        }; +        DEBUGLOG(4, "Selecting a row-based matchfinder"); +        assert(useRowMatchFinder != ZSTD_ps_auto); +        selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; +    } else { +        selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; +    }      assert(selectedCompressor != NULL);      return selectedCompressor;  } @@ -2383,7 +2715,7 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)  {      ssPtr->lit = ssPtr->litStart;      ssPtr->sequences = ssPtr->sequencesStart; -    ssPtr->longLengthID = 0; +    ssPtr->longLengthType = ZSTD_llt_none;  }  typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; @@ -2430,15 +2762,16 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)                  zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];          }          if (zc->externSeqStore.pos < zc->externSeqStore.size) { -            assert(!zc->appliedParams.ldmParams.enableLdm); +            assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);              /* Updates ldmSeqStore.pos */              lastLLSize =                  ZSTD_ldm_blockCompress(&zc->externSeqStore,                                         ms, &zc->seqStore,                                         zc->blockState.nextCBlock->rep, +                                       zc->appliedParams.useRowMatchFinder,                                         src, srcSize);              assert(zc->externSeqStore.pos <= zc->externSeqStore.size); -        } else if (zc->appliedParams.ldmParams.enableLdm) { +        } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {              rawSeqStore_t ldmSeqStore = kNullRawSeqStore;              ldmSeqStore.seq = zc->ldmSequences; @@ -2452,10 +2785,13 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)                  ZSTD_ldm_blockCompress(&ldmSeqStore,                                         ms, &zc->seqStore,                                         zc->blockState.nextCBlock->rep, +                                       zc->appliedParams.useRowMatchFinder,                                         src, srcSize);              assert(ldmSeqStore.pos == ldmSeqStore.size);          } else {   /* not long range mode */ -            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); +            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, +                                                                                    zc->appliedParams.useRowMatchFinder, +                                                                                    dictMode);              ms->ldmSeqStore = NULL;              lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);          } @@ -2483,22 +2819,22 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)      assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);      ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));      for (i = 0; i < seqStoreSeqSize; ++i) { -        U32 rawOffset = seqStoreSeqs[i].offset - ZSTD_REP_NUM; +        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;          outSeqs[i].litLength = seqStoreSeqs[i].litLength; -        outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH; +        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;          outSeqs[i].rep = 0;          if (i == seqStore->longLengthPos) { -            if (seqStore->longLengthID == 1) { +            if (seqStore->longLengthType == ZSTD_llt_literalLength) {                  outSeqs[i].litLength += 0x10000; -            } else if (seqStore->longLengthID == 2) { +            } else if (seqStore->longLengthType == ZSTD_llt_matchLength) {                  outSeqs[i].matchLength += 0x10000;              }          } -        if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) { +        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {              /* Derive the correct offset corresponding to a repcode */ -            outSeqs[i].rep = seqStoreSeqs[i].offset; +            outSeqs[i].rep = seqStoreSeqs[i].offBase;              if (outSeqs[i].litLength != 0) {                  rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];              } else { @@ -2512,9 +2848,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)          outSeqs[i].offset = rawOffset;          /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode             so we provide seqStoreSeqs[i].offset - 1 */ -        updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, -                                         seqStoreSeqs[i].offset - 1, -                                         seqStoreSeqs[i].litLength == 0); +        ZSTD_updateRep(updatedRepcodes.rep, +                       seqStoreSeqs[i].offBase - 1, +                       seqStoreSeqs[i].litLength == 0);          literalsRead += outSeqs[i].litLength;      }      /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. @@ -2602,16 +2938,740 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)      return nbSeqs < 4 && nbLits < 10;  } -static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) +{ +    ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; +    bs->prevCBlock = bs->nextCBlock; +    bs->nextCBlock = tmp; +} + +/* Writes the block header */ +static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { +    U32 const cBlockHeader = cSize == 1 ? +                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : +                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); +    MEM_writeLE24(op, cBlockHeader); +    DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); +} + +/* ZSTD_buildBlockEntropyStats_literals() : + *  Builds entropy for the literals. + *  Stores literals block type (raw, rle, compressed, repeat) and + *  huffman description table to hufMetadata. + *  Requires ENTROPY_WORKSPACE_SIZE workspace + *  @return : size of huffman description table or error code */ +static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +                                            const ZSTD_hufCTables_t* prevHuf, +                                                  ZSTD_hufCTables_t* nextHuf, +                                                  ZSTD_hufCTablesMetadata_t* hufMetadata, +                                                  const int literalsCompressionIsDisabled, +                                                  void* workspace, size_t wkspSize) +{ +    BYTE* const wkspStart = (BYTE*)workspace; +    BYTE* const wkspEnd = wkspStart + wkspSize; +    BYTE* const countWkspStart = wkspStart; +    unsigned* const countWksp = (unsigned*)workspace; +    const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); +    BYTE* const nodeWksp = countWkspStart + countWkspSize; +    const size_t nodeWkspSize = wkspEnd-nodeWksp; +    unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +    unsigned huffLog = HUF_TABLELOG_DEFAULT; +    HUF_repeat repeat = prevHuf->repeatMode; +    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +    /* Prepare nextEntropy assuming reusing the existing table */ +    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + +    if (literalsCompressionIsDisabled) { +        DEBUGLOG(5, "set_basic - disabled"); +        hufMetadata->hType = set_basic; +        return 0; +    } + +    /* small ? don't even attempt compression (speed opt) */ +#ifndef COMPRESS_LITERALS_SIZE_MIN +#define COMPRESS_LITERALS_SIZE_MIN 63 +#endif +    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +        if (srcSize <= minLitSize) { +            DEBUGLOG(5, "set_basic - too small"); +            hufMetadata->hType = set_basic; +            return 0; +        } +    } + +    /* Scan input and build symbol stats */ +    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); +        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); +        if (largest == srcSize) { +            DEBUGLOG(5, "set_rle"); +            hufMetadata->hType = set_rle; +            return 0; +        } +        if (largest <= (srcSize >> 7)+4) { +            DEBUGLOG(5, "set_basic - no gain"); +            hufMetadata->hType = set_basic; +            return 0; +        } +    } + +    /* Validate the previous Huffman table */ +    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { +        repeat = HUF_repeat_none; +    } + +    /* Build Huffman Tree */ +    ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); +    {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, +                                                    maxSymbolValue, huffLog, +                                                    nodeWksp, nodeWkspSize); +        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); +        huffLog = (U32)maxBits; +        {   /* Build and write the CTable */ +            size_t const newCSize = HUF_estimateCompressedSize( +                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +            size_t const hSize = HUF_writeCTable_wksp( +                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +                    nodeWksp, nodeWkspSize); +            /* Check against repeating the previous CTable */ +            if (repeat != HUF_repeat_none) { +                size_t const oldCSize = HUF_estimateCompressedSize( +                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +                    DEBUGLOG(5, "set_repeat - smaller"); +                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +                    hufMetadata->hType = set_repeat; +                    return 0; +                } +            } +            if (newCSize + hSize >= srcSize) { +                DEBUGLOG(5, "set_basic - no gains"); +                ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +                hufMetadata->hType = set_basic; +                return 0; +            } +            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +            hufMetadata->hType = set_compressed; +            nextHuf->repeatMode = HUF_repeat_check; +            return hSize; +        } +    } +} + + +/* ZSTD_buildDummySequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, + * and updates nextEntropy to the appropriate repeatMode. + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; +    nextEntropy->litlength_repeatMode = FSE_repeat_none; +    nextEntropy->offcode_repeatMode = FSE_repeat_none; +    nextEntropy->matchlength_repeatMode = FSE_repeat_none; +    return stats; +} + +/* ZSTD_buildBlockEntropyStats_sequences() : + *  Builds entropy for the sequences. + *  Stores symbol compression modes and fse table to fseMetadata. + *  Requires ENTROPY_WORKSPACE_SIZE wksp. + *  @return : size of fse tables or error code */ +static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +                                              const ZSTD_fseCTables_t* prevEntropy, +                                                    ZSTD_fseCTables_t* nextEntropy, +                                              const ZSTD_CCtx_params* cctxParams, +                                                    ZSTD_fseCTablesMetadata_t* fseMetadata, +                                                    void* workspace, size_t wkspSize) +{ +    ZSTD_strategy const strategy = cctxParams->cParams.strategy; +    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; +    BYTE* const ostart = fseMetadata->fseTablesBuffer; +    BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); +    BYTE* op = ostart; +    unsigned* countWorkspace = (unsigned*)workspace; +    unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); +    size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); +    ZSTD_symbolEncodingTypeStats_t stats; + +    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); +    stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, +                                          prevEntropy, nextEntropy, op, oend, +                                          strategy, countWorkspace, +                                          entropyWorkspace, entropyWorkspaceSize) +                       : ZSTD_buildDummySequencesStatistics(nextEntropy); +    FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); +    fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; +    fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; +    fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; +    fseMetadata->lastCountSize = stats.lastCountSize; +    return stats.size; +} + + +/* ZSTD_buildBlockEntropyStats() : + *  Builds entropy for the block. + *  Requires workspace size ENTROPY_WORKSPACE_SIZE + * + *  @return : 0 on success or error code + */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +                             const ZSTD_entropyCTables_t* prevEntropy, +                                   ZSTD_entropyCTables_t* nextEntropy, +                             const ZSTD_CCtx_params* cctxParams, +                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata, +                                   void* workspace, size_t wkspSize) +{ +    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; +    entropyMetadata->hufMetadata.hufDesSize = +        ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, +                                            &prevEntropy->huf, &nextEntropy->huf, +                                            &entropyMetadata->hufMetadata, +                                            ZSTD_literalsCompressionIsDisabled(cctxParams), +                                            workspace, wkspSize); +    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); +    entropyMetadata->fseMetadata.fseTablesSize = +        ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +                                              &prevEntropy->fse, &nextEntropy->fse, +                                              cctxParams, +                                              &entropyMetadata->fseMetadata, +                                              workspace, wkspSize); +    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); +    return 0; +} + +/* Returns the size estimate for the literals section (header + content) of a block */ +static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +                                                const ZSTD_hufCTables_t* huf, +                                                const ZSTD_hufCTablesMetadata_t* hufMetadata, +                                                void* workspace, size_t wkspSize, +                                                int writeEntropy) +{ +    unsigned* const countWksp = (unsigned*)workspace; +    unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +    size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); +    U32 singleStream = litSize < 256; + +    if (hufMetadata->hType == set_basic) return litSize; +    else if (hufMetadata->hType == set_rle) return 1; +    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { +        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); +        if (ZSTD_isError(largest)) return litSize; +        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); +            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; +            if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ +            return cLitSizeEstimate + literalSectionHeaderSize; +    }   } +    assert(0); /* impossible */ +    return 0; +} + +/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +                        const FSE_CTable* fseCTable, +                        const U8* additionalBits, +                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +                        void* workspace, size_t wkspSize) +{ +    unsigned* const countWksp = (unsigned*)workspace; +    const BYTE* ctp = codeTable; +    const BYTE* const ctStart = ctp; +    const BYTE* const ctEnd = ctStart + nbSeq; +    size_t cSymbolTypeSizeEstimateInBits = 0; +    unsigned max = maxCode; + +    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */ +    if (type == set_basic) { +        /* We selected this encoding type, so it must be valid. */ +        assert(max <= defaultMax); +        (void)defaultMax; +        cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); +    } else if (type == set_rle) { +        cSymbolTypeSizeEstimateInBits = 0; +    } else if (type == set_compressed || type == set_repeat) { +        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); +    } +    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { +        return nbSeq * 10; +    } +    while (ctp < ctEnd) { +        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; +        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ +        ctp++; +    } +    return cSymbolTypeSizeEstimateInBits >> 3; +} + +/* Returns the size estimate for the sequences section (header + content) of a block */ +static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +                                                  const BYTE* llCodeTable, +                                                  const BYTE* mlCodeTable, +                                                  size_t nbSeq, +                                                  const ZSTD_fseCTables_t* fseTables, +                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata, +                                                  void* workspace, size_t wkspSize, +                                                  int writeEntropy) +{ +    size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); +    size_t cSeqSizeEstimate = 0; +    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +                                         fseTables->offcodeCTable, NULL, +                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +                                         workspace, wkspSize); +    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +                                         fseTables->litlengthCTable, LL_bits, +                                         LL_defaultNorm, LL_defaultNormLog, MaxLL, +                                         workspace, wkspSize); +    cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +                                         fseTables->matchlengthCTable, ML_bits, +                                         ML_defaultNorm, ML_defaultNormLog, MaxML, +                                         workspace, wkspSize); +    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; +    return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +/* Returns the size estimate for a given stream of literals, of, ll, ml */ +static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +                                     const BYTE* ofCodeTable, +                                     const BYTE* llCodeTable, +                                     const BYTE* mlCodeTable, +                                     size_t nbSeq, +                                     const ZSTD_entropyCTables_t* entropy, +                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +                                     void* workspace, size_t wkspSize, +                                     int writeLitEntropy, int writeSeqEntropy) { +    size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +                                                         &entropy->huf, &entropyMetadata->hufMetadata, +                                                         workspace, wkspSize, writeLitEntropy); +    size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +                                                         workspace, wkspSize, writeSeqEntropy); +    return seqSize + literalsSize + ZSTD_blockHeaderSize; +} + +/* Builds entropy statistics and uses them for blocksize estimation. + * + * Returns the estimated compressed size of the seqStore, or a zstd error. + */ +static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; +    DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); +    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, +                    &zc->blockState.prevCBlock->entropy, +                    &zc->blockState.nextCBlock->entropy, +                    &zc->appliedParams, +                    entropyMetadata, +                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), +                    seqStore->ofCode, seqStore->llCode, seqStore->mlCode, +                    (size_t)(seqStore->sequences - seqStore->sequencesStart), +                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, +                    (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); +} + +/* Returns literals bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { +    size_t literalsBytes = 0; +    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; +    size_t i; +    for (i = 0; i < nbSeqs; ++i) { +        seqDef seq = seqStore->sequencesStart[i]; +        literalsBytes += seq.litLength; +        if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { +            literalsBytes += 0x10000; +        } +    } +    return literalsBytes; +} + +/* Returns match bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { +    size_t matchBytes = 0; +    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; +    size_t i; +    for (i = 0; i < nbSeqs; ++i) { +        seqDef seq = seqStore->sequencesStart[i]; +        matchBytes += seq.mlBase + MINMATCH; +        if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { +            matchBytes += 0x10000; +        } +    } +    return matchBytes; +} + +/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, +                               const seqStore_t* originalSeqStore, +                                     size_t startIdx, size_t endIdx) { +    BYTE* const litEnd = originalSeqStore->lit; +    size_t literalsBytes; +    size_t literalsBytesPreceding = 0; + +    *resultSeqStore = *originalSeqStore; +    if (startIdx > 0) { +        resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +    } + +    /* Move longLengthPos into the correct position if necessary */ +    if (originalSeqStore->longLengthType != ZSTD_llt_none) { +        if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { +            resultSeqStore->longLengthType = ZSTD_llt_none; +        } else { +            resultSeqStore->longLengthPos -= (U32)startIdx; +        } +    } +    resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; +    resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +    resultSeqStore->litStart += literalsBytesPreceding; +    if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { +        /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +        resultSeqStore->lit = litEnd; +    } else { +        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; +    } +    resultSeqStore->llCode += startIdx; +    resultSeqStore->mlCode += startIdx; +    resultSeqStore->ofCode += startIdx; +} + +/* + * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. + * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ +static U32 +ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +{ +    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */ +    assert(STORED_IS_REPCODE(offCode)); +    if (adjustedOffCode == ZSTD_REP_NUM) { +        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +        assert(rep[0] > 0); +        return rep[0] - 1; +    } +    return rep[adjustedOffCode]; +} + +/* + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, + * and replaces any repcodes within the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. + * cRepcodes are updated exactly in accordance with the seqStore. + * + * Note : this function assumes seq->offBase respects the following numbering scheme : + *        0 : invalid + *        1-3 : repcode 1-3 + *        4+ : real_offset+3 + */ +static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +                                          seqStore_t* const seqStore, U32 const nbSeq) { +    U32 idx = 0; +    for (; idx < nbSeq; ++idx) { +        seqDef* const seq = seqStore->sequencesStart + idx; +        U32 const ll0 = (seq->litLength == 0); +        U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +        assert(seq->offBase > 0); +        if (STORED_IS_REPCODE(offCode)) { +            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); +            /* Adjust simulated decompression repcode history if we come across a mismatch. Replace +             * the repcode with the offset it actually references, determined by the compression +             * repcode history. +             */ +            if (dRawOffset != cRawOffset) { +                seq->offBase = cRawOffset + ZSTD_REP_NUM; +            } +        } +        /* Compression repcode history is always updated with values directly from the unmodified seqStore. +         * Decompression repcode history may use modified seq->offset value taken from compression repcode history. +         */ +        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +        ZSTD_updateRep(cRepcodes->rep, offCode, ll0); +    } +} + +/* ZSTD_compressSeqStore_singleBlock(): + * Compresses a seqStore into a block with a block header, into the buffer dst. + * + * Returns the total size of that block (including header) or a ZSTD error code. + */ +static size_t +ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, +                                  repcodes_t* const dRep, repcodes_t* const cRep, +                                  void* dst, size_t dstCapacity, +                                  const void* src, size_t srcSize, +                                  U32 lastBlock, U32 isPartition)  { -    ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; -    zc->blockState.prevCBlock = zc->blockState.nextCBlock; -    zc->blockState.nextCBlock = tmp; +    const U32 rleMaxLength = 25; +    BYTE* op = (BYTE*)dst; +    const BYTE* ip = (const BYTE*)src; +    size_t cSize; +    size_t cSeqsSize; + +    /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ +    repcodes_t const dRepOriginal = *dRep; +    DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); +    if (isPartition) +        ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + +    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); +    cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, +                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, +                &zc->appliedParams, +                op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, +                srcSize, +                zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, +                zc->bmi2); +    FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + +    if (!zc->isFirstBlock && +        cSeqsSize < rleMaxLength && +        ZSTD_isRLE((BYTE const*)src, srcSize)) { +        /* We don't want to emit our first block as a RLE even if it qualifies because +        * doing so will cause the decoder (cli only) to throw a "should consume all input error." +        * This is only an issue for zstd <= v1.4.3 +        */ +        cSeqsSize = 1; +    } + +    if (zc->seqCollector.collectSequences) { +        ZSTD_copyBlockSequences(zc); +        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +        return 0; +    } + +    if (cSeqsSize == 0) { +        cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); +        FORWARD_IF_ERROR(cSize, "Nocompress block failed"); +        DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); +        *dRep = dRepOriginal; /* reset simulated decompression repcode history */ +    } else if (cSeqsSize == 1) { +        cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); +        FORWARD_IF_ERROR(cSize, "RLE compress block failed"); +        DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); +        *dRep = dRepOriginal; /* reset simulated decompression repcode history */ +    } else { +        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +        writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); +        cSize = ZSTD_blockHeaderSize + cSeqsSize; +        DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); +    } + +    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) +        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + +    return cSize; +} + +/* Struct to keep track of where we are in our recursive calls. */ +typedef struct { +    U32* splitLocations;    /* Array of split indices */ +    size_t idx;             /* The current index within splitLocations being worked on */ +} seqStoreSplits; + +#define MIN_SEQUENCES_BLOCK_SPLITTING 300 + +/* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. + * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then + * we do not recurse. + * + * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * + * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ +static void +ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, +                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore) +{ +    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; +    size_t estimatedOriginalSize; +    size_t estimatedFirstHalfSize; +    size_t estimatedSecondHalfSize; +    size_t midIdx = (startIdx + endIdx)/2; + +    if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); +        return; +    } +    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); +    ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); +    ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); +    ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); +    estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); +    estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); +    estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", +             estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); +    if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { +        return; +    } +    if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { +        ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); +        splits->splitLocations[splits->idx] = (U32)midIdx; +        splits->idx++; +        ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); +    } +} + +/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. + * + * Returns the number of splits made (which equals the size of the partition table - 1). + */ +static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +    seqStoreSplits splits = {partitions, 0}; +    if (nbSeq <= 4) { +        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); +        /* Refuse to try and split anything with less than 4 sequences */ +        return 0; +    } +    ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); +    splits.splitLocations[splits.idx] = nbSeq; +    DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); +    return splits.idx; +} + +/* ZSTD_compressBlock_splitBlock(): + * Attempts to split a given block into multiple blocks to improve compression ratio. + * + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ +static size_t +ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) +{ +    size_t cSize = 0; +    const BYTE* ip = (const BYTE*)src; +    BYTE* op = (BYTE*)dst; +    size_t i = 0; +    size_t srcBytesTotal = 0; +    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + +    /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history +     * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +     * separate repcode histories that simulate repcode history on compression and decompression side, +     * and use the histories to determine whether we must replace a particular repcode with its raw offset. +     * +     * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed +     *    or RLE. This allows us to retrieve the offset value that an invalid repcode references within +     *    a nocompress/RLE block. +     * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use +     *    the replacement offset value rather than the original repcode to update the repcode history. +     *    dRep also will be the final repcode history sent to the next block. +     * +     * See ZSTD_seqStore_resolveOffCodes() for more details. +     */ +    repcodes_t dRep; +    repcodes_t cRep; +    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +    ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + +    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", +                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, +                (unsigned)zc->blockState.matchState.nextToUpdate); + +    if (numSplits == 0) { +        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +                                                                   &dRep, &cRep, +                                                                    op, dstCapacity, +                                                                    ip, blockSize, +                                                                    lastBlock, 0 /* isPartition */); +        FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); +        DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); +        return cSizeSingleBlock; +    } + +    ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); +    for (i = 0; i <= numSplits; ++i) { +        size_t srcBytes; +        size_t cSizeChunk; +        U32 const lastPartition = (i == numSplits); +        U32 lastBlockEntireSrc = 0; + +        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); +        srcBytesTotal += srcBytes; +        if (lastPartition) { +            /* This is the final partition, need to account for possible last literals */ +            srcBytes += blockSize - srcBytesTotal; +            lastBlockEntireSrc = lastBlock; +        } else { +            ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); +        } + +        cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, +                                                      &dRep, &cRep, +                                                       op, dstCapacity, +                                                       ip, srcBytes, +                                                       lastBlockEntireSrc, 1 /* isPartition */); +        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); +        FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + +        ip += srcBytes; +        op += cSizeChunk; +        dstCapacity -= cSizeChunk; +        cSize += cSizeChunk; +        *currSeqStore = *nextSeqStore; +        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); +    } +    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +     * for the next block. +     */ +    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); +    return cSize; +} + +static size_t +ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, +                              void* dst, size_t dstCapacity, +                              const void* src, size_t srcSize, U32 lastBlock) +{ +    const BYTE* ip = (const BYTE*)src; +    BYTE* op = (BYTE*)dst; +    U32 nbSeq; +    size_t cSize; +    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +    assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable); + +    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); +        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); +        if (bss == ZSTDbss_noCompress) { +            if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) +                zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); +            FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); +            DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); +            return cSize; +        } +        nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); +    } + +    cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); +    FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); +    return cSize;  } -static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, -                                        void* dst, size_t dstCapacity, -                                        const void* src, size_t srcSize, U32 frame) +static size_t +ZSTD_compressBlock_internal(ZSTD_CCtx* zc, +                            void* dst, size_t dstCapacity, +                            const void* src, size_t srcSize, U32 frame)  {      /* This the upper bound for the length of an rle block.       * This isn't the actual upper bound. Finding the real threshold @@ -2632,12 +3692,12 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,      if (zc->seqCollector.collectSequences) {          ZSTD_copyBlockSequences(zc); -        ZSTD_confirmRepcodesAndEntropyTables(zc); +        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);          return 0;      }      /* encode sequences and literals */ -    cSize = ZSTD_entropyCompressSequences(&zc->seqStore, +    cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore,              &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,              &zc->appliedParams,              dst, dstCapacity, @@ -2645,12 +3705,6 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,              zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,              zc->bmi2); -    if (zc->seqCollector.collectSequences) { -        ZSTD_copyBlockSequences(zc); -        return 0; -    } - -      if (frame &&          /* We don't want to emit our first block as a RLE even if it qualifies because           * doing so will cause the decoder (cli only) to throw a "should consume all input error." @@ -2666,7 +3720,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,  out:      if (!ZSTD_isError(cSize) && cSize > 1) { -        ZSTD_confirmRepcodesAndEntropyTables(zc); +        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);      }      /* We check that dictionaries have offset codes available for the first       * block. After the first block, the offcode table might not have large @@ -2719,7 +3773,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,                  size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);                  FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");                  if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { -                    ZSTD_confirmRepcodesAndEntropyTables(zc); +                    ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);                      return cSize;                  }              } @@ -2759,9 +3813,9 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,                                           void const* ip,                                           void const* iend)  { -    if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { -        U32 const maxDist = (U32)1 << params->cParams.windowLog; -        U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); +    U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); +    U32 const maxDist = (U32)1 << params->cParams.windowLog; +    if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) {          U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);          ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);          ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); @@ -2784,7 +3838,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,  *   Frame is supposed already started (header already produced)  *   @return : compressed size, or an error code  */ -static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, +static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,                                       void* dst, size_t dstCapacity,                                 const void* src, size_t srcSize,                                       U32 lastFrameChunk) @@ -2814,6 +3868,7 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,          ZSTD_overflowCorrectIfNeeded(              ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);          ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); +        ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);          /* Ensure hash/chain table insertion resumes no sooner than lowlimit */          if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; @@ -2824,6 +3879,10 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,                  FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed");                  assert(cSize > 0);                  assert(cSize <= blockSize + ZSTD_blockHeaderSize); +            } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { +                cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); +                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); +                assert(cSize > 0 || cctx->seqCollector.collectSequences == 1);              } else {                  cSize = ZSTD_compressBlock_internal(cctx,                                          op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, @@ -2946,7 +4005,7 @@ size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSe  {      RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,                      "wrong cctx stage"); -    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, +    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,                      parameter_unsupported,                      "incompatible with ldm");      cctx->externSeqStore.seq = seq; @@ -2983,11 +4042,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,      if (!srcSize) return fhSize;  /* do not generate an empty block if no input */ -    if (!ZSTD_window_update(&ms->window, src, srcSize)) { +    if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { +        ms->forceNonContiguous = 0;          ms->nextToUpdate = ms->window.dictLimit;      } -    if (cctx->appliedParams.ldmParams.enableLdm) { -        ZSTD_window_update(&cctx->ldmState.window, src, srcSize); +    if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { +        ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);      }      if (!frame) { @@ -3055,63 +4115,86 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,  {      const BYTE* ip = (const BYTE*) src;      const BYTE* const iend = ip + srcSize; +    int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; -    ZSTD_window_update(&ms->window, src, srcSize); +    /* Assert that we the ms params match the params we're being given */ +    ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +    if (srcSize > ZSTD_CHUNKSIZE_MAX) { +        /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. +         * Dictionaries right at the edge will immediately trigger overflow +         * correction, but I don't want to insert extra constraints here. +         */ +        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +        /* We must have cleared our windows when our source is this large. */ +        assert(ZSTD_window_isEmpty(ms->window)); +        if (loadLdmDict) +            assert(ZSTD_window_isEmpty(ls->window)); +        /* If the dictionary is too large, only load the suffix of the dictionary. */ +        if (srcSize > maxDictSize) { +            ip = iend - maxDictSize; +            src = ip; +            srcSize = maxDictSize; +        } +    } + +    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); +    ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);      ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +    ms->forceNonContiguous = params->deterministicRefPrefix; -    if (params->ldmParams.enableLdm && ls != NULL) { -        ZSTD_window_update(&ls->window, src, srcSize); +    if (loadLdmDict) { +        ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);          ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);      } -    /* Assert that we the ms params match the params we're being given */ -    ZSTD_assertEqualCParams(params->cParams, ms->cParams); -      if (srcSize <= HASH_READ_SIZE) return 0; -    while (iend - ip > HASH_READ_SIZE) { -        size_t const remaining = (size_t)(iend - ip); -        size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); -        const BYTE* const ichunk = ip + chunk; - -        ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); +    ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); -        if (params->ldmParams.enableLdm && ls != NULL) -            ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, ¶ms->ldmParams); +    if (loadLdmDict) +        ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); -        switch(params->cParams.strategy) -        { -        case ZSTD_fast: -            ZSTD_fillHashTable(ms, ichunk, dtlm); -            break; -        case ZSTD_dfast: -            ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); -            break; +    switch(params->cParams.strategy) +    { +    case ZSTD_fast: +        ZSTD_fillHashTable(ms, iend, dtlm); +        break; +    case ZSTD_dfast: +        ZSTD_fillDoubleHashTable(ms, iend, dtlm); +        break; -        case ZSTD_greedy: -        case ZSTD_lazy: -        case ZSTD_lazy2: -            if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) { -                assert(chunk == remaining); /* must load everything in one go */ -                ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE); -            } else if (chunk >= HASH_READ_SIZE) { -                ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); +    case ZSTD_greedy: +    case ZSTD_lazy: +    case ZSTD_lazy2: +        assert(srcSize >= HASH_READ_SIZE); +        if (ms->dedicatedDictSearch) { +            assert(ms->chainTable != NULL); +            ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); +        } else { +            assert(params->useRowMatchFinder != ZSTD_ps_auto); +            if (params->useRowMatchFinder == ZSTD_ps_enable) { +                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); +                ZSTD_memset(ms->tagTable, 0, tagTableSize); +                ZSTD_row_update(ms, iend-HASH_READ_SIZE); +                DEBUGLOG(4, "Using row-based hash table for lazy dict"); +            } else { +                ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); +                DEBUGLOG(4, "Using chain-based hash table for lazy dict");              } -            break; - -        case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */ -        case ZSTD_btopt: -        case ZSTD_btultra: -        case ZSTD_btultra2: -            if (chunk >= HASH_READ_SIZE) -                ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); -            break; - -        default: -            assert(0);  /* not possible : not a valid strategy id */          } +        break; + +    case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */ +    case ZSTD_btopt: +    case ZSTD_btultra: +    case ZSTD_btultra2: +        assert(srcSize >= HASH_READ_SIZE); +        ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); +        break; -        ip = ichunk; +    default: +        assert(0);  /* not possible : not a valid strategy id */      }      ms->nextToUpdate = (U32)(iend - ms->window.base); @@ -3250,7 +4333,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,      const BYTE* const dictEnd = dictPtr + dictSize;      size_t dictID;      size_t eSize; -      ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));      assert(dictSize >= 8);      assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); @@ -3321,6 +4403,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,                                      const ZSTD_CCtx_params* params, U64 pledgedSrcSize,                                      ZSTD_buffered_policy_e zbuff)  { +    size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize;      DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog);      /* params are supposed to be fully validated at this point */      assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); @@ -3335,7 +4418,8 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,          return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);      } -    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, +    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, +                                     dictContentSize,                                       ZSTDcrp_makeClean, zbuff) , "");      {   size_t const dictID = cdict ?                  ZSTD_compress_insertDictionary( @@ -3350,7 +4434,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,          FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");          assert(dictID <= UINT_MAX);          cctx->dictID = (U32)dictID; -        cctx->dictContentSize = cdict ? cdict->dictContentSize : dictSize; +        cctx->dictContentSize = dictContentSize;      }      return 0;  } @@ -3485,15 +4569,14 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,                           const void* dict,size_t dictSize,                                 ZSTD_parameters params)  { -    ZSTD_CCtx_params cctxParams;      DEBUGLOG(4, "ZSTD_compress_advanced");      FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); -    ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); +    ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO_CLEVEL);      return ZSTD_compress_advanced_internal(cctx,                                             dst, dstCapacity,                                             src, srcSize,                                             dict, dictSize, -                                           &cctxParams); +                                           &cctx->simpleApiParams);  }  /* Internal */ @@ -3517,14 +4600,13 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,                           const void* dict, size_t dictSize,                                 int compressionLevel)  { -    ZSTD_CCtx_params cctxParams;      {          ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict);          assert(params.fParams.contentSizeFlag == 1); -        ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); +        ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel);      }      DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); -    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); +    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams);  }  size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, @@ -3561,7 +4643,10 @@ size_t ZSTD_estimateCDictSize_advanced(      DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));      return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))           + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) -         + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) +         /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small +          * in case we are using DDS with row-hash. */ +         + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), +                                  /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0)           + (dictLoadMethod == ZSTD_dlm_byRef ? 0              : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));  } @@ -3592,9 +4677,6 @@ static size_t ZSTD_initCDict_internal(      assert(!ZSTD_checkCParams(params.cParams));      cdict->matchState.cParams = params.cParams;      cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; -    if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE_MAX) { -        cdict->matchState.dedicatedDictSearch = 0; -    }      if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {          cdict->dictContent = dictBuffer;      } else { @@ -3615,6 +4697,7 @@ static size_t ZSTD_initCDict_internal(          &cdict->matchState,          &cdict->workspace,          ¶ms.cParams, +        params.useRowMatchFinder,          ZSTDcrp_makeClean,          ZSTDirp_reset,          ZSTD_resetTarget_CDict), ""); @@ -3638,14 +4721,17 @@ static size_t ZSTD_initCDict_internal(  static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,                                        ZSTD_dictLoadMethod_e dictLoadMethod, -                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem) +                                      ZSTD_compressionParameters cParams, +                                      ZSTD_paramSwitch_e useRowMatchFinder, +                                      U32 enableDedicatedDictSearch, +                                      ZSTD_customMem customMem)  {      if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;      {   size_t const workspaceSize =              ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +              ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + -            ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + +            ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) +              (dictLoadMethod == ZSTD_dlm_byRef ? 0               : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));          void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); @@ -3664,7 +4750,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,          ZSTD_cwksp_move(&cdict->workspace, &ws);          cdict->customMem = customMem;          cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ - +        cdict->useRowMatchFinder = useRowMatchFinder;          return cdict;      }  } @@ -3686,7 +4772,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,          &cctxParams, customMem);  } -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( +ZSTD_CDict* ZSTD_createCDict_advanced2(          const void* dict, size_t dictSize,          ZSTD_dictLoadMethod_e dictLoadMethod,          ZSTD_dictContentType_e dictContentType, @@ -3716,10 +4802,13 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(              &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);      } +    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);      cctxParams.cParams = cParams; +    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);      cdict = ZSTD_createCDict_advanced_internal(dictSize,                          dictLoadMethod, cctxParams.cParams, +                        cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,                          customMem);      if (ZSTD_isError( ZSTD_initCDict_internal(cdict, @@ -3788,7 +4877,9 @@ const ZSTD_CDict* ZSTD_initStaticCDict(                                   ZSTD_dictContentType_e dictContentType,                                   ZSTD_compressionParameters cParams)  { -    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); +    ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); +    /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ +    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);      size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))                              + (dictLoadMethod == ZSTD_dlm_byRef ? 0                                 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) @@ -3813,6 +4904,8 @@ const ZSTD_CDict* ZSTD_initStaticCDict(      ZSTD_CCtxParams_init(¶ms, 0);      params.cParams = cParams; +    params.useRowMatchFinder = useRowMatchFinder; +    cdict->useRowMatchFinder = useRowMatchFinder;      if (ZSTD_isError( ZSTD_initCDict_internal(cdict,                                                dict, dictSize, @@ -3839,15 +4932,15 @@ unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict)      return cdict->dictID;  } - -/* ZSTD_compressBegin_usingCDict_advanced() : - * cdict must be != NULL */ -size_t ZSTD_compressBegin_usingCDict_advanced( +/* ZSTD_compressBegin_usingCDict_internal() : + * Implementation of various ZSTD_compressBegin_usingCDict* functions. + */ +static size_t ZSTD_compressBegin_usingCDict_internal(      ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,      ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)  {      ZSTD_CCtx_params cctxParams; -    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); +    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal");      RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!");      /* Initialize the cctxParams from the cdict */      { @@ -3879,25 +4972,48 @@ size_t ZSTD_compressBegin_usingCDict_advanced(                                          ZSTDb_not_buffered);  } + +/* ZSTD_compressBegin_usingCDict_advanced() : + * This function is DEPRECATED. + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( +    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, +    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ +    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); +} +  /* ZSTD_compressBegin_usingCDict() : - * pledgedSrcSize=0 means "unknown" - * if pledgedSrcSize>0, it will enable contentSizeFlag */ + * cdict must be != NULL */  size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)  {      ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; -    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); -    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); +    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);  } -size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, +/*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,                                  void* dst, size_t dstCapacity,                                  const void* src, size_t srcSize,                                  const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)  { -    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), "");   /* will check if cdict != NULL */ +    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */      return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);  } +/*! ZSTD_compress_usingCDict_advanced(): + * This function is DEPRECATED. + */ +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, +                                void* dst, size_t dstCapacity, +                                const void* src, size_t srcSize, +                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ +    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} +  /*! ZSTD_compress_usingCDict() :   *  Compression using a digested Dictionary.   *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. @@ -3909,7 +5025,7 @@ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,                                  const ZSTD_CDict* cdict)  {      ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; -    return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);  } @@ -4313,8 +5429,13 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,      FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */      ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));   /* single usage */      assert(prefixDict.dict==NULL || cctx->cdict==NULL);    /* only one can be set */ -    if (cctx->cdict) -        params.compressionLevel = cctx->cdict->compressionLevel; /* let cdict take priority in terms of compression level */ +    if (cctx->cdict && !cctx->localDict.cdict) { +        /* Let the cdict's compression level take priority over the requested params. +         * But do not take the cdict's compression level if the "cdict" is actually a localDict +         * generated from ZSTD_initLocalDict(). +         */ +        params.compressionLevel = cctx->cdict->compressionLevel; +    }      DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");      if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */      { @@ -4327,11 +5448,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,                  dictSize, mode);      } -    if (ZSTD_CParams_shouldEnableLdm(¶ms.cParams)) { -        /* Enable LDM by default for optimal parser and window size >= 128MB */ -        DEBUGLOG(4, "LDM enabled by default (window size >= 128MB, strategy >= btopt)"); -        params.ldmParams.enableLdm = 1; -    } +    params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); +    params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); +    params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams);      {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;          assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); @@ -4436,39 +5555,39 @@ typedef struct {      size_t posInSrc;        /* Number of bytes given by sequences provided so far */  } ZSTD_sequencePosition; -/* Returns a ZSTD error code if sequence is not valid */ -static size_t ZSTD_validateSequence(U32 offCode, U32 matchLength, -                                    size_t posInSrc, U32 windowLog, size_t dictSize, U32 minMatch) { -    size_t offsetBound; -    U32 windowSize = 1 << windowLog; -    /* posInSrc represents the amount of data the the decoder would decode up to this point. +/* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ +static size_t +ZSTD_validateSequence(U32 offCode, U32 matchLength, +                      size_t posInSrc, U32 windowLog, size_t dictSize) +{ +    U32 const windowSize = 1 << windowLog; +    /* posInSrc represents the amount of data the decoder would decode up to this point.       * As long as the amount of data decoded is less than or equal to window size, offsets may be       * larger than the total length of output decoded in order to reference the dict, even larger than       * window size. After output surpasses windowSize, we're limited to windowSize offsets again.       */ -    offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; -    RETURN_ERROR_IF(offCode > offsetBound + ZSTD_REP_MOVE, corruption_detected, "Offset too large!"); -    RETURN_ERROR_IF(matchLength < minMatch, corruption_detected, "Matchlength too small"); +    size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");      return 0;  }  /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ -static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) { -    U32 offCode = rawOffset + ZSTD_REP_MOVE; -    U32 repCode = 0; +static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) +{ +    U32 offCode = STORE_OFFSET(rawOffset);      if (!ll0 && rawOffset == rep[0]) { -        repCode = 1; +        offCode = STORE_REPCODE_1;      } else if (rawOffset == rep[1]) { -        repCode = 2 - ll0; +        offCode = STORE_REPCODE(2 - ll0);      } else if (rawOffset == rep[2]) { -        repCode = 3 - ll0; +        offCode = STORE_REPCODE(3 - ll0);      } else if (ll0 && rawOffset == rep[0] - 1) { -        repCode = 3; -    } -    if (repCode) { -        /* ZSTD_storeSeq expects a number in the range [0, 2] to represent a repcode */ -        offCode = repCode - 1; +        offCode = STORE_REPCODE_3;      }      return offCode;  } @@ -4476,18 +5595,17 @@ static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32  /* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of   * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.   */ -static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, -                                                             const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -                                                             const void* src, size_t blockSize) { +static size_t +ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +                                              ZSTD_sequencePosition* seqPos, +                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +                                        const void* src, size_t blockSize) +{      U32 idx = seqPos->idx;      BYTE const* ip = (BYTE const*)(src);      const BYTE* const iend = ip + blockSize;      repcodes_t updatedRepcodes;      U32 dictSize; -    U32 litLength; -    U32 matchLength; -    U32 ll0; -    U32 offCode;      if (cctx->cdict) {          dictSize = (U32)cctx->cdict->dictContentSize; @@ -4498,23 +5616,22 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS      }      ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));      for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { -        litLength = inSeqs[idx].litLength; -        matchLength = inSeqs[idx].matchLength; -        ll0 = litLength == 0; -        offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); -        updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); +        U32 const litLength = inSeqs[idx].litLength; +        U32 const ll0 = (litLength == 0); +        U32 const matchLength = inSeqs[idx].matchLength; +        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);          DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);          if (cctx->appliedParams.validateSequences) {              seqPos->posInSrc += litLength + matchLength;              FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, -                                                cctx->appliedParams.cParams.windowLog, dictSize, -                                                cctx->appliedParams.cParams.minMatch), +                                                cctx->appliedParams.cParams.windowLog, dictSize),                                                  "Sequence validation failed");          }          RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,                          "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); -        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH); +        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);          ip += matchLength + litLength;      }      ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); @@ -4541,9 +5658,11 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS   * avoid splitting a match, or to avoid splitting a match such that it would produce a match   * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.   */ -static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, -                                                       const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -                                                       const void* src, size_t blockSize) { +static size_t +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +                                   const void* src, size_t blockSize) +{      U32 idx = seqPos->idx;      U32 startPosInSequence = seqPos->posInSequence;      U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; @@ -4553,10 +5672,6 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq      repcodes_t updatedRepcodes;      U32 bytesAdjustment = 0;      U32 finalMatchSplit = 0; -    U32 litLength; -    U32 matchLength; -    U32 rawOffset; -    U32 offCode;      if (cctx->cdict) {          dictSize = cctx->cdict->dictContentSize; @@ -4570,9 +5685,10 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq      ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));      while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {          const ZSTD_Sequence currSeq = inSeqs[idx]; -        litLength = currSeq.litLength; -        matchLength = currSeq.matchLength; -        rawOffset = currSeq.offset; +        U32 litLength = currSeq.litLength; +        U32 matchLength = currSeq.matchLength; +        U32 const rawOffset = currSeq.offset; +        U32 offCode;          /* Modify the sequence depending on where endPosInSequence lies */          if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { @@ -4625,22 +5741,21 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq              }          }          /* Check if this offset can be represented with a repcode */ -        {   U32 ll0 = (litLength == 0); +        {   U32 const ll0 = (litLength == 0);              offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); -            updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); +            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);          }          if (cctx->appliedParams.validateSequences) {              seqPos->posInSrc += litLength + matchLength;              FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, -                                                   cctx->appliedParams.cParams.windowLog, dictSize, -                                                   cctx->appliedParams.cParams.minMatch), +                                                   cctx->appliedParams.cParams.windowLog, dictSize),                                                     "Sequence validation failed");          }          DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);          RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,                          "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); -        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH); +        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);          ip += matchLength + litLength;      }      DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); @@ -4665,7 +5780,8 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq  typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,                                         const void* src, size_t blockSize); -static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) { +static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) +{      ZSTD_sequenceCopier sequenceCopier = NULL;      assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));      if (mode == ZSTD_sf_explicitBlockDelimiters) { @@ -4679,12 +5795,15 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)  /* Compress, block-by-block, all of the sequences given.   * - * Returns the cumulative size of all compressed blocks (including their headers), otherwise a ZSTD error. + * Returns the cumulative size of all compressed blocks (including their headers), + * otherwise a ZSTD error.   */ -static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, -                                              void* dst, size_t dstCapacity, -                                              const ZSTD_Sequence* inSeqs, size_t inSeqsSize, -                                              const void* src, size_t srcSize) { +static size_t +ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, +                                void* dst, size_t dstCapacity, +                          const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +                          const void* src, size_t srcSize) +{      size_t cSize = 0;      U32 lastBlock;      size_t blockSize; @@ -4694,7 +5813,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,      BYTE const* ip = (BYTE const*)src;      BYTE* op = (BYTE*)dst; -    ZSTD_sequenceCopier sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); +    ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);      DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);      /* Special case: empty frame */ @@ -4732,7 +5851,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,              continue;          } -        compressedSeqsSize = ZSTD_entropyCompressSequences(&cctx->seqStore, +        compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,                                  &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,                                  &cctx->appliedParams,                                  op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, @@ -4764,7 +5883,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,          } else {              U32 cBlockHeader;              /* Error checking and repcodes update */ -            ZSTD_confirmRepcodesAndEntropyTables(cctx); +            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState);              if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)                  cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; @@ -4794,7 +5913,8 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,  size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,                                const ZSTD_Sequence* inSeqs, size_t inSeqsSize, -                              const void* src, size_t srcSize) { +                              const void* src, size_t srcSize) +{      BYTE* op = (BYTE*)dst;      size_t cSize = 0;      size_t compressedBlocksSize = 0; @@ -4861,117 +5981,11 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)  /*-=====  Pre-defined compression levels  =====-*/ +#include "clevels.h" -#define ZSTD_MAX_CLEVEL     22  int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }  int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } - -static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { -{   /* "default" - for any srcSize > 256 KB */ -    /* W,  C,  H,  S,  L, TL, strat */ -    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */ -    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */ -    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */ -    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */ -    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */ -    { 21, 18, 19,  2,  5,  2, ZSTD_greedy  },  /* level  5 */ -    { 21, 19, 19,  3,  5,  4, ZSTD_greedy  },  /* level  6 */ -    { 21, 19, 19,  3,  5,  8, ZSTD_lazy    },  /* level  7 */ -    { 21, 19, 19,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */ -    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */ -    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */ -    { 22, 21, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */ -    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */ -    { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 13 */ -    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */ -    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */ -    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */ -    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */ -    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */ -    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */ -    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */ -    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */ -    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */ -}, -{   /* for srcSize <= 256 KB */ -    /* W,  C,  H,  S,  L,  T, strat */ -    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */ -    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */ -    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */ -    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */ -    { 18, 16, 17,  2,  5,  2, ZSTD_greedy  },  /* level  4.*/ -    { 18, 18, 18,  3,  5,  2, ZSTD_greedy  },  /* level  5.*/ -    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/ -    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */ -    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */ -    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */ -    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */ -    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/ -    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/ -    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */ -    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/ -    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/ -    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/ -    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/ -    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/ -    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/ -    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/ -    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/ -    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/ -}, -{   /* for srcSize <= 128 KB */ -    /* W,  C,  H,  S,  L,  T, strat */ -    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */ -    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */ -    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */ -    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */ -    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */ -    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */ -    { 17, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */ -    { 17, 17, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */ -    { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */ -    { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */ -    { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */ -    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */ -    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */ -    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/ -    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/ -    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/ -    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/ -    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/ -    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/ -    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/ -    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/ -    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/ -    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/ -}, -{   /* for srcSize <= 16 KB */ -    /* W,  C,  H,  S,  L,  T, strat */ -    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */ -    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */ -    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */ -    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */ -    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */ -    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/ -    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */ -    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */ -    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/ -    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/ -    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/ -    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/ -    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/ -    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/ -    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/ -    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/ -    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/ -    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/ -    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/ -    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/ -    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/ -    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/ -    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/ -}, -}; +int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }  static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize)  { @@ -4999,7 +6013,7 @@ static int ZSTD_dedicatedDictSearch_isSupported(  {      return (cParams->strategy >= ZSTD_greedy)          && (cParams->strategy <= ZSTD_lazy2) -        && (cParams->hashLog >= cParams->chainLog) +        && (cParams->hashLog > cParams->chainLog)          && (cParams->chainLog <= 24);  } @@ -5018,6 +6032,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(          case ZSTD_lazy:          case ZSTD_lazy2:              cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; +            if (cParams->hashLog < ZSTD_HASHLOG_MIN) { +                cParams->hashLog = ZSTD_HASHLOG_MIN; +            }              break;          case ZSTD_btlazy2:          case ZSTD_btopt: @@ -5066,6 +6083,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,      else row = compressionLevel;      {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; +        DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy);          /* acceleration factor */          if (compressionLevel < 0) {              int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel); diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h index 685d2f996cc2..71697a11ae30 100644 --- a/lib/zstd/compress/zstd_compress_internal.h +++ b/lib/zstd/compress/zstd_compress_internal.h @@ -57,7 +57,7 @@ typedef struct {  } ZSTD_localDict;  typedef struct { -    HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)]; +    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];      HUF_repeat repeatMode;  } ZSTD_hufCTables_t; @@ -75,8 +75,55 @@ typedef struct {      ZSTD_fseCTables_t fse;  } ZSTD_entropyCTables_t; +/* ********************************************* +*  Entropy buffer statistics structs and funcs * +***********************************************/ +/* ZSTD_hufCTablesMetadata_t : + *  Stores Literals Block Type for a super-block in hType, and + *  huffman tree description in hufDesBuffer. + *  hufDesSize refers to the size of huffman tree description in bytes. + *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */  typedef struct { -    U32 off;            /* Offset code (offset + ZSTD_REP_MOVE) for the match */ +    symbolEncodingType_e hType; +    BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; +    size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/* ZSTD_fseCTablesMetadata_t : + *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and + *  fse tables in fseTablesBuffer. + *  fseTablesSize refers to the size of fse tables in bytes. + *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ +typedef struct { +    symbolEncodingType_e llType; +    symbolEncodingType_e ofType; +    symbolEncodingType_e mlType; +    BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; +    size_t fseTablesSize; +    size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { +    ZSTD_hufCTablesMetadata_t hufMetadata; +    ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + +/* ZSTD_buildBlockEntropyStats() : + *  Builds entropy for the block. + *  @return : 0 on success or error code */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +                             const ZSTD_entropyCTables_t* prevEntropy, +                                   ZSTD_entropyCTables_t* nextEntropy, +                             const ZSTD_CCtx_params* cctxParams, +                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata, +                                   void* workspace, size_t wkspSize); + +/* ******************************* +*  Compression internals structs * +*********************************/ + +typedef struct { +    U32 off;            /* Offset sumtype code for the match, using ZSTD_storeSeq() format */      U32 len;            /* Raw length of match */  } ZSTD_match_t; @@ -126,7 +173,7 @@ typedef struct {      U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */      ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */      const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */ -    ZSTD_literalCompressionMode_e literalCompressionMode; +    ZSTD_paramSwitch_e literalCompressionMode;  } optState_t;  typedef struct { @@ -135,14 +182,23 @@ typedef struct {  } ZSTD_compressedBlockState_t;  typedef struct { -    BYTE const* nextSrc;    /* next block here to continue on current prefix */ -    BYTE const* base;       /* All regular indexes relative to this position */ -    BYTE const* dictBase;   /* extDict indexes relative to this position */ -    U32 dictLimit;          /* below that point, need extDict */ -    U32 lowLimit;           /* below that point, no more valid data */ +    BYTE const* nextSrc;       /* next block here to continue on current prefix */ +    BYTE const* base;          /* All regular indexes relative to this position */ +    BYTE const* dictBase;      /* extDict indexes relative to this position */ +    U32 dictLimit;             /* below that point, need extDict */ +    U32 lowLimit;              /* below that point, no more valid data */ +    U32 nbOverflowCorrections; /* Number of times overflow correction has run since +                                * ZSTD_window_init(). Useful for debugging coredumps +                                * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY. +                                */  } ZSTD_window_t; +#define ZSTD_WINDOW_START_INDEX 2 +  typedef struct ZSTD_matchState_t ZSTD_matchState_t; + +#define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache for row-based matchfinder */ +  struct ZSTD_matchState_t {      ZSTD_window_t window;   /* State for window round buffer management */      U32 loadedDictEnd;      /* index of end of dictionary, within context's referential. @@ -154,9 +210,17 @@ struct ZSTD_matchState_t {                               */      U32 nextToUpdate;       /* index from which to continue table update */      U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */ + +    U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */ +    U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ +      U32* hashTable;      U32* hashTable3;      U32* chainTable; + +    U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ +      int dedicatedDictSearch;  /* Indicates whether this matchState is using the                                 * dedicated dictionary search structure.                                 */ @@ -196,7 +260,7 @@ typedef struct {  } ldmState_t;  typedef struct { -    U32 enableLdm;          /* 1 if enable long distance matching */ +    ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */      U32 hashLog;            /* Log size of hashTable */      U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */      U32 minMatchLength;     /* Minimum match length */ @@ -227,7 +291,7 @@ struct ZSTD_CCtx_params_s {                                  * There is no guarantee that hint is close to actual source size */      ZSTD_dictAttachPref_e attachDictPref; -    ZSTD_literalCompressionMode_e literalCompressionMode; +    ZSTD_paramSwitch_e literalCompressionMode;      /* Multithreading: used to pass parameters to mtctx */      int nbWorkers; @@ -249,6 +313,15 @@ struct ZSTD_CCtx_params_s {      ZSTD_sequenceFormat_e blockDelimiters;      int validateSequences; +    /* Block splitting */ +    ZSTD_paramSwitch_e useBlockSplitter; + +    /* Param for deciding whether to use row-based matchfinder */ +    ZSTD_paramSwitch_e useRowMatchFinder; + +    /* Always load a dictionary in ext-dict mode (not prefix mode)? */ +    int deterministicRefPrefix; +      /* Internal use, for createCCtxParams() and freeCCtxParams() only */      ZSTD_customMem customMem;  };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ @@ -266,12 +339,29 @@ typedef enum {      ZSTDb_buffered  } ZSTD_buffered_policy_e; +/* + * Struct that contains all elements of block splitter that should be allocated + * in a wksp. + */ +#define ZSTD_MAX_NB_BLOCK_SPLITS 196 +typedef struct { +    seqStore_t fullSeqStoreChunk; +    seqStore_t firstHalfSeqStore; +    seqStore_t secondHalfSeqStore; +    seqStore_t currSeqStore; +    seqStore_t nextSeqStore; + +    U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; +    ZSTD_entropyCTablesMetadata_t entropyMetadata; +} ZSTD_blockSplitCtx; +  struct ZSTD_CCtx_s {      ZSTD_compressionStage_e stage;      int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */      int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */      ZSTD_CCtx_params requestedParams;      ZSTD_CCtx_params appliedParams; +    ZSTD_CCtx_params simpleApiParams;    /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */      U32   dictID;      size_t dictContentSize; @@ -296,7 +386,7 @@ struct ZSTD_CCtx_s {      ZSTD_blockState_t blockState;      U32* entropyWorkspace;  /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ -    /* Wether we are streaming or not */ +    /* Whether we are streaming or not */      ZSTD_buffered_policy_e bufferedPolicy;      /* streaming */ @@ -324,6 +414,9 @@ struct ZSTD_CCtx_s {      /* Multi-threading */      /* Tracing */ + +    /* Workspace for block splitter */ +    ZSTD_blockSplitCtx blockSplitCtx;  };  typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -358,7 +451,7 @@ typedef enum {  typedef size_t (*ZSTD_blockCompressor) (          ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize); -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode); +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);  MEM_STATIC U32 ZSTD_LLcode(U32 litLength) @@ -392,31 +485,6 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)      return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];  } -typedef struct repcodes_s { -    U32 rep[3]; -} repcodes_t; - -MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) -{ -    repcodes_t newReps; -    if (offset >= ZSTD_REP_NUM) {  /* full offset */ -        newReps.rep[2] = rep[1]; -        newReps.rep[1] = rep[0]; -        newReps.rep[0] = offset - ZSTD_REP_MOVE; -    } else {   /* repcode */ -        U32 const repCode = offset + ll0; -        if (repCode > 0) {  /* note : if repCode==0, no change */ -            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; -            newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; -            newReps.rep[1] = rep[0]; -            newReps.rep[0] = currentOffset; -        } else {   /* repCode == 0 */ -            ZSTD_memcpy(&newReps, rep, sizeof(newReps)); -        } -    } -    return newReps; -} -  /* ZSTD_cParam_withinBounds:   * @return 1 if value is within cParam bounds,   * 0 otherwise */ @@ -465,17 +533,17 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)      return (srcSize >> minlog) + 2;  } -MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams) +MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams)  {      switch (cctxParams->literalCompressionMode) { -    case ZSTD_lcm_huffman: +    case ZSTD_ps_enable:          return 0; -    case ZSTD_lcm_uncompressed: +    case ZSTD_ps_disable:          return 1;      default:          assert(0 /* impossible: pre-validated */);          ZSTD_FALLTHROUGH; -    case ZSTD_lcm_auto: +    case ZSTD_ps_auto:          return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);      }  } @@ -485,7 +553,9 @@ MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParam   *  Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single   *  large copies.   */ -static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) { +static void +ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) +{      assert(iend > ilimit_w);      if (ip <= ilimit_w) {          ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); @@ -495,14 +565,30 @@ static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const ie      while (ip < iend) *op++ = *ip++;  } +#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1) +#define STORE_REPCODE_1 STORE_REPCODE(1) +#define STORE_REPCODE_2 STORE_REPCODE(2) +#define STORE_REPCODE_3 STORE_REPCODE(3) +#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE) +#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE) +#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */ +#define STORED_TO_OFFBASE(o) ((o)+1) +#define OFFBASE_TO_STORED(o) ((o)-1) +  /*! ZSTD_storeSeq() : - *  Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. - *  `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes). - *  `mlBase` : matchLength - MINMATCH + *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. + *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). + *  @matchLength : must be >= MINMATCH   *  Allowed to overread literals up to litLimit.  */ -HINT_INLINE UNUSED_ATTR -void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase) +HINT_INLINE UNUSED_ATTR void +ZSTD_storeSeq(seqStore_t* seqStorePtr, +              size_t litLength, const BYTE* literals, const BYTE* litLimit, +              U32 offBase_minus1, +              size_t matchLength)  {      BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;      BYTE const* const litEnd = literals + litLength; @@ -511,7 +597,7 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera      if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */      {   U32 const pos = (U32)((const BYTE*)literals - g_start);          DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", -               pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); +               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);      }  #endif      assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); @@ -535,26 +621,66 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera      /* literal Length */      if (litLength>0xFFFF) { -        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ -        seqStorePtr->longLengthID = 1; +        assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +        seqStorePtr->longLengthType = ZSTD_llt_literalLength;          seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);      }      seqStorePtr->sequences[0].litLength = (U16)litLength;      /* match offset */ -    seqStorePtr->sequences[0].offset = offCode + 1; +    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);      /* match Length */ -    if (mlBase>0xFFFF) { -        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ -        seqStorePtr->longLengthID = 2; -        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +    assert(matchLength >= MINMATCH); +    {   size_t const mlBase = matchLength - MINMATCH; +        if (mlBase>0xFFFF) { +            assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +            seqStorePtr->longLengthType = ZSTD_llt_matchLength; +            seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +        } +        seqStorePtr->sequences[0].mlBase = (U16)mlBase;      } -    seqStorePtr->sequences[0].matchLength = (U16)mlBase;      seqStorePtr->sequences++;  } +/* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) + * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() + */ +MEM_STATIC void +ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) +{ +    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */ +        rep[2] = rep[1]; +        rep[1] = rep[0]; +        rep[0] = STORED_OFFSET(offBase_minus1); +    } else {   /* repcode */ +        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; +        if (repCode > 0) {  /* note : if repCode==0, no change */ +            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; +            rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +            rep[1] = rep[0]; +            rep[0] = currentOffset; +        } else {   /* repCode == 0 */ +            /* nothing to do */ +        } +    } +} + +typedef struct repcodes_s { +    U32 rep[3]; +} repcodes_t; + +MEM_STATIC repcodes_t +ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) +{ +    repcodes_t newReps; +    ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); +    return newReps; +} +  /*-*************************************  *  Match length counter @@ -778,6 +904,13 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)      window->dictLimit = end;  } +MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) +{ +    return window.dictLimit == ZSTD_WINDOW_START_INDEX && +           window.lowLimit == ZSTD_WINDOW_START_INDEX && +           (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX; +} +  /*   * ZSTD_window_hasExtDict():   * Returns non-zero if the window has a non-empty extDict. @@ -801,15 +934,71 @@ MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)              ZSTD_noDict;  } +/* Defining this macro to non-zero tells zstd to run the overflow correction + * code much more frequently. This is very inefficient, and should only be + * used for tests and fuzzers. + */ +#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY +#  ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1 +#  else +#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0 +#  endif +#endif + +/* + * ZSTD_window_canOverflowCorrect(): + * Returns non-zero if the indices are large enough for overflow correction + * to work correctly without impacting compression ratio. + */ +MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window, +                                              U32 cycleLog, +                                              U32 maxDist, +                                              U32 loadedDictEnd, +                                              void const* src) +{ +    U32 const cycleSize = 1u << cycleLog; +    U32 const curr = (U32)((BYTE const*)src - window.base); +    U32 const minIndexToOverflowCorrect = cycleSize +                                        + MAX(maxDist, cycleSize) +                                        + ZSTD_WINDOW_START_INDEX; + +    /* Adjust the min index to backoff the overflow correction frequency, +     * so we don't waste too much CPU in overflow correction. If this +     * computation overflows we don't really care, we just need to make +     * sure it is at least minIndexToOverflowCorrect. +     */ +    U32 const adjustment = window.nbOverflowCorrections + 1; +    U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment, +                                  minIndexToOverflowCorrect); +    U32 const indexLargeEnough = curr > adjustedIndex; + +    /* Only overflow correct early if the dictionary is invalidated already, +     * so we don't hurt compression ratio. +     */ +    U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd; + +    return indexLargeEnough && dictionaryInvalidated; +} +  /*   * ZSTD_window_needOverflowCorrection():   * Returns non-zero if the indices are getting too large and need overflow   * protection.   */  MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, +                                                  U32 cycleLog, +                                                  U32 maxDist, +                                                  U32 loadedDictEnd, +                                                  void const* src,                                                    void const* srcEnd)  {      U32 const curr = (U32)((BYTE const*)srcEnd - window.base); +    if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { +        if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) { +            return 1; +        } +    }      return curr > ZSTD_CURRENT_MAX;  } @@ -821,7 +1010,6 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,   *   * The least significant cycleLog bits of the indices must remain the same,   * which may be 0. Every index up to maxDist in the past must be valid. - * NOTE: (maxDist & cycleMask) must be zero.   */  MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,                                             U32 maxDist, void const* src) @@ -845,32 +1033,52 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,       * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:       *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.       */ -    U32 const cycleMask = (1U << cycleLog) - 1; +    U32 const cycleSize = 1u << cycleLog; +    U32 const cycleMask = cycleSize - 1;      U32 const curr = (U32)((BYTE const*)src - window->base); -    U32 const currentCycle0 = curr & cycleMask; -    /* Exclude zero so that newCurrent - maxDist >= 1. */ -    U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0; -    U32 const newCurrent = currentCycle1 + maxDist; +    U32 const currentCycle = curr & cycleMask; +    /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */ +    U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX +                                     ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX) +                                     : 0; +    U32 const newCurrent = currentCycle +                         + currentCycleCorrection +                         + MAX(maxDist, cycleSize);      U32 const correction = curr - newCurrent; -    assert((maxDist & cycleMask) == 0); +    /* maxDist must be a power of two so that: +     *   (newCurrent & cycleMask) == (curr & cycleMask) +     * This is required to not corrupt the chains / binary tree. +     */ +    assert((maxDist & (maxDist - 1)) == 0); +    assert((curr & cycleMask) == (newCurrent & cycleMask));      assert(curr > newCurrent); -    /* Loose bound, should be around 1<<29 (see above) */ -    assert(correction > 1<<28); +    if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { +        /* Loose bound, should be around 1<<29 (see above) */ +        assert(correction > 1<<28); +    }      window->base += correction;      window->dictBase += correction; -    if (window->lowLimit <= correction) window->lowLimit = 1; -    else window->lowLimit -= correction; -    if (window->dictLimit <= correction) window->dictLimit = 1; -    else window->dictLimit -= correction; +    if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) { +        window->lowLimit = ZSTD_WINDOW_START_INDEX; +    } else { +        window->lowLimit -= correction; +    } +    if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) { +        window->dictLimit = ZSTD_WINDOW_START_INDEX; +    } else { +        window->dictLimit -= correction; +    }      /* Ensure we can still reference the full window. */      assert(newCurrent >= maxDist); -    assert(newCurrent - maxDist >= 1); +    assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX);      /* Ensure that lowLimit and dictLimit didn't underflow. */      assert(window->lowLimit <= newCurrent);      assert(window->dictLimit <= newCurrent); +    ++window->nbOverflowCorrections; +      DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,               window->lowLimit);      return correction; @@ -975,11 +1183,13 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,  MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {      ZSTD_memset(window, 0, sizeof(*window)); -    window->base = (BYTE const*)""; -    window->dictBase = (BYTE const*)""; -    window->dictLimit = 1;    /* start from 1, so that 1st position is valid */ -    window->lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */ -    window->nextSrc = window->base + 1;   /* see issue #1241 */ +    window->base = (BYTE const*)" "; +    window->dictBase = (BYTE const*)" "; +    ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */ +    window->dictLimit = ZSTD_WINDOW_START_INDEX;    /* start from >0, so that 1st position is valid */ +    window->lowLimit = ZSTD_WINDOW_START_INDEX;     /* it ensures first and later CCtx usages compress the same */ +    window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX;   /* see issue #1241 */ +    window->nbOverflowCorrections = 0;  }  /* @@ -990,7 +1200,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {   * Returns non-zero if the segment is contiguous.   */  MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, -                                  void const* src, size_t srcSize) +                                  void const* src, size_t srcSize, +                                  int forceNonContiguous)  {      BYTE const* const ip = (BYTE const*)src;      U32 contiguous = 1; @@ -1000,7 +1211,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,      assert(window->base != NULL);      assert(window->dictBase != NULL);      /* Check if blocks follow each other */ -    if (src != window->nextSrc) { +    if (src != window->nextSrc || forceNonContiguous) {          /* not contiguous */          size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);          DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); @@ -1030,15 +1241,15 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,   */  MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)  { -    U32    const maxDistance = 1U << windowLog; -    U32    const lowestValid = ms->window.lowLimit; -    U32    const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; -    U32    const isDictionary = (ms->loadedDictEnd != 0); +    U32 const maxDistance = 1U << windowLog; +    U32 const lowestValid = ms->window.lowLimit; +    U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; +    U32 const isDictionary = (ms->loadedDictEnd != 0);      /* When using a dictionary the entire dictionary is valid if a single byte of the dictionary       * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't       * valid for the entire block. So this check is sufficient to find the lowest valid match index.       */ -    U32    const matchLowest = isDictionary ? lowestValid : withinWindow; +    U32 const matchLowest = isDictionary ? lowestValid : withinWindow;      return matchLowest;  } diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c index 655bcda4d1f1..52b0a8059aba 100644 --- a/lib/zstd/compress/zstd_compress_literals.c +++ b/lib/zstd/compress/zstd_compress_literals.c @@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,                                void* dst, size_t dstCapacity,                          const void* src, size_t srcSize,                                void* entropyWorkspace, size_t entropyWorkspaceSize, -                        const int bmi2) +                        const int bmi2, +                        unsigned suspectUncompressible)  {      size_t const minGain = ZSTD_minGain(srcSize, strategy);      size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); @@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,              HUF_compress1X_repeat(                  ostart+lhSize, dstCapacity-lhSize, src, srcSize,                  HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, -                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : +                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :              HUF_compress4X_repeat(                  ostart+lhSize, dstCapacity-lhSize, src, srcSize,                  HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, -                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); +                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);          if (repeat != HUF_repeat_none) {              /* reused the existing table */              DEBUGLOG(5, "Reusing previous huffman table"); @@ -117,7 +118,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,          }      } -    if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { +    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {          ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));          return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);      } diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h index 9904c0cd30a0..9775fb97cb70 100644 --- a/lib/zstd/compress/zstd_compress_literals.h +++ b/lib/zstd/compress/zstd_compress_literals.h @@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,  size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); +/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */  size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,                                ZSTD_hufCTables_t* nextHuf,                                ZSTD_strategy strategy, int disableLiteralCompression,                                void* dst, size_t dstCapacity,                          const void* src, size_t srcSize,                                void* entropyWorkspace, size_t entropyWorkspaceSize, -                        const int bmi2); +                        const int bmi2, +                        unsigned suspectUncompressible);  #endif /* ZSTD_COMPRESS_LITERALS_H */ diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c index dcfcdc9cc5e8..21ddc1b37acf 100644 --- a/lib/zstd/compress/zstd_compress_sequences.c +++ b/lib/zstd/compress/zstd_compress_sequences.c @@ -85,6 +85,8 @@ static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t  {      unsigned cost = 0;      unsigned s; + +    assert(total > 0);      for (s = 0; s <= max; ++s) {          unsigned norm = (unsigned)((256 * count[s]) / total);          if (count[s] != 0 && norm == 0) @@ -273,10 +275,11 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity,          assert(nbSeq_1 > 1);          assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp));          (void)entropyWorkspaceSize; -        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), ""); -        {   size_t const NCountSize = FSE_writeNCount(op, oend - op, wksp->norm, max, tableLog);   /* overflow protected */ +        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); +        assert(oend >= op); +        {   size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog);   /* overflow protected */              FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); -            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), ""); +            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed");              return NCountSize;          }      } @@ -310,19 +313,19 @@ ZSTD_encodeSequences_body(      FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);      BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);      if (MEM_32bits()) BIT_flushBits(&blockStream); -    BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); +    BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);      if (MEM_32bits()) BIT_flushBits(&blockStream);      if (longOffsets) {          U32 const ofBits = ofCodeTable[nbSeq-1];          unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);          if (extraBits) { -            BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); +            BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits);              BIT_flushBits(&blockStream);          } -        BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, +        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits,                      ofBits - extraBits);      } else { -        BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); +        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);      }      BIT_flushBits(&blockStream); @@ -336,8 +339,8 @@ ZSTD_encodeSequences_body(              U32  const mlBits = ML_bits[mlCode];              DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",                          (unsigned)sequences[n].litLength, -                        (unsigned)sequences[n].matchLength + MINMATCH, -                        (unsigned)sequences[n].offset); +                        (unsigned)sequences[n].mlBase + MINMATCH, +                        (unsigned)sequences[n].offBase);                                                                              /* 32b*/  /* 64b*/                                                                              /* (7)*/  /* (7)*/              FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */ @@ -348,18 +351,18 @@ ZSTD_encodeSequences_body(                  BIT_flushBits(&blockStream);                                /* (7)*/              BIT_addBits(&blockStream, sequences[n].litLength, llBits);              if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); -            BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); +            BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);              if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);              if (longOffsets) {                  unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);                  if (extraBits) { -                    BIT_addBits(&blockStream, sequences[n].offset, extraBits); +                    BIT_addBits(&blockStream, sequences[n].offBase, extraBits);                      BIT_flushBits(&blockStream);                            /* (7)*/                  } -                BIT_addBits(&blockStream, sequences[n].offset >> extraBits, +                BIT_addBits(&blockStream, sequences[n].offBase >> extraBits,                              ofBits - extraBits);                            /* 31 */              } else { -                BIT_addBits(&blockStream, sequences[n].offset, ofBits);     /* 31 */ +                BIT_addBits(&blockStream, sequences[n].offBase, ofBits);     /* 31 */              }              BIT_flushBits(&blockStream);                                    /* (7)*/              DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); @@ -396,7 +399,7 @@ ZSTD_encodeSequences_default(  #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t  ZSTD_encodeSequences_bmi2(              void* dst, size_t dstCapacity,              FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c index b0610b255653..17d836cc84e8 100644 --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -15,289 +15,10 @@  #include "../common/zstd_internal.h"  /* ZSTD_getSequenceLength */  #include "hist.h"                     /* HIST_countFast_wksp */ -#include "zstd_compress_internal.h" +#include "zstd_compress_internal.h"   /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */  #include "zstd_compress_sequences.h"  #include "zstd_compress_literals.h" -/*-************************************* -*  Superblock entropy buffer structs -***************************************/ -/* ZSTD_hufCTablesMetadata_t : - *  Stores Literals Block Type for a super-block in hType, and - *  huffman tree description in hufDesBuffer. - *  hufDesSize refers to the size of huffman tree description in bytes. - *  This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ -typedef struct { -    symbolEncodingType_e hType; -    BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; -    size_t hufDesSize; -} ZSTD_hufCTablesMetadata_t; - -/* ZSTD_fseCTablesMetadata_t : - *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and - *  fse tables in fseTablesBuffer. - *  fseTablesSize refers to the size of fse tables in bytes. - *  This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */ -typedef struct { -    symbolEncodingType_e llType; -    symbolEncodingType_e ofType; -    symbolEncodingType_e mlType; -    BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; -    size_t fseTablesSize; -    size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */ -} ZSTD_fseCTablesMetadata_t; - -typedef struct { -    ZSTD_hufCTablesMetadata_t hufMetadata; -    ZSTD_fseCTablesMetadata_t fseMetadata; -} ZSTD_entropyCTablesMetadata_t; - - -/* ZSTD_buildSuperBlockEntropy_literal() : - *  Builds entropy for the super-block literals. - *  Stores literals block type (raw, rle, compressed, repeat) and - *  huffman description table to hufMetadata. - *  @return : size of huffman description table or error code */ -static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize, -                                            const ZSTD_hufCTables_t* prevHuf, -                                                  ZSTD_hufCTables_t* nextHuf, -                                                  ZSTD_hufCTablesMetadata_t* hufMetadata, -                                                  const int disableLiteralsCompression, -                                                  void* workspace, size_t wkspSize) -{ -    BYTE* const wkspStart = (BYTE*)workspace; -    BYTE* const wkspEnd = wkspStart + wkspSize; -    BYTE* const countWkspStart = wkspStart; -    unsigned* const countWksp = (unsigned*)workspace; -    const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); -    BYTE* const nodeWksp = countWkspStart + countWkspSize; -    const size_t nodeWkspSize = wkspEnd-nodeWksp; -    unsigned maxSymbolValue = 255; -    unsigned huffLog = HUF_TABLELOG_DEFAULT; -    HUF_repeat repeat = prevHuf->repeatMode; - -    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize); - -    /* Prepare nextEntropy assuming reusing the existing table */ -    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - -    if (disableLiteralsCompression) { -        DEBUGLOG(5, "set_basic - disabled"); -        hufMetadata->hType = set_basic; -        return 0; -    } - -    /* small ? don't even attempt compression (speed opt) */ -#   define COMPRESS_LITERALS_SIZE_MIN 63 -    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; -        if (srcSize <= minLitSize) { -            DEBUGLOG(5, "set_basic - too small"); -            hufMetadata->hType = set_basic; -            return 0; -        } -    } - -    /* Scan input and build symbol stats */ -    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); -        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); -        if (largest == srcSize) { -            DEBUGLOG(5, "set_rle"); -            hufMetadata->hType = set_rle; -            return 0; -        } -        if (largest <= (srcSize >> 7)+4) { -            DEBUGLOG(5, "set_basic - no gain"); -            hufMetadata->hType = set_basic; -            return 0; -        } -    } - -    /* Validate the previous Huffman table */ -    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { -        repeat = HUF_repeat_none; -    } - -    /* Build Huffman Tree */ -    ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); -    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); -    {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, -                                                    maxSymbolValue, huffLog, -                                                    nodeWksp, nodeWkspSize); -        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); -        huffLog = (U32)maxBits; -        {   /* Build and write the CTable */ -            size_t const newCSize = HUF_estimateCompressedSize( -                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); -            size_t const hSize = HUF_writeCTable_wksp( -                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), -                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, -                    nodeWksp, nodeWkspSize); -            /* Check against repeating the previous CTable */ -            if (repeat != HUF_repeat_none) { -                size_t const oldCSize = HUF_estimateCompressedSize( -                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); -                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { -                    DEBUGLOG(5, "set_repeat - smaller"); -                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -                    hufMetadata->hType = set_repeat; -                    return 0; -                } -            } -            if (newCSize + hSize >= srcSize) { -                DEBUGLOG(5, "set_basic - no gains"); -                ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -                hufMetadata->hType = set_basic; -                return 0; -            } -            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); -            hufMetadata->hType = set_compressed; -            nextHuf->repeatMode = HUF_repeat_check; -            return hSize; -        } -    } -} - -/* ZSTD_buildSuperBlockEntropy_sequences() : - *  Builds entropy for the super-block sequences. - *  Stores symbol compression modes and fse table to fseMetadata. - *  @return : size of fse tables or error code */ -static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr, -                                              const ZSTD_fseCTables_t* prevEntropy, -                                                    ZSTD_fseCTables_t* nextEntropy, -                                              const ZSTD_CCtx_params* cctxParams, -                                                    ZSTD_fseCTablesMetadata_t* fseMetadata, -                                                    void* workspace, size_t wkspSize) -{ -    BYTE* const wkspStart = (BYTE*)workspace; -    BYTE* const wkspEnd = wkspStart + wkspSize; -    BYTE* const countWkspStart = wkspStart; -    unsigned* const countWksp = (unsigned*)workspace; -    const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned); -    BYTE* const cTableWksp = countWkspStart + countWkspSize; -    const size_t cTableWkspSize = wkspEnd-cTableWksp; -    ZSTD_strategy const strategy = cctxParams->cParams.strategy; -    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; -    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; -    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; -    const BYTE* const ofCodeTable = seqStorePtr->ofCode; -    const BYTE* const llCodeTable = seqStorePtr->llCode; -    const BYTE* const mlCodeTable = seqStorePtr->mlCode; -    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; -    BYTE* const ostart = fseMetadata->fseTablesBuffer; -    BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); -    BYTE* op = ostart; - -    assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE)); -    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq); -    ZSTD_memset(workspace, 0, wkspSize); - -    fseMetadata->lastCountSize = 0; -    /* convert length/distances into codes */ -    ZSTD_seqToCodes(seqStorePtr); -    /* build CTable for Literal Lengths */ -    {   U32 LLtype; -        unsigned max = MaxLL; -        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */ -        DEBUGLOG(5, "Building LL table"); -        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; -        LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, -                                        countWksp, max, mostFrequent, nbSeq, -                                        LLFSELog, prevEntropy->litlengthCTable, -                                        LL_defaultNorm, LL_defaultNormLog, -                                        ZSTD_defaultAllowed, strategy); -        assert(set_basic < set_compressed && set_rle < set_compressed); -        assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ -        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, -                                                    countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, -                                                    prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), -                                                    cTableWksp, cTableWkspSize); -            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); -            if (LLtype == set_compressed) -                fseMetadata->lastCountSize = countSize; -            op += countSize; -            fseMetadata->llType = (symbolEncodingType_e) LLtype; -    }   } -    /* build CTable for Offsets */ -    {   U32 Offtype; -        unsigned max = MaxOff; -        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */ -        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ -        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; -        DEBUGLOG(5, "Building OF table"); -        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; -        Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, -                                        countWksp, max, mostFrequent, nbSeq, -                                        OffFSELog, prevEntropy->offcodeCTable, -                                        OF_defaultNorm, OF_defaultNormLog, -                                        defaultPolicy, strategy); -        assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ -        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, -                                                    countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, -                                                    prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), -                                                    cTableWksp, cTableWkspSize); -            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); -            if (Offtype == set_compressed) -                fseMetadata->lastCountSize = countSize; -            op += countSize; -            fseMetadata->ofType = (symbolEncodingType_e) Offtype; -    }   } -    /* build CTable for MatchLengths */ -    {   U32 MLtype; -        unsigned max = MaxML; -        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize);   /* can't fail */ -        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); -        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; -        MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, -                                        countWksp, max, mostFrequent, nbSeq, -                                        MLFSELog, prevEntropy->matchlengthCTable, -                                        ML_defaultNorm, ML_defaultNormLog, -                                        ZSTD_defaultAllowed, strategy); -        assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ -        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, -                                                    countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, -                                                    prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), -                                                    cTableWksp, cTableWkspSize); -            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); -            if (MLtype == set_compressed) -                fseMetadata->lastCountSize = countSize; -            op += countSize; -            fseMetadata->mlType = (symbolEncodingType_e) MLtype; -    }   } -    assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer)); -    return op-ostart; -} - - -/* ZSTD_buildSuperBlockEntropy() : - *  Builds entropy for the super-block. - *  @return : 0 on success or error code */ -static size_t -ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, -                      const ZSTD_entropyCTables_t* prevEntropy, -                            ZSTD_entropyCTables_t* nextEntropy, -                      const ZSTD_CCtx_params* cctxParams, -                            ZSTD_entropyCTablesMetadata_t* entropyMetadata, -                            void* workspace, size_t wkspSize) -{ -    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; -    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); -    entropyMetadata->hufMetadata.hufDesSize = -        ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, -                                            &prevEntropy->huf, &nextEntropy->huf, -                                            &entropyMetadata->hufMetadata, -                                            ZSTD_disableLiteralsCompression(cctxParams), -                                            workspace, wkspSize); -    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed"); -    entropyMetadata->fseMetadata.fseTablesSize = -        ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, -                                              &prevEntropy->fse, &nextEntropy->fse, -                                              cctxParams, -                                              &entropyMetadata->fseMetadata, -                                              workspace, wkspSize); -    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed"); -    return 0; -} -  /* ZSTD_compressSubBlock_literal() :   *  Compresses literals section for a sub-block.   *  When we have to write the Huffman table we will sometimes choose a header @@ -411,8 +132,7 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*      const seqDef* sp = sstart;      size_t matchLengthSum = 0;      size_t litLengthSum = 0; -    /* Only used by assert(), suppress unused variable warnings in production. */ -    (void)litLengthSum; +    (void)(litLengthSum); /* suppress unused variable warning on some environments */      while (send-sp > 0) {          ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);          litLengthSum += seqLen.litLength; @@ -605,7 +325,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit  static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,                          const BYTE* codeTable, unsigned maxCode,                          size_t nbSeq, const FSE_CTable* fseCTable, -                        const U32* additionalBits, +                        const U8* additionalBits,                          short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,                          void* workspace, size_t wkspSize)  { @@ -646,8 +366,9 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,                                                    void* workspace, size_t wkspSize,                                                    int writeEntropy)  { -    size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ +    size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */      size_t cSeqSizeEstimate = 0; +    if (nbSeq == 0) return sequencesSectionHeaderSize;      cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,                                           nbSeq, fseTables->offcodeCTable, NULL,                                           OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, @@ -754,7 +475,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,          /* I think there is an optimization opportunity here.           * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful           * since it recalculates estimate from scratch. -         * For example, it would recount literal distribution and symbol codes everytime. +         * For example, it would recount literal distribution and symbol codes every time.           */          cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,                                                         &nextCBlock->entropy, entropyMetadata, @@ -818,7 +539,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,              repcodes_t rep;              ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));              for (seq = sstart; seq < sp; ++seq) { -                rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); +                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);              }              ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));          } @@ -833,7 +554,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,                                 unsigned lastBlock) {      ZSTD_entropyCTablesMetadata_t entropyMetadata; -    FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, +    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,            &zc->blockState.prevCBlock->entropy,            &zc->blockState.nextCBlock->entropy,            &zc->appliedParams, diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h index 98e359adf5d4..349fc923c355 100644 --- a/lib/zstd/compress/zstd_cwksp.h +++ b/lib/zstd/compress/zstd_cwksp.h @@ -32,6 +32,10 @@  #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128  #endif + +/* Set our tables and aligneds to align by 64 bytes */ +#define ZSTD_CWKSP_ALIGNMENT_BYTES 64 +  /*-*************************************  *  Structures  ***************************************/ @@ -114,10 +118,11 @@ typedef enum {   * - Tables: these are any of several different datastructures (hash tables,   *   chain tables, binary trees) that all respect a common format: they are   *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base). - *   Their sizes depend on the cparams. + *   Their sizes depend on the cparams. These tables are 64-byte aligned.   *   * - Aligned: these buffers are used for various purposes that require 4 byte - *   alignment, but don't require any initialization before they're used. + *   alignment, but don't require any initialization before they're used. These + *   buffers are each aligned to 64 bytes.   *   * - Buffers: these buffers are used for various purposes that don't require   *   any alignment or initialization before they're used. This means they can @@ -130,8 +135,7 @@ typedef enum {   *   * 1. Objects   * 2. Buffers - * 3. Aligned - * 4. Tables + * 3. Aligned/Tables   *   * Attempts to reserve objects of different types out of order will fail.   */ @@ -184,6 +188,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {   * Since tables aren't currently redzoned, you don't need to call through this   * to figure out how much space you need for the matchState tables. Everything   * else is though. + * + * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size().   */  MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {      if (size == 0) @@ -191,66 +197,139 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {      return size;  } -MEM_STATIC void ZSTD_cwksp_internal_advance_phase( -        ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { +/* + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { +    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); +} + +/* + * Returns the amount of additional space the cwksp must allocate + * for internal purposes (currently only alignment). + */ +MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +     * to align the beginning of the aligned section. +     * +     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +     * aligneds being sized in multiples of 64 bytes. +     */ +    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; +    return slackSpace; +} + + +/* + * Return the number of additional bytes required to align a pointer to the given number of bytes. + * alignBytes must be a power of two. + */ +MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { +    size_t const alignBytesMask = alignBytes - 1; +    size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; +    assert((alignBytes & alignBytesMask) == 0); +    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); +    return bytes; +} + +/* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, + * which counts from the end of the wksp (as opposed to the object/table segment). + * + * Returns a pointer to the beginning of that space. + */ +MEM_STATIC void* +ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) +{ +    void* const alloc = (BYTE*)ws->allocStart - bytes; +    void* const bottom = ws->tableEnd; +    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", +        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); +    ZSTD_cwksp_assert_internal_consistency(ws); +    assert(alloc >= bottom); +    if (alloc < bottom) { +        DEBUGLOG(4, "cwksp: alloc failed!"); +        ws->allocFailed = 1; +        return NULL; +    } +    /* the area is reserved from the end of wksp. +     * If it overlaps with tableValidEnd, it voids guarantees on values' range */ +    if (alloc < ws->tableValidEnd) { +        ws->tableValidEnd = alloc; +    } +    ws->allocStart = alloc; +    return alloc; +} + +/* + * Moves the cwksp to the next phase, and does any necessary allocations. + * cwksp initialization must necessarily go through each phase in order. + * Returns a 0 on success, or zstd error + */ +MEM_STATIC size_t +ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) +{      assert(phase >= ws->phase);      if (phase > ws->phase) { +        /* Going from allocating objects to allocating buffers */          if (ws->phase < ZSTD_cwksp_alloc_buffers &&                  phase >= ZSTD_cwksp_alloc_buffers) {              ws->tableValidEnd = ws->objectEnd;          } + +        /* Going from allocating buffers to allocating aligneds/tables */          if (ws->phase < ZSTD_cwksp_alloc_aligned &&                  phase >= ZSTD_cwksp_alloc_aligned) { -            /* If unaligned allocations down from a too-large top have left us -             * unaligned, we need to realign our alloc ptr. Technically, this -             * can consume space that is unaccounted for in the neededSpace -             * calculation. However, I believe this can only happen when the -             * workspace is too large, and specifically when it is too large -             * by a larger margin than the space that will be consumed. */ -            /* TODO: cleaner, compiler warning friendly way to do this??? */ -            ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); -            if (ws->allocStart < ws->tableValidEnd) { -                ws->tableValidEnd = ws->allocStart; +            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +                size_t const bytesToAlign = +                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +                                memory_allocation, "aligned phase - alignment initial allocation failed!");              } -        } +            {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +                void* const alloc = ws->objectEnd; +                size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +                void* const objectEnd = (BYTE*)alloc + bytesToAlign; +                DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); +                RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, +                                "table phase - alignment initial allocation failed!"); +                ws->objectEnd = objectEnd; +                ws->tableEnd = objectEnd;  /* table area starts being empty */ +                if (ws->tableValidEnd < ws->tableEnd) { +                    ws->tableValidEnd = ws->tableEnd; +        }   }   }          ws->phase = phase; +        ZSTD_cwksp_assert_internal_consistency(ws);      } +    return 0;  }  /*   * Returns whether this object/buffer/etc was allocated in this workspace.   */ -MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { +MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) +{      return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);  }  /*   * Internal function. Do not use directly.   */ -MEM_STATIC void* ZSTD_cwksp_reserve_internal( -        ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { +MEM_STATIC void* +ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) +{      void* alloc; -    void* bottom = ws->tableEnd; -    ZSTD_cwksp_internal_advance_phase(ws, phase); -    alloc = (BYTE *)ws->allocStart - bytes; - -    if (bytes == 0) +    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {          return NULL; +    } -    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", -        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); -    ZSTD_cwksp_assert_internal_consistency(ws); -    assert(alloc >= bottom); -    if (alloc < bottom) { -        DEBUGLOG(4, "cwksp: alloc failed!"); -        ws->allocFailed = 1; -        return NULL; -    } -    if (alloc < ws->tableValidEnd) { -        ws->tableValidEnd = alloc; -    } -    ws->allocStart = alloc; +    alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);      return alloc; @@ -259,33 +338,44 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal(  /*   * Reserves and returns unaligned memory.   */ -MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +{      return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);  }  /* - * Reserves and returns memory sized on and aligned on sizeof(unsigned). + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).   */ -MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { -    assert((bytes & (sizeof(U32)-1)) == 0); -    return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); +MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) +{ +    void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), +                                            ZSTD_cwksp_alloc_aligned); +    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); +    return ptr;  }  /* - * Aligned on sizeof(unsigned). These buffers have the special property that + * Aligned on 64 bytes. These buffers have the special property that   * their values remain constrained, allowing us to re-use them without   * memset()-ing them.   */ -MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) +{      const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; -    void* alloc = ws->tableEnd; -    void* end = (BYTE *)alloc + bytes; -    void* top = ws->allocStart; +    void* alloc; +    void* end; +    void* top; + +    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +        return NULL; +    } +    alloc = ws->tableEnd; +    end = (BYTE *)alloc + bytes; +    top = ws->allocStart;      DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",          alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);      assert((bytes & (sizeof(U32)-1)) == 0); -    ZSTD_cwksp_internal_advance_phase(ws, phase);      ZSTD_cwksp_assert_internal_consistency(ws);      assert(end <= top);      if (end > top) { @@ -296,27 +386,31 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {      ws->tableEnd = end; +    assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); +    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);      return alloc;  }  /*   * Aligned on sizeof(void*). + * Note : should happen only once, at workspace first initialization   */ -MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { -    size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); +MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) +{ +    size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));      void* alloc = ws->objectEnd;      void* end = (BYTE*)alloc + roundedBytes; -    DEBUGLOG(5, +    DEBUGLOG(4,          "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",          alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes); -    assert(((size_t)alloc & (sizeof(void*)-1)) == 0); -    assert((bytes & (sizeof(void*)-1)) == 0); +    assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0); +    assert(bytes % ZSTD_ALIGNOF(void*) == 0);      ZSTD_cwksp_assert_internal_consistency(ws);      /* we must be in the first phase, no advance is possible */      if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) { -        DEBUGLOG(4, "cwksp: object alloc failed!"); +        DEBUGLOG(3, "cwksp: object alloc failed!");          ws->allocFailed = 1;          return NULL;      } @@ -328,7 +422,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {      return alloc;  } -MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { +MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) +{      DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); @@ -451,6 +546,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {  *  Functions Checking Free Space  ***************************************/ +/* ZSTD_alignmentSpaceWithinBounds() : + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +                                                        size_t const estimatedSpace, int resizedWorkspace) { +    if (resizedWorkspace) { +        /* Resized/newly allocated wksp should have exact bounds */ +        return ZSTD_cwksp_used(ws) == estimatedSpace; +    } else { +        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +         * than estimatedSpace. See the comments in zstd_cwksp.h for details. +         */ +        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +    } +} + +  MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {      return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);  } diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c index b0424d23ac57..76933dea2624 100644 --- a/lib/zstd/compress/zstd_double_fast.c +++ b/lib/zstd/compress/zstd_double_fast.c @@ -48,10 +48,216 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,  FORCE_INLINE_TEMPLATE -size_t ZSTD_compressBlock_doubleFast_generic( +size_t ZSTD_compressBlock_doubleFast_noDict_generic( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize, U32 const mls /* template */) +{ +    ZSTD_compressionParameters const* cParams = &ms->cParams; +    U32* const hashLong = ms->hashTable; +    const U32 hBitsL = cParams->hashLog; +    U32* const hashSmall = ms->chainTable; +    const U32 hBitsS = cParams->chainLog; +    const BYTE* const base = ms->window.base; +    const BYTE* const istart = (const BYTE*)src; +    const BYTE* anchor = istart; +    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); +    /* presumes that, if there is a dictionary, it must be using Attach mode */ +    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); +    const BYTE* const prefixLowest = base + prefixLowestIndex; +    const BYTE* const iend = istart + srcSize; +    const BYTE* const ilimit = iend - HASH_READ_SIZE; +    U32 offset_1=rep[0], offset_2=rep[1]; +    U32 offsetSaved = 0; + +    size_t mLength; +    U32 offset; +    U32 curr; + +    /* how many positions to search before increasing step size */ +    const size_t kStepIncr = 1 << kSearchStrength; +    /* the position at which to increment the step size if no match is found */ +    const BYTE* nextStep; +    size_t step; /* the current step size */ + +    size_t hl0; /* the long hash at ip */ +    size_t hl1; /* the long hash at ip1 */ + +    U32 idxl0; /* the long match index for ip */ +    U32 idxl1; /* the long match index for ip1 */ + +    const BYTE* matchl0; /* the long match for ip */ +    const BYTE* matchs0; /* the short match for ip */ +    const BYTE* matchl1; /* the long match for ip1 */ + +    const BYTE* ip = istart; /* the current position */ +    const BYTE* ip1; /* the next position */ + +    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + +    /* init */ +    ip += ((ip - prefixLowest) == 0); +    { +        U32 const current = (U32)(ip - base); +        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); +        U32 const maxRep = current - windowLow; +        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; +    } + +    /* Outer Loop: one iteration per match found and stored */ +    while (1) { +        step = 1; +        nextStep = ip + kStepIncr; +        ip1 = ip + step; + +        if (ip1 > ilimit) { +            goto _cleanup; +        } + +        hl0 = ZSTD_hashPtr(ip, hBitsL, 8); +        idxl0 = hashLong[hl0]; +        matchl0 = base + idxl0; + +        /* Inner Loop: one iteration per search / position */ +        do { +            const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls); +            const U32 idxs0 = hashSmall[hs0]; +            curr = (U32)(ip-base); +            matchs0 = base + idxs0; + +            hashLong[hl0] = hashSmall[hs0] = curr;   /* update hash tables */ + +            /* check noDict repcode */ +            if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { +                mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; +                ip++; +                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +                goto _match_stored; +            } + +            hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + +            if (idxl0 > prefixLowestIndex) { +                /* check prefix long match */ +                if (MEM_read64(matchl0) == MEM_read64(ip)) { +                    mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; +                    offset = (U32)(ip-matchl0); +                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ +                    goto _match_found; +                } +            } + +            idxl1 = hashLong[hl1]; +            matchl1 = base + idxl1; + +            if (idxs0 > prefixLowestIndex) { +                /* check prefix short match */ +                if (MEM_read32(matchs0) == MEM_read32(ip)) { +                    goto _search_next_long; +                } +            } + +            if (ip1 >= nextStep) { +                PREFETCH_L1(ip1 + 64); +                PREFETCH_L1(ip1 + 128); +                step++; +                nextStep += kStepIncr; +            } +            ip = ip1; +            ip1 += step; + +            hl0 = hl1; +            idxl0 = idxl1; +            matchl0 = matchl1; +    #if defined(__aarch64__) +            PREFETCH_L1(ip+256); +    #endif +        } while (ip1 <= ilimit); + +_cleanup: +        /* save reps for next block */ +        rep[0] = offset_1 ? offset_1 : offsetSaved; +        rep[1] = offset_2 ? offset_2 : offsetSaved; + +        /* Return the last literals size */ +        return (size_t)(iend - anchor); + +_search_next_long: + +        /* check prefix long +1 match */ +        if (idxl1 > prefixLowestIndex) { +            if (MEM_read64(matchl1) == MEM_read64(ip1)) { +                ip = ip1; +                mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; +                offset = (U32)(ip-matchl1); +                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ +                goto _match_found; +            } +        } + +        /* if no long +1 match, explore the short match we found */ +        mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; +        offset = (U32)(ip - matchs0); +        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ + +        /* fall-through */ + +_match_found: /* requires ip, offset, mLength */ +        offset_2 = offset_1; +        offset_1 = offset; + +        if (step < 4) { +            /* It is unsafe to write this value back to the hashtable when ip1 is +             * greater than or equal to the new ip we will have after we're done +             * processing this match. Rather than perform that test directly +             * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler +             * more predictable test. The minmatch even if we take a short match is +             * 4 bytes, so as long as step, the distance between ip and ip1 +             * (initially) is less than 4, we know ip1 < new ip. */ +            hashLong[hl1] = (U32)(ip1 - base); +        } + +        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); + +_match_stored: +        /* match found */ +        ip += mLength; +        anchor = ip; + +        if (ip <= ilimit) { +            /* Complementary insertion */ +            /* done after iLimit test, as candidates could be > iend-8 */ +            {   U32 const indexToInsert = curr+2; +                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; +                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); +                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; +                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); +            } + +            /* check immediate repcode */ +            while ( (ip <= ilimit) +                 && ( (offset_2>0) +                    & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { +                /* store sequence */ +                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +                U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */ +                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); +                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); +                ip += rLength; +                anchor = ip; +                continue;   /* faster when present ... (?) */ +            } +        } +    } +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize, -        U32 const mls /* template */, ZSTD_dictMode_e const dictMode) +        U32 const mls /* template */)  {      ZSTD_compressionParameters const* cParams = &ms->cParams;      U32* const hashLong = ms->hashTable; @@ -72,54 +278,30 @@ size_t ZSTD_compressBlock_doubleFast_generic(      U32 offsetSaved = 0;      const ZSTD_matchState_t* const dms = ms->dictMatchState; -    const ZSTD_compressionParameters* const dictCParams = -                                     dictMode == ZSTD_dictMatchState ? -                                     &dms->cParams : NULL; -    const U32* const dictHashLong  = dictMode == ZSTD_dictMatchState ? -                                     dms->hashTable : NULL; -    const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ? -                                     dms->chainTable : NULL; -    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ? -                                     dms->window.dictLimit : 0; -    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ? -                                     dms->window.base : NULL; -    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ? -                                     dictBase + dictStartIndex : NULL; -    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ? -                                     dms->window.nextSrc : NULL; -    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ? -                                     prefixLowestIndex - (U32)(dictEnd - dictBase) : -                                     0; -    const U32 dictHBitsL           = dictMode == ZSTD_dictMatchState ? -                                     dictCParams->hashLog : hBitsL; -    const U32 dictHBitsS           = dictMode == ZSTD_dictMatchState ? -                                     dictCParams->chainLog : hBitsS; +    const ZSTD_compressionParameters* const dictCParams = &dms->cParams; +    const U32* const dictHashLong  = dms->hashTable; +    const U32* const dictHashSmall = dms->chainTable; +    const U32 dictStartIndex       = dms->window.dictLimit; +    const BYTE* const dictBase     = dms->window.base; +    const BYTE* const dictStart    = dictBase + dictStartIndex; +    const BYTE* const dictEnd      = dms->window.nextSrc; +    const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase); +    const U32 dictHBitsL           = dictCParams->hashLog; +    const U32 dictHBitsS           = dictCParams->chainLog;      const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); -    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); - -    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState); +    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");      /* if a dictionary is attached, it must be within window range */ -    if (dictMode == ZSTD_dictMatchState) { -        assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); -    } +    assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);      /* init */      ip += (dictAndPrefixLength == 0); -    if (dictMode == ZSTD_noDict) { -        U32 const curr = (U32)(ip - base); -        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); -        U32 const maxRep = curr - windowLow; -        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; -        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; -    } -    if (dictMode == ZSTD_dictMatchState) { -        /* dictMatchState repCode checks don't currently handle repCode == 0 -         * disabling. */ -        assert(offset_1 <= dictAndPrefixLength); -        assert(offset_2 <= dictAndPrefixLength); -    } + +    /* dictMatchState repCode checks don't currently handle repCode == 0 +     * disabling. */ +    assert(offset_1 <= dictAndPrefixLength); +    assert(offset_2 <= dictAndPrefixLength);      /* Main Search Loop */      while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */ @@ -135,29 +317,18 @@ size_t ZSTD_compressBlock_doubleFast_generic(          const BYTE* matchLong = base + matchIndexL;          const BYTE* match = base + matchIndexS;          const U32 repIndex = curr + 1 - offset_1; -        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState -                            && repIndex < prefixLowestIndex) ? +        const BYTE* repMatch = (repIndex < prefixLowestIndex) ?                                 dictBase + (repIndex - dictIndexDelta) :                                 base + repIndex;          hashLong[h2] = hashSmall[h] = curr;   /* update hash tables */ -        /* check dictMatchState repcode */ -        if (dictMode == ZSTD_dictMatchState -            && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) +        /* check repcode */ +        if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)              && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {              const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;              mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;              ip++; -            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); -            goto _match_stored; -        } - -        /* check noDict repcode */ -        if ( dictMode == ZSTD_noDict -          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { -            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; -            ip++; -            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); +            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);              goto _match_stored;          } @@ -169,7 +340,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(                  while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */                  goto _match_found;              } -        } else if (dictMode == ZSTD_dictMatchState) { +        } else {              /* check dictMatchState long match */              U32 const dictMatchIndexL = dictHashLong[dictHL];              const BYTE* dictMatchL = dictBase + dictMatchIndexL; @@ -187,7 +358,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(              if (MEM_read32(match) == MEM_read32(ip)) {                  goto _search_next_long;              } -        } else if (dictMode == ZSTD_dictMatchState) { +        } else {              /* check dictMatchState short match */              U32 const dictMatchIndexS = dictHashSmall[dictHS];              match = dictBase + dictMatchIndexS; @@ -220,7 +391,7 @@ _search_next_long:                      while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */                      goto _match_found;                  } -            } else if (dictMode == ZSTD_dictMatchState) { +            } else {                  /* check dict long +1 match */                  U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];                  const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; @@ -234,7 +405,7 @@ _search_next_long:          }   }   }          /* if no long +1 match, explore the short match we found */ -        if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) { +        if (matchIndexS < prefixLowestIndex) {              mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;              offset = (U32)(curr - matchIndexS);              while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ @@ -248,7 +419,7 @@ _match_found:          offset_2 = offset_1;          offset_1 = offset; -        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); +        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);  _match_stored:          /* match found */ @@ -266,43 +437,27 @@ _match_stored:              }              /* check immediate repcode */ -            if (dictMode == ZSTD_dictMatchState) { -                while (ip <= ilimit) { -                    U32 const current2 = (U32)(ip-base); -                    U32 const repIndex2 = current2 - offset_2; -                    const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState -                        && repIndex2 < prefixLowestIndex ? -                            dictBase + repIndex2 - dictIndexDelta : -                            base + repIndex2; -                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) -                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { -                        const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; -                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; -                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */ -                        ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); -                        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; -                        hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; -                        ip += repLength2; -                        anchor = ip; -                        continue; -                    } -                    break; -            }   } - -            if (dictMode == ZSTD_noDict) { -                while ( (ip <= ilimit) -                     && ( (offset_2>0) -                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { -                    /* store sequence */ -                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; -                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */ -                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); -                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); -                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH); -                    ip += rLength; +            while (ip <= ilimit) { +                U32 const current2 = (U32)(ip-base); +                U32 const repIndex2 = current2 - offset_2; +                const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? +                        dictBase + repIndex2 - dictIndexDelta : +                        base + repIndex2; +                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +                    const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; +                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; +                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */ +                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; +                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; +                    ip += repLength2;                      anchor = ip; -                    continue;   /* faster when present ... (?) */ -        }   }   } +                    continue; +                } +                break; +            } +        }      }   /* while (ip < ilimit) */      /* save reps for next block */ @@ -313,6 +468,24 @@ _match_stored:      return (size_t)(iend - anchor);  } +#define ZSTD_GEN_DFAST_FN(dictMode, mls)                                                                 \ +    static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(                                      \ +            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                          \ +            void const* src, size_t srcSize)                                                             \ +    {                                                                                                    \ +        return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ +    } + +ZSTD_GEN_DFAST_FN(noDict, 4) +ZSTD_GEN_DFAST_FN(noDict, 5) +ZSTD_GEN_DFAST_FN(noDict, 6) +ZSTD_GEN_DFAST_FN(noDict, 7) + +ZSTD_GEN_DFAST_FN(dictMatchState, 4) +ZSTD_GEN_DFAST_FN(dictMatchState, 5) +ZSTD_GEN_DFAST_FN(dictMatchState, 6) +ZSTD_GEN_DFAST_FN(dictMatchState, 7) +  size_t ZSTD_compressBlock_doubleFast(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -323,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast(      {      default: /* includes case 3 */      case 4 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict); +        return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);      case 5 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict); +        return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);      case 6 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict); +        return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);      case 7 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict); +        return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);      }  } @@ -343,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(      {      default: /* includes case 3 */      case 4 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState); +        return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);      case 5 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState); +        return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);      case 6 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState); +        return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);      case 7 : -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState); +        return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);      }  } @@ -385,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(      /* if extDict is invalidated due to maxDistance, switch to "regular" variant */      if (prefixStartIndex == dictStartIndex) -        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict); +        return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);      /* Search Loop */      while (ip < ilimit) {  /* < instead of <=, because (ip+1) */ @@ -407,12 +580,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(          hashSmall[hSmall] = hashLong[hLong] = curr;   /* update hash table */          if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ -            & (repIndex > dictStartIndex)) +            & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {              const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;              mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;              ip++; -            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); +            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);          } else {              if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {                  const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; @@ -423,7 +596,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(                  while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */                  offset_2 = offset_1;                  offset_1 = offset; -                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); +                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);              } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {                  size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); @@ -448,7 +621,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(                  }                  offset_2 = offset_1;                  offset_1 = offset; -                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); +                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);              } else {                  ip += ((ip-anchor) >> kSearchStrength) + 1; @@ -475,12 +648,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(                  U32 const repIndex2 = current2 - offset_2;                  const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;                  if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ -                    & (repIndex2 > dictStartIndex)) +                    & (offset_2 <= current2 - dictStartIndex))                    && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {                      const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;                      size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;                      U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */ -                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); +                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);                      hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;                      hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;                      ip += repLength2; @@ -498,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(      return (size_t)(iend - anchor);  } +ZSTD_GEN_DFAST_FN(extDict, 4) +ZSTD_GEN_DFAST_FN(extDict, 5) +ZSTD_GEN_DFAST_FN(extDict, 6) +ZSTD_GEN_DFAST_FN(extDict, 7)  size_t ZSTD_compressBlock_doubleFast_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -508,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict(      {      default: /* includes case 3 */      case 4 : -        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); +        return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);      case 5 : -        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); +        return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);      case 6 : -        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); +        return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);      case 7 : -        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); +        return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);      }  } diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c index 96b7d48e2868..a752e6beab52 100644 --- a/lib/zstd/compress/zstd_fast.c +++ b/lib/zstd/compress/zstd_fast.c @@ -43,145 +43,294 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,  } +/* + * If you squint hard enough (and ignore repcodes), the search operation at any + * given position is broken into 4 stages: + * + * 1. Hash   (map position to hash value via input read) + * 2. Lookup (map hash val to index via hashtable read) + * 3. Load   (map index to value at that position via input read) + * 4. Compare + * + * Each of these steps involves a memory read at an address which is computed + * from the previous step. This means these steps must be sequenced and their + * latencies are cumulative. + * + * Rather than do 1->2->3->4 sequentially for a single position before moving + * onto the next, this implementation interleaves these operations across the + * next few positions: + * + * R = Repcode Read & Compare + * H = Hash + * T = Table Lookup + * M = Match Read & Compare + * + * Pos | Time --> + * ----+------------------- + * N   | ... M + * N+1 | ...   TM + * N+2 |    R H   T M + * N+3 |         H    TM + * N+4 |           R H   T M + * N+5 |                H   ... + * N+6 |                  R ... + * + * This is very much analogous to the pipelining of execution in a CPU. And just + * like a CPU, we have to dump the pipeline when we find a match (i.e., take a + * branch). + * + * When this happens, we throw away our current state, and do the following prep + * to re-enter the loop: + * + * Pos | Time --> + * ----+------------------- + * N   | H T + * N+1 |  H + * + * This is also the work we do at the beginning to enter the loop initially. + */  FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_generic( +ZSTD_compressBlock_fast_noDict_generic(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize, -        U32 const mls) +        U32 const mls, U32 const hasStep)  {      const ZSTD_compressionParameters* const cParams = &ms->cParams;      U32* const hashTable = ms->hashTable;      U32 const hlog = cParams->hashLog;      /* support stepSize of 0 */ -    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; +    size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;      const BYTE* const base = ms->window.base;      const BYTE* const istart = (const BYTE*)src; -    /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ -    const BYTE* ip0 = istart; -    const BYTE* ip1; -    const BYTE* anchor = istart;      const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);      const U32   prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);      const BYTE* const prefixStart = base + prefixStartIndex;      const BYTE* const iend = istart + srcSize;      const BYTE* const ilimit = iend - HASH_READ_SIZE; -    U32 offset_1=rep[0], offset_2=rep[1]; + +    const BYTE* anchor = istart; +    const BYTE* ip0 = istart; +    const BYTE* ip1; +    const BYTE* ip2; +    const BYTE* ip3; +    U32 current0; + +    U32 rep_offset1 = rep[0]; +    U32 rep_offset2 = rep[1];      U32 offsetSaved = 0; -    /* init */ +    size_t hash0; /* hash for ip0 */ +    size_t hash1; /* hash for ip1 */ +    U32 idx; /* match idx for ip0 */ +    U32 mval; /* src value at match idx */ + +    U32 offcode; +    const BYTE* match0; +    size_t mLength; + +    /* ip0 and ip1 are always adjacent. The targetLength skipping and +     * uncompressibility acceleration is applied to every other position, +     * matching the behavior of #1562. step therefore represents the gap +     * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */ +    size_t step; +    const BYTE* nextStep; +    const size_t kStepIncr = (1 << (kSearchStrength - 1)); +      DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");      ip0 += (ip0 == prefixStart); -    ip1 = ip0 + 1;      {   U32 const curr = (U32)(ip0 - base);          U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);          U32 const maxRep = curr - windowLow; -        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; -        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; +        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;      } -    /* Main Search Loop */ -#ifdef __INTEL_COMPILER -    /* From intel 'The vector pragma indicates that the loop should be -     * vectorized if it is legal to do so'. Can be used together with -     * #pragma ivdep (but have opted to exclude that because intel -     * warns against using it).*/ -    #pragma vector always -#endif -    while (ip1 < ilimit) {   /* < instead of <=, because check at ip0+2 */ -        size_t mLength; -        BYTE const* ip2 = ip0 + 2; -        size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls); -        U32 const val0 = MEM_read32(ip0); -        size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls); -        U32 const val1 = MEM_read32(ip1); -        U32 const current0 = (U32)(ip0-base); -        U32 const current1 = (U32)(ip1-base); -        U32 const matchIndex0 = hashTable[h0]; -        U32 const matchIndex1 = hashTable[h1]; -        BYTE const* repMatch = ip2 - offset_1; -        const BYTE* match0 = base + matchIndex0; -        const BYTE* match1 = base + matchIndex1; -        U32 offcode; - -#if defined(__aarch64__) -        PREFETCH_L1(ip0+256); -#endif - -        hashTable[h0] = current0;   /* update hash table */ -        hashTable[h1] = current1;   /* update hash table */ - -        assert(ip0 + 1 == ip1); - -        if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) { -            mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0; -            ip0 = ip2 - mLength; -            match0 = repMatch - mLength; +    /* start each op */ +_start: /* Requires: ip0 */ + +    step = stepSize; +    nextStep = ip0 + kStepIncr; + +    /* calculate positions, ip0 - anchor == 0, so we skip step calc */ +    ip1 = ip0 + 1; +    ip2 = ip0 + step; +    ip3 = ip2 + 1; + +    if (ip3 >= ilimit) { +        goto _cleanup; +    } + +    hash0 = ZSTD_hashPtr(ip0, hlog, mls); +    hash1 = ZSTD_hashPtr(ip1, hlog, mls); + +    idx = hashTable[hash0]; + +    do { +        /* load repcode match for ip[2]*/ +        const U32 rval = MEM_read32(ip2 - rep_offset1); + +        /* write back hash table entry */ +        current0 = (U32)(ip0 - base); +        hashTable[hash0] = current0; + +        /* check repcode at ip[2] */ +        if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) { +            ip0 = ip2; +            match0 = ip0 - rep_offset1; +            mLength = ip0[-1] == match0[-1]; +            ip0 -= mLength; +            match0 -= mLength; +            offcode = STORE_REPCODE_1;              mLength += 4; -            offcode = 0;              goto _match;          } -        if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) { -            /* found a regular match */ -            goto _offset; + +        /* load match for ip[0] */ +        if (idx >= prefixStartIndex) { +            mval = MEM_read32(base + idx); +        } else { +            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */          } -        if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) { -            /* found a regular match after one literal */ -            ip0 = ip1; -            match0 = match1; + +        /* check match at ip[0] */ +        if (MEM_read32(ip0) == mval) { +            /* found a match! */              goto _offset;          } -        {   size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize; -            assert(step >= 2); -            ip0 += step; -            ip1 += step; -            continue; + +        /* lookup ip[1] */ +        idx = hashTable[hash1]; + +        /* hash ip[2] */ +        hash0 = hash1; +        hash1 = ZSTD_hashPtr(ip2, hlog, mls); + +        /* advance to next positions */ +        ip0 = ip1; +        ip1 = ip2; +        ip2 = ip3; + +        /* write back hash table entry */ +        current0 = (U32)(ip0 - base); +        hashTable[hash0] = current0; + +        /* load match for ip[0] */ +        if (idx >= prefixStartIndex) { +            mval = MEM_read32(base + idx); +        } else { +            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */          } -_offset: /* Requires: ip0, match0 */ -        /* Compute the offset code */ -        offset_2 = offset_1; -        offset_1 = (U32)(ip0-match0); -        offcode = offset_1 + ZSTD_REP_MOVE; -        mLength = 4; -        /* Count the backwards match length */ -        while (((ip0>anchor) & (match0>prefixStart)) -             && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */ -_match: /* Requires: ip0, match0, offcode */ -        /* Count the forward length */ -        mLength += ZSTD_count(ip0+mLength, match0+mLength, iend); -        ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH); -        /* match found */ -        ip0 += mLength; -        anchor = ip0; +        /* check match at ip[0] */ +        if (MEM_read32(ip0) == mval) { +            /* found a match! */ +            goto _offset; +        } -        if (ip0 <= ilimit) { -            /* Fill Table */ -            assert(base+current0+2 > istart);  /* check base overflow */ -            hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */ -            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); - -            if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */ -                while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) { -                    /* store sequence */ -                    size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4; -                    { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ -                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); -                    ip0 += rLength; -                    ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH); -                    anchor = ip0; -                    continue;   /* faster when present (confirmed on gcc-8) ... (?) */ -        }   }   } -        ip1 = ip0 + 1; -    } +        /* lookup ip[1] */ +        idx = hashTable[hash1]; + +        /* hash ip[2] */ +        hash0 = hash1; +        hash1 = ZSTD_hashPtr(ip2, hlog, mls); + +        /* advance to next positions */ +        ip0 = ip1; +        ip1 = ip2; +        ip2 = ip0 + step; +        ip3 = ip1 + step; + +        /* calculate step */ +        if (ip2 >= nextStep) { +            step++; +            PREFETCH_L1(ip1 + 64); +            PREFETCH_L1(ip1 + 128); +            nextStep += kStepIncr; +        } +    } while (ip3 < ilimit); + +_cleanup: +    /* Note that there are probably still a couple positions we could search. +     * However, it seems to be a meaningful performance hit to try to search +     * them. So let's not. */      /* save reps for next block */ -    rep[0] = offset_1 ? offset_1 : offsetSaved; -    rep[1] = offset_2 ? offset_2 : offsetSaved; +    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;      /* Return the last literals size */      return (size_t)(iend - anchor); + +_offset: /* Requires: ip0, idx */ + +    /* Compute the offset code. */ +    match0 = base + idx; +    rep_offset2 = rep_offset1; +    rep_offset1 = (U32)(ip0-match0); +    offcode = STORE_OFFSET(rep_offset1); +    mLength = 4; + +    /* Count the backwards match length. */ +    while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) { +        ip0--; +        match0--; +        mLength++; +    } + +_match: /* Requires: ip0, match0, offcode */ + +    /* Count the forward length. */ +    mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend); + +    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); + +    ip0 += mLength; +    anchor = ip0; + +    /* write next hash table entry */ +    if (ip1 < ip0) { +        hashTable[hash1] = (U32)(ip1 - base); +    } + +    /* Fill table and check for immediate repcode. */ +    if (ip0 <= ilimit) { +        /* Fill Table */ +        assert(base+current0+2 > istart);  /* check base overflow */ +        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */ +        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + +        if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */ +            while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) { +                /* store sequence */ +                size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4; +                { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ +                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); +                ip0 += rLength; +                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); +                anchor = ip0; +                continue;   /* faster when present (confirmed on gcc-8) ... (?) */ +    }   }   } + +    goto _start;  } +#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                                                            \ +    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(                                      \ +            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],                    \ +            void const* src, size_t srcSize)                                                       \ +    {                                                                                              \ +        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ +    } + +ZSTD_GEN_FAST_FN(noDict, 4, 1) +ZSTD_GEN_FAST_FN(noDict, 5, 1) +ZSTD_GEN_FAST_FN(noDict, 6, 1) +ZSTD_GEN_FAST_FN(noDict, 7, 1) + +ZSTD_GEN_FAST_FN(noDict, 4, 0) +ZSTD_GEN_FAST_FN(noDict, 5, 0) +ZSTD_GEN_FAST_FN(noDict, 6, 0) +ZSTD_GEN_FAST_FN(noDict, 7, 0)  size_t ZSTD_compressBlock_fast(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -189,24 +338,40 @@ size_t ZSTD_compressBlock_fast(  {      U32 const mls = ms->cParams.minMatch;      assert(ms->dictMatchState == NULL); -    switch(mls) -    { -    default: /* includes case 3 */ -    case 4 : -        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); -    case 5 : -        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); -    case 6 : -        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); -    case 7 : -        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); +    if (ms->cParams.targetLength > 1) { +        switch(mls) +        { +        default: /* includes case 3 */ +        case 4 : +            return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize); +        case 5 : +            return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize); +        case 6 : +            return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize); +        case 7 : +            return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); +        } +    } else { +        switch(mls) +        { +        default: /* includes case 3 */ +        case 4 : +            return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize); +        case 5 : +            return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize); +        case 6 : +            return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize); +        case 7 : +            return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); +        } +      }  }  FORCE_INLINE_TEMPLATE  size_t ZSTD_compressBlock_fast_dictMatchState_generic(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], -        void const* src, size_t srcSize, U32 const mls) +        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)  {      const ZSTD_compressionParameters* const cParams = &ms->cParams;      U32* const hashTable = ms->hashTable; @@ -242,6 +407,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(      assert(endIndex - prefixStartIndex <= maxDistance);      (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */ +    (void)hasStep; /* not currently specialized on whether it's accelerated */ +      /* ensure there will be no underflow       * when translating a dict index into a local index */      assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); @@ -272,7 +439,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(              const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;              mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;              ip++; -            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); +            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);          } else if ( (matchIndex <= prefixStartIndex) ) {              size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);              U32 const dictMatchIndex = dictHashTable[dictHash]; @@ -292,7 +459,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(                  } /* catch up */                  offset_2 = offset_1;                  offset_1 = offset; -                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); +                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);              }          } else if (MEM_read32(match) != MEM_read32(ip)) {              /* it's not a match, and we're not going to check the dictionary */ @@ -307,7 +474,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(                   && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */              offset_2 = offset_1;              offset_1 = offset; -            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); +            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);          }          /* match found */ @@ -332,7 +499,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(                      const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;                      size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;                      U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */ -                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); +                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);                      hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;                      ip += repLength2;                      anchor = ip; @@ -351,6 +518,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(      return (size_t)(iend - anchor);  } + +ZSTD_GEN_FAST_FN(dictMatchState, 4, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 5, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) +  size_t ZSTD_compressBlock_fast_dictMatchState(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize) @@ -361,20 +534,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState(      {      default: /* includes case 3 */      case 4 : -        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); +        return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);      case 5 : -        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); +        return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);      case 6 : -        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); +        return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);      case 7 : -        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); +        return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);      }  }  static size_t ZSTD_compressBlock_fast_extDict_generic(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], -        void const* src, size_t srcSize, U32 const mls) +        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)  {      const ZSTD_compressionParameters* const cParams = &ms->cParams;      U32* const hashTable = ms->hashTable; @@ -398,11 +571,13 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(      const BYTE* const ilimit = iend - 8;      U32 offset_1=rep[0], offset_2=rep[1]; +    (void)hasStep; /* not currently specialized on whether it's accelerated */ +      DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);      /* switch to "regular" variant if extDict is invalidated due to maxDistance */      if (prefixStartIndex == dictStartIndex) -        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); +        return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);      /* Search Loop */      while (ip < ilimit) {  /* < instead of <=, because (ip+1) */ @@ -416,14 +591,14 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(          const BYTE* const repMatch = repBase + repIndex;          hashTable[h] = curr;   /* update hash table */          DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); -        assert(offset_1 <= curr +1);   /* check repIndex */ -        if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) +        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */             && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {              const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;              size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;              ip++; -            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH); +            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);              ip += rLength;              anchor = ip;          } else { @@ -439,7 +614,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(                  size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;                  while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */                  offset_2 = offset_1; offset_1 = offset;  /* update offset history */ -                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); +                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);                  ip += mLength;                  anchor = ip;          }   } @@ -453,12 +628,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(                  U32 const current2 = (U32)(ip-base);                  U32 const repIndex2 = current2 - offset_2;                  const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; -                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex))  /* intentional overflow */ +                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */                     && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {                      const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;                      size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;                      { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */ -                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH); +                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);                      hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;                      ip += repLength2;                      anchor = ip; @@ -475,6 +650,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(      return (size_t)(iend - anchor);  } +ZSTD_GEN_FAST_FN(extDict, 4, 0) +ZSTD_GEN_FAST_FN(extDict, 5, 0) +ZSTD_GEN_FAST_FN(extDict, 6, 0) +ZSTD_GEN_FAST_FN(extDict, 7, 0)  size_t ZSTD_compressBlock_fast_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -485,12 +664,12 @@ size_t ZSTD_compressBlock_fast_extDict(      {      default: /* includes case 3 */      case 4 : -        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); +        return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);      case 5 : -        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); +        return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);      case 6 : -        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); +        return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);      case 7 : -        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); +        return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);      }  } diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c index fb54d4e28a2b..0298a01a7504 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,   *  assumption : curr >= btlow == (curr - btmask)   *  doesn't fail */  static void -ZSTD_insertDUBT1(ZSTD_matchState_t* ms, +ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,                   U32 curr, const BYTE* inputEnd,                   U32 nbCompares, U32 btLow,                   const ZSTD_dictMode_e dictMode) @@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,  static size_t  ZSTD_DUBT_findBetterDictMatch ( -        ZSTD_matchState_t* ms, +        const ZSTD_matchState_t* ms,          const BYTE* const ip, const BYTE* const iend,          size_t* offsetPtr,          size_t bestLength, @@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch (              U32 matchIndex = dictMatchIndex + dictIndexDelta;              if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {                  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", -                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex); -                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; +                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);              }              if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */                  break;   /* drop, to guarantee consistency (miss a little bit of compression) */ @@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch (      }      if (bestLength >= MINMATCH) { -        U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; +        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;          DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",                      curr, (U32)bestLength, (U32)*offsetPtr, mIndex);      } @@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,                  if (matchLength > matchEndIdx - matchIndex)                      matchEndIdx = matchIndex + (U32)matchLength;                  if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) -                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; +                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);                  if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */                      if (dictMode == ZSTD_dictMatchState) {                          nbCompares = 0; /* in addition to avoiding checking any @@ -368,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,          assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */          ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */          if (bestLength >= MINMATCH) { -            U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; +            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;              DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",                          curr, (U32)bestLength, (U32)*offsetPtr, mIndex);          } @@ -391,91 +391,9 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,      return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);  } - -static size_t -ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms, -                            const BYTE* ip, const BYTE* const iLimit, -                                  size_t* offsetPtr) -{ -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); -    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); -    case 7 : -    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); -    } -} - - -static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* const iLimit, -                        size_t* offsetPtr) -{ -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); -    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); -    case 7 : -    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); -    } -} - - -static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* const iLimit, -                        size_t* offsetPtr) -{ -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); -    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); -    case 7 : -    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); -    } -} - - -  /* ********************************* -*  Hash Chain +* Dedicated dict search  ***********************************/ -#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)] - -/* Update chains up to ip (excluded) -   Assumption : always within prefix (i.e. not within extDict) */ -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( -                        ZSTD_matchState_t* ms, -                        const ZSTD_compressionParameters* const cParams, -                        const BYTE* ip, U32 const mls) -{ -    U32* const hashTable  = ms->hashTable; -    const U32 hashLog = cParams->hashLog; -    U32* const chainTable = ms->chainTable; -    const U32 chainMask = (1 << cParams->chainLog) - 1; -    const BYTE* const base = ms->window.base; -    const U32 target = (U32)(ip - base); -    U32 idx = ms->nextToUpdate; - -    while(idx < target) { /* catch up */ -        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); -        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; -        hashTable[h] = idx; -        idx++; -    } - -    ms->nextToUpdate = target; -    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; -} - -U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { -    const ZSTD_compressionParameters* const cParams = &ms->cParams; -    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); -}  void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)  { @@ -485,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B      U32* const chainTable = ms->chainTable;      U32 const chainSize = 1 << ms->cParams.chainLog;      U32 idx = ms->nextToUpdate; -    U32 const minChain = chainSize < target ? target - chainSize : idx; +    U32 const minChain = chainSize < target - idx ? target - chainSize : idx;      U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;      U32 const cacheSize = bucketSize - 1;      U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; @@ -499,13 +417,12 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B      U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;      U32* const tmpHashTable = hashTable;      U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); -    U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; +    U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;      U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; -      U32 hashIdx;      assert(ms->cParams.chainLog <= 24); -    assert(ms->cParams.hashLog >= ms->cParams.chainLog); +    assert(ms->cParams.hashLog > ms->cParams.chainLog);      assert(idx != 0);      assert(tmpMinChain <= minChain); @@ -536,7 +453,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B              if (count == cacheSize) {                  for (count = 0; count < chainLimit;) {                      if (i < minChain) { -                        if (!i || countBeyondMinChain++ > cacheSize) { +                        if (!i || ++countBeyondMinChain > cacheSize) {                              /* only allow pulling `cacheSize` number of entries                               * into the cache or chainTable beyond `minChain`,                               * to replace the entries pulled out of the @@ -592,10 +509,143 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B      ms->nextToUpdate = target;  } +/* Returns the longest match length found in the dedicated dict search structure. + * If none are longer than the argument ml, then ml will be returned. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, +                                            const ZSTD_matchState_t* const dms, +                                            const BYTE* const ip, const BYTE* const iLimit, +                                            const BYTE* const prefixStart, const U32 curr, +                                            const U32 dictLimit, const size_t ddsIdx) { +    const U32 ddsLowestIndex  = dms->window.dictLimit; +    const BYTE* const ddsBase = dms->window.base; +    const BYTE* const ddsEnd  = dms->window.nextSrc; +    const U32 ddsSize         = (U32)(ddsEnd - ddsBase); +    const U32 ddsIndexDelta   = dictLimit - ddsSize; +    const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); +    const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; +    U32 ddsAttempt; +    U32 matchIndex; + +    for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { +        PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); +    } + +    { +        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; +        U32 const chainIndex = chainPackedPointer >> 8; + +        PREFETCH_L1(&dms->chainTable[chainIndex]); +    } + +    for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { +        size_t currentMl=0; +        const BYTE* match; +        matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; +        match = ddsBase + matchIndex; + +        if (!matchIndex) { +            return ml; +        } + +        /* guaranteed by table construction */ +        (void)ddsLowestIndex; +        assert(matchIndex >= ddsLowestIndex); +        assert(match+4 <= ddsEnd); +        if (MEM_read32(match) == MEM_read32(ip)) { +            /* assumption : matchIndex <= dictLimit-4 (by table construction) */ +            currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; +        } + +        /* save best solution */ +        if (currentMl > ml) { +            ml = currentMl; +            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); +            if (ip+currentMl == iLimit) { +                /* best possible, avoids read overflow on next attempt */ +                return ml; +            } +        } +    } + +    { +        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; +        U32 chainIndex = chainPackedPointer >> 8; +        U32 const chainLength = chainPackedPointer & 0xFF; +        U32 const chainAttempts = nbAttempts - ddsAttempt; +        U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; +        U32 chainAttempt; + +        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { +            PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); +        } + +        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { +            size_t currentMl=0; +            const BYTE* match; +            matchIndex = dms->chainTable[chainIndex]; +            match = ddsBase + matchIndex; + +            /* guaranteed by table construction */ +            assert(matchIndex >= ddsLowestIndex); +            assert(match+4 <= ddsEnd); +            if (MEM_read32(match) == MEM_read32(ip)) { +                /* assumption : matchIndex <= dictLimit-4 (by table construction) */ +                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; +            } + +            /* save best solution */ +            if (currentMl > ml) { +                ml = currentMl; +                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); +                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ +            } +        } +    } +    return ml; +} + + +/* ********************************* +*  Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) +   Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( +                        ZSTD_matchState_t* ms, +                        const ZSTD_compressionParameters* const cParams, +                        const BYTE* ip, U32 const mls) +{ +    U32* const hashTable  = ms->hashTable; +    const U32 hashLog = cParams->hashLog; +    U32* const chainTable = ms->chainTable; +    const U32 chainMask = (1 << cParams->chainLog) - 1; +    const BYTE* const base = ms->window.base; +    const U32 target = (U32)(ip - base); +    U32 idx = ms->nextToUpdate; + +    while(idx < target) { /* catch up */ +        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); +        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; +        hashTable[h] = idx; +        idx++; +    } + +    ms->nextToUpdate = target; +    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { +    const ZSTD_compressionParameters* const cParams = &ms->cParams; +    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); +}  /* inlining is important to hardwire a hot branch (template emulation) */  FORCE_INLINE_TEMPLATE -size_t ZSTD_HcFindBestMatch_generic ( +size_t ZSTD_HcFindBestMatch(                          ZSTD_matchState_t* ms,                          const BYTE* const ip, const BYTE* const iLimit,                          size_t* offsetPtr, @@ -653,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (          /* save best solution */          if (currentMl > ml) {              ml = currentMl; -            *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; +            *offsetPtr = STORE_OFFSET(curr - matchIndex);              if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */          } @@ -663,90 +713,8 @@ size_t ZSTD_HcFindBestMatch_generic (      assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */      if (dictMode == ZSTD_dedicatedDictSearch) { -        const U32 ddsLowestIndex  = dms->window.dictLimit; -        const BYTE* const ddsBase = dms->window.base; -        const BYTE* const ddsEnd  = dms->window.nextSrc; -        const U32 ddsSize         = (U32)(ddsEnd - ddsBase); -        const U32 ddsIndexDelta   = dictLimit - ddsSize; -        const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); -        const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; -        U32 ddsAttempt; - -        for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { -            PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); -        } - -        { -            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; -            U32 const chainIndex = chainPackedPointer >> 8; - -            PREFETCH_L1(&dms->chainTable[chainIndex]); -        } - -        for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { -            size_t currentMl=0; -            const BYTE* match; -            matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; -            match = ddsBase + matchIndex; - -            if (!matchIndex) { -                return ml; -            } - -            /* guaranteed by table construction */ -            (void)ddsLowestIndex; -            assert(matchIndex >= ddsLowestIndex); -            assert(match+4 <= ddsEnd); -            if (MEM_read32(match) == MEM_read32(ip)) { -                /* assumption : matchIndex <= dictLimit-4 (by table construction) */ -                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; -            } - -            /* save best solution */ -            if (currentMl > ml) { -                ml = currentMl; -                *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; -                if (ip+currentMl == iLimit) { -                    /* best possible, avoids read overflow on next attempt */ -                    return ml; -                } -            } -        } - -        { -            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; -            U32 chainIndex = chainPackedPointer >> 8; -            U32 const chainLength = chainPackedPointer & 0xFF; -            U32 const chainAttempts = nbAttempts - ddsAttempt; -            U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; -            U32 chainAttempt; - -            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { -                PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); -            } - -            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { -                size_t currentMl=0; -                const BYTE* match; -                matchIndex = dms->chainTable[chainIndex]; -                match = ddsBase + matchIndex; - -                /* guaranteed by table construction */ -                assert(matchIndex >= ddsLowestIndex); -                assert(match+4 <= ddsEnd); -                if (MEM_read32(match) == MEM_read32(ip)) { -                    /* assumption : matchIndex <= dictLimit-4 (by table construction) */ -                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; -                } - -                /* save best solution */ -                if (currentMl > ml) { -                    ml = currentMl; -                    *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; -                    if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ -                } -            } -        } +        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, +                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);      } else if (dictMode == ZSTD_dictMatchState) {          const U32* const dmsChainTable = dms->chainTable;          const U32 dmsChainSize         = (1 << dms->cParams.chainLog); @@ -770,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic (              /* save best solution */              if (currentMl > ml) {                  ml = currentMl; -                *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; +                assert(curr > matchIndex + dmsIndexDelta); +                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */              } @@ -783,75 +752,725 @@ size_t ZSTD_HcFindBestMatch_generic (      return ml;  } +/* ********************************* +* (SIMD) Row-based matchfinder +***********************************/ +/* Constants for row-based hash */ +#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */ +#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */ + +#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* const iLimit, -                        size_t* offsetPtr) +typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 representing a mask of matches */ + +/* ZSTD_VecMask_next(): + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +    assert(val != 0); +#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +    if (sizeof(size_t) == 4) { +        U32 mostSignificantWord = (U32)(val >> 32); +        U32 leastSignificantWord = (U32)val; +        if (leastSignificantWord == 0) { +            return 32 + (U32)__builtin_ctz(mostSignificantWord); +        } else { +            return (U32)__builtin_ctz(leastSignificantWord); +        } +    } else { +        return (U32)__builtin_ctzll(val); +    } +#   else +    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +     */ +    val = ~val & (val - 1ULL); /* Lowest set bit mask */ +    val = val - ((val >> 1) & 0x5555555555555555); +    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +#   endif +} + +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts + */ +FORCE_INLINE_TEMPLATE +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +    assert(count < 64); +    count &= 0x3F; /* for fickle pattern recognition */ +    return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +    assert(count < 32); +    count &= 0x1F; /* for fickle pattern recognition */ +    return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +FORCE_INLINE_TEMPLATE +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +    assert(count < 16); +    count &= 0x0F; /* for fickle pattern recognition */ +    return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +} + +/* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" + * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +  U32 const next = (*tagRow - 1) & rowMask; +  *tagRow = (BYTE)next; +  return next; +} + +/* ZSTD_isAligned(): + * Checks that a pointer is aligned to "align" bytes which must be a power of 2. + */ +MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { +    assert((align & (align - 1)) == 0); +    return (((size_t)ptr) & (align - 1)) == 0; +} + +/* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { +    PREFETCH_L1(hashTable + relRow); +    if (rowLog >= 5) { +        PREFETCH_L1(hashTable + relRow + 16); +        /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ +    } +    PREFETCH_L1(tagTable + relRow); +    if (rowLog == 6) { +        PREFETCH_L1(tagTable + relRow + 32); +    } +    assert(rowLog == 4 || rowLog == 5 || rowLog == 6); +    assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* prefetched hash row always 64-byte aligned */ +    assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ +} + +/* ZSTD_row_fillHashCache(): + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, +                                   U32 const rowLog, U32 const mls, +                                   U32 idx, const BYTE* const iLimit)  { -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); -    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); -    case 7 : -    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); +    U32 const* const hashTable = ms->hashTable; +    U16 const* const tagTable = ms->tagTable; +    U32 const hashLog = ms->rowHashLog; +    U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); +    U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + +    for (; idx < lim; ++idx) { +        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); +        U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; +        ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); +        ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;      } + +    DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], +                                                     ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], +                                                     ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);  } +/* ZSTD_row_nextCachedHash(): + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +                                                  U16 const* tagTable, BYTE const* base, +                                                  U32 idx, U32 const hashLog, +                                                  U32 const rowLog, U32 const mls) +{ +    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); +    U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; +    ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); +    {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +        cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; +        return hash; +    } +} -static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* const iLimit, -                        size_t* offsetPtr) +/* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, +                                                        U32 updateStartIdx, U32 const updateEndIdx, +                                                        U32 const mls, U32 const rowLog, +                                                        U32 const rowMask, U32 const useCache)  { -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); -    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); -    case 7 : -    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); +    U32* const hashTable = ms->hashTable; +    U16* const tagTable = ms->tagTable; +    U32 const hashLog = ms->rowHashLog; +    const BYTE* const base = ms->window.base; + +    DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); +    for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); +        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; +        U32* const row = hashTable + relRow; +        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +                                                       Explicit cast allows us to get exact desired position within each row */ +        U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; +        row[pos] = updateStartIdx;      }  } +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, +                                                    U32 const mls, U32 const rowLog, +                                                    U32 const rowMask, U32 const useCache) +{ +    U32 idx = ms->nextToUpdate; +    const BYTE* const base = ms->window.base; +    const U32 target = (U32)(ip - base); +    const U32 kSkipThreshold = 384; +    const U32 kMaxMatchStartPositionsToUpdate = 96; +    const U32 kMaxMatchEndPositionsToUpdate = 32; + +    if (useCache) { +        /* Only skip positions when using hash cache, i.e. +         * if we are loading a dict, don't skip anything. +         * If we decide to skip, then we only update a set number +         * of positions at the beginning and end of the match. +         */ +        if (UNLIKELY(target - idx > kSkipThreshold)) { +            U32 const bound = idx + kMaxMatchStartPositionsToUpdate; +            ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); +            idx = target - kMaxMatchEndPositionsToUpdate; +            ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); +        } +    } +    assert(target >= idx); +    ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); +    ms->nextToUpdate = target; +} -static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* const iLimit, -                        size_t* offsetPtr) +/* ZSTD_row_update(): + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { +    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); +    const U32 rowMask = (1u << rowLog) - 1; +    const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + +    DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); +} + +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)  { -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch); -    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch); -    case 7 : -    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch); +    const __m128i comparisonMask = _mm_set1_epi8((char)tag); +    int matches[4] = {0}; +    int i; +    assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); +    for (i=0; i<nbChunks; i++) { +        const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); +        const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); +        matches[i] = _mm_movemask_epi8(equalMask);      } +    if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); +    if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); +    assert(nbChunks == 4); +    return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);  } +#endif +/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches + * the hash at the nth position in a row of the tagTable. + * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield + * to match up with the actual layout of the entries within the hashTable */ +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) +{ +    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; +    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); +    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + +#if defined(ZSTD_ARCH_X86_SSE2) + +    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); + +#else /* SW or NEON-LE */ + +# if defined(ZSTD_ARCH_ARM_NEON) +  /* This NEON path only works for little endian - otherwise use SWAR below */ +    if (MEM_isLittleEndian()) { +        if (rowEntries == 16) { +            const uint8x16_t chunk = vld1q_u8(src); +            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +            const U16 hi = (U16)vgetq_lane_u8(t3, 8); +            const U16 lo = (U16)vgetq_lane_u8(t3, 0); +            return ZSTD_rotateRight_U16((hi << 8) | lo, head); +        } else if (rowEntries == 32) { +            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +            const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +            const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +            const uint8x8x2_t t3 = vuzp_u8(t2, t0); +            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +            return ZSTD_rotateRight_U32(matches, head); +        } else { /* rowEntries == 64 */ +            const uint8x16x4_t chunk = vld4q_u8(src); +            const uint8x16_t dup = vdupq_n_u8(tag); +            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); + +            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +            return ZSTD_rotateRight_U64(matches, head); +        } +    } +# endif /* ZSTD_ARCH_ARM_NEON */ +    /* SWAR */ +    {   const size_t chunkSize = sizeof(size_t); +        const size_t shiftAmount = ((chunkSize * 8) - chunkSize); +        const size_t xFF = ~((size_t)0); +        const size_t x01 = xFF / 0xFF; +        const size_t x80 = x01 << 7; +        const size_t splatChar = tag * x01; +        ZSTD_VecMask matches = 0; +        int i = rowEntries - chunkSize; +        assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); +        if (MEM_isLittleEndian()) { /* runtime check so have two loops */ +            const size_t extractMagic = (xFF / 0x7F) >> chunkSize; +            do { +                size_t chunk = MEM_readST(&src[i]); +                chunk ^= splatChar; +                chunk = (((chunk | x80) - x01) | chunk) & x80; +                matches <<= chunkSize; +                matches |= (chunk * extractMagic) >> shiftAmount; +                i -= chunkSize; +            } while (i >= 0); +        } else { /* big endian: reverse bits during extraction */ +            const size_t msb = xFF ^ (xFF >> 1); +            const size_t extractMagic = (msb / 0x1FF) | msb; +            do { +                size_t chunk = MEM_readST(&src[i]); +                chunk ^= splatChar; +                chunk = (((chunk | x80) - x01) | chunk) & x80; +                matches <<= chunkSize; +                matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; +                i -= chunkSize; +            } while (i >= 0); +        } +        matches = ~matches; +        if (rowEntries == 16) { +            return ZSTD_rotateRight_U16((U16)matches, head); +        } else if (rowEntries == 32) { +            return ZSTD_rotateRight_U32((U32)matches, head); +        } else { +            return ZSTD_rotateRight_U64((U64)matches, head); +        } +    } +#endif +} -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( +/* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: + *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" + *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines + *        which row to insert into. + *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can + *        be considered as a circular buffer with a "head" index that resides in the tagTable. + *      - Also insert the "tag" into the equivalent row and position in the tagTable. + *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. + *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, + *                  for alignment/performance reasons, leaving some bytes unused. + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and + *   generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_RowFindBestMatch(                          ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* const iLimit, -                        size_t* offsetPtr) +                        const BYTE* const ip, const BYTE* const iLimit, +                        size_t* offsetPtr, +                        const U32 mls, const ZSTD_dictMode_e dictMode, +                        const U32 rowLog)  { -    switch(ms->cParams.minMatch) -    { -    default : /* includes case 3 */ -    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); -    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); -    case 7 : -    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); +    U32* const hashTable = ms->hashTable; +    U16* const tagTable = ms->tagTable; +    U32* const hashCache = ms->hashCache; +    const U32 hashLog = ms->rowHashLog; +    const ZSTD_compressionParameters* const cParams = &ms->cParams; +    const BYTE* const base = ms->window.base; +    const BYTE* const dictBase = ms->window.dictBase; +    const U32 dictLimit = ms->window.dictLimit; +    const BYTE* const prefixStart = base + dictLimit; +    const BYTE* const dictEnd = dictBase + dictLimit; +    const U32 curr = (U32)(ip-base); +    const U32 maxDistance = 1U << cParams->windowLog; +    const U32 lowestValid = ms->window.lowLimit; +    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; +    const U32 isDictionary = (ms->loadedDictEnd != 0); +    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; +    const U32 rowEntries = (1U << rowLog); +    const U32 rowMask = rowEntries - 1; +    const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ +    U32 nbAttempts = 1U << cappedSearchLog; +    size_t ml=4-1; + +    /* DMS/DDS variables that may be referenced laster */ +    const ZSTD_matchState_t* const dms = ms->dictMatchState; + +    /* Initialize the following variables to satisfy static analyzer */ +    size_t ddsIdx = 0; +    U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ +    U32 dmsTag = 0; +    U32* dmsRow = NULL; +    BYTE* dmsTagRow = NULL; + +    if (dictMode == ZSTD_dedicatedDictSearch) { +        const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; +        {   /* Prefetch DDS hashtable entry */ +            ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; +            PREFETCH_L1(&dms->hashTable[ddsIdx]); +        } +        ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; +    } + +    if (dictMode == ZSTD_dictMatchState) { +        /* Prefetch DMS rows */ +        U32* const dmsHashTable = dms->hashTable; +        U16* const dmsTagTable = dms->tagTable; +        U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); +        U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; +        dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +        dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); +        dmsRow = dmsHashTable + dmsRelRow; +        ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); +    } + +    /* Update the hashTable and tagTable up to (but not including) ip */ +    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); +    {   /* Get the hash for ip, compute the appropriate row */ +        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); +        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; +        U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; +        U32* const row = hashTable + relRow; +        BYTE* tagRow = (BYTE*)(tagTable + relRow); +        U32 const head = *tagRow & rowMask; +        U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; +        size_t numMatches = 0; +        size_t currMatch = 0; +        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); + +        /* Cycle through the matches and prefetch */ +        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; +            U32 const matchIndex = row[matchPos]; +            assert(numMatches < rowEntries); +            if (matchIndex < lowLimit) +                break; +            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { +                PREFETCH_L1(base + matchIndex); +            } else { +                PREFETCH_L1(dictBase + matchIndex); +            } +            matchBuffer[numMatches++] = matchIndex; +        } + +        /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop +           in ZSTD_row_update_internal() at the next search. */ +        { +            U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; +            row[pos] = ms->nextToUpdate++; +        } + +        /* Return the longest match */ +        for (; currMatch < numMatches; ++currMatch) { +            U32 const matchIndex = matchBuffer[currMatch]; +            size_t currentMl=0; +            assert(matchIndex < curr); +            assert(matchIndex >= lowLimit); + +            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { +                const BYTE* const match = base + matchIndex; +                assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */ +                if (match[ml] == ip[ml])   /* potentially better */ +                    currentMl = ZSTD_count(ip, match, iLimit); +            } else { +                const BYTE* const match = dictBase + matchIndex; +                assert(match+4 <= dictEnd); +                if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */ +                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; +            } + +            /* Save best solution */ +            if (currentMl > ml) { +                ml = currentMl; +                *offsetPtr = STORE_OFFSET(curr - matchIndex); +                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ +            } +        } +    } + +    assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ +    if (dictMode == ZSTD_dedicatedDictSearch) { +        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, +                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); +    } else if (dictMode == ZSTD_dictMatchState) { +        /* TODO: Measure and potentially add prefetching to DMS */ +        const U32 dmsLowestIndex       = dms->window.dictLimit; +        const BYTE* const dmsBase      = dms->window.base; +        const BYTE* const dmsEnd       = dms->window.nextSrc; +        const U32 dmsSize              = (U32)(dmsEnd - dmsBase); +        const U32 dmsIndexDelta        = dictLimit - dmsSize; + +        {   U32 const head = *dmsTagRow & rowMask; +            U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; +            size_t numMatches = 0; +            size_t currMatch = 0; +            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); + +            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; +                U32 const matchIndex = dmsRow[matchPos]; +                if (matchIndex < dmsLowestIndex) +                    break; +                PREFETCH_L1(dmsBase + matchIndex); +                matchBuffer[numMatches++] = matchIndex; +            } + +            /* Return the longest match */ +            for (; currMatch < numMatches; ++currMatch) { +                U32 const matchIndex = matchBuffer[currMatch]; +                size_t currentMl=0; +                assert(matchIndex >= dmsLowestIndex); +                assert(matchIndex < curr); + +                {   const BYTE* const match = dmsBase + matchIndex; +                    assert(match+4 <= dmsEnd); +                    if (MEM_read32(match) == MEM_read32(ip)) +                        currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; +                } + +                if (currentMl > ml) { +                    ml = currentMl; +                    assert(curr > matchIndex + dmsIndexDelta); +                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); +                    if (ip+currentMl == iLimit) break; +                } +            } +        }      } +    return ml;  } +/* + * Generate search functions templated on (dictMode, mls, rowLog). + * These functions are outlined for code size & compilation time. + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a + * bunch of constants from the ZSTD_matchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. + * + * TODO: Move the match re-winding into searchMax. This improves compression + * ratio, and unlocks further simplifications with the next TODO. + * + * TODO: Try moving the repcode search into searchMax. After the re-winding + * and repcode search are in searchMax, there is no more logic in the match + * finder loop that requires knowledge about the dictMode. So we should be + * able to avoid force inlining it, and we can join the extDict loop with + * the single segment loop. It should go in searchMax instead of its own + * function to avoid having multiple virtual function calls per search. + */ + +#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls +#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls +#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog + +#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE + +#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                                           \ +    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(                      \ +            ZSTD_matchState_t* ms,                                                     \ +            const BYTE* ip, const BYTE* const iLimit,                                  \ +            size_t* offBasePtr)                                                        \ +    {                                                                                  \ +        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                           \ +        return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ +    }                                                                                  \ + +#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                                          \ +    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(                     \ +            ZSTD_matchState_t* ms,                                                    \ +            const BYTE* ip, const BYTE* const iLimit,                                 \ +            size_t* offsetPtr)                                                        \ +    {                                                                                 \ +        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \ +        return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ +    }                                                                                 \ + +#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                                          \ +    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(                     \ +            ZSTD_matchState_t* ms,                                                             \ +            const BYTE* ip, const BYTE* const iLimit,                                          \ +            size_t* offsetPtr)                                                                 \ +    {                                                                                          \ +        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                                   \ +        assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog);                               \ +        return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ +    }                                                                                          \ + +#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ +    X(dictMode, mls, 4)                        \ +    X(dictMode, mls, 5)                        \ +    X(dictMode, mls, 6) + +#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ +    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4)      \ +    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5)      \ +    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) + +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ +    X(dictMode, 4)                     \ +    X(dictMode, 5)                     \ +    X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ +    X(__VA_ARGS__, noDict)              \ +    X(__VA_ARGS__, extDict)             \ +    X(__VA_ARGS__, dictMatchState)      \ +    X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate row search fns for each combination of (dictMode, mls, rowLog) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) +/* Generate binary Tree search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) +/* Generate hash chain search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) + +typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; + +#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls)                         \ +    case mls:                                                             \ +        return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls)                         \ +    case mls:                                                             \ +        return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog)                         \ +    case rowLog:                                                                   \ +        return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); + +#define ZSTD_SWITCH_MLS(X, dictMode)   \ +    switch (mls) {                     \ +        ZSTD_FOR_EACH_MLS(X, dictMode) \ +    } + +#define ZSTD_SWITCH_ROWLOG(dictMode, mls)                                    \ +    case mls:                                                                \ +        switch (rowLog) {                                                    \ +            ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ +        }                                                                    \ +        ZSTD_UNREACHABLE;                                                    \ +        break; + +#define ZSTD_SWITCH_SEARCH_METHOD(dictMode)                       \ +    switch (searchMethod) {                                       \ +        case search_hashChain:                                    \ +            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ +            break;                                                \ +        case search_binaryTree:                                   \ +            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ +            break;                                                \ +        case search_rowHash:                                      \ +            ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode)         \ +            break;                                                \ +    }                                                             \ +    ZSTD_UNREACHABLE; + +/* + * Searches for the longest match at @p ip. + * Dispatches to the correct implementation function based on the + * (searchMethod, dictMode, mls, rowLog). We use switch statements + * here instead of using an indirect function call through a function + * pointer because after Spectre and Meltdown mitigations, indirect + * function calls can be very costly, especially in the kernel. + * + * NOTE: dictMode and searchMethod should be templated, so those switch + * statements should be optimized out. Only the mls & rowLog switches + * should be left. + * + * @param ms The match state. + * @param ip The position to search at. + * @param iend The end of the input data. + * @param[out] offsetPtr Stores the match offset into this pointer. + * @param mls The minimum search length, in the range [4, 6]. + * @param rowLog The row log (if applicable), in the range [4, 6]. + * @param searchMethod The search method to use (templated). + * @param dictMode The dictMode (templated). + * + * @returns The length of the longest match found, or < mls if no match is found. + * If a match is found its offset is stored in @p offsetPtr. + */ +FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( +    ZSTD_matchState_t* ms, +    const BYTE* ip, +    const BYTE* iend, +    size_t* offsetPtr, +    U32 const mls, +    U32 const rowLog, +    searchMethod_e const searchMethod, +    ZSTD_dictMode_e const dictMode) +{ +    if (dictMode == ZSTD_noDict) { +        ZSTD_SWITCH_SEARCH_METHOD(noDict) +    } else if (dictMode == ZSTD_extDict) { +        ZSTD_SWITCH_SEARCH_METHOD(extDict) +    } else if (dictMode == ZSTD_dictMatchState) { +        ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) +    } else if (dictMode == ZSTD_dedicatedDictSearch) { +        ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) +    } +    ZSTD_UNREACHABLE; +    return 0; +} +  /* *******************************  *  Common parser - lazy strategy  *********************************/ -typedef enum { search_hashChain, search_binaryTree } searchMethod_e;  FORCE_INLINE_TEMPLATE size_t  ZSTD_compressBlock_lazy_generic( @@ -865,41 +1484,13 @@ ZSTD_compressBlock_lazy_generic(      const BYTE* ip = istart;      const BYTE* anchor = istart;      const BYTE* const iend = istart + srcSize; -    const BYTE* const ilimit = iend - 8; +    const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;      const BYTE* const base = ms->window.base;      const U32 prefixLowestIndex = ms->window.dictLimit;      const BYTE* const prefixLowest = base + prefixLowestIndex; +    const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); +    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); -    typedef size_t (*searchMax_f)( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - -    /* -     * This table is indexed first by the four ZSTD_dictMode_e values, and then -     * by the two searchMethod_e values. NULLs are placed for configurations -     * that should never occur (extDict modes go to the other implementation -     * below and there is no DDSS for binary tree search yet). -     */ -    const searchMax_f searchFuncs[4][2] = { -        { -            ZSTD_HcFindBestMatch_selectMLS, -            ZSTD_BtFindBestMatch_selectMLS -        }, -        { -            NULL, -            NULL -        }, -        { -            ZSTD_HcFindBestMatch_dictMatchState_selectMLS, -            ZSTD_BtFindBestMatch_dictMatchState_selectMLS -        }, -        { -            ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, -            NULL -        } -    }; - -    searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];      U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;      const int isDMS = dictMode == ZSTD_dictMatchState; @@ -915,11 +1506,7 @@ ZSTD_compressBlock_lazy_generic(                                       0;      const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); -    assert(searchMax != NULL); - -    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode); - -    /* init */ +    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);      ip += (dictAndPrefixLength == 0);      if (dictMode == ZSTD_noDict) {          U32 const curr = (U32)(ip - base); @@ -935,6 +1522,12 @@ ZSTD_compressBlock_lazy_generic(          assert(offset_2 <= dictAndPrefixLength);      } +    if (searchMethod == search_rowHash) { +        ZSTD_row_fillHashCache(ms, base, rowLog, +                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +                            ms->nextToUpdate, ilimit); +    } +      /* Match Loop */  #if defined(__x86_64__)      /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the @@ -944,8 +1537,9 @@ ZSTD_compressBlock_lazy_generic(  #endif      while (ip < ilimit) {          size_t matchLength=0; -        size_t offset=0; +        size_t offcode=STORE_REPCODE_1;          const BYTE* start=ip+1; +        DEBUGLOG(7, "search baseline (depth 0)");          /* check repCode */          if (isDxS) { @@ -969,9 +1563,9 @@ ZSTD_compressBlock_lazy_generic(          /* first search (depth 0) */          {   size_t offsetFound = 999999999; -            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); +            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);              if (ml2 > matchLength) -                matchLength = ml2, start = ip, offset=offsetFound; +                matchLength = ml2, start = ip, offcode=offsetFound;          }          if (matchLength < 4) { @@ -982,14 +1576,15 @@ ZSTD_compressBlock_lazy_generic(          /* let's try to find a better solution */          if (depth>=1)          while (ip<ilimit) { +            DEBUGLOG(7, "search depth 1");              ip ++;              if ( (dictMode == ZSTD_noDict) -              && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { +              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {                  size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;                  int const gain2 = (int)(mlRep * 3); -                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); +                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);                  if ((mlRep >= 4) && (gain2 > gain1)) -                    matchLength = mlRep, offset = 0, start = ip; +                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;              }              if (isDxS) {                  const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1001,30 +1596,31 @@ ZSTD_compressBlock_lazy_generic(                      const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;                      size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;                      int const gain2 = (int)(mlRep * 3); -                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); +                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);                      if ((mlRep >= 4) && (gain2 > gain1)) -                        matchLength = mlRep, offset = 0, start = ip; +                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;                  }              }              {   size_t offset2=999999999; -                size_t const ml2 = searchMax(ms, ip, iend, &offset2); -                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */ -                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); +                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */ +                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);                  if ((ml2 >= 4) && (gain2 > gain1)) { -                    matchLength = ml2, offset = offset2, start = ip; +                    matchLength = ml2, offcode = offset2, start = ip;                      continue;   /* search a better one */              }   }              /* let's find an even better one */              if ((depth==2) && (ip<ilimit)) { +                DEBUGLOG(7, "search depth 2");                  ip ++;                  if ( (dictMode == ZSTD_noDict) -                  && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { +                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {                      size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;                      int const gain2 = (int)(mlRep * 4); -                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); +                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);                      if ((mlRep >= 4) && (gain2 > gain1)) -                        matchLength = mlRep, offset = 0, start = ip; +                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;                  }                  if (isDxS) {                      const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1036,46 +1632,45 @@ ZSTD_compressBlock_lazy_generic(                          const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;                          size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;                          int const gain2 = (int)(mlRep * 4); -                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); +                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);                          if ((mlRep >= 4) && (gain2 > gain1)) -                            matchLength = mlRep, offset = 0, start = ip; +                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;                      }                  }                  {   size_t offset2=999999999; -                    size_t const ml2 = searchMax(ms, ip, iend, &offset2); -                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */ -                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); +                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */ +                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);                      if ((ml2 >= 4) && (gain2 > gain1)) { -                        matchLength = ml2, offset = offset2, start = ip; +                        matchLength = ml2, offcode = offset2, start = ip;                          continue;              }   }   }              break;  /* nothing found : store previous solution */          }          /* NOTE: -         * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. -         * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which -         * overflows the pointer, which is undefined behavior. +         * Pay attention that `start[-value]` can lead to strange undefined behavior +         * notably if `value` is unsigned, resulting in a large positive `-value`.           */          /* catch up */ -        if (offset) { +        if (STORED_IS_OFFSET(offcode)) {              if (dictMode == ZSTD_noDict) { -                while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) -                     && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */ +                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */                      { start--; matchLength++; }              }              if (isDxS) { -                U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); +                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));                  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;                  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;                  while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */              } -            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); +            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);          }          /* store sequence */  _storeSequence: -        {   size_t const litLength = start - anchor; -            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); +        {   size_t const litLength = (size_t)(start - anchor); +            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);              anchor = ip = start + matchLength;          } @@ -1091,8 +1686,8 @@ _storeSequence:                     && (MEM_read32(repMatch) == MEM_read32(ip)) ) {                      const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;                      matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; -                    offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset_2 <=> offset_1 */ -                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); +                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */ +                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);                      ip += matchLength;                      anchor = ip;                      continue; @@ -1106,8 +1701,8 @@ _storeSequence:                   && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {                  /* store sequence */                  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; -                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ -                ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); +                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);                  ip += matchLength;                  anchor = ip;                  continue;   /* faster when present ... (?) */ @@ -1200,6 +1795,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(      return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);  } +/* Row-based matchfinder */ +size_t ZSTD_compressBlock_lazy2_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); +} + + +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); +}  FORCE_INLINE_TEMPLATE  size_t ZSTD_compressBlock_lazy_extDict_generic( @@ -1212,7 +1871,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(      const BYTE* ip = istart;      const BYTE* anchor = istart;      const BYTE* const iend = istart + srcSize; -    const BYTE* const ilimit = iend - 8; +    const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;      const BYTE* const base = ms->window.base;      const U32 dictLimit = ms->window.dictLimit;      const BYTE* const prefixStart = base + dictLimit; @@ -1220,18 +1879,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(      const BYTE* const dictEnd  = dictBase + dictLimit;      const BYTE* const dictStart  = dictBase + ms->window.lowLimit;      const U32 windowLog = ms->cParams.windowLog; - -    typedef size_t (*searchMax_f)( -                        ZSTD_matchState_t* ms, -                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); -    searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; +    const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); +    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);      U32 offset_1 = rep[0], offset_2 = rep[1]; -    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); +    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);      /* init */      ip += (ip == prefixStart); +    if (searchMethod == search_rowHash) { +        ZSTD_row_fillHashCache(ms, base, rowLog, +                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +                               ms->nextToUpdate, ilimit); +    }      /* Match Loop */  #if defined(__x86_64__) @@ -1242,7 +1903,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(  #endif      while (ip < ilimit) {          size_t matchLength=0; -        size_t offset=0; +        size_t offcode=STORE_REPCODE_1;          const BYTE* start=ip+1;          U32 curr = (U32)(ip-base); @@ -1251,7 +1912,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(              const U32 repIndex = (U32)(curr+1 - offset_1);              const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;              const BYTE* const repMatch = repBase + repIndex; -            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))   /* intentional overflow */ +            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ +               & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */              if (MEM_read32(ip+1) == MEM_read32(repMatch)) {                  /* repcode detected we should take it */                  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1261,9 +1923,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(          /* first search (depth 0) */          {   size_t offsetFound = 999999999; -            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); +            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);              if (ml2 > matchLength) -                matchLength = ml2, start = ip, offset=offsetFound; +                matchLength = ml2, start = ip, offcode=offsetFound;          }          if (matchLength < 4) { @@ -1277,29 +1939,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(              ip ++;              curr++;              /* check repCode */ -            if (offset) { +            if (offcode) {                  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);                  const U32 repIndex = (U32)(curr - offset_1);                  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;                  const BYTE* const repMatch = repBase + repIndex; -                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */ +                if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */ +                   & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */                  if (MEM_read32(ip) == MEM_read32(repMatch)) {                      /* repcode detected */                      const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;                      size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;                      int const gain2 = (int)(repLength * 3); -                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); +                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);                      if ((repLength >= 4) && (gain2 > gain1)) -                        matchLength = repLength, offset = 0, start = ip; +                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;              }   }              /* search match, depth 1 */              {   size_t offset2=999999999; -                size_t const ml2 = searchMax(ms, ip, iend, &offset2); -                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */ -                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); +                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */ +                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);                  if ((ml2 >= 4) && (gain2 > gain1)) { -                    matchLength = ml2, offset = offset2, start = ip; +                    matchLength = ml2, offcode = offset2, start = ip;                      continue;   /* search a better one */              }   } @@ -1308,47 +1971,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(                  ip ++;                  curr++;                  /* check repCode */ -                if (offset) { +                if (offcode) {                      const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);                      const U32 repIndex = (U32)(curr - offset_1);                      const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;                      const BYTE* const repMatch = repBase + repIndex; -                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */ +                    if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */ +                       & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */                      if (MEM_read32(ip) == MEM_read32(repMatch)) {                          /* repcode detected */                          const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;                          size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;                          int const gain2 = (int)(repLength * 4); -                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); +                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);                          if ((repLength >= 4) && (gain2 > gain1)) -                            matchLength = repLength, offset = 0, start = ip; +                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;                  }   }                  /* search match, depth 2 */                  {   size_t offset2=999999999; -                    size_t const ml2 = searchMax(ms, ip, iend, &offset2); -                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */ -                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); +                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */ +                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);                      if ((ml2 >= 4) && (gain2 > gain1)) { -                        matchLength = ml2, offset = offset2, start = ip; +                        matchLength = ml2, offcode = offset2, start = ip;                          continue;              }   }   }              break;  /* nothing found : store previous solution */          }          /* catch up */ -        if (offset) { -            U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); +        if (STORED_IS_OFFSET(offcode)) { +            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));              const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;              const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;              while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */ -            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); +            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);          }          /* store sequence */  _storeSequence: -        {   size_t const litLength = start - anchor; -            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); +        {   size_t const litLength = (size_t)(start - anchor); +            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);              anchor = ip = start + matchLength;          } @@ -1359,13 +2023,14 @@ _storeSequence:              const U32 repIndex = repCurrent - offset_2;              const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;              const BYTE* const repMatch = repBase + repIndex; -            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */ +            if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */ +               & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */              if (MEM_read32(ip) == MEM_read32(repMatch)) {                  /* repcode detected we should take it */                  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;                  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; -                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */ -                ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); +                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */ +                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);                  ip += matchLength;                  anchor = ip;                  continue;   /* faster when present ... (?) */ @@ -1412,3 +2077,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(  {      return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);  } + +size_t ZSTD_compressBlock_greedy_extDict_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) + +{ +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) + +{ +    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); +} diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h index 2fc5a6182134..e5bdf4df8dde 100644 --- a/lib/zstd/compress/zstd_lazy.h +++ b/lib/zstd/compress/zstd_lazy.h @@ -23,6 +23,7 @@  #define ZSTD_LAZY_DDSS_BUCKET_LOG 2  U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);  void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); @@ -40,6 +41,15 @@ size_t ZSTD_compressBlock_lazy(  size_t ZSTD_compressBlock_greedy(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize);  size_t ZSTD_compressBlock_btlazy2_dictMatchState(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -53,6 +63,15 @@ size_t ZSTD_compressBlock_lazy_dictMatchState(  size_t ZSTD_compressBlock_greedy_dictMatchState(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dictMatchState_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dictMatchState_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize);  size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -63,6 +82,15 @@ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(  size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize);  size_t ZSTD_compressBlock_greedy_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -73,9 +101,19 @@ size_t ZSTD_compressBlock_lazy_extDict(  size_t ZSTD_compressBlock_lazy2_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict_row( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize);  size_t ZSTD_compressBlock_btlazy2_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          void const* src, size_t srcSize); +          #endif /* ZSTD_LAZY_H */ diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c index 8ef7e88a5add..dd86fc83e7dd 100644 --- a/lib/zstd/compress/zstd_ldm.c +++ b/lib/zstd/compress/zstd_ldm.c @@ -57,6 +57,33 @@ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const*      }  } +/* ZSTD_ldm_gear_reset() + * Feeds [data, data + minMatchLength) into the hash without registering any + * splits. This effectively resets the hash state. This is used when skipping + * over data, either at the beginning of a block, or skipping sections. + */ +static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, +                                BYTE const* data, size_t minMatchLength) +{ +    U64 hash = state->rolling; +    size_t n = 0; + +#define GEAR_ITER_ONCE() do {                                  \ +        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ +        n += 1;                                                \ +    } while (0) +    while (n + 3 < minMatchLength) { +        GEAR_ITER_ONCE(); +        GEAR_ITER_ONCE(); +        GEAR_ITER_ONCE(); +        GEAR_ITER_ONCE(); +    } +    while (n < minMatchLength) { +        GEAR_ITER_ONCE(); +    } +#undef GEAR_ITER_ONCE +} +  /* ZSTD_ldm_gear_feed():   *   * Registers in the splits array all the split points found in the first @@ -132,12 +159,12 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params)      size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);      size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)                             + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); -    return params.enableLdm ? totalSize : 0; +    return params.enableLdm == ZSTD_ps_enable ? totalSize : 0;  }  size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)  { -    return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; +    return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0;  }  /* ZSTD_ldm_getBucket() : @@ -255,7 +282,7 @@ void ZSTD_ldm_fillHashTable(      while (ip < iend) {          size_t hashed;          unsigned n; -         +          numSplits = 0;          hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); @@ -327,16 +354,8 @@ static size_t ZSTD_ldm_generateSequences_internal(      /* Initialize the rolling hash state with the first minMatchLength bytes */      ZSTD_ldm_gear_init(&hashState, params); -    { -        size_t n = 0; - -        while (n < minMatchLength) { -            numSplits = 0; -            n += ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength - n, -                                    splits, &numSplits); -        } -        ip += minMatchLength; -    } +    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); +    ip += minMatchLength;      while (ip < ilimit) {          size_t hashed; @@ -361,6 +380,7 @@ static size_t ZSTD_ldm_generateSequences_internal(          for (n = 0; n < numSplits; n++) {              size_t forwardMatchLength = 0, backwardMatchLength = 0,                     bestMatchLength = 0, mLength; +            U32 offset;              BYTE const* const split = candidates[n].split;              U32 const checksum = candidates[n].checksum;              U32 const hash = candidates[n].hash; @@ -428,9 +448,9 @@ static size_t ZSTD_ldm_generateSequences_internal(              }              /* Match found */ +            offset = (U32)(split - base) - bestEntry->offset;              mLength = forwardMatchLength + backwardMatchLength;              { -                U32 const offset = (U32)(split - base) - bestEntry->offset;                  rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;                  /* Out of sequence storage */ @@ -447,6 +467,21 @@ static size_t ZSTD_ldm_generateSequences_internal(              ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);              anchor = split + forwardMatchLength; + +            /* If we find a match that ends after the data that we've hashed +             * then we have a repeating, overlapping, pattern. E.g. all zeros. +             * If one repetition of the pattern matches our `stopMask` then all +             * repetitions will. We don't need to insert them all into out table, +             * only the first one. So skip over overlapping matches. +             * This is a major speed boost (20x) for compressing a single byte +             * repeated, when that byte ends up in the table. +             */ +            if (anchor > ip + hashed) { +                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); +                /* Continue the outer loop at anchor (ip + hashed == anchor). */ +                ip = anchor - hashed; +                break; +            }          }          ip += hashed; @@ -500,7 +535,7 @@ size_t ZSTD_ldm_generateSequences(          assert(chunkStart < iend);          /* 1. Perform overflow correction if necessary. */ -        if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { +        if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) {              U32 const ldmHSize = 1U << params->hashLog;              U32 const correction = ZSTD_window_correctOverflow(                  &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); @@ -544,7 +579,9 @@ size_t ZSTD_ldm_generateSequences(      return 0;  } -void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { +void +ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) +{      while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {          rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;          if (srcSize <= seq->litLength) { @@ -622,12 +659,13 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {  size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,      ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +    ZSTD_paramSwitch_e useRowMatchFinder,      void const* src, size_t srcSize)  {      const ZSTD_compressionParameters* const cParams = &ms->cParams;      unsigned const minMatch = cParams->minMatch;      ZSTD_blockCompressor const blockCompressor = -        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); +        ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));      /* Input bounds */      BYTE const* const istart = (BYTE const*)src;      BYTE const* const iend = istart + srcSize; @@ -673,8 +711,8 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,              rep[0] = sequence.offset;              /* Store the sequence */              ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, -                          sequence.offset + ZSTD_REP_MOVE, -                          sequence.matchLength - MINMATCH); +                          STORE_OFFSET(sequence.offset), +                          sequence.matchLength);              ip += sequence.matchLength;          }      } diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h index 25b25270b72e..fbc6a5e88fd7 100644 --- a/lib/zstd/compress/zstd_ldm.h +++ b/lib/zstd/compress/zstd_ldm.h @@ -63,6 +63,7 @@ size_t ZSTD_ldm_generateSequences(   */  size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,              ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +            ZSTD_paramSwitch_e useRowMatchFinder,              void const* src, size_t srcSize);  /* diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h index e5c24d856b0a..647f865be290 100644 --- a/lib/zstd/compress/zstd_ldm_geartab.h +++ b/lib/zstd/compress/zstd_ldm_geartab.h @@ -11,7 +11,10 @@  #ifndef ZSTD_LDM_GEARTAB_H  #define ZSTD_LDM_GEARTAB_H -static U64 ZSTD_ldm_gearTab[256] = { +#include "../common/compiler.h" /* UNUSED_ATTR */ +#include "../common/mem.h"      /* U64 */ + +static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {      0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc,      0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05,      0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e, diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c index dfc55e3e8119..fd82acfda62f 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -8,25 +8,12 @@   * You may select, at your option, one of the above-listed licenses.   */ -/* - * Disable inlining for the optimal parser for the kernel build. - * It is unlikely to be used in the kernel, and where it is used - * latency shouldn't matter because it is very slow to begin with. - * We prefer a ~180KB binary size win over faster optimal parsing. - * - * TODO(https://github.com/facebook/zstd/issues/2862): - * Improve the code size of the optimal parser in general, so we - * don't need this hack for the kernel build. - */ -#define ZSTD_NO_INLINE 1 -  #include "zstd_compress_internal.h"  #include "hist.h"  #include "zstd_opt.h"  #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ -#define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to init next stats */  #define ZSTD_MAX_PRICE     (1<<30)  #define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ @@ -36,11 +23,11 @@  *  Price functions for optimal parser  ***************************************/ -#if 0    /* approximation at bit level */ +#if 0    /* approximation at bit level (for tests) */  #  define BITCOST_ACCURACY 0  #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) -#  define WEIGHT(stat)  ((void)opt, ZSTD_bitWeight(stat)) -#elif 0  /* fractional bit accuracy */ +#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) +#elif 0  /* fractional bit accuracy (for tests) */  #  define BITCOST_ACCURACY 8  #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)  #  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) @@ -78,7 +65,7 @@ MEM_STATIC double ZSTD_fCost(U32 price)  static int ZSTD_compressedLiterals(optState_t const* const optPtr)  { -    return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed; +    return optPtr->literalCompressionMode != ZSTD_ps_disable;  }  static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) @@ -91,25 +78,46 @@ static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)  } -/* ZSTD_downscaleStat() : - * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) - * return the resulting sum of elements */ -static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) +static U32 sum_u32(const unsigned table[], size_t nbElts) +{ +    size_t n; +    U32 total = 0; +    for (n=0; n<nbElts; n++) { +        total += table[n]; +    } +    return total; +} + +static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)  {      U32 s, sum=0; -    DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1); -    assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31); +    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); +    assert(shift < 30);      for (s=0; s<lastEltIndex+1; s++) { -        table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus)); +        table[s] = 1 + (table[s] >> shift);          sum += table[s];      }      return sum;  } +/* ZSTD_scaleStats() : + * reduce all elements in table is sum too large + * return the resulting sum of elements */ +static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) +{ +    U32 const prevsum = sum_u32(table, lastEltIndex+1); +    U32 const factor = prevsum >> logTarget; +    DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); +    assert(logTarget < 30); +    if (factor <= 1) return prevsum; +    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); +} +  /* ZSTD_rescaleFreqs() :   * if first block (detected by optPtr->litLengthSum == 0) : init statistics   *    take hints from dictionary if there is one - *    or init from zero, using src for literals stats, or flat 1 for match symbols + *    and init from zero if there is none, + *    using src for literals stats, and baseline stats for sequence symbols   * otherwise downscale existing stats, to be used as seed for next block.   */  static void @@ -138,7 +146,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,                  optPtr->litSum = 0;                  for (lit=0; lit<=MaxLit; lit++) {                      U32 const scaleLog = 11;   /* scale to 2K */ -                    U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); +                    U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit);                      assert(bitCost <= scaleLog);                      optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;                      optPtr->litSum += optPtr->litFreq[lit]; @@ -186,14 +194,19 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,              if (compressedLiterals) {                  unsigned lit = MaxLit;                  HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */ -                optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); +                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);              } -            {   unsigned ll; -                for (ll=0; ll<=MaxLL; ll++) -                    optPtr->litLengthFreq[ll] = 1; +            {   unsigned const baseLLfreqs[MaxLL+1] = { +                    4, 2, 1, 1, 1, 1, 1, 1, +                    1, 1, 1, 1, 1, 1, 1, 1, +                    1, 1, 1, 1, 1, 1, 1, 1, +                    1, 1, 1, 1, 1, 1, 1, 1, +                    1, 1, 1, 1 +                }; +                ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs)); +                optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1);              } -            optPtr->litLengthSum = MaxLL+1;              {   unsigned ml;                  for (ml=0; ml<=MaxML; ml++) @@ -201,21 +214,26 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,              }              optPtr->matchLengthSum = MaxML+1; -            {   unsigned of; -                for (of=0; of<=MaxOff; of++) -                    optPtr->offCodeFreq[of] = 1; +            {   unsigned const baseOFCfreqs[MaxOff+1] = { +                    6, 2, 1, 1, 2, 3, 4, 4, +                    4, 3, 2, 1, 1, 1, 1, 1, +                    1, 1, 1, 1, 1, 1, 1, 1, +                    1, 1, 1, 1, 1, 1, 1, 1 +                }; +                ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs)); +                optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);              } -            optPtr->offCodeSum = MaxOff+1; +          }      } else {   /* new block : re-use previous statistics, scaled down */          if (compressedLiterals) -            optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); -        optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0); -        optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0); -        optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0); +            optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +        optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11); +        optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11); +        optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11);      }      ZSTD_setBasePrices(optPtr, optLevel); @@ -251,7 +269,16 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,   * cost of literalLength symbol */  static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)  { -    if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel); +    assert(litLength <= ZSTD_BLOCKSIZE_MAX); +    if (optPtr->priceType == zop_predef) +        return WEIGHT(litLength, optLevel); +    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +     * because it isn't representable in the zstd format. So instead just +     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +     * would be all literals. +     */ +    if (litLength == ZSTD_BLOCKSIZE_MAX) +        return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);      /* dynamic statistics */      {   U32 const llCode = ZSTD_LLcode(litLength); @@ -264,15 +291,17 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP  /* ZSTD_getMatchPrice() :   * Provides the cost of the match part (offset + matchLength) of a sequence   * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. - * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ + * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */  FORCE_INLINE_TEMPLATE U32 -ZSTD_getMatchPrice(U32 const offset, +ZSTD_getMatchPrice(U32 const offcode,                     U32 const matchLength,               const optState_t* const optPtr,                     int const optLevel)  {      U32 price; -    U32 const offCode = ZSTD_highbit32(offset+1); +    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));      U32 const mlBase = matchLength - MINMATCH;      assert(matchLength >= MINMATCH); @@ -315,8 +344,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,          optPtr->litLengthSum++;      } -    /* match offset code (0-2=>repCode; 3+=>offset+2) */ -    {   U32 const offCode = ZSTD_highbit32(offsetCode+1); +    /* offset code : expected to follow storeSeq() numeric representation */ +    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));          assert(offCode <= MaxOff);          optPtr->offCodeFreq[offCode]++;          optPtr->offCodeSum++; @@ -350,7 +379,7 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)  /* Update hashTable3 up to ip (excluded)     Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, +static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,                                                U32* nextToUpdate3,                                                const BYTE* const ip)  { @@ -376,11 +405,13 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms,  *  Binary Tree search  ***************************************/  /* ZSTD_insertBt1() : add one or multiple positions to tree. - *  ip : assumed <= iend-8 . + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position   * @return : nb of positions added */  static U32 ZSTD_insertBt1( -                ZSTD_matchState_t* ms, +                const ZSTD_matchState_t* ms,                  const BYTE* const ip, const BYTE* const iend, +                U32 const target,                  U32 const mls, const int extDict)  {      const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -403,7 +434,10 @@ static U32 ZSTD_insertBt1(      U32* smallerPtr = bt + 2*(curr&btMask);      U32* largerPtr  = smallerPtr + 1;      U32 dummy32;   /* to be nullified at the end */ -    U32 const windowLow = ms->window.lowLimit; +    /* windowLow is based on target because +     * we only need positions that will be in the window at the end of the tree update. +     */ +    U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog);      U32 matchEndIdx = curr+8+1;      size_t bestLength = 8;      U32 nbCompares = 1U << cParams->searchLog; @@ -416,6 +450,7 @@ static U32 ZSTD_insertBt1(      DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); +    assert(curr <= target);      assert(ip <= iend-8);   /* required for h calculation */      hashTable[h] = curr;   /* Update Hash Table */ @@ -504,7 +539,7 @@ void ZSTD_updateTree_internal(                  idx, target, dictMode);      while(idx < target) { -        U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); +        U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict);          assert(idx < (U32)(idx + forward));          idx += forward;      } @@ -609,7 +644,7 @@ U32 ZSTD_insertBtAndGetAllMatches (                  DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",                              repCode, ll0, repOffset, repLen);                  bestLength = repLen; -                matches[mnum].off = repCode - ll0; +                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */                  matches[mnum].len = (U32)repLen;                  mnum++;                  if ( (repLen > sufficient_len) @@ -638,7 +673,7 @@ U32 ZSTD_insertBtAndGetAllMatches (                  bestLength = mlen;                  assert(curr > matchIndex3);                  assert(mnum==0);  /* no prior solution */ -                matches[0].off = (curr - matchIndex3) + ZSTD_REP_MOVE; +                matches[0].off = STORE_OFFSET(curr - matchIndex3);                  matches[0].len = (U32)mlen;                  mnum = 1;                  if ( (mlen > sufficient_len) | @@ -647,7 +682,7 @@ U32 ZSTD_insertBtAndGetAllMatches (                      return 1;          }   }   }          /* no dictMatchState lookup: dicts don't have a populated HC3 table */ -    } +    }  /* if (mls == 3) */      hashTable[h] = curr;   /* Update Hash Table */ @@ -672,20 +707,19 @@ U32 ZSTD_insertBtAndGetAllMatches (          if (matchLength > bestLength) {              DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", -                    (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE); +                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));              assert(matchEndIdx > matchIndex);              if (matchLength > matchEndIdx - matchIndex)                  matchEndIdx = matchIndex + (U32)matchLength;              bestLength = matchLength; -            matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE; +            matches[mnum].off = STORE_OFFSET(curr - matchIndex);              matches[mnum].len = (U32)matchLength;              mnum++;              if ( (matchLength > ZSTD_OPT_NUM)                 | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {                  if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */                  break; /* drop, to preserve bt consistency (miss a little bit of compression) */ -            } -        } +        }   }          if (match[matchLength] < ip[matchLength]) {              /* match smaller than current */ @@ -721,18 +755,17 @@ U32 ZSTD_insertBtAndGetAllMatches (              if (matchLength > bestLength) {                  matchIndex = dictMatchIndex + dmsIndexDelta;                  DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", -                        (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE); +                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));                  if (matchLength > matchEndIdx - matchIndex)                      matchEndIdx = matchIndex + (U32)matchLength;                  bestLength = matchLength; -                matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE; +                matches[mnum].off = STORE_OFFSET(curr - matchIndex);                  matches[mnum].len = (U32)matchLength;                  mnum++;                  if ( (matchLength > ZSTD_OPT_NUM)                     | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {                      break;   /* drop, to guarantee consistency (miss a little bit of compression) */ -                } -            } +            }   }              if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */              if (match[matchLength] < ip[matchLength]) { @@ -742,39 +775,91 @@ U32 ZSTD_insertBtAndGetAllMatches (                  /* match is larger than current */                  commonLengthLarger = matchLength;                  dictMatchIndex = nextPtr[0]; -            } -        } -    } +    }   }   }  /* if (dictMode == ZSTD_dictMatchState) */      assert(matchEndIdx > curr+8);      ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */      return mnum;  } - -FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( -                        ZSTD_match_t* matches,   /* store result (match found, increasing size) in this table */ -                        ZSTD_matchState_t* ms, -                        U32* nextToUpdate3, -                        const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode, -                        const U32 rep[ZSTD_REP_NUM], -                        U32 const ll0, -                        U32 const lengthToBeat) +typedef U32 (*ZSTD_getAllMatchesFn)( +    ZSTD_match_t*, +    ZSTD_matchState_t*, +    U32*, +    const BYTE*, +    const BYTE*, +    const U32 rep[ZSTD_REP_NUM], +    U32 const ll0, +    U32 const lengthToBeat); + +FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( +        ZSTD_match_t* matches, +        ZSTD_matchState_t* ms, +        U32* nextToUpdate3, +        const BYTE* ip, +        const BYTE* const iHighLimit, +        const U32 rep[ZSTD_REP_NUM], +        U32 const ll0, +        U32 const lengthToBeat, +        const ZSTD_dictMode_e dictMode, +        const U32 mls)  { -    const ZSTD_compressionParameters* const cParams = &ms->cParams; -    U32 const matchLengthSearch = cParams->minMatch; -    DEBUGLOG(8, "ZSTD_BtGetAllMatches"); -    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */ -    ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode); -    switch(matchLengthSearch) -    { -    case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); -    default : -    case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); -    case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); -    case 7 : -    case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); +    assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls); +    DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls); +    if (ip < ms->window.base + ms->nextToUpdate) +        return 0;   /* skipped area */ +    ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode); +    return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls); +} + +#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls + +#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)            \ +    static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(      \ +            ZSTD_match_t* matches,                             \ +            ZSTD_matchState_t* ms,                             \ +            U32* nextToUpdate3,                                \ +            const BYTE* ip,                                    \ +            const BYTE* const iHighLimit,                      \ +            const U32 rep[ZSTD_REP_NUM],                       \ +            U32 const ll0,                                     \ +            U32 const lengthToBeat)                            \ +    {                                                          \ +        return ZSTD_btGetAllMatches_internal(                  \ +                matches, ms, nextToUpdate3, ip, iHighLimit,    \ +                rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \ +    } + +#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode)  \ +    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3)  \ +    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4)  \ +    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5)  \ +    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6) + +GEN_ZSTD_BT_GET_ALL_MATCHES(noDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(extDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + +#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode)  \ +    {                                            \ +        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \ +        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \ +        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \ +        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6)  \      } + +static ZSTD_getAllMatchesFn +ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode) +{ +    ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { +        ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), +        ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict), +        ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState) +    }; +    U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6); +    assert((U32)dictMode < 3); +    assert(mls - 3 < 4); +    return getAllMatchesFns[(int)dictMode][mls - 3];  }  /* *********************** @@ -783,16 +868,18 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (  /* Struct containing info needed to make decision about ldm inclusion */  typedef struct { -    rawSeqStore_t seqStore;         /* External match candidates store for this block */ -    U32 startPosInBlock;            /* Start position of the current match candidate */ -    U32 endPosInBlock;              /* End position of the current match candidate */ -    U32 offset;                     /* Offset of the match candidate */ +    rawSeqStore_t seqStore;   /* External match candidates store for this block */ +    U32 startPosInBlock;      /* Start position of the current match candidate */ +    U32 endPosInBlock;        /* End position of the current match candidate */ +    U32 offset;               /* Offset of the match candidate */  } ZSTD_optLdm_t;  /* ZSTD_optLdm_skipRawSeqStoreBytes(): - * Moves forward in rawSeqStore by nbBytes, which will update the fields 'pos' and 'posInSequence'. + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'.   */ -static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { +static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) +{      U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);      while (currPos && rawSeqStore->pos < rawSeqStore->size) {          rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; @@ -813,8 +900,10 @@ static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t   * Calculates the beginning and end of the next match in the current block.   * Updates 'pos' and 'posInSequence' of the ldmSeqStore.   */ -static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, -                                                   U32 blockBytesRemaining) { +static void +ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, +                                       U32 blockBytesRemaining) +{      rawSeq currSeq;      U32 currBlockEndPos;      U32 literalsBytesRemaining; @@ -826,8 +915,8 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 cu          optLdm->endPosInBlock = UINT_MAX;          return;      } -    /* Calculate appropriate bytes left in matchLength and litLength after adjusting -       based on ldmSeqStore->posInSequence */ +    /* Calculate appropriate bytes left in matchLength and litLength +     * after adjusting based on ldmSeqStore->posInSequence */      currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos];      assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength);      currBlockEndPos = currPosInBlock + blockBytesRemaining; @@ -863,15 +952,16 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 cu  }  /* ZSTD_optLdm_maybeAddMatch(): - * Adds a match if it's long enough, based on it's 'matchStartPosInBlock' - * and 'matchEndPosInBlock', into 'matches'. Maintains the correct ordering of 'matches' + * Adds a match if it's long enough, + * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock', + * into 'matches'. Maintains the correct ordering of 'matches'.   */  static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, -                                      ZSTD_optLdm_t* optLdm, U32 currPosInBlock) { -    U32 posDiff = currPosInBlock - optLdm->startPosInBlock; +                                      const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) +{ +    U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;      /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ -    U32 candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; -    U32 candidateOffCode = optLdm->offset + ZSTD_REP_MOVE; +    U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;      /* Ensure that current block position is not outside of the match */      if (currPosInBlock < optLdm->startPosInBlock @@ -881,6 +971,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,      }      if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);          DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",                   candidateOffCode, candidateMatchLength, currPosInBlock);          matches[*nbMatches].len = candidateMatchLength; @@ -892,8 +983,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,  /* ZSTD_optLdm_processMatchCandidate():   * Wrapper function to update ldm seq store and call ldm functions as necessary.   */ -static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_t* matches, U32* nbMatches, -                                              U32 currPosInBlock, U32 remainingBytes) { +static void +ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, +                                  ZSTD_match_t* matches, U32* nbMatches, +                                  U32 currPosInBlock, U32 remainingBytes) +{      if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {          return;      } @@ -904,19 +998,19 @@ static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_               * at the end of a match from the ldm seq store, and will often be some bytes               * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots"               */ -            U32 posOvershoot = currPosInBlock - optLdm->endPosInBlock; +            U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock;              ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot); -        }  +        }          ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);      }      ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);  } +  /*-*******************************  *  Optimal parser  *********************************/ -  static U32 ZSTD_totalLen(ZSTD_optimal_t sol)  {      return sol.litlen + sol.mlen; @@ -957,6 +1051,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,      const BYTE* const prefixStart = base + ms->window.dictLimit;      const ZSTD_compressionParameters* const cParams = &ms->cParams; +    ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode); +      U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);      U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;      U32 nextToUpdate3 = ms->nextToUpdate; @@ -984,7 +1080,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,          /* find first match */          {   U32 const litlen = (U32)(ip - anchor);              U32 const ll0 = !litlen; -            U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch); +            U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);              ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,                                                (U32)(ip-istart), (U32)(iend - ip));              if (!nbMatches) { ip++; continue; } @@ -998,18 +1094,18 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,               * in every price. We include the literal length to avoid negative               * prices when we subtract the previous literal length.               */ -            opt[0].price = ZSTD_litLengthPrice(litlen, optStatePtr, optLevel); +            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);              /* large match -> immediate encoding */              {   U32 const maxML = matches[nbMatches-1].len; -                U32 const maxOffset = matches[nbMatches-1].off; +                U32 const maxOffcode = matches[nbMatches-1].off;                  DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", -                            nbMatches, maxML, maxOffset, (U32)(ip-prefixStart)); +                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));                  if (maxML > sufficient_len) {                      lastSequence.litlen = litlen;                      lastSequence.mlen = maxML; -                    lastSequence.off = maxOffset; +                    lastSequence.off = maxOffcode;                      DEBUGLOG(6, "large match (%u>%u), immediate encoding",                                  maxML, sufficient_len);                      cur = 0; @@ -1018,24 +1114,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,              }   }              /* set prices for first matches starting position == 0 */ -            {   U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); +            assert(opt[0].price >= 0); +            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);                  U32 pos;                  U32 matchNb;                  for (pos = 1; pos < minMatch; pos++) {                      opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */                  }                  for (matchNb = 0; matchNb < nbMatches; matchNb++) { -                    U32 const offset = matches[matchNb].off; +                    U32 const offcode = matches[matchNb].off;                      U32 const end = matches[matchNb].len;                      for ( ; pos <= end ; pos++ ) { -                        U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); +                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);                          U32 const sequencePrice = literalsPrice + matchPrice;                          DEBUGLOG(7, "rPos:%u => set initial price : %.2f",                                      pos, ZSTD_fCost(sequencePrice));                          opt[pos].mlen = pos; -                        opt[pos].off = offset; +                        opt[pos].off = offcode;                          opt[pos].litlen = litlen; -                        opt[pos].price = sequencePrice; +                        opt[pos].price = (int)sequencePrice;                  }   }                  last_pos = pos-1;              } @@ -1050,9 +1147,9 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,              /* Fix current position with one literal if cheaper */              {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;                  int const price = opt[cur-1].price -                                + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) -                                + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) -                                - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); +                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) +                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) +                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);                  assert(price < 1000000000); /* overflow check */                  if (price <= opt[cur].price) {                      DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", @@ -1078,7 +1175,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,              assert(cur >= opt[cur].mlen);              if (opt[cur].mlen != 0) {                  U32 const prev = cur - opt[cur].mlen; -                repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); +                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);                  ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));              } else {                  ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); @@ -1095,11 +1192,12 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,                  continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */              } +            assert(opt[cur].price >= 0);              {   U32 const ll0 = (opt[cur].mlen != 0);                  U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; -                U32 const previousPrice = opt[cur].price; +                U32 const previousPrice = (U32)opt[cur].price;                  U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); -                U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); +                U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);                  U32 matchNb;                  ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, @@ -1137,7 +1235,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,                      for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */                          U32 const pos = cur + mlen; -                        int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); +                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);                          if ((pos > last_pos) || (price < opt[pos].price)) {                              DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", @@ -1167,7 +1265,7 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */           * update them while traversing the sequences.           */          if (lastSequence.mlen != 0) { -            repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); +            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);              ZSTD_memcpy(rep, &reps, sizeof(reps));          } else {              ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); @@ -1211,7 +1309,7 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */                      assert(anchor + llen <= iend);                      ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); -                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH); +                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);                      anchor += advance;                      ip = anchor;              }   } @@ -1223,38 +1321,30 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */      return (size_t)(iend - anchor);  } +static size_t ZSTD_compressBlock_opt0( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ +    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); +} + +static size_t ZSTD_compressBlock_opt2( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ +    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); +}  size_t ZSTD_compressBlock_btopt(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          const void* src, size_t srcSize)  {      DEBUGLOG(5, "ZSTD_compressBlock_btopt"); -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict); +    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);  } -/* used in 2-pass strategy */ -static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) -{ -    U32 s, sum=0; -    assert(ZSTD_FREQ_DIV+bonus >= 0); -    for (s=0; s<lastEltIndex+1; s++) { -        table[s] <<= ZSTD_FREQ_DIV+bonus; -        table[s]--; -        sum += table[s]; -    } -    return sum; -} -/* used in 2-pass strategy */ -MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr) -{ -    if (ZSTD_compressedLiterals(optPtr)) -        optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); -    optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0); -    optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0); -    optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0); -}  /* ZSTD_initStats_ultra():   * make a first compression pass, just to seed stats with more accurate starting values. @@ -1276,7 +1366,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,      assert(ms->window.dictLimit == ms->window.lowLimit);   /* no dictionary */      assert(ms->window.dictLimit - ms->nextToUpdate <= 1);  /* no prefix (note: intentional overflow, defined as 2-complement) */ -    ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/ +    ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/      /* invalidate first scan from history */      ZSTD_resetSeqStore(seqStore); @@ -1285,8 +1375,6 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,      ms->window.lowLimit = ms->window.dictLimit;      ms->nextToUpdate = ms->window.dictLimit; -    /* re-inforce weight of collected statistics */ -    ZSTD_upscaleStats(&ms->opt);  }  size_t ZSTD_compressBlock_btultra( @@ -1294,7 +1382,7 @@ size_t ZSTD_compressBlock_btultra(          const void* src, size_t srcSize)  {      DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); +    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);  }  size_t ZSTD_compressBlock_btultra2( @@ -1322,35 +1410,35 @@ size_t ZSTD_compressBlock_btultra2(          ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);      } -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); +    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);  }  size_t ZSTD_compressBlock_btopt_dictMatchState(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          const void* src, size_t srcSize)  { -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState); +    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);  }  size_t ZSTD_compressBlock_btultra_dictMatchState(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          const void* src, size_t srcSize)  { -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState); +    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);  }  size_t ZSTD_compressBlock_btopt_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          const void* src, size_t srcSize)  { -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict); +    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);  }  size_t ZSTD_compressBlock_btultra_extDict(          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],          const void* src, size_t srcSize)  { -    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict); +    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);  }  /* note : no btultra2 variant for extDict nor dictMatchState,  | 
