diff options
Diffstat (limited to 'lib/zstd/compress/huf_compress.c')
| -rw-r--r-- | lib/zstd/compress/huf_compress.c | 999 |
1 files changed, 778 insertions, 221 deletions
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c index f76a526bfa54..0b229f5d2ae2 100644 --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause /* ****************************************************************** * Huffman encoder, part of New Generation Entropy library - * Copyright (c) Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * * You can contact the author at : * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -26,9 +27,9 @@ #include "hist.h" #define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ #include "../common/fse.h" /* header compression */ -#define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "../common/error_private.h" +#include "../common/bits.h" /* ZSTD_highbit32 */ /* ************************************************************** @@ -39,17 +40,93 @@ /* ************************************************************** -* Utils +* Required declarations ****************************************************************/ -unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +typedef struct nodeElt_s { + U32 count; + U16 parent; + BYTE byte; + BYTE nbBits; +} nodeElt; + + +/* ************************************************************** +* Debug Traces +****************************************************************/ + +#if DEBUGLEVEL >= 2 + +static size_t showU32(const U32* arr, size_t size) { - return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); + size_t u; + for (u=0; u<size; u++) { + RAWLOG(6, " %u", arr[u]); (void)arr; + } + RAWLOG(6, " \n"); + return size; } +static size_t HUF_getNbBits(HUF_CElt elt); + +static size_t showCTableBits(const HUF_CElt* ctable, size_t size) +{ + size_t u; + for (u=0; u<size; u++) { + RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable; + } + RAWLOG(6, " \n"); + return size; + +} + +static size_t showHNodeSymbols(const nodeElt* hnode, size_t size) +{ + size_t u; + for (u=0; u<size; u++) { + RAWLOG(6, " %u", hnode[u].byte); (void)hnode; + } + RAWLOG(6, " \n"); + return size; +} + +static size_t showHNodeBits(const nodeElt* hnode, size_t size) +{ + size_t u; + for (u=0; u<size; u++) { + RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode; + } + RAWLOG(6, " \n"); + return size; +} + +#endif + /* ******************************************************* * HUF : Huffman block compression *********************************************************/ +#define HUF_WORKSPACE_MAX_ALIGNMENT 8 + +static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align) +{ + size_t const mask = align - 1; + size_t const rem = (size_t)workspace & mask; + size_t const add = (align - rem) & mask; + BYTE* const aligned = (BYTE*)workspace + add; + assert((align & (align - 1)) == 0); /* pow 2 */ + assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT); + if (*workspaceSizePtr >= add) { + assert(add < align); + assert(((size_t)aligned & mask) == 0); + *workspaceSizePtr -= add; + return aligned; + } else { + *workspaceSizePtr = 0; + return NULL; + } +} + + /* HUF_compressWeights() : * Same as FSE_compress(), but dedicated to huff0's weights compression. * The use case needs much less stack memory. @@ -64,7 +141,10 @@ typedef struct { S16 norm[HUF_TABLELOG_MAX+1]; } HUF_CompressWeightsWksp; -static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize) +static size_t +HUF_compressWeights(void* dst, size_t dstSize, + const void* weightTable, size_t wtSize, + void* workspace, size_t workspaceSize) { BYTE* const ostart = (BYTE*) dst; BYTE* op = ostart; @@ -72,7 +152,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT unsigned maxSymbolValue = HUF_TABLELOG_MAX; U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; - HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace; + HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC); @@ -103,6 +183,59 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT return (size_t)(op-ostart); } +static size_t HUF_getNbBits(HUF_CElt elt) +{ + return elt & 0xFF; +} + +static size_t HUF_getNbBitsFast(HUF_CElt elt) +{ + return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ + return elt & ~(size_t)0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ + return elt; +} + +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ + assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); + *elt = nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ + size_t const nbBits = HUF_getNbBits(*elt); + if (nbBits > 0) { + assert((value >> nbBits) == 0); + *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); + } +} + +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable) +{ + HUF_CTableHeader header; + ZSTD_memcpy(&header, ctable, sizeof(header)); + return header; +} + +static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue) +{ + HUF_CTableHeader header; + HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header)); + ZSTD_memset(&header, 0, sizeof(header)); + assert(tableLog < 256); + header.tableLog = (BYTE)tableLog; + assert(maxSymbolValue < 256); + header.maxSymbolValue = (BYTE)maxSymbolValue; + ZSTD_memcpy(ctable, &header, sizeof(header)); +} typedef struct { HUF_CompressWeightsWksp wksp; @@ -114,9 +247,15 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize) { + HUF_CElt const* const ct = CTable + 1; BYTE* op = (BYTE*)dst; U32 n; - HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace; + HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); + + HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp)); + + assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); + assert(HUF_readCTableHeader(CTable).tableLog == huffLog); /* check conditions */ if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); @@ -127,9 +266,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, for (n=1; n<huffLog+1; n++) wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n); for (n=0; n<maxSymbolValue; n++) - wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits]; + wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])]; /* attempt weights compression by FSE */ + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ op[0] = (BYTE)hSize; @@ -146,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, return ((maxSymbolValue+1)/2) + 1; } -/*! HUF_writeCTable() : - `CTable` : Huffman tree to save, using huf representation. - @return : size of saved CTable */ -size_t HUF_writeCTable (void* dst, size_t maxDstSize, - const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) -{ - HUF_WriteCTableWksp wksp; - return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); -} - size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) { @@ -163,6 +293,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ U32 tableLog = 0; U32 nbSymbols = 0; + HUF_CElt* const ct = CTable + 1; /* get symbol weights */ CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); @@ -172,6 +303,10 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + *maxSymbolValuePtr = nbSymbols - 1; + + HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + /* Prepare base value per rank */ { U32 n, nextRankStart = 0; for (n=1; n<=tableLog; n++) { @@ -183,13 +318,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void /* fill nbBits */ { U32 n; for (n=0; n<nbSymbols; n++) { const U32 w = huffWeight[n]; - CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0); + HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0)); } } /* fill val */ { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */ U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; - { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; } + { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; } /* determine stating value per rank */ valPerRank[tableLog+1] = 0; /* for w==0 */ { U16 min = 0; @@ -199,77 +334,74 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void min >>= 1; } } /* assign value within rank, symbol order */ - { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; } + { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); } } - *maxSymbolValuePtr = nbSymbols - 1; return readSize; } -U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue) +U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue) { - const HUF_CElt* table = (const HUF_CElt*)symbolTable; + const HUF_CElt* const ct = CTable + 1; assert(symbolValue <= HUF_SYMBOLVALUE_MAX); - return table[symbolValue].nbBits; + if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue) + return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); } -typedef struct nodeElt_s { - U32 count; - U16 parent; - BYTE byte; - BYTE nbBits; -} nodeElt; - /* * HUF_setMaxHeight(): - * Enforces maxNbBits on the Huffman tree described in huffNode. + * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. * - * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts - * the tree to so that it is a valid canonical Huffman tree. + * It attempts to convert all nodes with nbBits > @targetNbBits + * to employ @targetNbBits instead. Then it adjusts the tree + * so that it remains a valid canonical Huffman tree. * * @pre The sum of the ranks of each symbol == 2^largestBits, * where largestBits == huffNode[lastNonNull].nbBits. * @post The sum of the ranks of each symbol == 2^largestBits, - * where largestBits is the return value <= maxNbBits. + * where largestBits is the return value (expected <= targetNbBits). * - * @param huffNode The Huffman tree modified in place to enforce maxNbBits. + * @param huffNode The Huffman tree modified in place to enforce targetNbBits. + * It's presumed sorted, from most frequent to rarest symbol. * @param lastNonNull The symbol with the lowest count in the Huffman tree. - * @param maxNbBits The maximum allowed number of bits, which the Huffman tree + * @param targetNbBits The allowed number of bits, which the Huffman tree * may not respect. After this function the Huffman tree will - * respect maxNbBits. - * @return The maximum number of bits of the Huffman tree after adjustment, - * necessarily no more than maxNbBits. + * respect targetNbBits. + * @return The maximum number of bits of the Huffman tree after adjustment. */ -static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) +static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) { const U32 largestBits = huffNode[lastNonNull].nbBits; - /* early exit : no elt > maxNbBits, so the tree is already valid. */ - if (largestBits <= maxNbBits) return largestBits; + /* early exit : no elt > targetNbBits, so the tree is already valid. */ + if (largestBits <= targetNbBits) return largestBits; + + DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); /* there are several too large elements (at least >= 2) */ { int totalCost = 0; - const U32 baseCost = 1 << (largestBits - maxNbBits); + const U32 baseCost = 1 << (largestBits - targetNbBits); int n = (int)lastNonNull; - /* Adjust any ranks > maxNbBits to maxNbBits. + /* Adjust any ranks > targetNbBits to targetNbBits. * Compute totalCost, which is how far the sum of the ranks is * we are over 2^largestBits after adjust the offending ranks. */ - while (huffNode[n].nbBits > maxNbBits) { + while (huffNode[n].nbBits > targetNbBits) { totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); - huffNode[n].nbBits = (BYTE)maxNbBits; + huffNode[n].nbBits = (BYTE)targetNbBits; n--; } - /* n stops at huffNode[n].nbBits <= maxNbBits */ - assert(huffNode[n].nbBits <= maxNbBits); - /* n end at index of smallest symbol using < maxNbBits */ - while (huffNode[n].nbBits == maxNbBits) --n; + /* n stops at huffNode[n].nbBits <= targetNbBits */ + assert(huffNode[n].nbBits <= targetNbBits); + /* n end at index of smallest symbol using < targetNbBits */ + while (huffNode[n].nbBits == targetNbBits) --n; - /* renorm totalCost from 2^largestBits to 2^maxNbBits + /* renorm totalCost from 2^largestBits to 2^targetNbBits * note : totalCost is necessarily a multiple of baseCost */ - assert((totalCost & (baseCost - 1)) == 0); - totalCost >>= (largestBits - maxNbBits); + assert(((U32)totalCost & (baseCost - 1)) == 0); + totalCost >>= (largestBits - targetNbBits); assert(totalCost > 0); /* repay normalized cost */ @@ -278,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) /* Get pos of last (smallest = lowest cum. count) symbol per rank */ ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); - { U32 currentNbBits = maxNbBits; + { U32 currentNbBits = targetNbBits; int pos; for (pos=n ; pos >= 0; pos--) { if (huffNode[pos].nbBits >= currentNbBits) continue; - currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ - rankLast[maxNbBits-currentNbBits] = (U32)pos; + currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ + rankLast[targetNbBits-currentNbBits] = (U32)pos; } } while (totalCost > 0) { /* Try to reduce the next power of 2 above totalCost because we * gain back half the rank. */ - U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; + U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { U32 const highPos = rankLast[nBitsToDecrease]; U32 const lowPos = rankLast[nBitsToDecrease-1]; @@ -330,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) rankLast[nBitsToDecrease] = noSymbol; else { rankLast[nBitsToDecrease]--; - if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) + if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ } } /* while (totalCost > 0) */ @@ -342,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) * TODO. */ while (totalCost < 0) { /* Sometimes, cost correction overshoot */ - /* special case : no rank 1 symbol (using maxNbBits-1); - * let's create one from largest rank 0 (using maxNbBits). + /* special case : no rank 1 symbol (using targetNbBits-1); + * let's create one from largest rank 0 (using targetNbBits). */ if (rankLast[1] == noSymbol) { - while (huffNode[n].nbBits == maxNbBits) n--; + while (huffNode[n].nbBits == targetNbBits) n--; huffNode[n+1].nbBits--; assert(n >= 0); rankLast[1] = (U32)(n+1); @@ -360,26 +492,122 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) } /* repay normalized cost */ } /* there are several too large elements (at least >= 2) */ - return maxNbBits; + return targetNbBits; } typedef struct { - U32 base; - U32 curr; + U16 base; + U16 curr; } rankPos; -typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; +typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; -#define RANK_POSITION_TABLE_SIZE 32 +/* Number of buckets available for HUF_sort() */ +#define RANK_POSITION_TABLE_SIZE 192 typedef struct { huffNodeTable huffNodeTbl; rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; } HUF_buildCTable_wksp_tables; +/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing. + * Strategy is to use as many buckets as possible for representing distinct + * counts while using the remainder to represent all "large" counts. + * + * To satisfy this requirement for 192 buckets, we can do the following: + * Let buckets 0-166 represent distinct counts of [0, 166] + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ +#define RANK_POSITION_MAX_COUNT_LOG 32 +#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) +#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + +/* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. + */ +static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count + : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; +} + +/* Helper swap function for HUF_quickSortPartition() */ +static void HUF_swapNodes(nodeElt* a, nodeElt* b) { + nodeElt tmp = *a; + *a = *b; + *b = tmp; +} + +/* Returns 0 if the huffNode array is not sorted by descending count */ +MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) { + U32 i; + for (i = 1; i < maxSymbolValue1; ++i) { + if (huffNode[i].count > huffNode[i-1].count) { + return 0; + } + } + return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { + int i; + int const size = high-low+1; + huffNode += low; + for (i = 1; i < size; ++i) { + nodeElt const key = huffNode[i]; + int j = i - 1; + while (j >= 0 && huffNode[j].count < key.count) { + huffNode[j + 1] = huffNode[j]; + j--; + } + huffNode[j + 1] = key; + } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { + /* Simply select rightmost element as pivot. "Better" selectors like + * median-of-three don't experimentally appear to have any benefit. + */ + U32 const pivot = arr[high].count; + int i = low - 1; + int j = low; + for ( ; j < high; j++) { + if (arr[j].count > pivot) { + i++; + HUF_swapNodes(&arr[i], &arr[j]); + } + } + HUF_swapNodes(&arr[i + 1], &arr[high]); + return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { + int const kInsertionSortThreshold = 8; + if (high - low < kInsertionSortThreshold) { + HUF_insertionSort(arr, low, high); + return; + } + while (low < high) { + int const idx = HUF_quickSortPartition(arr, low, high); + if (idx - low < high - idx) { + HUF_simpleQuickSort(arr, low, idx - 1); + low = idx + 1; + } else { + HUF_simpleQuickSort(arr, idx + 1, high); + high = idx - 1; + } + } +} + /* * HUF_sort(): * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket. * * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled. * Must have (maxSymbolValue + 1) entries. @@ -387,42 +615,51 @@ typedef struct { * @param[in] maxSymbolValue Maximum symbol value. * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries. */ -static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) -{ - int n; - int const maxSymbolValue1 = (int)maxSymbolValue + 1; +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { + U32 n; + U32 const maxSymbolValue1 = maxSymbolValue+1; /* Compute base and set curr to base. - * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1. - * Then 2^lowerRank <= count[n]+1 <= 2^rank. + * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. + * See HUF_getIndex to see bucketing strategy. * We attribute each symbol to lowerRank's base value, because we want to know where * each rank begins in the output, so for rank R we want to count ranks R+1 and above. */ ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); for (n = 0; n < maxSymbolValue1; ++n) { - U32 lowerRank = BIT_highbit32(count[n] + 1); + U32 lowerRank = HUF_getIndex(count[n]); + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); rankPosition[lowerRank].base++; } + assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); + /* Set up the rankPosition table */ for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { rankPosition[n-1].base += rankPosition[n].base; rankPosition[n-1].curr = rankPosition[n-1].base; } - /* Sort */ + + /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */ for (n = 0; n < maxSymbolValue1; ++n) { U32 const c = count[n]; - U32 const r = BIT_highbit32(c+1) + 1; - U32 pos = rankPosition[r].curr++; - /* Insert into the correct position in the rank. - * We have at most 256 symbols, so this insertion should be fine. - */ - while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { - huffNode[pos] = huffNode[pos-1]; - pos--; - } + U32 const r = HUF_getIndex(c) + 1; + U32 const pos = rankPosition[r].curr++; + assert(pos < maxSymbolValue1); huffNode[pos].count = c; huffNode[pos].byte = (BYTE)n; } + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { + int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); + } + } + + assert(HUF_isSorted(huffNode, maxSymbolValue1)); } @@ -446,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) int lowS, lowN; int nodeNb = STARTNODE; int n, nodeRoot; + DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); /* init for parents */ nonNullRank = (int)maxSymbolValue; while(huffNode[nonNullRank].count == 0) nonNullRank--; @@ -472,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) for (n=0; n<=nonNullRank; n++) huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); + return nonNullRank; } @@ -487,6 +727,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) */ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) { + HUF_CElt* const ct = CTable + 1; /* fill result into ctable (val, nbBits) */ int n; U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; @@ -502,130 +743,381 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i min >>= 1; } } for (n=0; n<alphabetSize; n++) - CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */ + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ for (n=0; n<alphabetSize; n++) - CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */ + HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */ + + HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue); } -size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) +size_t +HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, + void* workSpace, size_t wkspSize) { - HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace; + HUF_buildCTable_wksp_tables* const wksp_tables = + (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; nodeElt* const huffNode = huffNode0+1; int nonNullRank; + HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); + + DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); + /* safety checks */ - if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) - return ERROR(workSpace_tooSmall); + return ERROR(workSpace_tooSmall); if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) - return ERROR(maxSymbolValue_tooLarge); + return ERROR(maxSymbolValue_tooLarge); ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); /* sort, decreasing order */ HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); + DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); /* build tree */ nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); - /* enforce maxTableLog */ + /* determine and enforce maxTableLog */ maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ - HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits); + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits); return maxNbBits; } size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CElt const* ct = CTable + 1; size_t nbBits = 0; int s; for (s = 0; s <= (int)maxSymbolValue; ++s) { - nbBits += CTable[s].nbBits * count[s]; + nbBits += HUF_getNbBits(ct[s]) * count[s]; } return nbBits >> 3; } int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { - int bad = 0; - int s; - for (s = 0; s <= (int)maxSymbolValue; ++s) { - bad |= (count[s] != 0) & (CTable[s].nbBits == 0); - } - return !bad; + HUF_CTableHeader header = HUF_readCTableHeader(CTable); + HUF_CElt const* ct = CTable + 1; + int bad = 0; + int s; + + assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); + + if (header.maxSymbolValue < maxSymbolValue) + return 0; + + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); + } + return !bad; } size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +/* HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is + * the pair (nbBits, value) in the format: + * format: + * - Bits [0, 4) = nbBits + * - Bits [4, 64 - nbBits) = 0 + * - Bits [64 - nbBits, 64) = value + * 2. The bitContainer is built from the upper bits and + * right shifted. E.g. to add a new value of N bits + * you right shift the bitContainer by N, then or in + * the new value into the N upper bits. + * 3. The bitstream has two bit containers. You can add + * bits to the second container and merge them into + * the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { + size_t bitContainer[2]; + size_t bitPos[2]; + + BYTE* startPtr; + BYTE* ptr; + BYTE* endPtr; +} HUF_CStream_t; + +/*! HUF_initCStream(): + * Initializes the bitstream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + ZSTD_memset(bitC, 0, sizeof(*bitC)); + bitC->startPtr = (BYTE*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); + if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt The element we're adding. This is a (nbBits, value) pair. + * See the HUF_CStream_t docs for the format. + * @param idx Insert into the bitstream at this idx. + * @param kFast This is a template parameter. If the bitstream is guaranteed + * to have at least 4 unused bits after this call it may be 1, + * otherwise it must be 0. HUF_addBits() is faster when fast is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) +{ + assert(idx <= 1); + assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); + /* This is efficient on x86-64 with BMI2 because shrx + * only reads the low 6 bits of the register. The compiler + * knows this and elides the mask. When fast is set, + * every operation can use the same value loaded from elt. + */ + bitC->bitContainer[idx] >>= HUF_getNbBits(elt); + bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); + /* We only read the low 8 bits of bitC->bitPos[idx] so it + * doesn't matter that the high bits have noise from the value. + */ + bitC->bitPos[idx] += HUF_getNbBitsFast(elt); + assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + /* The last 4-bits of elt are dirty if fast is set, + * so we must not be overwriting bits that have already been + * inserted into the bit container. + */ +#if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); + size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); + /* We didn't overwrite any bits in the bit container. */ + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + (void)dirtyBits; + } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ + bitC->bitContainer[1] = 0; + bitC->bitPos[1] = 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); + bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); + bitC->bitContainer[0] |= bitC->bitContainer[1]; + bitC->bitPos[0] += bitC->bitPos[1]; + assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +* the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ + size_t const nbBits = bitC->bitPos[0] & 0xFF; + size_t const nbBytes = nbBits >> 3; + /* The top nbBits bits of bitContainer are the ones we need. */ + size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); + /* Mask bitPos to account for the bytes we consumed. */ + bitC->bitPos[0] &= 7; + assert(nbBits > 0); + assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitContainer); + bitC->ptr += nbBytes; + assert(!kFast || bitC->ptr <= bitC->endPtr); + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + /* bitContainer doesn't need to be modified because the leftover + * bits are already the top bitPos bits. And we don't care about + * noise in the lower values. + */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value = 1. + */ +static HUF_CElt HUF_endMark(void) +{ + HUF_CElt endMark; + HUF_setNbBits(&endMark, 1); + HUF_setValue(&endMark, 1); + return endMark; +} + +/*! HUF_closeCStream() : + * @return Size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); + HUF_flushBits(bitC, /* kFast */ 0); + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } +} + FORCE_INLINE_TEMPLATE void -HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) { - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); } -#define HUF_FLUSHBITS(s) BIT_flushBits(s) +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, + const BYTE* ip, size_t srcSize, + const HUF_CElt* ct, + int kUnroll, int kFastFlush, int kLastFast) +{ + /* Join to kUnroll */ + int n = (int)srcSize; + int rem = n % kUnroll; + if (rem > 0) { + for (; rem > 0; --rem) { + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); + } + HUF_flushBits(bitC, kFastFlush); + } + assert(n % kUnroll == 0); -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) + /* Join to 2 * kUnroll */ + if (n % (2 * kUnroll)) { + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); + HUF_flushBits(bitC, kFastFlush); + n -= kUnroll; + } + assert(n % (2 * kUnroll) == 0); + + for (; n>0; n-= 2 * kUnroll) { + /* Encode kUnroll symbols into the bitstream @ index 0. */ + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); + HUF_flushBits(bitC, kFastFlush); + /* Encode kUnroll symbols into the bitstream @ index 1. + * This allows us to start filling the bit container + * without any data dependencies. + */ + HUF_zeroIndex1(bitC); + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ + HUF_mergeIndex1(bitC); + HUF_flushBits(bitC, kFastFlush); + } + assert(n == 0); + +} + +/* + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ + return ((srcSize * tableLog) >> 3) + 8; +} -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) FORCE_INLINE_TEMPLATE size_t HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { + U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; const BYTE* ip = (const BYTE*) src; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - size_t n; - BIT_CStream_t bitC; + HUF_CStream_t bitC; /* init */ if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); + { BYTE* op = ostart; + size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); if (HUF_isError(initErr)) return 0; } - n = srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3: - HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - ZSTD_FALLTHROUGH; - case 2: - HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - ZSTD_FALLTHROUGH; - case 1: - HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - ZSTD_FALLTHROUGH; - case 0: ZSTD_FALLTHROUGH; - default: break; - } - - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); - } - - return BIT_closeCStream(&bitC); + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); + else { + if (MEM_32bits()) { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: ZSTD_FALLTHROUGH; + case 9: ZSTD_FALLTHROUGH; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 7: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } else { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 9: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 7: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 6: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } + } + assert(bitC.ptr <= bitC.endPtr); + + return HUF_closeCStream(&bitC); } #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) @@ -644,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, static size_t HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, const void* src, size_t srcSize, - const HUF_CElt* CTable, const int bmi2) + const HUF_CElt* CTable, const int flags) { - if (bmi2) { + if (flags & HUF_flags_bmi2) { return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); } return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); @@ -657,24 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, static size_t HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, const void* src, size_t srcSize, - const HUF_CElt* CTable, const int bmi2) + const HUF_CElt* CTable, const int flags) { - (void)bmi2; + (void)flags; return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); } #endif -size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) { - return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); } - static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, const void* src, size_t srcSize, - const HUF_CElt* CTable, int bmi2) + const HUF_CElt* CTable, int flags) { size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ const BYTE* ip = (const BYTE*) src; @@ -688,27 +1179,24 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, op += 6; /* jumpTable */ assert(op <= oend); - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart, (U16)cSize); op += cSize; } ip += segmentSize; assert(op <= oend); - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+2, (U16)cSize); op += cSize; } ip += segmentSize; assert(op <= oend); - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+4, (U16)cSize); op += cSize; } @@ -716,17 +1204,17 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, ip += segmentSize; assert(op <= oend); assert(ip <= iend); - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); - if (cSize==0) return 0; + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; op += cSize; } return (size_t)(op-ostart); } -size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) { - return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); } typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; @@ -734,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; static size_t HUF_compressCTable_internal( BYTE* const ostart, BYTE* op, BYTE* const oend, const void* src, size_t srcSize, - HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) + HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) { size_t const cSize = (nbStreams==HUF_singleStream) ? - HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : - HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); + HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : + HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); if (HUF_isError(cSize)) { return cSize; } if (cSize==0) { return 0; } /* uncompressible */ op += cSize; @@ -750,35 +1238,113 @@ static size_t HUF_compressCTable_internal( typedef struct { unsigned count[HUF_SYMBOLVALUE_MAX + 1]; - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; union { HUF_buildCTable_wksp_tables buildCTable_wksp; HUF_WriteCTableWksp writeCTable_wksp; + U32 hist_wksp[HIST_WKSP_SIZE_U32]; } wksps; } HUF_compress_tables_t; +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + +unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) +{ + unsigned cardinality = 0; + unsigned i; + + for (i = 0; i < maxSymbolValue + 1; i++) { + if (count[i] != 0) cardinality += 1; + } + + return cardinality; +} + +unsigned HUF_minTableLog(unsigned symbolCardinality) +{ + U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; + return minBitsSymbols; +} + +unsigned HUF_optimalTableLog( + unsigned maxTableLog, + size_t srcSize, + unsigned maxSymbolValue, + void* workSpace, size_t wkspSize, + HUF_CElt* table, + const unsigned* count, + int flags) +{ + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); + + if (!(flags & HUF_flags_optimalDepth)) { + /* cheap evaluation, based on FSE */ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); + } + + { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); + size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); + size_t hSize, newSize; + const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); + const unsigned minTableLog = HUF_minTableLog(symbolCardinality); + size_t optSize = ((size_t) ~0) - 1; + unsigned optLog = maxTableLog, optLogGuess; + + DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); + + /* Search until size increases */ + for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { + DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); + + { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); + if (ERR_isError(maxBits)) continue; + + if (maxBits < optLogGuess && optLogGuess > minTableLog) break; + + hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); + } + + if (ERR_isError(hSize)) continue; + + newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; + + if (newSize > optSize + 1) { + break; + } + + if (newSize < optSize) { + optSize = newSize; + optLog = optLogGuess; + } + } + assert(optLog <= HUF_TABLELOG_MAX); + return optLog; + } +} + /* HUF_compress_internal() : * `workSpace_align4` must be aligned on 4-bytes boundaries, - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */ + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ static size_t HUF_compress_internal (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, HUF_nbStreams_e nbStreams, - void* workSpace_align4, size_t wkspSize, - HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, - const int bmi2) + void* workSpace, size_t wkspSize, + HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) { - HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4; + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; - HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); - assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */ + DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); /* checks & inits */ - if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); if (!srcSize) return 0; /* Uncompressed */ if (!dstSize) return 0; /* cannot fit anything within dst budget */ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ @@ -788,17 +1354,34 @@ HUF_compress_internal (void* dst, size_t dstSize, if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; /* Heuristic : If old table is valid, use it for small inputs */ - if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { + if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, oldHufTable, bmi2); + nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); + if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; + DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; + } + { unsigned maxSymbolValueEnd = maxSymbolValue; + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestEnd; + } + if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */ } /* Scan input and build symbol stats */ - { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) ); + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) ); if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ } + DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); /* Check validity of previous table */ if ( repeat @@ -807,22 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize, *repeat = HUF_repeat_none; } /* Heuristic : use existing table for small inputs */ - if (preferRepeat && repeat && *repeat != HUF_repeat_none) { + if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, oldHufTable, bmi2); + nbStreams, oldHufTable, flags); } /* Build Huffman Tree */ - huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, maxSymbolValue, huffLog, &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); CHECK_F(maxBits); huffLog = (U32)maxBits; - /* Zero unused symbols in CTable, so we can check it for validity */ - ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0, - sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); + DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); } /* Write table description header */ @@ -835,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize, if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, oldHufTable, bmi2); + nbStreams, oldHufTable, flags); } } /* Use the new huffman table */ @@ -847,59 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize, } return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, table->CTable, bmi2); -} - - -size_t HUF_compress1X_wksp (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog, - void* workSpace, size_t wkspSize) -{ - return HUF_compress_internal(dst, dstSize, src, srcSize, - maxSymbolValue, huffLog, HUF_singleStream, - workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + nbStreams, table->CTable, flags); } size_t HUF_compress1X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int flags) { + DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, hufTable, - repeat, preferRepeat, bmi2); + repeat, flags); } /* HUF_compress4X_repeat(): * compress input using 4 streams. - * provide workspace to generate compression tables */ -size_t HUF_compress4X_wksp (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog, - void* workSpace, size_t wkspSize) -{ - return HUF_compress_internal(dst, dstSize, src, srcSize, - maxSymbolValue, huffLog, HUF_fourStreams, - workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); -} - -/* HUF_compress4X_repeat(): - * compress input using 4 streams. - * re-use an existing huffman compression table */ + * consider skipping quickly + * reuse an existing huffman compression table */ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int flags) { + DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - hufTable, repeat, preferRepeat, bmi2); + hufTable, repeat, flags); } - |
