aboutsummaryrefslogtreecommitdiff
path: root/internal/flatex/huffman.go
diff options
context:
space:
mode:
authorGravatar Runxi Yu2025-11-25 08:00:00 +0800
committerGravatar Runxi Yu2025-11-25 08:00:00 +0800
commit94a6325ee646be4a06ca0646fcd797b2a9c74581 (patch)
tree1b9dd5018a7c5871e9f8204b7b6899006ce1d4c7 /internal/flatex/huffman.go
parentflatex: Remove unused cruft (diff)
signatureNo signature
flatex: Restructure a little
Diffstat (limited to 'internal/flatex/huffman.go')
-rw-r--r--internal/flatex/huffman.go245
1 files changed, 245 insertions, 0 deletions
diff --git a/internal/flatex/huffman.go b/internal/flatex/huffman.go
new file mode 100644
index 00000000..32172dbb
--- /dev/null
+++ b/internal/flatex/huffman.go
@@ -0,0 +1,245 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flatex implements the DEFLATE compressed data format, described in
+// RFC 1951. The [compress/gzip] and [compress/zlib] packages implement access
+// to DEFLATE-based file formats.
+package flatex
+
+import (
+ "math/bits"
+ "strconv"
+ "sync"
+)
+
+const (
+ // The special code used to mark the end of a block.
+ endBlockMarker = 256
+ maxCodeLen = 16 // max length of Huffman code
+ maxMatchOffset = 1 << 15 // The largest match offset
+ // The next three numbers come from the RFC section 3.2.7, with the
+ // additional proviso in section 3.2.5 which implies that distance codes
+ // 30 and 31 should never occur in compressed data.
+ maxNumLit = 286
+ maxNumDist = 30
+ numCodes = 19 // number of codes in Huffman meta-code
+)
+
+// A CorruptInputError reports the presence of corrupt input at a given offset.
+type CorruptInputError int64
+
+func (e CorruptInputError) Error() string {
+ return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
+}
+
+// The data structure for decoding Huffman tables is based on that of
+// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
+// For codes smaller than the table width, there are multiple entries
+// (each combination of trailing bits has the same value). For codes
+// larger than the table width, the table contains a link to an overflow
+// table. The width of each entry in the link table is the maximum code
+// size minus the chunk width.
+//
+// Note that you can do a lookup in the table even without all bits
+// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
+// have the property that shorter codes come before longer ones, the
+// bit length estimate in the result is a lower bound on the actual
+// number of bits.
+//
+// See the following:
+// https://github.com/madler/zlib/raw/master/doc/algorithm.txt
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+ huffmanChunkBits = 9
+ huffmanNumChunks = 1 << huffmanChunkBits
+ huffmanCountMask = 15
+ huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+ min int // the minimum code length
+ chunks [huffmanNumChunks]uint32 // chunks as described above
+ links [][]uint32 // overflow links
+ linkMask uint32 // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(lengths []int) bool {
+ // Sanity enables additional runtime tests during Huffman
+ // table construction. It's intended to be used during
+ // development to supplement the currently ad-hoc unit tests.
+ const sanity = false
+
+ if h.min != 0 {
+ *h = huffmanDecoder{}
+ }
+
+ // Count number of codes of each length,
+ // compute min and max length.
+ var count [maxCodeLen]int
+ var min, max int
+ for _, n := range lengths {
+ if n == 0 {
+ continue
+ }
+ if min == 0 || n < min {
+ min = n
+ }
+ if n > max {
+ max = n
+ }
+ count[n]++
+ }
+
+ // Empty tree. The decompressor.huffSym function will fail later if the tree
+ // is used. Technically, an empty tree is only valid for the HDIST tree and
+ // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+ // is guaranteed to fail since it will attempt to use the tree to decode the
+ // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+ // guaranteed to fail later since the compressed data section must be
+ // composed of at least one symbol (the end-of-block marker).
+ if max == 0 {
+ return true
+ }
+
+ code := 0
+ var nextcode [maxCodeLen]int
+ for i := min; i <= max; i++ {
+ code <<= 1
+ nextcode[i] = code
+ code += count[i]
+ }
+
+ // Check that the coding is complete (i.e., that we've
+ // assigned all 2-to-the-max possible bit sequences).
+ // Exception: To be compatible with zlib, we also need to
+ // accept degenerate single-code codings. See also
+ // TestDegenerateHuffmanCoding.
+ if code != 1<<uint(max) && (code != 1 || max != 1) {
+ return false
+ }
+
+ h.min = min
+ if max > huffmanChunkBits {
+ numLinks := 1 << (uint(max) - huffmanChunkBits)
+ h.linkMask = uint32(numLinks - 1)
+
+ // create link tables
+ link := nextcode[huffmanChunkBits+1] >> 1
+ h.links = make([][]uint32, huffmanNumChunks-link)
+ for j := uint(link); j < huffmanNumChunks; j++ {
+ reverse := int(bits.Reverse16(uint16(j)))
+ reverse >>= uint(16 - huffmanChunkBits)
+ off := j - uint(link)
+ if sanity && h.chunks[reverse] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+ h.links[off] = make([]uint32, numLinks)
+ }
+ }
+
+ for i, n := range lengths {
+ if n == 0 {
+ continue
+ }
+ code := nextcode[n]
+ nextcode[n]++
+ chunk := uint32(i<<huffmanValueShift | n)
+ reverse := int(bits.Reverse16(uint16(code)))
+ reverse >>= uint(16 - n)
+ if n <= huffmanChunkBits {
+ for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+ // We should never need to overwrite
+ // an existing chunk. Also, 0 is
+ // never a valid chunk, because the
+ // lower 4 "count" bits should be
+ // between 1 and 15.
+ if sanity && h.chunks[off] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ h.chunks[off] = chunk
+ }
+ } else {
+ j := reverse & (huffmanNumChunks - 1)
+ if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+ // Longer codes should have been
+ // associated with a link table above.
+ panic("impossible: not an indirect chunk")
+ }
+ value := h.chunks[j] >> huffmanValueShift
+ linktab := h.links[value]
+ reverse >>= huffmanChunkBits
+ for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+ if sanity && linktab[off] != 0 {
+ panic("impossible: overwriting existing chunk")
+ }
+ linktab[off] = chunk
+ }
+ }
+ }
+
+ if sanity {
+ // Above we've sanity checked that we never overwrote
+ // an existing entry. Here we additionally check that
+ // we filled the tables completely.
+ for i, chunk := range h.chunks {
+ if chunk == 0 {
+ // As an exception, in the degenerate
+ // single-code case, we allow odd
+ // chunks to be missing.
+ if code == 1 && i%2 == 1 {
+ continue
+ }
+ panic("impossible: missing chunk")
+ }
+ }
+ for _, linktab := range h.links {
+ for _, chunk := range linktab {
+ if chunk == 0 {
+ panic("impossible: missing chunk")
+ }
+ }
+ }
+ }
+
+ return true
+}
+
+// RFC 1951 section 3.2.7.
+// Compression with dynamic Huffman codes
+var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+var (
+ // Initialize the fixedHuffmanDecoder only once upon first use.
+ fixedOnce sync.Once
+ fixedHuffmanDecoder huffmanDecoder
+)
+
+func fixedHuffmanDecoderInit() {
+ fixedOnce.Do(func() {
+ // These come from the RFC section 3.2.6.
+ var bits [288]int
+ for i := 0; i < 144; i++ {
+ bits[i] = 8
+ }
+ for i := 144; i < 256; i++ {
+ bits[i] = 9
+ }
+ for i := 256; i < 280; i++ {
+ bits[i] = 7
+ }
+ for i := 280; i < 288; i++ {
+ bits[i] = 8
+ }
+ fixedHuffmanDecoder.init(bits[:])
+ })
+}