From 94a6325ee646be4a06ca0646fcd797b2a9c74581 Mon Sep 17 00:00:00 2001 From: Runxi Yu Date: Tue, 25 Nov 2025 08:00:00 +0800 Subject: flatex: Restructure a little --- internal/flatex/decompress.go | 38 ++++++ internal/flatex/decompress_bytes.go | 56 --------- internal/flatex/huffman.go | 245 ++++++++++++++++++++++++++++++++++++ internal/flatex/inflate.go | 245 ------------------------------------ internal/flatex/slice_inflate.go | 11 ++ 5 files changed, 294 insertions(+), 301 deletions(-) create mode 100644 internal/flatex/decompress.go delete mode 100644 internal/flatex/decompress_bytes.go create mode 100644 internal/flatex/huffman.go delete mode 100644 internal/flatex/inflate.go (limited to 'internal') diff --git a/internal/flatex/decompress.go b/internal/flatex/decompress.go new file mode 100644 index 00000000..641b6e7a --- /dev/null +++ b/internal/flatex/decompress.go @@ -0,0 +1,38 @@ +package flatex + +import ( + "io" + + "git.sr.ht/~runxiyu/furgit/internal/bufpool" +) + +func DecompressSized(src []byte, sizeHint int) (bufpool.Buffer, int, error) { + d := sliceInflaterPool.Get().(*sliceInflater) + defer sliceInflaterPool.Put(d) + + if err := d.reset(src); err != nil { + return bufpool.Buffer{}, 0, err + } + + out := bufpool.Borrow(sizeHint) + out.Resize(0) + + for { + if len(d.toRead) > 0 { + out.Append(d.toRead) + d.toRead = nil + continue + } + if d.err != nil { + if d.err == io.EOF { + return out, d.pos, nil + } + out.Release() + return bufpool.Buffer{}, 0, d.err + } + d.step(d) + if d.err != nil && len(d.toRead) == 0 { + d.toRead = d.window.readFlush() + } + } +} diff --git a/internal/flatex/decompress_bytes.go b/internal/flatex/decompress_bytes.go deleted file mode 100644 index 45a8b834..00000000 --- a/internal/flatex/decompress_bytes.go +++ /dev/null @@ -1,56 +0,0 @@ -package flatex - -import ( - "io" - "sync" - - "git.sr.ht/~runxiyu/furgit/internal/bufpool" -) - -type bufferDecompressor struct { - inflater sliceInflater -} - -var bufferDecompressorPool = sync.Pool{ - New: func() any { - fixedHuffmanDecoderInit() - d := &bufferDecompressor{ - inflater: sliceInflater{ - bits: new([maxNumLit + maxNumDist]int), - codebits: new([numCodes]int), - }, - } - return d - }, -} - -func DecompressSized(src []byte, sizeHint int) (bufpool.Buffer, int, error) { - d := bufferDecompressorPool.Get().(*bufferDecompressor) - defer bufferDecompressorPool.Put(d) - - if err := d.inflater.reset(src); err != nil { - return bufpool.Buffer{}, 0, err - } - - out := bufpool.Borrow(sizeHint) - out.Resize(0) - - for { - if len(d.inflater.toRead) > 0 { - out.Append(d.inflater.toRead) - d.inflater.toRead = nil - continue - } - if d.inflater.err != nil { - if d.inflater.err == io.EOF { - return out, d.inflater.pos, nil - } - out.Release() - return bufpool.Buffer{}, 0, d.inflater.err - } - d.inflater.step(&d.inflater) - if d.inflater.err != nil && len(d.inflater.toRead) == 0 { - d.inflater.toRead = d.inflater.window.readFlush() - } - } -} diff --git a/internal/flatex/huffman.go b/internal/flatex/huffman.go new file mode 100644 index 00000000..32172dbb --- /dev/null +++ b/internal/flatex/huffman.go @@ -0,0 +1,245 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package flatex implements the DEFLATE compressed data format, described in +// RFC 1951. The [compress/gzip] and [compress/zlib] packages implement access +// to DEFLATE-based file formats. +package flatex + +import ( + "math/bits" + "strconv" + "sync" +) + +const ( + // The special code used to mark the end of a block. + endBlockMarker = 256 + maxCodeLen = 16 // max length of Huffman code + maxMatchOffset = 1 << 15 // The largest match offset + // The next three numbers come from the RFC section 3.2.7, with the + // additional proviso in section 3.2.5 which implies that distance codes + // 30 and 31 should never occur in compressed data. + maxNumLit = 286 + maxNumDist = 30 + numCodes = 19 // number of codes in Huffman meta-code +) + +// A CorruptInputError reports the presence of corrupt input at a given offset. +type CorruptInputError int64 + +func (e CorruptInputError) Error() string { + return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10) +} + +// The data structure for decoding Huffman tables is based on that of +// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits), +// For codes smaller than the table width, there are multiple entries +// (each combination of trailing bits has the same value). For codes +// larger than the table width, the table contains a link to an overflow +// table. The width of each entry in the link table is the maximum code +// size minus the chunk width. +// +// Note that you can do a lookup in the table even without all bits +// filled. Since the extra bits are zero, and the DEFLATE Huffman codes +// have the property that shorter codes come before longer ones, the +// bit length estimate in the result is a lower bound on the actual +// number of bits. +// +// See the following: +// https://github.com/madler/zlib/raw/master/doc/algorithm.txt + +// chunk & 15 is number of bits +// chunk >> 4 is value, including table link + +const ( + huffmanChunkBits = 9 + huffmanNumChunks = 1 << huffmanChunkBits + huffmanCountMask = 15 + huffmanValueShift = 4 +) + +type huffmanDecoder struct { + min int // the minimum code length + chunks [huffmanNumChunks]uint32 // chunks as described above + links [][]uint32 // overflow links + linkMask uint32 // mask the width of the link table +} + +// Initialize Huffman decoding tables from array of code lengths. +// Following this function, h is guaranteed to be initialized into a complete +// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a +// degenerate case where the tree has only a single symbol with length 1. Empty +// trees are permitted. +func (h *huffmanDecoder) init(lengths []int) bool { + // Sanity enables additional runtime tests during Huffman + // table construction. It's intended to be used during + // development to supplement the currently ad-hoc unit tests. + const sanity = false + + if h.min != 0 { + *h = huffmanDecoder{} + } + + // Count number of codes of each length, + // compute min and max length. + var count [maxCodeLen]int + var min, max int + for _, n := range lengths { + if n == 0 { + continue + } + if min == 0 || n < min { + min = n + } + if n > max { + max = n + } + count[n]++ + } + + // Empty tree. The decompressor.huffSym function will fail later if the tree + // is used. Technically, an empty tree is only valid for the HDIST tree and + // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree + // is guaranteed to fail since it will attempt to use the tree to decode the + // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is + // guaranteed to fail later since the compressed data section must be + // composed of at least one symbol (the end-of-block marker). + if max == 0 { + return true + } + + code := 0 + var nextcode [maxCodeLen]int + for i := min; i <= max; i++ { + code <<= 1 + nextcode[i] = code + code += count[i] + } + + // Check that the coding is complete (i.e., that we've + // assigned all 2-to-the-max possible bit sequences). + // Exception: To be compatible with zlib, we also need to + // accept degenerate single-code codings. See also + // TestDegenerateHuffmanCoding. + if code != 1< huffmanChunkBits { + numLinks := 1 << (uint(max) - huffmanChunkBits) + h.linkMask = uint32(numLinks - 1) + + // create link tables + link := nextcode[huffmanChunkBits+1] >> 1 + h.links = make([][]uint32, huffmanNumChunks-link) + for j := uint(link); j < huffmanNumChunks; j++ { + reverse := int(bits.Reverse16(uint16(j))) + reverse >>= uint(16 - huffmanChunkBits) + off := j - uint(link) + if sanity && h.chunks[reverse] != 0 { + panic("impossible: overwriting existing chunk") + } + h.chunks[reverse] = uint32(off<>= uint(16 - n) + if n <= huffmanChunkBits { + for off := reverse; off < len(h.chunks); off += 1 << uint(n) { + // We should never need to overwrite + // an existing chunk. Also, 0 is + // never a valid chunk, because the + // lower 4 "count" bits should be + // between 1 and 15. + if sanity && h.chunks[off] != 0 { + panic("impossible: overwriting existing chunk") + } + h.chunks[off] = chunk + } + } else { + j := reverse & (huffmanNumChunks - 1) + if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 { + // Longer codes should have been + // associated with a link table above. + panic("impossible: not an indirect chunk") + } + value := h.chunks[j] >> huffmanValueShift + linktab := h.links[value] + reverse >>= huffmanChunkBits + for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) { + if sanity && linktab[off] != 0 { + panic("impossible: overwriting existing chunk") + } + linktab[off] = chunk + } + } + } + + if sanity { + // Above we've sanity checked that we never overwrote + // an existing entry. Here we additionally check that + // we filled the tables completely. + for i, chunk := range h.chunks { + if chunk == 0 { + // As an exception, in the degenerate + // single-code case, we allow odd + // chunks to be missing. + if code == 1 && i%2 == 1 { + continue + } + panic("impossible: missing chunk") + } + } + for _, linktab := range h.links { + for _, chunk := range linktab { + if chunk == 0 { + panic("impossible: missing chunk") + } + } + } + } + + return true +} + +// RFC 1951 section 3.2.7. +// Compression with dynamic Huffman codes +var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} + +var ( + // Initialize the fixedHuffmanDecoder only once upon first use. + fixedOnce sync.Once + fixedHuffmanDecoder huffmanDecoder +) + +func fixedHuffmanDecoderInit() { + fixedOnce.Do(func() { + // These come from the RFC section 3.2.6. + var bits [288]int + for i := 0; i < 144; i++ { + bits[i] = 8 + } + for i := 144; i < 256; i++ { + bits[i] = 9 + } + for i := 256; i < 280; i++ { + bits[i] = 7 + } + for i := 280; i < 288; i++ { + bits[i] = 8 + } + fixedHuffmanDecoder.init(bits[:]) + }) +} diff --git a/internal/flatex/inflate.go b/internal/flatex/inflate.go deleted file mode 100644 index a556dc42..00000000 --- a/internal/flatex/inflate.go +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package flatex implements the DEFLATE compressed data format, described in -// RFC 1951. The [compress/gzip] and [compress/zlib] packages implement access -// to DEFLATE-based file formats. -package flatex - -import ( - "math/bits" - "strconv" - "sync" -) - -const ( - // The special code used to mark the end of a block. - endBlockMarker = 256 - maxCodeLen = 16 // max length of Huffman code - maxMatchOffset = 1 << 15 // The largest match offset - // The next three numbers come from the RFC section 3.2.7, with the - // additional proviso in section 3.2.5 which implies that distance codes - // 30 and 31 should never occur in compressed data. - maxNumLit = 286 - maxNumDist = 30 - numCodes = 19 // number of codes in Huffman meta-code -) - -var ( - // Initialize the fixedHuffmanDecoder only once upon first use. - fixedOnce sync.Once - fixedHuffmanDecoder huffmanDecoder -) - -// A CorruptInputError reports the presence of corrupt input at a given offset. -type CorruptInputError int64 - -func (e CorruptInputError) Error() string { - return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10) -} - -// The data structure for decoding Huffman tables is based on that of -// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits), -// For codes smaller than the table width, there are multiple entries -// (each combination of trailing bits has the same value). For codes -// larger than the table width, the table contains a link to an overflow -// table. The width of each entry in the link table is the maximum code -// size minus the chunk width. -// -// Note that you can do a lookup in the table even without all bits -// filled. Since the extra bits are zero, and the DEFLATE Huffman codes -// have the property that shorter codes come before longer ones, the -// bit length estimate in the result is a lower bound on the actual -// number of bits. -// -// See the following: -// https://github.com/madler/zlib/raw/master/doc/algorithm.txt - -// chunk & 15 is number of bits -// chunk >> 4 is value, including table link - -const ( - huffmanChunkBits = 9 - huffmanNumChunks = 1 << huffmanChunkBits - huffmanCountMask = 15 - huffmanValueShift = 4 -) - -type huffmanDecoder struct { - min int // the minimum code length - chunks [huffmanNumChunks]uint32 // chunks as described above - links [][]uint32 // overflow links - linkMask uint32 // mask the width of the link table -} - -// Initialize Huffman decoding tables from array of code lengths. -// Following this function, h is guaranteed to be initialized into a complete -// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a -// degenerate case where the tree has only a single symbol with length 1. Empty -// trees are permitted. -func (h *huffmanDecoder) init(lengths []int) bool { - // Sanity enables additional runtime tests during Huffman - // table construction. It's intended to be used during - // development to supplement the currently ad-hoc unit tests. - const sanity = false - - if h.min != 0 { - *h = huffmanDecoder{} - } - - // Count number of codes of each length, - // compute min and max length. - var count [maxCodeLen]int - var min, max int - for _, n := range lengths { - if n == 0 { - continue - } - if min == 0 || n < min { - min = n - } - if n > max { - max = n - } - count[n]++ - } - - // Empty tree. The decompressor.huffSym function will fail later if the tree - // is used. Technically, an empty tree is only valid for the HDIST tree and - // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree - // is guaranteed to fail since it will attempt to use the tree to decode the - // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is - // guaranteed to fail later since the compressed data section must be - // composed of at least one symbol (the end-of-block marker). - if max == 0 { - return true - } - - code := 0 - var nextcode [maxCodeLen]int - for i := min; i <= max; i++ { - code <<= 1 - nextcode[i] = code - code += count[i] - } - - // Check that the coding is complete (i.e., that we've - // assigned all 2-to-the-max possible bit sequences). - // Exception: To be compatible with zlib, we also need to - // accept degenerate single-code codings. See also - // TestDegenerateHuffmanCoding. - if code != 1< huffmanChunkBits { - numLinks := 1 << (uint(max) - huffmanChunkBits) - h.linkMask = uint32(numLinks - 1) - - // create link tables - link := nextcode[huffmanChunkBits+1] >> 1 - h.links = make([][]uint32, huffmanNumChunks-link) - for j := uint(link); j < huffmanNumChunks; j++ { - reverse := int(bits.Reverse16(uint16(j))) - reverse >>= uint(16 - huffmanChunkBits) - off := j - uint(link) - if sanity && h.chunks[reverse] != 0 { - panic("impossible: overwriting existing chunk") - } - h.chunks[reverse] = uint32(off<>= uint(16 - n) - if n <= huffmanChunkBits { - for off := reverse; off < len(h.chunks); off += 1 << uint(n) { - // We should never need to overwrite - // an existing chunk. Also, 0 is - // never a valid chunk, because the - // lower 4 "count" bits should be - // between 1 and 15. - if sanity && h.chunks[off] != 0 { - panic("impossible: overwriting existing chunk") - } - h.chunks[off] = chunk - } - } else { - j := reverse & (huffmanNumChunks - 1) - if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 { - // Longer codes should have been - // associated with a link table above. - panic("impossible: not an indirect chunk") - } - value := h.chunks[j] >> huffmanValueShift - linktab := h.links[value] - reverse >>= huffmanChunkBits - for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) { - if sanity && linktab[off] != 0 { - panic("impossible: overwriting existing chunk") - } - linktab[off] = chunk - } - } - } - - if sanity { - // Above we've sanity checked that we never overwrote - // an existing entry. Here we additionally check that - // we filled the tables completely. - for i, chunk := range h.chunks { - if chunk == 0 { - // As an exception, in the degenerate - // single-code case, we allow odd - // chunks to be missing. - if code == 1 && i%2 == 1 { - continue - } - panic("impossible: missing chunk") - } - } - for _, linktab := range h.links { - for _, chunk := range linktab { - if chunk == 0 { - panic("impossible: missing chunk") - } - } - } - } - - return true -} - -// RFC 1951 section 3.2.7. -// Compression with dynamic Huffman codes -var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15} - -func fixedHuffmanDecoderInit() { - fixedOnce.Do(func() { - // These come from the RFC section 3.2.6. - var bits [288]int - for i := 0; i < 144; i++ { - bits[i] = 8 - } - for i := 144; i < 256; i++ { - bits[i] = 9 - } - for i := 256; i < 280; i++ { - bits[i] = 7 - } - for i := 280; i < 288; i++ { - bits[i] = 8 - } - fixedHuffmanDecoder.init(bits[:]) - }) -} diff --git a/internal/flatex/slice_inflate.go b/internal/flatex/slice_inflate.go index 3e07e744..f9120143 100644 --- a/internal/flatex/slice_inflate.go +++ b/internal/flatex/slice_inflate.go @@ -3,6 +3,7 @@ package flatex import ( "io" "math/bits" + "sync" ) // sliceInflater is a specialized DEFLATE decoder that reads directly from an @@ -33,6 +34,16 @@ type sliceInflater struct { copyDist int } +var sliceInflaterPool = sync.Pool{ + New: func() any { + fixedHuffmanDecoderInit() + return &sliceInflater{ + bits: new([maxNumLit + maxNumDist]int), + codebits: new([numCodes]int), + } + }, +} + func (f *sliceInflater) reset(src []byte) error { bits := f.bits codebits := f.codebits -- cgit v1.3.1-10-gc9f91