aboutsummaryrefslogtreecommitdiff
path: root/internal/format
diff options
context:
space:
mode:
Diffstat (limited to 'internal/format')
-rw-r--r--internal/format/packfile/delta/apply.go122
-rw-r--r--internal/format/packfile/delta/apply_test.go174
-rw-r--r--internal/format/packfile/delta/header.go98
-rw-r--r--internal/format/packfile/delta/header_test.go88
-rw-r--r--internal/format/packfile/entry_header.go11
-rw-r--r--internal/format/packfile/entry_header_test.go13
-rw-r--r--internal/format/packfile/header.go64
-rw-r--r--internal/format/packidx/bloom/bloom.go180
-rw-r--r--internal/format/packidx/bloom/bloom_test.go159
-rw-r--r--internal/format/packidx/bloom/doc.go138
-rw-r--r--internal/format/packidx/bloom/lookup.go42
-rw-r--r--internal/format/packidx/bloom/lookup_test.go32
-rw-r--r--internal/format/packidx/bloom/roundtrip_test.go92
-rw-r--r--internal/format/packidx/bloom/write.go164
-rw-r--r--internal/format/packidx/bloom/write_test.go95
-rw-r--r--internal/format/packidx/doc.go3
-rw-r--r--internal/format/packidx/helpers_test.go86
-rw-r--r--internal/format/packidx/lookup.go121
-rw-r--r--internal/format/packidx/lookup_test.go99
-rw-r--r--internal/format/packidx/packidx.go197
-rw-r--r--internal/format/packidx/packidx_test.go124
-rw-r--r--internal/format/packidx/roundtrip_test.go88
-rw-r--r--internal/format/packidx/write.go129
-rw-r--r--internal/format/packidx/write_test.go112
-rw-r--r--internal/format/packrev/doc.go3
-rw-r--r--internal/format/packrev/helpers_test.go75
-rw-r--r--internal/format/packrev/packrev.go124
-rw-r--r--internal/format/packrev/packrev_test.go160
-rw-r--r--internal/format/packrev/write.go79
-rw-r--r--internal/format/packrev/write_test.go135
30 files changed, 2990 insertions, 17 deletions
diff --git a/internal/format/packfile/delta/apply.go b/internal/format/packfile/delta/apply.go
new file mode 100644
index 00000000..4210d1b3
--- /dev/null
+++ b/internal/format/packfile/delta/apply.go
@@ -0,0 +1,122 @@
+package delta
+
+import (
+ "fmt"
+
+ "lindenii.org/go/lgo/intconv"
+)
+
+// MaxChainDepth is the maximum supported delta chain length.
+// Resolvers reject chains deeper than this
+// to bound recursion depth and reconstruction work.
+const MaxChainDepth = 1 << 12
+
+// Apply applies one inflated delta payload to base
+// and returns the reconstructed result.
+//
+// delta must include the leading base/result size header.
+func Apply(base, delta []byte) ([]byte, error) {
+ baseSize, resultSize, pos, err := ParseHeaderSizes(delta)
+ if err != nil {
+ return nil, err
+ }
+
+ if baseSize != uint64(len(base)) {
+ return nil, fmt.Errorf("%w: base size mismatch", ErrMalformedDelta)
+ }
+
+ outLen, err := intconv.Uint64ToInt(resultSize)
+ if err != nil {
+ return nil, fmt.Errorf("%w: result size overflows int", ErrMalformedDelta)
+ }
+
+ out := make([]byte, outLen)
+ outPos := 0
+
+ for pos < len(delta) {
+ op := delta[pos]
+ pos++
+
+ switch {
+ case op&0x80 != 0:
+ outPos, err = applyCopy(out, outPos, base, delta, &pos, op)
+ case op != 0:
+ outPos, err = applyInsert(out, outPos, delta, &pos, int(op))
+ default:
+ err = fmt.Errorf("%w: invalid opcode 0", ErrMalformedDelta)
+ }
+
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ if outPos != len(out) {
+ return nil, fmt.Errorf("%w: result size mismatch", ErrMalformedDelta)
+ }
+
+ return out, nil
+}
+
+// applyCopy executes one copy instruction,
+// copying a base range into out,
+// and returns the new output position.
+func applyCopy(out []byte, outPos int, base, delta []byte, pos *int, op byte) (int, error) {
+ off, err := parseCopyOperand(delta, pos, op, 0, 4)
+ if err != nil {
+ return 0, err
+ }
+
+ n, err := parseCopyOperand(delta, pos, op, 4, 3)
+ if err != nil {
+ return 0, err
+ }
+
+ if n == 0 {
+ n = 0x10000
+ }
+
+ if off+n > len(base) || outPos+n > len(out) {
+ return 0, fmt.Errorf("%w: copy out of bounds", ErrMalformedDelta)
+ }
+
+ copy(out[outPos:outPos+n], base[off:off+n])
+
+ return outPos + n, nil
+}
+
+// applyInsert executes one insert instruction,
+// copying n literal delta bytes into out,
+// and returns the new output position.
+func applyInsert(out []byte, outPos int, delta []byte, pos *int, n int) (int, error) {
+ if *pos+n > len(delta) || outPos+n > len(out) {
+ return 0, fmt.Errorf("%w: insert out of bounds", ErrMalformedDelta)
+ }
+
+ copy(out[outPos:outPos+n], delta[*pos:*pos+n])
+ *pos += n
+
+ return outPos + n, nil
+}
+
+// parseCopyOperand assembles one little-endian copy instruction operand
+// from the operand bytes selected by op's flag bits
+// firstBit through firstBit+count-1.
+func parseCopyOperand(delta []byte, pos *int, op byte, firstBit uint, count int) (int, error) {
+ value := 0
+
+ for i := range count {
+ if op&(1<<(firstBit+uint(i))) == 0 {
+ continue
+ }
+
+ if *pos >= len(delta) {
+ return 0, fmt.Errorf("%w: truncated copy operand", ErrMalformedDelta)
+ }
+
+ value |= int(delta[*pos]) << (8 * i)
+ *pos++
+ }
+
+ return value, nil
+}
diff --git a/internal/format/packfile/delta/apply_test.go b/internal/format/packfile/delta/apply_test.go
new file mode 100644
index 00000000..bdf454d9
--- /dev/null
+++ b/internal/format/packfile/delta/apply_test.go
@@ -0,0 +1,174 @@
+package delta_test
+
+import (
+ "bytes"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packfile/delta"
+)
+
+func TestApplyInsert(t *testing.T) {
+ t.Parallel()
+
+ data := delta.AppendHeaderSizes(nil, 0, 5)
+ data = append(data, 5, 'h', 'e', 'l', 'l', 'o')
+
+ out, err := delta.Apply(nil, data)
+ if err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+
+ if string(out) != "hello" {
+ t.Fatalf("Apply = %q, want %q", out, "hello")
+ }
+}
+
+func TestApplyCopy(t *testing.T) {
+ t.Parallel()
+
+ base := []byte("0123456789")
+
+ // One offset byte and one size byte.
+ data := delta.AppendHeaderSizes(nil, 10, 4)
+ data = append(data, 0x91, 3, 4)
+
+ out, err := delta.Apply(base, data)
+ if err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+
+ if string(out) != "3456" {
+ t.Fatalf("Apply = %q, want %q", out, "3456")
+ }
+}
+
+func TestApplyCopyImplicitSize(t *testing.T) {
+ t.Parallel()
+
+ base := bytes.Repeat([]byte{0xab}, 0x10000+10)
+
+ // Offset 0 and the implicit copy size 0x10000.
+ data := delta.AppendHeaderSizes(nil, uint64(len(base)), 0x10000)
+ data = append(data, 0x80)
+
+ out, err := delta.Apply(base, data)
+ if err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+
+ if !bytes.Equal(out, base[:0x10000]) {
+ t.Fatalf("Apply implicit-size copy mismatch")
+ }
+}
+
+func TestApplyMixed(t *testing.T) {
+ t.Parallel()
+
+ base := []byte("the quick brown fox")
+
+ data := delta.AppendHeaderSizes(nil, uint64(len(base)), 9)
+ data = append(data, 0x91, 4, 5) // copy "quick"
+ data = append(data, 3, '!', '?', '!') // insert "!?!"
+ data = append(data, 0x91, 16, 1) // copy "f"
+
+ out, err := delta.Apply(base, data)
+ if err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+
+ if string(out) != "quick!?!f" {
+ t.Fatalf("Apply = %q, want %q", out, "quick!?!f")
+ }
+}
+
+func TestApplyEmptyResult(t *testing.T) {
+ t.Parallel()
+
+ data := delta.AppendHeaderSizes(nil, 3, 0)
+
+ out, err := delta.Apply([]byte("abc"), data)
+ if err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+
+ if len(out) != 0 {
+ t.Fatalf("Apply = %q, want empty", out)
+ }
+}
+
+func TestApplyMalformed(t *testing.T) {
+ t.Parallel()
+
+ base := []byte("0123456789")
+
+ cases := []struct {
+ name string
+ ops []byte
+ baseSize uint64
+ resultSize uint64
+ }{
+ {
+ name: "opcode zero",
+ ops: []byte{0x00},
+ baseSize: 10,
+ resultSize: 1,
+ },
+ {
+ name: "truncated copy operand",
+ ops: []byte{0x91, 3},
+ baseSize: 10,
+ resultSize: 4,
+ },
+ {
+ name: "copy past base",
+ ops: []byte{0x91, 8, 5},
+ baseSize: 10,
+ resultSize: 5,
+ },
+ {
+ name: "copy past result",
+ ops: []byte{0x91, 0, 5},
+ baseSize: 10,
+ resultSize: 3,
+ },
+ {
+ name: "insert past delta",
+ ops: []byte{5, 'a', 'b'},
+ baseSize: 10,
+ resultSize: 5,
+ },
+ {
+ name: "insert past result",
+ ops: []byte{5, 'a', 'b', 'c', 'd', 'e'},
+ baseSize: 10,
+ resultSize: 3,
+ },
+ {
+ name: "base size mismatch",
+ ops: []byte{0x91, 0, 1},
+ baseSize: 9,
+ resultSize: 1,
+ },
+ {
+ name: "result shorter than declared",
+ ops: []byte{0x91, 0, 1},
+ baseSize: 10,
+ resultSize: 2,
+ },
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ data := delta.AppendHeaderSizes(nil, tc.baseSize, tc.resultSize)
+ data = append(data, tc.ops...)
+
+ _, err := delta.Apply(base, data)
+ if !errors.Is(err, delta.ErrMalformedDelta) {
+ t.Fatalf("Apply error = %v, want ErrMalformedDelta", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packfile/delta/header.go b/internal/format/packfile/delta/header.go
new file mode 100644
index 00000000..d5dd93fb
--- /dev/null
+++ b/internal/format/packfile/delta/header.go
@@ -0,0 +1,98 @@
+package delta
+
+import (
+ "errors"
+ "fmt"
+)
+
+// ErrMalformedDelta reports that
+// a delta payload is truncated, overlong,
+// declares sizes that overflow uint64,
+// contains invalid instructions,
+// or does not match the supplied base or declared sizes.
+var ErrMalformedDelta = errors.New("internal/format/packfile/delta: malformed delta")
+
+// MaxSizeVarintLen is the maximum encoded length
+// of one delta header size varint.
+// Every uint64 size is encodable within this bound,
+// and parsing rejects longer encodings.
+const MaxSizeVarintLen = 10
+
+// MaxHeaderSizesLen is the maximum encoded length
+// of the base/result size header of a delta payload.
+//
+// Callers reading a delta from a stream may inflate
+// MaxHeaderSizesLen bytes
+// (or fewer if the payload ends sooner)
+// and parse with [ParseHeaderSizes];
+// no valid header is longer.
+const MaxHeaderSizesLen = 2 * MaxSizeVarintLen
+
+// ParseHeaderSizes parses the base size and result size varints
+// at the beginning of one inflated delta payload.
+func ParseHeaderSizes(data []byte) (baseSize, resultSize uint64, consumed int, err error) {
+ baseSize, consumed, err = parseSizeVarint(data, 0)
+ if err != nil {
+ return 0, 0, 0, err
+ }
+
+ resultSize, consumed, err = parseSizeVarint(data, consumed)
+ if err != nil {
+ return 0, 0, 0, err
+ }
+
+ return baseSize, resultSize, consumed, nil
+}
+
+// AppendHeaderSizes appends the base/result size header encoding
+// of one delta payload to dst.
+func AppendHeaderSizes(dst []byte, baseSize, resultSize uint64) []byte {
+ dst = appendSizeVarint(dst, baseSize)
+
+ return appendSizeVarint(dst, resultSize)
+}
+
+// appendSizeVarint appends the encoding of one delta header size varint to dst.
+func appendSizeVarint(dst []byte, value uint64) []byte {
+ for value >= 0x80 {
+ dst = append(dst, byte(value)|0x80)
+ value >>= 7
+ }
+
+ return append(dst, byte(value))
+}
+
+// parseSizeVarint parses one delta header size varint
+// beginning at pos,
+// and returns the position past it.
+func parseSizeVarint(data []byte, pos int) (value uint64, consumed int, err error) {
+ start := pos
+
+ shift := uint(0)
+
+ for {
+ if pos-start >= MaxSizeVarintLen {
+ return 0, 0, fmt.Errorf("%w: overlong size varint", ErrMalformedDelta)
+ }
+
+ if pos >= len(data) {
+ return 0, 0, fmt.Errorf("%w: truncated size varint", ErrMalformedDelta)
+ }
+
+ b := data[pos]
+ pos++
+
+ group := uint64(b & 0x7f)
+ if group<<shift>>shift != group {
+ return 0, 0, fmt.Errorf("%w: size overflows uint64", ErrMalformedDelta)
+ }
+
+ value |= group << shift
+
+ if b&0x80 == 0 {
+ return value, pos, nil
+ }
+
+ shift += 7
+ }
+}
diff --git a/internal/format/packfile/delta/header_test.go b/internal/format/packfile/delta/header_test.go
new file mode 100644
index 00000000..8d97674f
--- /dev/null
+++ b/internal/format/packfile/delta/header_test.go
@@ -0,0 +1,88 @@
+package delta_test
+
+import (
+ "bytes"
+ "errors"
+ "math"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packfile/delta"
+)
+
+func TestParseHeaderSizesRoundTrip(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ baseSize uint64
+ resultSize uint64
+ }{
+ {name: "zero", baseSize: 0, resultSize: 0},
+ {name: "small", baseSize: 5, resultSize: 130},
+ {name: "boundaries", baseSize: 127, resultSize: 128},
+ {name: "large", baseSize: 1 << 32, resultSize: 1 << 57},
+ {name: "max", baseSize: math.MaxUint64, resultSize: math.MaxUint64},
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ data := delta.AppendHeaderSizes(nil, tc.baseSize, tc.resultSize)
+
+ wantConsumed := len(data)
+
+ data = append(data, 0xde, 0xad)
+
+ baseSize, resultSize, consumed, err := delta.ParseHeaderSizes(data)
+ if err != nil {
+ t.Fatalf("ParseHeaderSizes: %v", err)
+ }
+
+ if baseSize != tc.baseSize {
+ t.Fatalf("ParseHeaderSizes base size = %d, want %d", baseSize, tc.baseSize)
+ }
+
+ if resultSize != tc.resultSize {
+ t.Fatalf("ParseHeaderSizes result size = %d, want %d", resultSize, tc.resultSize)
+ }
+
+ if consumed != wantConsumed {
+ t.Fatalf("ParseHeaderSizes consumed = %d, want %d", consumed, wantConsumed)
+ }
+ })
+ }
+}
+
+func TestParseHeaderSizesMalformed(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ data []byte
+ }{
+ {name: "empty", data: []byte{}},
+ {name: "truncated first varint", data: []byte{0x80}},
+ {name: "missing second varint", data: []byte{0x05}},
+ {name: "truncated second varint", data: []byte{0x05, 0x80}},
+ {
+ name: "overflow",
+ data: append(bytes.Repeat([]byte{0xff}, 9), 0x7f),
+ },
+ {
+ name: "overlong",
+ data: append(bytes.Repeat([]byte{0x80}, 10), 0x00),
+ },
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ _, _, _, err := delta.ParseHeaderSizes(tc.data)
+ if !errors.Is(err, delta.ErrMalformedDelta) {
+ t.Fatalf("ParseHeaderSizes error = %v, want ErrMalformedDelta", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packfile/entry_header.go b/internal/format/packfile/entry_header.go
index 1428600b..e5f2d62d 100644
--- a/internal/format/packfile/entry_header.go
+++ b/internal/format/packfile/entry_header.go
@@ -13,12 +13,6 @@ import (
// or declares a size that overflows uint64.
var ErrMalformedEntryHeader = errors.New("internal/format/packfile: malformed entry header")
-// ErrInvalidHashSize reports that
-// a supplied hash size is not a plausible object ID size.
-// This indicates a caller bug,
-// not malformed pack data.
-var ErrInvalidHashSize = errors.New("internal/format/packfile: invalid hash size")
-
// MaxTypeSizeLen is the maximum encoded length
// of the type/size prefix of an entry header.
// Every uint64 size is encodable within this bound,
@@ -73,7 +67,8 @@ type EntryHeader struct {
// from the beginning of data.
//
// hashSize must be the object ID size
-// of the pack's object format.
+// of the pack's object format;
+// ParseEntryHeader panics on implausible hash sizes.
//
// data need not contain the whole entry;
// [MaxEntryHeaderLen] bytes always suffice.
@@ -83,7 +78,7 @@ func ParseEntryHeader(data []byte, hashSize int) (EntryHeader, error) {
var zero EntryHeader
if hashSize <= 0 || hashSize > id.MaxObjectIDSize {
- return zero, fmt.Errorf("%w: %d", ErrInvalidHashSize, hashSize)
+ panic("internal/format/packfile: invalid hash size")
}
if len(data) == 0 {
diff --git a/internal/format/packfile/entry_header_test.go b/internal/format/packfile/entry_header_test.go
index 807c544e..79dc2740 100644
--- a/internal/format/packfile/entry_header_test.go
+++ b/internal/format/packfile/entry_header_test.go
@@ -130,10 +130,15 @@ func TestParseEntryHeaderBadHashSize(t *testing.T) {
t.Parallel()
for _, hashSize := range []int{-1, 0, id.MaxObjectIDSize + 1} {
- _, err := packfile.ParseEntryHeader([]byte{0x95, 0x05}, hashSize)
- if err == nil {
- t.Fatalf("ParseEntryHeader hash size %d: expected error", hashSize)
- }
+ func() {
+ defer func() {
+ if recover() == nil {
+ t.Fatalf("ParseEntryHeader hash size %d: expected panic", hashSize)
+ }
+ }()
+
+ _, _ = packfile.ParseEntryHeader([]byte{0x95, 0x05}, hashSize)
+ }()
}
}
diff --git a/internal/format/packfile/header.go b/internal/format/packfile/header.go
index 1a156f36..64c7d140 100644
--- a/internal/format/packfile/header.go
+++ b/internal/format/packfile/header.go
@@ -1,9 +1,63 @@
package packfile
-// Signature is the 4-byte "PACK" magic at the start of pack files.
-const Signature = 0x5041434b
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+)
-// SupportedVersion reports whether a pack version is supported.
-func SupportedVersion(version uint32) bool {
- return version == 2
+const (
+ // signature is the 4-byte "PACK" magic at the start of pack files.
+ signature = 0x5041434b
+
+ // version is the only pack version this package reads or writes.
+ version = 2
+)
+
+// HeaderLen is the size of a pack header:
+// signature, version, and object count.
+const HeaderLen = 12
+
+// ErrMalformedHeader reports that
+// a pack header is truncated,
+// or has a bad signature or unsupported version.
+var ErrMalformedHeader = errors.New("internal/format/packfile: malformed pack header")
+
+// Header is one parsed pack header.
+type Header struct {
+ // ObjectCount is the number of object entries
+ // declared to follow the header.
+ ObjectCount uint32
+}
+
+// ParseHeader parses and validates the pack header
+// at the start of data.
+func ParseHeader(data []byte) (Header, error) {
+ var zero Header
+
+ if len(data) < HeaderLen {
+ return zero, fmt.Errorf("%w: truncated", ErrMalformedHeader)
+ }
+
+ if binary.BigEndian.Uint32(data[0:4]) != signature {
+ return zero, fmt.Errorf("%w: bad signature", ErrMalformedHeader)
+ }
+
+ parsedVersion := binary.BigEndian.Uint32(data[4:8])
+ if parsedVersion != version {
+ return zero, fmt.Errorf("%w: unsupported version %d", ErrMalformedHeader, parsedVersion)
+ }
+
+ return Header{
+ ObjectCount: binary.BigEndian.Uint32(data[8:12]),
+ }, nil
+}
+
+// AppendHeader appends an encoded pack header for objectCount to dst.
+func AppendHeader(dst []byte, objectCount uint32) []byte {
+ dst = binary.BigEndian.AppendUint32(dst, signature)
+ dst = binary.BigEndian.AppendUint32(dst, version)
+ dst = binary.BigEndian.AppendUint32(dst, objectCount)
+
+ return dst
}
diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go
new file mode 100644
index 00000000..8b608221
--- /dev/null
+++ b/internal/format/packidx/bloom/bloom.go
@@ -0,0 +1,180 @@
+package bloom
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "math/bits"
+
+ "lindenii.org/go/furgit/object/id"
+)
+
+// ErrMalformedBloomFilter reports that
+// a Bloom filter is truncated,
+// has a bad signature, version, or hash function,
+// or has inconsistent parameters.
+var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter")
+
+const (
+ signature = 0x4944424c // "IDBL"
+ version = 1
+
+ // HeaderLen is the fixed header length in octets,
+ // i.e., the signature, version, hash function identifier,
+ // B, K, and the trailing zero padding.
+ HeaderLen = 64
+
+ // BucketLen is the length of one bucket in octets,
+ // chosen to match the most common cache-line size.
+ BucketLen = 64
+
+ // wordBits is the bit width of one bucket word.
+ wordBits = 64
+
+ // fieldBits is the width of one in-bucket position field.
+ fieldBits = 9
+)
+
+// checkParams validates the filter parameters
+// against one object hash size,
+// returning log2(bucketCount) on success.
+func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) {
+ switch {
+ case bucketCount == 0 || bucketCount&(bucketCount-1) != 0:
+ return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113
+ case k == 0:
+ return 0, errors.New("zero probe count") //nolint:err113
+ }
+
+ log2B := uint(bits.TrailingZeros32(bucketCount))
+ if log2B+fieldBits*uint(k) > uint(hashSize)*8 {
+ return 0, errors.New("parameters exceed hash length") //nolint:err113
+ }
+
+ return log2B, nil
+}
+
+// hashFunctionID returns the on-disk hash function identifier
+// for one object format.
+func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
+ switch objectFormat {
+ case id.ObjectFormatSHA1:
+ return 1, nil
+ case id.ObjectFormatSHA256:
+ return 2, nil
+ case id.ObjectFormatUnknown:
+ }
+
+ return 0, id.ErrInvalidObjectFormat
+}
+
+// Bloom is a parsed blocked Bloom filter view over borrowed bytes.
+//
+// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
+type Bloom struct {
+ // data is the entire filter payload.
+ data []byte
+
+ // buckets is the bucket region, between the header and the trailer.
+ buckets []byte
+
+ // objectFormat is the filter's object format.
+ objectFormat id.ObjectFormat
+
+ // log2B is the base-2 logarithm of the bucket count,
+ // i.e. the number of leading object ID bits that select a bucket.
+ log2B uint
+
+ // k is the number of bits set and tested per object ID.
+ k int
+}
+
+// Parse parses one Bloom filter from data.
+//
+// Labels: Deps-Borrowed, Life-Parent.
+func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
+ var zero Bloom
+
+ wantHashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return zero, err
+ }
+
+ hashSize := objectFormat.Size()
+
+ if len(data) < HeaderLen {
+ return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data) != signature {
+ return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data[4:]) != version {
+ return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data[8:]) != wantHashID {
+ return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter)
+ }
+
+ bucketCount := binary.BigEndian.Uint32(data[12:])
+ k := binary.BigEndian.Uint16(data[16:])
+
+ for _, octet := range data[18:HeaderLen] {
+ if octet != 0 {
+ return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter)
+ }
+ }
+
+ log2B, err := checkParams(bucketCount, k, hashSize)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
+ }
+
+ want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115
+ if uint64(len(data)) != want {
+ return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
+ }
+
+ return Bloom{
+ data: data,
+ buckets: data[HeaderLen : len(data)-2*hashSize],
+ objectFormat: objectFormat,
+ log2B: log2B,
+ k: int(k),
+ }, nil
+}
+
+// PackHash returns the pack hash recorded in the filter trailer.
+//
+// Labels: Life-Parent, Mut-No.
+func (f *Bloom) PackHash() []byte {
+ hashSize := f.objectFormat.Size()
+ end := len(f.data) - hashSize
+
+ return f.data[end-hashSize : end]
+}
+
+// Verify recomputes the filter's trailing checksum and reports any mismatch.
+//
+// Verify reads the whole filter,
+// so callers should treat it as a deliberate integrity check
+// rather than part of the open path.
+func (f *Bloom) Verify() error {
+ hashImpl, err := f.objectFormat.New()
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
+ checksumOff := len(f.data) - f.objectFormat.Size()
+
+ _, _ = hashImpl.Write(f.data[:checksumOff])
+
+ if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) {
+ return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter)
+ }
+
+ return nil
+}
diff --git a/internal/format/packidx/bloom/bloom_test.go b/internal/format/packidx/bloom/bloom_test.go
new file mode 100644
index 00000000..bcfb4419
--- /dev/null
+++ b/internal/format/packidx/bloom/bloom_test.go
@@ -0,0 +1,159 @@
+package bloom_test
+
+import (
+ "encoding/binary"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func validFilter(t *testing.T, format id.ObjectFormat) []byte {
+ // TODO: maybe testgit should have something like this?
+ t.Helper()
+
+ builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return builder.Bytes()
+}
+
+func otherFormat(t *testing.T, format id.ObjectFormat) id.ObjectFormat {
+ t.Helper()
+
+ for _, candidate := range id.SupportedObjectFormats() {
+ if candidate != format {
+ return candidate
+ }
+ }
+
+ t.Skip("only one supported object format")
+
+ return id.ObjectFormatUnknown
+}
+
+func TestParseValid(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.Parse(validFilter(t, format), format)
+ if err != nil {
+ t.Fatalf("Parse rejected a valid filter: %v", err)
+ }
+ })
+ }
+}
+
+func TestParseMalformed(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ mangle func(data []byte) []byte
+ }{
+ {"truncated", func(data []byte) []byte { return data[:bloom.HeaderLen-1] }},
+ {"bad signature", func(data []byte) []byte {
+ data[0] ^= 0xff
+
+ return data
+ }},
+ {"bad version", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[4:], 99)
+
+ return data
+ }},
+ {"non power of two", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[12:], 3)
+
+ return data
+ }},
+ {"zero probe count", func(data []byte) []byte {
+ binary.BigEndian.PutUint16(data[16:], 0)
+
+ return data
+ }},
+ {"parameters exceed hash", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[12:], 1<<31)
+ binary.BigEndian.PutUint16(data[16:], 30)
+
+ return data
+ }},
+ {"nonzero padding", func(data []byte) []byte {
+ data[20] = 1
+
+ return data
+ }},
+ {"size disagrees", func(data []byte) []byte { return data[:len(data)-1] }},
+ }
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ data := tc.mangle(append([]byte(nil), validFilter(t, format)...))
+
+ _, err := bloom.Parse(data, format)
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+// TestVerifyDetectsCorruption checks that Verify accepts a sound filter
+// and rejects one whose bucket bytes have been altered.
+func TestVerifyDetectsCorruption(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ data := validFilter(t, format)
+
+ filter, err := bloom.Parse(data, format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ err = filter.Verify()
+ if err != nil {
+ t.Fatalf("Verify on a sound filter: %v", err)
+ }
+
+ data[bloom.HeaderLen] ^= 0xff
+
+ err = filter.Verify()
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Verify error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+}
+
+func TestParseHashMismatch(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.Parse(validFilter(t, format), otherFormat(t, format))
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packidx/bloom/doc.go b/internal/format/packidx/bloom/doc.go
new file mode 100644
index 00000000..06ca57cd
--- /dev/null
+++ b/internal/format/packidx/bloom/doc.go
@@ -0,0 +1,138 @@
+// Package bloom provides a blocked Bloom filter
+// for pack indexes.
+//
+// A filter answers, from a single cache-line-sized read,
+// whether an object ID is definitely absent from the index it covers.
+// A lookup that must consult many packs
+// can then skip the full binary search
+// in every pack whose filter rejects the object,
+// decreasing the cost of misses.
+//
+// # Rationale
+//
+// Especially for server-side usage,
+// repacking is expensive,
+// and creating multi-pack-indexes is still rather expensive.
+// Incremental multi-pack-indexes partially solve this,
+// but having too many of them defeats the purpose,
+// since the indexes must still be walked in order
+// while performing expensive lookups.
+//
+// Instead, each multi-pack-index layer,
+// and each ordinary pack index,
+// may carry its own filter.
+// The indexes are still traversed in their usual order,
+// but the first step when traversing one
+// is to check whether it could possibly hold the wanted object.
+//
+// The filter is split into 64-octet buckets,
+// matching the most common cache-line size.
+// Some bits of the object ID choose the bucket,
+// and the rest choose several bit positions inside it,
+// so a lookup reads one 64-octet bucket
+// and checks whether all required bits are set.
+//
+// # Parameters
+//
+// A filter is parameterized by
+// the number of buckets B
+// and the number of bits set and tested per object ID, K.
+// All integers in the format are big endian.
+// The object ID is interpreted as a big-endian bitstring,
+// where bit offset 0 is the most significant bit of octet 0.
+// B must be a nonzero power of two,
+// K must be nonzero,
+// and log2(B) + 9*K must not exceed the hash length in bits.
+//
+// # File format
+//
+// A filter file is a 64-octet header,
+// then B buckets of 64 octets each,
+// then a two-hash trailer:
+//
+// - 4-octet signature: {'I', 'D', 'B', 'L'}.
+// - 4-octet version identifier (= 1).
+// - 4-octet object hash algorithm identifier
+// (= 1 for SHA-1, 2 for SHA-256).
+// - 4-octet B, the number of buckets.
+// - 2-octet K, the number of bits set and tested per object ID.
+// - 46-octet padding, which must be all zero.
+// - B buckets of 64 octets each.
+// - the pack trailer hash, which binds the filter to its pack.
+// - the checksum of everything before it, over the filter's hash function.
+//
+// The hash length is that of the object format,
+// so the trailer is 2 hashes wide
+// and the file size is exactly 64 + 64*B + 2*hashlen octets.
+//
+// A reader must validate that
+// the signature matches,
+// the version is supported,
+// the hash function identifier is recognized,
+// B is nonzero and a power of two,
+// K is nonzero,
+// log2(B) + 9*K does not exceed the hash length in bits,
+// the padding is all zero,
+// and the file size is exactly 64 + 64*B + 2*hashlen octets.
+//
+// # Binding and integrity
+//
+// The pack hash binds a filter to one pack;
+// a reader trusts a filter only when the recorded pack hash
+// matches the pack it accompanies.
+//
+// The checksum guards against corruption of the filter itself.
+// Recomputing it reads the whole file and rehashes it as fsck.
+//
+// # Lookup
+//
+// A lookup against one filter proceeds as follows:
+//
+// 1. Let b be the unsigned integer encoded
+// by the most significant log2(B) bits of the object ID.
+// B is a power of two, so 0 <= b < B.
+// 2. Select and read bucket b.
+// 3. For each 0 <= i < K,
+// take the i-th 9-bit field
+// from the 9*K bits that follow the bucket-selecting bits,
+// and let pi be the unsigned integer it encodes,
+// so 0 <= pi < 512.
+// Compute wi = pi >> 6 and bi = pi & 63,
+// so wi identifies one of the eight 64-bit words in bucket b
+// and bi identifies one bit within that word.
+// Within each 64-bit word,
+// bit index 0 is the most significant bit
+// and bit index 63 is the least significant bit.
+// Test whether bit bi is set in word wi of bucket b.
+//
+// If any test fails,
+// the object ID is definitely not in the covered index.
+// If all tests succeed,
+// the object ID may be in it.
+// Two of the K 9-bit fields can decode to the same pi,
+// so an insertion may set fewer than K distinct bits;
+// this only raises the false positive rate
+// and never causes a false negative.
+//
+// # Worked example
+//
+// Let B = 1 << 15 = 32768 and K = 8.
+// Then log2(B) = 15,
+// so each lookup uses 15 bits to choose the bucket
+// and 8*9 = 72 bits to choose the in-bucket positions,
+// for a total of 87 bits taken from the object ID.
+// A SHA-1 has 160 bits and a SHA-256 has 256 bits,
+// so both leave ample headroom.
+//
+// # Security considerations
+//
+// Object IDs are public unkeyed hashes,
+// so an adversary can mine packs
+// whose object IDs share a chosen prefix
+// to crowd objects into one bucket
+// and fill its bits.
+// In the worst case this renders some buckets useless,
+// making the filter degrade to "may contain" for those buckets,
+// but it never produces a false negative
+// and is not a significant denial-of-service vector.
+package bloom
diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go
new file mode 100644
index 00000000..4a8d7bda
--- /dev/null
+++ b/internal/format/packidx/bloom/lookup.go
@@ -0,0 +1,42 @@
+package bloom
+
+import (
+ "encoding/binary"
+)
+
+// MayContain reports whether oid may be present
+// in the index covered by the filter.
+//
+// oid must be exactly the filter's hash size;
+// MayContain panics otherwise.
+//
+// Labels: Mut-No.
+func (f *Bloom) MayContain(oid []byte) bool {
+ if len(oid) != f.objectFormat.Size() {
+ panic("internal/format/packidx/bloom: invalid object ID length")
+ }
+
+ base := int(binary.BigEndian.Uint32(oid[:4])>>(32-f.log2B)) * BucketLen
+
+ for i := range f.k {
+ word, mask := probe(oid, f.log2B, i)
+ if binary.BigEndian.Uint64(f.buckets[base+word*8:])&mask == 0 {
+ return false
+ }
+ }
+
+ return true
+}
+
+// probe returns the bucket word index and single-bit mask
+// addressed by the i-th probe of oid.
+func probe(oid []byte, log2B uint, i int) (word int, mask uint64) {
+ bitOff := log2B + fieldBits*uint(i)
+ byteOff := bitOff >> 3
+ bitInByte := bitOff & 7
+
+ window := uint32(oid[byteOff])<<8 | uint32(oid[byteOff+1])
+ pi := (window >> (16 - bitInByte - fieldBits)) & 0x1ff
+
+ return int(pi >> 6), 1 << (wordBits - 1 - (pi & 63))
+}
diff --git a/internal/format/packidx/bloom/lookup_test.go b/internal/format/packidx/bloom/lookup_test.go
new file mode 100644
index 00000000..e6264f9a
--- /dev/null
+++ b/internal/format/packidx/bloom/lookup_test.go
@@ -0,0 +1,32 @@
+package bloom_test
+
+import (
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestMayContainBadLength(t *testing.T) {
+ t.Parallel()
+
+ format := id.ObjectFormatSHA256
+
+ builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ filter, err := bloom.Parse(builder.Bytes(), format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("MayContain did not panic on a short object ID")
+ }
+ }()
+
+ filter.MayContain(make([]byte, format.Size()-1))
+}
diff --git a/internal/format/packidx/bloom/roundtrip_test.go b/internal/format/packidx/bloom/roundtrip_test.go
new file mode 100644
index 00000000..28db17a5
--- /dev/null
+++ b/internal/format/packidx/bloom/roundtrip_test.go
@@ -0,0 +1,92 @@
+package bloom_test
+
+import (
+ "bytes"
+ "encoding/binary"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func makeOID(size int, seed uint64) []byte {
+ out := make([]byte, size)
+ state := seed
+
+ for i := 0; i < size; i += 8 {
+ state = state*6364136223846793005 + 1442695040888963407
+
+ var word [8]byte
+
+ binary.BigEndian.PutUint64(word[:], state)
+ copy(out[i:], word[:])
+ }
+
+ return out
+}
+
+func TestRoundTrip(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ const objects = 10000
+
+ bucketCount, k, err := bloom.RecommendParams(format, objects)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ size := format.Size()
+ packHash := makeOID(size, 0xC0FFEE)
+
+ builder, err := bloom.NewBuilder(format, bucketCount, k, packHash)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ for i := range objects {
+ builder.Add(makeOID(size, uint64(i)))
+ }
+
+ filter, err := bloom.Parse(builder.Bytes(), format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if !bytes.Equal(filter.PackHash(), packHash) {
+ t.Fatalf("PackHash = %x, want %x", filter.PackHash(), packHash)
+ }
+
+ err = filter.Verify()
+ if err != nil {
+ t.Fatalf("Verify on a freshly built filter: %v", err)
+ }
+
+ for i := range objects {
+ if !filter.MayContain(makeOID(size, uint64(i))) {
+ t.Fatalf("false negative for added object %d", i)
+ }
+ }
+
+ const probes = 10000
+
+ falsePositives := 0
+
+ for i := range probes {
+ if filter.MayContain(makeOID(size, uint64(1)<<40+uint64(i))) {
+ falsePositives++
+ }
+ }
+
+ rate := float64(falsePositives) / float64(probes)
+ if rate > 0.05 {
+ t.Errorf("false positive rate %.4f exceeds 0.05", rate)
+ }
+
+ t.Logf("B=%d K=%d false positive rate %.4f", bucketCount, k, rate)
+ })
+ }
+}
diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go
new file mode 100644
index 00000000..e6213a2c
--- /dev/null
+++ b/internal/format/packidx/bloom/write.go
@@ -0,0 +1,164 @@
+package bloom
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "hash"
+ "math/bits"
+
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+// ErrInvalidParameters reports that
+// the parameters supplied for a filter build
+// are not representable in the format.
+var ErrInvalidParameters = errors.New("internal/format/packidx/bloom: invalid parameters")
+
+// defaultK is the probe count used by [RecommendParams].
+//
+// With 512-bit buckets it keeps the false positive rate near one percent
+// at the target bucket load.
+const defaultK = 8
+
+// targetLoad is the object count per bucket that [RecommendParams] aims for.
+const targetLoad = 48
+
+// Builder accumulates object IDs into an in-memory Bloom filter
+// and serializes it.
+//
+// Labels: MT-Unsafe.
+type Builder struct {
+ // data is the full filter file, header and trailer included.
+ data []byte
+
+ // buckets aliases the bucket region of data, between header and trailer.
+ buckets []byte
+
+ // hashImpl computes the trailing checksum and gives the hash size.
+ hashImpl hash.Hash
+
+ log2B uint
+ k int
+}
+
+// NewBuilder creates a filter builder
+// for bucketCount buckets and k probes per object ID,
+// binding the filter to packHash.
+//
+// bucketCount must be a nonzero power of two,
+// k must be nonzero,
+// and log2(bucketCount) + 9*k must not exceed the hash length in bits.
+// packHash must be the pack's trailer hash;
+// NewBuilder panics when its length does not match the object format.
+func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16, packHash []byte) (*Builder, error) {
+ hashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return nil, err
+ }
+
+ hashImpl, err := objectFormat.New()
+ if err != nil {
+ return nil, fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
+ hashSize := objectFormat.Size()
+
+ if len(packHash) != hashSize {
+ panic("internal/format/packidx/bloom: invalid pack hash length")
+ }
+
+ log2B, err := checkParams(bucketCount, k, hashSize)
+ if err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize)) //#nosec G115
+ if err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ data := make([]byte, total)
+ binary.BigEndian.PutUint32(data[0:], signature)
+ binary.BigEndian.PutUint32(data[4:], version)
+ binary.BigEndian.PutUint32(data[8:], hashID)
+ binary.BigEndian.PutUint32(data[12:], bucketCount)
+ binary.BigEndian.PutUint16(data[16:], k)
+
+ bucketsEnd := total - 2*hashSize
+ copy(data[bucketsEnd:], packHash)
+
+ return &Builder{
+ data: data,
+ buckets: data[HeaderLen:bucketsEnd],
+ hashImpl: hashImpl,
+ log2B: log2B,
+ k: int(k),
+ }, nil
+}
+
+// Add records oid in the filter.
+//
+// oid must be exactly the filter's hash size;
+// Add panics otherwise.
+func (b *Builder) Add(oid []byte) {
+ if len(oid) != b.hashImpl.Size() {
+ panic("internal/format/packidx/bloom: invalid object ID length")
+ }
+
+ base := int(binary.BigEndian.Uint32(oid[:4])>>(32-b.log2B)) * BucketLen
+
+ for i := range b.k {
+ word, mask := probe(oid, b.log2B, i)
+
+ off := base + word*8
+ set := binary.BigEndian.Uint64(b.buckets[off:]) | mask
+ binary.BigEndian.PutUint64(b.buckets[off:], set)
+ }
+}
+
+// Bytes returns the serialized filter, including its trailing checksum.
+//
+// Labels: Life-Parent, Mut-No.
+func (b *Builder) Bytes() []byte {
+ checksumOff := len(b.data) - b.hashImpl.Size()
+
+ b.hashImpl.Reset()
+ _, _ = b.hashImpl.Write(b.data[:checksumOff])
+ b.hashImpl.Sum(b.data[checksumOff:checksumOff])
+
+ return b.data
+}
+
+// RecommendParams returns filter parameters for an index of n objects,
+// targeting a false positive rate near one percent.
+func RecommendParams(objectFormat id.ObjectFormat, n int) (bucketCount uint32, k uint16, err error) {
+ hashSize := objectFormat.Size()
+ if hashSize == 0 {
+ return 0, 0, id.ErrInvalidObjectFormat
+ }
+
+ const maxPow2 = uint32(1) << 31
+
+ wanted := uint64(0)
+ if n > 0 {
+ wanted = (uint64(n) + targetLoad - 1) / targetLoad
+ }
+
+ switch {
+ case wanted <= 1:
+ bucketCount = 1
+ case wanted > uint64(maxPow2):
+ bucketCount = maxPow2
+ default:
+ bucketCount = uint32(1) << bits.Len64(wanted-1)
+ }
+
+ _, err = checkParams(bucketCount, defaultK, hashSize)
+ if err != nil {
+ return 0, 0, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ return bucketCount, defaultK, nil
+}
diff --git a/internal/format/packidx/bloom/write_test.go b/internal/format/packidx/bloom/write_test.go
new file mode 100644
index 00000000..74173921
--- /dev/null
+++ b/internal/format/packidx/bloom/write_test.go
@@ -0,0 +1,95 @@
+package bloom_test
+
+import (
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestRecommendParams(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, n := range []int{0, 1, 1000, 10000, 1000000} {
+ bucketCount, k, err := bloom.RecommendParams(format, n)
+ if err != nil {
+ t.Fatalf("n=%d: %v", n, err)
+ }
+
+ if bucketCount == 0 || bucketCount&(bucketCount-1) != 0 {
+ t.Errorf("n=%d: bucket count %d not a power of two", n, bucketCount)
+ }
+
+ _, err = bloom.NewBuilder(format, bucketCount, k, make([]byte, format.Size()))
+ if err != nil {
+ t.Errorf("n=%d: recommended parameters rejected: %v", n, err)
+ }
+ }
+ })
+ }
+}
+
+func TestNewBuilderRejects(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ bucketCount uint32
+ k uint16
+ }{
+ {"zero buckets", 0, 8},
+ {"non power of two", 3, 8},
+ {"zero probe count", 4, 0},
+ }
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.NewBuilder(format, tc.bucketCount, tc.k, make([]byte, format.Size()))
+ if !errors.Is(err, bloom.ErrInvalidParameters) {
+ t.Fatalf("error = %v, want ErrInvalidParameters", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+func TestNewBuilderBadPackHash(t *testing.T) {
+ t.Parallel()
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("NewBuilder did not panic on a short pack hash")
+ }
+ }()
+
+ _, _ = bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size()-1))
+}
+
+func TestAddBadLength(t *testing.T) {
+ t.Parallel()
+
+ builder, err := bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("Add did not panic on a short object ID")
+ }
+ }()
+
+ builder.Add(make([]byte, id.ObjectFormatSHA256.Size()-1))
+}
diff --git a/internal/format/packidx/doc.go b/internal/format/packidx/doc.go
new file mode 100644
index 00000000..5c6015b4
--- /dev/null
+++ b/internal/format/packidx/doc.go
@@ -0,0 +1,3 @@
+// Package packidx provides Git pack index (version 2) format
+// parsing and writing primitives.
+package packidx
diff --git a/internal/format/packidx/helpers_test.go b/internal/format/packidx/helpers_test.go
new file mode 100644
index 00000000..be9b47d1
--- /dev/null
+++ b/internal/format/packidx/helpers_test.go
@@ -0,0 +1,86 @@
+package packidx_test
+
+import (
+ "os"
+ "strconv"
+ "strings"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/testgit"
+ "lindenii.org/go/furgit/object/id"
+)
+
+// makeGitPack seeds a repository,
+// packs the seeded objects with git pack-objects,
+// and returns the repository, the artifact path prefix,
+// and the packed object IDs.
+func makeGitPack(t *testing.T, objectFormat id.ObjectFormat) (*testgit.Repo, string, []id.ObjectID) {
+ t.Helper()
+
+ repo, err := testgit.NewRepo(t, testgit.RepoOptions{ObjectFormat: objectFormat})
+ if err != nil {
+ t.Fatalf("NewRepo: %v", err)
+ }
+
+ seeded, err := repo.SeedHistory(t)
+ if err != nil {
+ t.Fatalf("SeedHistory: %v", err)
+ }
+
+ oids := seeded.All()
+
+ prefix, err := repo.PackObjects(t, oids, testgit.PackObjectsOptions{})
+ if err != nil {
+ t.Fatalf("PackObjects: %v", err)
+ }
+
+ return repo, prefix, oids
+}
+
+// parseGitIdxFile reads and parses one .idx file produced by git.
+func parseGitIdxFile(t *testing.T, prefix string, objectFormat id.ObjectFormat) ([]byte, packidx.Packidx) {
+ t.Helper()
+
+ data, err := os.ReadFile(prefix + ".idx") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ idx, err := packidx.Parse(data, objectFormat.Size())
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ return data, idx
+}
+
+// gitPackOffsets extracts the object-to-offset mapping
+// from git verify-pack -v output.
+func gitPackOffsets(t *testing.T, repo *testgit.Repo, idxPath string, objectFormat id.ObjectFormat) map[string]uint64 {
+ t.Helper()
+
+ out, err := repo.VerifyPack(t, idxPath)
+ if err != nil {
+ t.Fatalf("VerifyPack: %v", err)
+ }
+
+ hexLen := objectFormat.HexLen()
+ offsets := make(map[string]uint64)
+
+ for line := range strings.Lines(string(out)) {
+ fields := strings.Fields(line)
+ if len(fields) < 5 || len(fields[0]) != hexLen {
+ continue
+ }
+
+ offset, err := strconv.ParseUint(fields[4], 10, 64)
+ if err != nil {
+ continue
+ }
+
+ offsets[fields[0]] = offset
+ }
+
+ return offsets
+}
diff --git a/internal/format/packidx/lookup.go b/internal/format/packidx/lookup.go
new file mode 100644
index 00000000..e4b565b6
--- /dev/null
+++ b/internal/format/packidx/lookup.go
@@ -0,0 +1,121 @@
+package packidx
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "math/bits"
+)
+
+// Lookup searches the index for one object ID
+// and returns its pack file offset.
+//
+// oid must be exactly the index's hash size;
+// Lookup panics otherwise.
+func (idx *Packidx) Lookup(oid []byte) (offset uint64, found bool, err error) {
+ if len(oid) != idx.hashSize {
+ panic("internal/format/packidx: invalid object ID length")
+ }
+
+ lo, hi := idx.fanoutRange(oid[0])
+
+ // Object IDs are uniform for honest inputs,
+ // interp on next 8 octets converges in O(log log n).
+ //
+ // OIDs are public unkeyed hashes,
+ // so an attacker may sample/filter/mine prefix clusters,
+ // making interpolation mis-estimate every probe.
+ // See https://runxiyu.org/comp/ch4ht/
+ // for why cryptographic hash algorithms are insufficient.
+ //
+ // Cap the interp at bisect's probe count and finish by bisect;
+ // adversarial at O(log n) plus small interpolation overhead,
+ // honest exit well under the cap.
+ target := binary.BigEndian.Uint64(oid[1:9])
+
+ for budget := bits.Len(uint(hi - lo)); hi-lo > 8 && budget > 0; budget-- {
+ loKey := binary.BigEndian.Uint64(idx.OIDAt(lo)[1:9])
+ hiKey := binary.BigEndian.Uint64(idx.OIDAt(hi - 1)[1:9])
+
+ var mid int
+
+ switch {
+ case target <= loKey:
+ mid = lo
+ case target >= hiKey:
+ mid = hi - 1
+ default:
+ hi128, lo128 := bits.Mul64(target-loKey, uint64(hi-lo-1)) //#nosec G115
+ q, _ := bits.Div64(hi128, lo128, hiKey-loKey)
+ mid = lo + int(q) //#nosec G115
+ }
+
+ switch cmp := bytes.Compare(oid, idx.OIDAt(mid)); {
+ case cmp == 0:
+ offset, err = idx.OffsetAt(mid)
+ if err != nil {
+ return 0, false, err
+ }
+
+ return offset, true, nil
+ case cmp < 0:
+ hi = mid
+ default:
+ lo = mid + 1
+ }
+ }
+
+ // Interpolation narrowed or capped; bisect to finish.
+ for lo < hi {
+ mid := lo + (hi-lo)/2
+
+ cmp := bytes.Compare(oid, idx.OIDAt(mid))
+
+ switch {
+ case cmp == 0:
+ offset, err = idx.OffsetAt(mid)
+ if err != nil {
+ return 0, false, err
+ }
+
+ return offset, true, nil
+ case cmp < 0:
+ hi = mid
+ default:
+ lo = mid + 1
+ }
+ }
+
+ return 0, false, nil
+}
+
+// OffsetAt returns the pack file offset at one index position.
+//
+// OffsetAt panics when pos is out of range.
+func (idx *Packidx) OffsetAt(pos int) (uint64, error) {
+ idx.checkPos(pos)
+
+ raw := binary.BigEndian.Uint32(idx.data[idx.off32Off+4*pos:])
+ if raw&largeOffsetFlag == 0 {
+ return uint64(raw), nil
+ }
+
+ slot := raw &^ largeOffsetFlag
+ if uint64(slot) >= idx.off64Count {
+ return 0, fmt.Errorf("%w: 64-bit offset reference out of range", ErrMalformedPackIndex)
+ }
+
+ return binary.BigEndian.Uint64(idx.data[idx.off64Off+8*int(slot):]), nil
+}
+
+// fanoutRange returns the index position range [lo, hi)
+// of object IDs beginning with first.
+func (idx *Packidx) fanoutRange(first byte) (lo, hi int) {
+ hi = int(binary.BigEndian.Uint32(idx.data[headerLen+4*int(first):]))
+
+ if first > 0 {
+ lo = int(binary.BigEndian.Uint32(idx.data[headerLen+4*(int(first)-1):]))
+ }
+
+ return lo, hi
+}
diff --git a/internal/format/packidx/lookup_test.go b/internal/format/packidx/lookup_test.go
new file mode 100644
index 00000000..514aad7a
--- /dev/null
+++ b/internal/format/packidx/lookup_test.go
@@ -0,0 +1,99 @@
+package packidx_test
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestLookupGitIndex(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ repo, prefix, oids := makeGitPack(t, objectFormat)
+ _, idx := parseGitIdxFile(t, prefix, objectFormat)
+
+ wantOffsets := gitPackOffsets(t, repo, prefix+".idx", objectFormat)
+ if len(wantOffsets) != len(oids) {
+ t.Fatalf("verify-pack offsets = %d entries, want %d", len(wantOffsets), len(oids))
+ }
+
+ for _, oid := range oids {
+ offset, found, err := idx.Lookup(oid.RawBytes())
+ if err != nil {
+ t.Fatalf("Lookup(%s): %v", oid, err)
+ }
+
+ if !found {
+ t.Fatalf("Lookup(%s) not found", oid)
+ }
+
+ if offset != wantOffsets[oid.String()] {
+ t.Fatalf("Lookup(%s) = %d, want %d", oid, offset, wantOffsets[oid.String()])
+ }
+ }
+ })
+ }
+}
+
+func TestLookupMissing(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, prefix, oids := makeGitPack(t, objectFormat)
+ _, idx := parseGitIdxFile(t, prefix, objectFormat)
+
+ missing := bytes.Clone(oids[0].RawBytes())
+ missing[len(missing)-1] ^= 0xff
+
+ _, found, err := idx.Lookup(missing)
+ if err != nil {
+ t.Fatalf("Lookup: %v", err)
+ }
+
+ if found {
+ t.Fatalf("Lookup of mutated oid unexpectedly found")
+ }
+ })
+ }
+}
+
+func TestOffsetAtMalformedLargeReference(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ hashSize := objectFormat.Size()
+
+ entries := syntheticEntries(3)
+ data := writeSyntheticIndex(t, objectFormat, entries)
+
+ // Mark the first 32-bit offset entry as a 64-bit reference;
+ // the index has no 64-bit offset table.
+ off32Off := 8 + 256*4 + len(entries)*hashSize + len(entries)*4
+ binary.BigEndian.PutUint32(data[off32Off:], 0x80000000)
+
+ idx, err := packidx.Parse(data, hashSize)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ _, err = idx.OffsetAt(0)
+ if !errors.Is(err, packidx.ErrMalformedPackIndex) {
+ t.Fatalf("OffsetAt error = %v, want ErrMalformedPackIndex", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packidx/packidx.go b/internal/format/packidx/packidx.go
new file mode 100644
index 00000000..ef2c93f4
--- /dev/null
+++ b/internal/format/packidx/packidx.go
@@ -0,0 +1,197 @@
+package packidx
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+// ErrMalformedPackIndex reports that
+// a pack index is truncated,
+// has a bad signature or unsupported version,
+// or has inconsistent tables.
+var ErrMalformedPackIndex = errors.New("internal/format/packidx: malformed pack index")
+
+const (
+ signature = 0xff744f63
+ version = 2
+
+ headerLen = 8
+ fanoutLen = 256 * 4
+
+ // largeOffsetFlag marks one 32-bit offset table entry
+ // as an index into the 64-bit offset table.
+ largeOffsetFlag = 0x80000000
+)
+
+// Packidx is one parsed pack index view over borrowed bytes.
+//
+// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
+type Packidx struct {
+ // data is the entire pack index payload.
+ data []byte
+ // hashSize is the object ID size of the index's object format.
+ hashSize int
+
+ // numObjects is the object count from the last fanout entry.
+ numObjects int
+
+ // namesOff, crcOff, off32Off, and off64Off are
+ // the byte offsets of the object ID, CRC32,
+ // 32-bit offset, and 64-bit offset tables.
+ namesOff int
+ crcOff int
+ off32Off int
+ off64Off int
+ // off64Count is the number of 64-bit offset table entries.
+ off64Count uint64
+}
+
+// Parse parses one pack index from data.
+//
+// hashSize must be the object ID size
+// of the pack's object format;
+// Parse panics on implausible hash sizes.
+func Parse(data []byte, hashSize int) (Packidx, error) {
+ var zero Packidx
+
+ if hashSize <= 0 || hashSize > id.MaxObjectIDSize {
+ panic("internal/format/packidx: invalid hash size")
+ }
+
+ if len(data) < headerLen+fanoutLen+2*hashSize {
+ return zero, fmt.Errorf("%w: truncated", ErrMalformedPackIndex)
+ }
+
+ if binary.BigEndian.Uint32(data) != signature {
+ return zero, fmt.Errorf("%w: bad signature", ErrMalformedPackIndex)
+ }
+
+ if binary.BigEndian.Uint32(data[4:]) != version {
+ return zero, fmt.Errorf("%w: unsupported version", ErrMalformedPackIndex)
+ }
+
+ prev := uint32(0)
+
+ for i := range 256 {
+ count := binary.BigEndian.Uint32(data[headerLen+4*i:])
+ if count < prev {
+ return zero, fmt.Errorf("%w: non-monotonic fanout", ErrMalformedPackIndex)
+ }
+
+ prev = count
+ }
+
+ numObjects := uint64(prev)
+ hashSize64 := uint64(hashSize)
+
+ namesOff := uint64(headerLen + fanoutLen)
+ crcOff := namesOff + numObjects*hashSize64
+ off32Off := crcOff + 4*numObjects
+ off64Off := off32Off + 4*numObjects
+
+ minTotal := off64Off + 2*hashSize64
+
+ dataLen, err := intconv.IntToUint64(len(data))
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err)
+ }
+
+ if dataLen < minTotal {
+ return zero, fmt.Errorf("%w: tables exceed index size", ErrMalformedPackIndex)
+ }
+
+ off64Bytes := dataLen - minTotal
+ if off64Bytes%8 != 0 {
+ return zero, fmt.Errorf("%w: trailing table size not a 64-bit offset multiple", ErrMalformedPackIndex)
+ }
+
+ off64Count := off64Bytes / 8
+ if off64Count > numObjects {
+ return zero, fmt.Errorf("%w: more 64-bit offsets than objects", ErrMalformedPackIndex)
+ }
+
+ idx := Packidx{
+ data: data,
+ hashSize: hashSize,
+ off64Count: off64Count,
+ }
+
+ idx.numObjects, err = intconv.Uint64ToInt(numObjects)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err)
+ }
+
+ idx.namesOff, err = intconv.Uint64ToInt(namesOff)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err)
+ }
+
+ idx.crcOff, err = intconv.Uint64ToInt(crcOff)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err)
+ }
+
+ idx.off32Off, err = intconv.Uint64ToInt(off32Off)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err)
+ }
+
+ idx.off64Off, err = intconv.Uint64ToInt(off64Off)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err)
+ }
+
+ return idx, nil
+}
+
+// NumObjects returns the number of objects in the index.
+func (idx *Packidx) NumObjects() int {
+ return idx.numObjects
+}
+
+// PackHash returns the pack hash recorded in the index trailer.
+//
+// Labels: Life-Parent, Mut-No.
+func (idx *Packidx) PackHash() []byte {
+ return idx.data[len(idx.data)-2*idx.hashSize : len(idx.data)-idx.hashSize]
+}
+
+// OIDAt returns the object ID bytes at one index position.
+// Positions follow object ID sort order.
+//
+// OIDAt panics when pos is out of range.
+//
+// Labels: Life-Parent, Mut-No.
+func (idx *Packidx) OIDAt(pos int) []byte {
+ idx.checkPos(pos)
+
+ start := idx.namesOff + pos*idx.hashSize
+
+ return idx.data[start : start+idx.hashSize]
+}
+
+// CRCAt returns the CRC32 of the packed entry data
+// at one index position.
+//
+// CRCAt panics when pos is out of range.
+func (idx *Packidx) CRCAt(pos int) uint32 {
+ idx.checkPos(pos)
+
+ return binary.BigEndian.Uint32(idx.data[idx.crcOff+4*pos:])
+}
+
+// checkPos panics when pos is not a valid index position.
+//
+// An out-of-range position is a caller bug
+// that slice bounds checking would not catch,
+// since the tables share one data slice;
+// an unchecked access would silently read other tables' bytes.
+func (idx *Packidx) checkPos(pos int) {
+ if pos < 0 || pos >= idx.numObjects {
+ panic("internal/format/packidx: index position out of range")
+ }
+}
diff --git a/internal/format/packidx/packidx_test.go b/internal/format/packidx/packidx_test.go
new file mode 100644
index 00000000..d225a993
--- /dev/null
+++ b/internal/format/packidx/packidx_test.go
@@ -0,0 +1,124 @@
+package packidx_test
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "os"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestParseGitIndex(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, prefix, oids := makeGitPack(t, objectFormat)
+ _, idx := parseGitIdxFile(t, prefix, objectFormat)
+
+ if idx.NumObjects() != len(oids) {
+ t.Fatalf("NumObjects = %d, want %d", idx.NumObjects(), len(oids))
+ }
+
+ for pos := 1; pos < idx.NumObjects(); pos++ {
+ if bytes.Compare(idx.OIDAt(pos-1), idx.OIDAt(pos)) >= 0 {
+ t.Fatalf("OIDAt(%d) not sorted after predecessor", pos)
+ }
+ }
+
+ packData, err := os.ReadFile(prefix + ".pack") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ packTrailer := packData[len(packData)-objectFormat.Size():]
+ if !bytes.Equal(idx.PackHash(), packTrailer) {
+ t.Fatalf("PackHash does not match pack trailer")
+ }
+ })
+ }
+}
+
+func TestParseMalformed(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ hashSize := objectFormat.Size()
+
+ valid := writeSyntheticIndex(t, objectFormat, syntheticEntries(8))
+
+ corrupt := func(mutate func(data []byte) []byte) []byte {
+ return mutate(bytes.Clone(valid))
+ }
+
+ cases := []struct {
+ name string
+ data []byte
+ }{
+ {name: "empty", data: []byte{}},
+ {name: "truncated", data: corrupt(func(d []byte) []byte { return d[:20] })},
+ {
+ name: "bad signature",
+ data: corrupt(func(d []byte) []byte {
+ d[0] ^= 0xff
+
+ return d
+ }),
+ },
+ {
+ name: "bad version",
+ data: corrupt(func(d []byte) []byte {
+ d[7] = 3
+
+ return d
+ }),
+ },
+ {
+ name: "non-monotonic fanout",
+ data: corrupt(func(d []byte) []byte {
+ binary.BigEndian.PutUint32(d[8:], 0xffffffff)
+
+ return d
+ }),
+ },
+ {
+ name: "tables exceed index size",
+ data: corrupt(func(d []byte) []byte {
+ binary.BigEndian.PutUint32(d[8+255*4:], 0x00ffffff)
+
+ return d
+ }),
+ },
+ {
+ name: "trailing size not 64-bit multiple",
+ data: corrupt(func(d []byte) []byte { return append(d, 0xde, 0xad, 0xbe, 0xef) }),
+ },
+ {
+ name: "more 64-bit offsets than objects",
+ data: corrupt(func(d []byte) []byte {
+ return append(d, bytes.Repeat([]byte{0x00}, 9*8)...)
+ }),
+ },
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ _, err := packidx.Parse(tc.data, hashSize)
+ if !errors.Is(err, packidx.ErrMalformedPackIndex) {
+ t.Fatalf("Parse error = %v, want ErrMalformedPackIndex", err)
+ }
+ })
+ }
+ })
+ }
+}
diff --git a/internal/format/packidx/roundtrip_test.go b/internal/format/packidx/roundtrip_test.go
new file mode 100644
index 00000000..f5bd82e7
--- /dev/null
+++ b/internal/format/packidx/roundtrip_test.go
@@ -0,0 +1,88 @@
+package packidx_test
+
+import (
+ "bytes"
+ "math"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestWriteRoundTrip(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ hashSize := objectFormat.Size()
+
+ entries := syntheticEntries(20)
+ entries[3].Offset = 1 << 33
+ entries[11].Offset = math.MaxUint64
+ entries[12].Offset = 0x7fffffff
+ entries[13].Offset = 0x80000000
+
+ data := writeSyntheticIndex(t, objectFormat, entries)
+
+ idx, err := packidx.Parse(data, hashSize)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ if idx.NumObjects() != len(entries) {
+ t.Fatalf("NumObjects = %d, want %d", idx.NumObjects(), len(entries))
+ }
+
+ if !bytes.Equal(idx.PackHash(), bytes.Repeat([]byte{0x5a}, hashSize)) {
+ t.Fatalf("PackHash mismatch")
+ }
+
+ for pos, entry := range entries {
+ if !bytes.Equal(idx.OIDAt(pos), entry.OID[:hashSize]) {
+ t.Fatalf("OIDAt(%d) mismatch", pos)
+ }
+
+ if idx.CRCAt(pos) != entry.CRC32 {
+ t.Fatalf("CRCAt(%d) = %x, want %x", pos, idx.CRCAt(pos), entry.CRC32)
+ }
+
+ offset, err := idx.OffsetAt(pos)
+ if err != nil {
+ t.Fatalf("OffsetAt(%d): %v", pos, err)
+ }
+
+ if offset != entry.Offset {
+ t.Fatalf("OffsetAt(%d) = %d, want %d", pos, offset, entry.Offset)
+ }
+
+ offset, found, err := idx.Lookup(entry.OID[:hashSize])
+ if err != nil {
+ t.Fatalf("Lookup(%d): %v", pos, err)
+ }
+
+ if !found || offset != entry.Offset {
+ t.Fatalf("Lookup(%d) = %d %v, want %d found", pos, offset, found, entry.Offset)
+ }
+ }
+ })
+ }
+}
+
+// writeSyntheticIndex writes one index over entries
+// with a fixed fake pack hash.
+func writeSyntheticIndex(t *testing.T, objectFormat id.ObjectFormat, entries []packidx.Entry) []byte {
+ t.Helper()
+
+ packHash := bytes.Repeat([]byte{0x5a}, objectFormat.Size())
+
+ var buf bytes.Buffer
+
+ err := packidx.Write(&buf, objectFormat, entries, packHash)
+ if err != nil {
+ t.Fatalf("Write: %v", err)
+ }
+
+ return buf.Bytes()
+}
diff --git a/internal/format/packidx/write.go b/internal/format/packidx/write.go
new file mode 100644
index 00000000..35b2805f
--- /dev/null
+++ b/internal/format/packidx/write.go
@@ -0,0 +1,129 @@
+package packidx
+
+import (
+ "bufio"
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "math"
+
+ "lindenii.org/go/furgit/internal/stickyio"
+ "lindenii.org/go/furgit/object/id"
+)
+
+// ErrInvalidEntries reports that
+// entries supplied for an index write
+// are unsorted, duplicated, or not representable
+// in the pack index format.
+var ErrInvalidEntries = errors.New("internal/format/packidx: invalid entries")
+
+// Entry is one object record for an index write.
+type Entry struct {
+ // OID holds the object ID bytes;
+ // only the first hash-size bytes are meaningful.
+ OID [id.MaxObjectIDSize]byte
+ // Offset is the entry's pack file offset.
+ Offset uint64
+ // CRC32 is the CRC32 of the entry's packed data.
+ CRC32 uint32
+}
+
+// Write writes one pack index over entries to w.
+//
+// entries must be sorted by object ID without duplicates.
+// packHash must be the pack's trailer hash;
+// Write panics when its length does not match the object format.
+func Write(w io.Writer, objectFormat id.ObjectFormat, entries []Entry, packHash []byte) error {
+ hashSize := objectFormat.Size()
+ if hashSize == 0 {
+ return id.ErrInvalidObjectFormat
+ }
+
+ if len(packHash) != hashSize {
+ panic("internal/format/packidx: invalid pack hash length")
+ }
+
+ if len(entries) > math.MaxUint32 {
+ return fmt.Errorf("%w: too many entries", ErrInvalidEntries)
+ }
+
+ for i := 1; i < len(entries); i++ {
+ if bytes.Compare(entries[i-1].OID[:hashSize], entries[i].OID[:hashSize]) >= 0 {
+ return fmt.Errorf("%w: not sorted by object ID", ErrInvalidEntries)
+ }
+ }
+
+ hashImpl, err := objectFormat.New()
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx: %w", err)
+ }
+
+ bw := bufio.NewWriter(io.MultiWriter(w, hashImpl))
+ sw := stickyio.New(bw)
+
+ sw.PutUint32(signature)
+ sw.PutUint32(version)
+
+ var counts [256]uint32
+ for i := range entries {
+ counts[entries[i].OID[0]]++
+ }
+
+ cumulative := uint32(0)
+ for _, count := range counts {
+ cumulative += count
+ sw.PutUint32(cumulative)
+ }
+
+ for i := range entries {
+ sw.Put(entries[i].OID[:hashSize])
+ }
+
+ for i := range entries {
+ sw.PutUint32(entries[i].CRC32)
+ }
+
+ largeOffsets := make([]uint64, 0, len(entries))
+
+ for i := range entries {
+ offset := entries[i].Offset
+ if offset < largeOffsetFlag {
+ sw.PutUint32(uint32(offset))
+
+ continue
+ }
+
+ slot := len(largeOffsets)
+ if slot >= largeOffsetFlag {
+ return fmt.Errorf("%w: too many large offsets", ErrInvalidEntries)
+ }
+
+ sw.PutUint32(largeOffsetFlag | uint32(slot))
+
+ largeOffsets = append(largeOffsets, offset)
+ }
+
+ for _, offset := range largeOffsets {
+ sw.PutUint64(offset)
+ }
+
+ sw.Put(packHash)
+
+ err = sw.Err()
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx: %w", err)
+ }
+
+ err = bw.Flush()
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx: %w", err)
+ }
+
+ _, err = w.Write(hashImpl.Sum(nil))
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx: %w", err)
+ }
+
+ return nil
+}
diff --git a/internal/format/packidx/write_test.go b/internal/format/packidx/write_test.go
new file mode 100644
index 00000000..68df3ece
--- /dev/null
+++ b/internal/format/packidx/write_test.go
@@ -0,0 +1,112 @@
+package packidx_test
+
+import (
+ "bytes"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/object/id"
+)
+
+// syntheticEntries builds n distinct entries sorted by object ID,
+// spread over the fanout table.
+func syntheticEntries(n int) []packidx.Entry {
+ entries := make([]packidx.Entry, n)
+ for i := range entries {
+ entries[i].OID[0] = byte(i * 7)
+ entries[i].OID[1] = byte(i + 1)
+ entries[i].Offset = uint64(i+1) * 100
+ entries[i].CRC32 = uint32(i+1) * 0x01010101
+ }
+
+ return entries
+}
+
+func TestWriteMatchesGit(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, prefix, _ := makeGitPack(t, objectFormat)
+ gitData, idx := parseGitIdxFile(t, prefix, objectFormat)
+
+ entries := make([]packidx.Entry, idx.NumObjects())
+ for pos := range entries {
+ copy(entries[pos].OID[:], idx.OIDAt(pos))
+ entries[pos].CRC32 = idx.CRCAt(pos)
+
+ offset, err := idx.OffsetAt(pos)
+ if err != nil {
+ t.Fatalf("OffsetAt(%d): %v", pos, err)
+ }
+
+ entries[pos].Offset = offset
+ }
+
+ var buf bytes.Buffer
+
+ err := packidx.Write(&buf, objectFormat, entries, idx.PackHash())
+ if err != nil {
+ t.Fatalf("Write: %v", err)
+ }
+
+ if !bytes.Equal(buf.Bytes(), gitData) {
+ t.Fatalf("Write output differs from git's index (%d vs %d bytes)", buf.Len(), len(gitData))
+ }
+ })
+ }
+}
+
+func TestWriteInvalidEntries(t *testing.T) {
+ t.Parallel()
+
+ unsorted := syntheticEntries(3)
+ unsorted[0], unsorted[2] = unsorted[2], unsorted[0]
+
+ duplicated := syntheticEntries(3)
+ duplicated[1] = duplicated[0]
+
+ cases := []struct {
+ name string
+ entries []packidx.Entry
+ }{
+ {name: "unsorted", entries: unsorted},
+ {name: "duplicated", entries: duplicated},
+ }
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ packHash := bytes.Repeat([]byte{0x5a}, objectFormat.Size())
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ err := packidx.Write(&bytes.Buffer{}, objectFormat, tc.entries, packHash)
+ if !errors.Is(err, packidx.ErrInvalidEntries) {
+ t.Fatalf("Write error = %v, want ErrInvalidEntries", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+func TestWriteBadPackHashPanics(t *testing.T) {
+ t.Parallel()
+
+ objectFormat := id.ObjectFormatSHA256
+
+ defer func() {
+ if recover() == nil {
+ t.Fatalf("Write with short pack hash: expected panic")
+ }
+ }()
+
+ _ = packidx.Write(&bytes.Buffer{}, objectFormat, nil, []byte{0x01})
+}
diff --git a/internal/format/packrev/doc.go b/internal/format/packrev/doc.go
new file mode 100644
index 00000000..6ce8113e
--- /dev/null
+++ b/internal/format/packrev/doc.go
@@ -0,0 +1,3 @@
+// Package packrev provides Git pack reverse index (version 1) format
+// parsing and writing primitives.
+package packrev
diff --git a/internal/format/packrev/helpers_test.go b/internal/format/packrev/helpers_test.go
new file mode 100644
index 00000000..2d781669
--- /dev/null
+++ b/internal/format/packrev/helpers_test.go
@@ -0,0 +1,75 @@
+package packrev_test
+
+import (
+ "cmp"
+ "slices"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/testgit"
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+// makeGitPack seeds a repository,
+// packs the seeded objects with git pack-objects
+// including a reverse index,
+// and returns the artifact path prefix.
+func makeGitPack(t *testing.T, objectFormat id.ObjectFormat) string {
+ t.Helper()
+
+ repo, err := testgit.NewRepo(t, testgit.RepoOptions{ObjectFormat: objectFormat})
+ if err != nil {
+ t.Fatalf("NewRepo: %v", err)
+ }
+
+ seeded, err := repo.SeedHistory(t)
+ if err != nil {
+ t.Fatalf("SeedHistory: %v", err)
+ }
+
+ prefix, err := repo.PackObjects(t, seeded.All(), testgit.PackObjectsOptions{RevIndex: true})
+ if err != nil {
+ t.Fatalf("PackObjects: %v", err)
+ }
+
+ return prefix
+}
+
+// packOrderPositions derives the pack-offset-order index positions
+// from one parsed pack index.
+func packOrderPositions(t *testing.T, idx *packidx.Packidx) []uint32 {
+ t.Helper()
+
+ type pair struct {
+ offset uint64
+ position uint32
+ }
+
+ pairs := make([]pair, 0, idx.NumObjects())
+
+ for pos := range idx.NumObjects() {
+ offset, err := idx.OffsetAt(pos)
+ if err != nil {
+ t.Fatalf("OffsetAt(%d): %v", pos, err)
+ }
+
+ position, err := intconv.IntToUint32(pos)
+ if err != nil {
+ t.Fatalf("IntToUint32(%d): %v", pos, err)
+ }
+
+ pairs = append(pairs, pair{offset: offset, position: position})
+ }
+
+ slices.SortFunc(pairs, func(a, b pair) int {
+ return cmp.Compare(a.offset, b.offset)
+ })
+
+ positions := make([]uint32, 0, len(pairs))
+ for _, p := range pairs {
+ positions = append(positions, p.position)
+ }
+
+ return positions
+}
diff --git a/internal/format/packrev/packrev.go b/internal/format/packrev/packrev.go
new file mode 100644
index 00000000..3a6dc2de
--- /dev/null
+++ b/internal/format/packrev/packrev.go
@@ -0,0 +1,124 @@
+package packrev
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+// ErrMalformedReverseIndex reports that
+// a pack reverse index is truncated,
+// has a bad signature, version, or hash function,
+// or contains invalid index positions.
+var ErrMalformedReverseIndex = errors.New("internal/format/packrev: malformed pack reverse index")
+
+const (
+ signature = 0x52494458 // "RIDX"
+ version = 1
+
+ headerLen = 12
+)
+
+// hashFunctionID returns the on-disk hash function identifier
+// for one object format.
+func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
+ switch objectFormat {
+ case id.ObjectFormatSHA1:
+ return 1, nil
+ case id.ObjectFormatSHA256:
+ return 2, nil
+ case id.ObjectFormatUnknown:
+ }
+
+ return 0, id.ErrInvalidObjectFormat
+}
+
+// Packrev is a parsed pack reverse index view over borrowed bytes.
+//
+// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
+type Packrev struct {
+ // data is the entire pack reverse index payload.
+ data []byte
+ // hashSize is the object ID size of the object format.
+ hashSize int
+ // numObjects is the number of index position entries.
+ numObjects int
+}
+
+// Parse parses a pack reverse index from data.
+func Parse(data []byte, objectFormat id.ObjectFormat) (Packrev, error) {
+ var zero Packrev
+
+ wantHashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return zero, err
+ }
+
+ hashSize := objectFormat.Size()
+
+ if len(data) < headerLen+2*hashSize {
+ return zero, fmt.Errorf("%w: truncated", ErrMalformedReverseIndex)
+ }
+
+ if binary.BigEndian.Uint32(data) != signature {
+ return zero, fmt.Errorf("%w: bad signature", ErrMalformedReverseIndex)
+ }
+
+ if binary.BigEndian.Uint32(data[4:]) != version {
+ return zero, fmt.Errorf("%w: unsupported version", ErrMalformedReverseIndex)
+ }
+
+ if binary.BigEndian.Uint32(data[8:]) != wantHashID {
+ return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedReverseIndex)
+ }
+
+ positionBytes := len(data) - headerLen - 2*hashSize
+ if positionBytes%4 != 0 {
+ return zero, fmt.Errorf("%w: position table size not a 32-bit multiple", ErrMalformedReverseIndex)
+ }
+
+ return Packrev{
+ data: data,
+ hashSize: hashSize,
+ numObjects: positionBytes / 4,
+ }, nil
+}
+
+// NumObjects returns the number of index position entries.
+func (rev *Packrev) NumObjects() int {
+ return rev.numObjects
+}
+
+// PackHash returns the pack hash recorded in the trailer.
+//
+// Labels: Life-Parent, Mut-No.
+func (rev *Packrev) PackHash() []byte {
+ return rev.data[len(rev.data)-2*rev.hashSize : len(rev.data)-rev.hashSize]
+}
+
+// PositionAt returns the pack index position
+// of the object at a pack offset order position.
+//
+// PositionAt panics when packOrder is out of range,
+// and errors when the stored position is not a valid index position.
+func (rev *Packrev) PositionAt(packOrder int) (int, error) {
+ if packOrder < 0 || packOrder >= rev.numObjects {
+ panic("internal/format/packrev: pack order position out of range")
+ }
+
+ stored := binary.BigEndian.Uint32(rev.data[headerLen+4*packOrder:])
+
+ position, err := intconv.Uint32ToInt(stored)
+ if err != nil {
+ return 0, fmt.Errorf("%w: %w", ErrMalformedReverseIndex, err)
+ }
+
+ if position >= rev.numObjects {
+ return 0, fmt.Errorf("%w: index position out of range", ErrMalformedReverseIndex)
+ }
+
+ return position, nil
+}
diff --git a/internal/format/packrev/packrev_test.go b/internal/format/packrev/packrev_test.go
new file mode 100644
index 00000000..b644e15e
--- /dev/null
+++ b/internal/format/packrev/packrev_test.go
@@ -0,0 +1,160 @@
+package packrev_test
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "os"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/format/packrev"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestParseGitReverseIndex(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ prefix := makeGitPack(t, objectFormat)
+
+ revData, err := os.ReadFile(prefix + ".rev") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ rev, err := packrev.Parse(revData, objectFormat)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ idxData, err := os.ReadFile(prefix + ".idx") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ idx, err := packidx.Parse(idxData, objectFormat.Size())
+ if err != nil {
+ t.Fatalf("packidx.Parse: %v", err)
+ }
+
+ if rev.NumObjects() != idx.NumObjects() {
+ t.Fatalf("NumObjects = %d, want %d", rev.NumObjects(), idx.NumObjects())
+ }
+
+ packData, err := os.ReadFile(prefix + ".pack") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ packTrailer := packData[len(packData)-objectFormat.Size():]
+ if !bytes.Equal(rev.PackHash(), packTrailer) {
+ t.Fatalf("PackHash does not match pack trailer")
+ }
+
+ want := packOrderPositions(t, &idx)
+
+ for packOrder, wantPosition := range want {
+ position, err := rev.PositionAt(packOrder)
+ if err != nil {
+ t.Fatalf("PositionAt(%d): %v", packOrder, err)
+ }
+
+ if position != int(wantPosition) {
+ t.Fatalf("PositionAt(%d) = %d, want %d", packOrder, position, wantPosition)
+ }
+ }
+ })
+ }
+}
+
+func TestParseMalformed(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ valid := writeSyntheticRev(t, objectFormat, []uint32{2, 0, 1, 3})
+
+ corrupt := func(mutate func(data []byte) []byte) []byte {
+ return mutate(bytes.Clone(valid))
+ }
+
+ cases := []struct {
+ name string
+ data []byte
+ }{
+ {name: "empty", data: []byte{}},
+ {name: "truncated", data: corrupt(func(d []byte) []byte { return d[:10] })},
+ {
+ name: "bad signature",
+ data: corrupt(func(d []byte) []byte {
+ d[0] ^= 0xff
+
+ return d
+ }),
+ },
+ {
+ name: "bad version",
+ data: corrupt(func(d []byte) []byte {
+ d[7] = 2
+
+ return d
+ }),
+ },
+ {
+ name: "hash function mismatch",
+ data: corrupt(func(d []byte) []byte {
+ d[11] ^= 0xff
+
+ return d
+ }),
+ },
+ {
+ name: "position table size not 32-bit multiple",
+ data: corrupt(func(d []byte) []byte { return append(d, 0xde, 0xad) }),
+ },
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ _, err := packrev.Parse(tc.data, objectFormat)
+ if !errors.Is(err, packrev.ErrMalformedReverseIndex) {
+ t.Fatalf("Parse error = %v, want ErrMalformedReverseIndex", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+func TestPositionAtMalformed(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ data := writeSyntheticRev(t, objectFormat, []uint32{2, 0, 1, 3})
+
+ // Corrupt the first stored position to one past the object count.
+ binary.BigEndian.PutUint32(data[12:], 4)
+
+ rev, err := packrev.Parse(data, objectFormat)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ _, err = rev.PositionAt(0)
+ if !errors.Is(err, packrev.ErrMalformedReverseIndex) {
+ t.Fatalf("PositionAt error = %v, want ErrMalformedReverseIndex", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packrev/write.go b/internal/format/packrev/write.go
new file mode 100644
index 00000000..399c9157
--- /dev/null
+++ b/internal/format/packrev/write.go
@@ -0,0 +1,79 @@
+package packrev
+
+import (
+ "bufio"
+ "errors"
+ "fmt"
+ "io"
+ "math"
+
+ "lindenii.org/go/furgit/internal/stickyio"
+ "lindenii.org/go/furgit/object/id"
+)
+
+// ErrInvalidPositions reports that
+// positions supplied for a reverse index write
+// are out of range or too numerous.
+var ErrInvalidPositions = errors.New("internal/format/packrev: invalid positions")
+
+// Write writes one pack reverse index to w.
+//
+// positions holds, for each object in pack offset order,
+// the object's pack index position.
+// packHash must be the pack's trailer hash;
+// Write panics when its length does not match the object format.
+func Write(w io.Writer, objectFormat id.ObjectFormat, positions []uint32, packHash []byte) error {
+ hashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return err
+ }
+
+ if len(packHash) != objectFormat.Size() {
+ panic("internal/format/packrev: invalid pack hash length")
+ }
+
+ if len(positions) > math.MaxUint32 {
+ return fmt.Errorf("%w: too many positions", ErrInvalidPositions)
+ }
+
+ for _, position := range positions {
+ if uint64(position) >= uint64(len(positions)) {
+ return fmt.Errorf("%w: index position out of range", ErrInvalidPositions)
+ }
+ }
+
+ hashImpl, err := objectFormat.New()
+ if err != nil {
+ return fmt.Errorf("internal/format/packrev: %w", err)
+ }
+
+ bw := bufio.NewWriter(io.MultiWriter(w, hashImpl))
+ sw := stickyio.New(bw)
+
+ sw.PutUint32(signature)
+ sw.PutUint32(version)
+ sw.PutUint32(hashID)
+
+ for _, position := range positions {
+ sw.PutUint32(position)
+ }
+
+ sw.Put(packHash)
+
+ err = sw.Err()
+ if err != nil {
+ return fmt.Errorf("internal/format/packrev: %w", err)
+ }
+
+ err = bw.Flush()
+ if err != nil {
+ return fmt.Errorf("internal/format/packrev: %w", err)
+ }
+
+ _, err = w.Write(hashImpl.Sum(nil))
+ if err != nil {
+ return fmt.Errorf("internal/format/packrev: %w", err)
+ }
+
+ return nil
+}
diff --git a/internal/format/packrev/write_test.go b/internal/format/packrev/write_test.go
new file mode 100644
index 00000000..b5c1fcb9
--- /dev/null
+++ b/internal/format/packrev/write_test.go
@@ -0,0 +1,135 @@
+package packrev_test
+
+import (
+ "bytes"
+ "errors"
+ "os"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/format/packrev"
+ "lindenii.org/go/furgit/object/id"
+)
+
+// writeSyntheticRev writes one reverse index over positions
+// with a fixed fake pack hash.
+func writeSyntheticRev(t *testing.T, objectFormat id.ObjectFormat, positions []uint32) []byte {
+ t.Helper()
+
+ packHash := bytes.Repeat([]byte{0x5a}, objectFormat.Size())
+
+ var buf bytes.Buffer
+
+ err := packrev.Write(&buf, objectFormat, positions, packHash)
+ if err != nil {
+ t.Fatalf("Write: %v", err)
+ }
+
+ return buf.Bytes()
+}
+
+func TestWriteRoundTrip(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ positions := []uint32{8, 6, 7, 5, 3, 0, 4, 1, 2}
+ data := writeSyntheticRev(t, objectFormat, positions)
+
+ rev, err := packrev.Parse(data, objectFormat)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ if rev.NumObjects() != len(positions) {
+ t.Fatalf("NumObjects = %d, want %d", rev.NumObjects(), len(positions))
+ }
+
+ if !bytes.Equal(rev.PackHash(), bytes.Repeat([]byte{0x5a}, objectFormat.Size())) {
+ t.Fatalf("PackHash mismatch")
+ }
+
+ for packOrder, want := range positions {
+ position, err := rev.PositionAt(packOrder)
+ if err != nil {
+ t.Fatalf("PositionAt(%d): %v", packOrder, err)
+ }
+
+ if position != int(want) {
+ t.Fatalf("PositionAt(%d) = %d, want %d", packOrder, position, want)
+ }
+ }
+ })
+ }
+}
+
+func TestWriteMatchesGit(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ prefix := makeGitPack(t, objectFormat)
+
+ gitData, err := os.ReadFile(prefix + ".rev") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ idxData, err := os.ReadFile(prefix + ".idx") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile: %v", err)
+ }
+
+ idx, err := packidx.Parse(idxData, objectFormat.Size())
+ if err != nil {
+ t.Fatalf("packidx.Parse: %v", err)
+ }
+
+ positions := packOrderPositions(t, &idx)
+
+ var buf bytes.Buffer
+
+ err = packrev.Write(&buf, objectFormat, positions, idx.PackHash())
+ if err != nil {
+ t.Fatalf("Write: %v", err)
+ }
+
+ if !bytes.Equal(buf.Bytes(), gitData) {
+ t.Fatalf("Write output differs from git's reverse index (%d vs %d bytes)", buf.Len(), len(gitData))
+ }
+ })
+ }
+}
+
+func TestWriteInvalidPositions(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ packHash := bytes.Repeat([]byte{0x5a}, objectFormat.Size())
+
+ err := packrev.Write(&bytes.Buffer{}, objectFormat, []uint32{0, 5}, packHash)
+ if !errors.Is(err, packrev.ErrInvalidPositions) {
+ t.Fatalf("Write error = %v, want ErrInvalidPositions", err)
+ }
+ })
+ }
+}
+
+func TestWriteBadPackHashPanics(t *testing.T) {
+ t.Parallel()
+
+ defer func() {
+ if recover() == nil {
+ t.Fatalf("Write with short pack hash: expected panic")
+ }
+ }()
+
+ _ = packrev.Write(&bytes.Buffer{}, id.ObjectFormatSHA256, nil, []byte{0x01})
+}