diff options
Diffstat (limited to 'internal/format/packidx')
| -rw-r--r-- | internal/format/packidx/bloom/bloom.go | 180 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/bloom_test.go | 159 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/doc.go | 138 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/lookup.go | 42 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/lookup_test.go | 32 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/roundtrip_test.go | 92 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/write.go | 164 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/write_test.go | 95 | ||||
| -rw-r--r-- | internal/format/packidx/doc.go | 3 | ||||
| -rw-r--r-- | internal/format/packidx/helpers_test.go | 86 | ||||
| -rw-r--r-- | internal/format/packidx/lookup.go | 121 | ||||
| -rw-r--r-- | internal/format/packidx/lookup_test.go | 99 | ||||
| -rw-r--r-- | internal/format/packidx/packidx.go | 197 | ||||
| -rw-r--r-- | internal/format/packidx/packidx_test.go | 124 | ||||
| -rw-r--r-- | internal/format/packidx/roundtrip_test.go | 88 | ||||
| -rw-r--r-- | internal/format/packidx/write.go | 129 | ||||
| -rw-r--r-- | internal/format/packidx/write_test.go | 112 |
17 files changed, 1861 insertions, 0 deletions
diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go new file mode 100644 index 00000000..8b608221 --- /dev/null +++ b/internal/format/packidx/bloom/bloom.go @@ -0,0 +1,180 @@ +package bloom + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "math/bits" + + "lindenii.org/go/furgit/object/id" +) + +// ErrMalformedBloomFilter reports that +// a Bloom filter is truncated, +// has a bad signature, version, or hash function, +// or has inconsistent parameters. +var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter") + +const ( + signature = 0x4944424c // "IDBL" + version = 1 + + // HeaderLen is the fixed header length in octets, + // i.e., the signature, version, hash function identifier, + // B, K, and the trailing zero padding. + HeaderLen = 64 + + // BucketLen is the length of one bucket in octets, + // chosen to match the most common cache-line size. + BucketLen = 64 + + // wordBits is the bit width of one bucket word. + wordBits = 64 + + // fieldBits is the width of one in-bucket position field. + fieldBits = 9 +) + +// checkParams validates the filter parameters +// against one object hash size, +// returning log2(bucketCount) on success. +func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) { + switch { + case bucketCount == 0 || bucketCount&(bucketCount-1) != 0: + return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113 + case k == 0: + return 0, errors.New("zero probe count") //nolint:err113 + } + + log2B := uint(bits.TrailingZeros32(bucketCount)) + if log2B+fieldBits*uint(k) > uint(hashSize)*8 { + return 0, errors.New("parameters exceed hash length") //nolint:err113 + } + + return log2B, nil +} + +// hashFunctionID returns the on-disk hash function identifier +// for one object format. +func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) { + switch objectFormat { + case id.ObjectFormatSHA1: + return 1, nil + case id.ObjectFormatSHA256: + return 2, nil + case id.ObjectFormatUnknown: + } + + return 0, id.ErrInvalidObjectFormat +} + +// Bloom is a parsed blocked Bloom filter view over borrowed bytes. +// +// Labels: Deps-Borrowed, Life-Parent, MT-Safe. +type Bloom struct { + // data is the entire filter payload. + data []byte + + // buckets is the bucket region, between the header and the trailer. + buckets []byte + + // objectFormat is the filter's object format. + objectFormat id.ObjectFormat + + // log2B is the base-2 logarithm of the bucket count, + // i.e. the number of leading object ID bits that select a bucket. + log2B uint + + // k is the number of bits set and tested per object ID. + k int +} + +// Parse parses one Bloom filter from data. +// +// Labels: Deps-Borrowed, Life-Parent. +func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) { + var zero Bloom + + wantHashID, err := hashFunctionID(objectFormat) + if err != nil { + return zero, err + } + + hashSize := objectFormat.Size() + + if len(data) < HeaderLen { + return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter) + } + + if binary.BigEndian.Uint32(data) != signature { + return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter) + } + + if binary.BigEndian.Uint32(data[4:]) != version { + return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter) + } + + if binary.BigEndian.Uint32(data[8:]) != wantHashID { + return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter) + } + + bucketCount := binary.BigEndian.Uint32(data[12:]) + k := binary.BigEndian.Uint16(data[16:]) + + for _, octet := range data[18:HeaderLen] { + if octet != 0 { + return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter) + } + } + + log2B, err := checkParams(bucketCount, k, hashSize) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err) + } + + want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115 + if uint64(len(data)) != want { + return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter) + } + + return Bloom{ + data: data, + buckets: data[HeaderLen : len(data)-2*hashSize], + objectFormat: objectFormat, + log2B: log2B, + k: int(k), + }, nil +} + +// PackHash returns the pack hash recorded in the filter trailer. +// +// Labels: Life-Parent, Mut-No. +func (f *Bloom) PackHash() []byte { + hashSize := f.objectFormat.Size() + end := len(f.data) - hashSize + + return f.data[end-hashSize : end] +} + +// Verify recomputes the filter's trailing checksum and reports any mismatch. +// +// Verify reads the whole filter, +// so callers should treat it as a deliberate integrity check +// rather than part of the open path. +func (f *Bloom) Verify() error { + hashImpl, err := f.objectFormat.New() + if err != nil { + return fmt.Errorf("internal/format/packidx/bloom: %w", err) + } + + checksumOff := len(f.data) - f.objectFormat.Size() + + _, _ = hashImpl.Write(f.data[:checksumOff]) + + if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) { + return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter) + } + + return nil +} diff --git a/internal/format/packidx/bloom/bloom_test.go b/internal/format/packidx/bloom/bloom_test.go new file mode 100644 index 00000000..bcfb4419 --- /dev/null +++ b/internal/format/packidx/bloom/bloom_test.go @@ -0,0 +1,159 @@ +package bloom_test + +import ( + "encoding/binary" + "errors" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func validFilter(t *testing.T, format id.ObjectFormat) []byte { + // TODO: maybe testgit should have something like this? + t.Helper() + + builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size())) + if err != nil { + t.Fatal(err) + } + + return builder.Bytes() +} + +func otherFormat(t *testing.T, format id.ObjectFormat) id.ObjectFormat { + t.Helper() + + for _, candidate := range id.SupportedObjectFormats() { + if candidate != format { + return candidate + } + } + + t.Skip("only one supported object format") + + return id.ObjectFormatUnknown +} + +func TestParseValid(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + _, err := bloom.Parse(validFilter(t, format), format) + if err != nil { + t.Fatalf("Parse rejected a valid filter: %v", err) + } + }) + } +} + +func TestParseMalformed(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + mangle func(data []byte) []byte + }{ + {"truncated", func(data []byte) []byte { return data[:bloom.HeaderLen-1] }}, + {"bad signature", func(data []byte) []byte { + data[0] ^= 0xff + + return data + }}, + {"bad version", func(data []byte) []byte { + binary.BigEndian.PutUint32(data[4:], 99) + + return data + }}, + {"non power of two", func(data []byte) []byte { + binary.BigEndian.PutUint32(data[12:], 3) + + return data + }}, + {"zero probe count", func(data []byte) []byte { + binary.BigEndian.PutUint16(data[16:], 0) + + return data + }}, + {"parameters exceed hash", func(data []byte) []byte { + binary.BigEndian.PutUint32(data[12:], 1<<31) + binary.BigEndian.PutUint16(data[16:], 30) + + return data + }}, + {"nonzero padding", func(data []byte) []byte { + data[20] = 1 + + return data + }}, + {"size disagrees", func(data []byte) []byte { return data[:len(data)-1] }}, + } + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + data := tc.mangle(append([]byte(nil), validFilter(t, format)...)) + + _, err := bloom.Parse(data, format) + if !errors.Is(err, bloom.ErrMalformedBloomFilter) { + t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err) + } + }) + } + }) + } +} + +// TestVerifyDetectsCorruption checks that Verify accepts a sound filter +// and rejects one whose bucket bytes have been altered. +func TestVerifyDetectsCorruption(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + data := validFilter(t, format) + + filter, err := bloom.Parse(data, format) + if err != nil { + t.Fatal(err) + } + + err = filter.Verify() + if err != nil { + t.Fatalf("Verify on a sound filter: %v", err) + } + + data[bloom.HeaderLen] ^= 0xff + + err = filter.Verify() + if !errors.Is(err, bloom.ErrMalformedBloomFilter) { + t.Fatalf("Verify error = %v, want ErrMalformedBloomFilter", err) + } + }) + } +} + +func TestParseHashMismatch(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + _, err := bloom.Parse(validFilter(t, format), otherFormat(t, format)) + if !errors.Is(err, bloom.ErrMalformedBloomFilter) { + t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err) + } + }) + } +} diff --git a/internal/format/packidx/bloom/doc.go b/internal/format/packidx/bloom/doc.go new file mode 100644 index 00000000..06ca57cd --- /dev/null +++ b/internal/format/packidx/bloom/doc.go @@ -0,0 +1,138 @@ +// Package bloom provides a blocked Bloom filter +// for pack indexes. +// +// A filter answers, from a single cache-line-sized read, +// whether an object ID is definitely absent from the index it covers. +// A lookup that must consult many packs +// can then skip the full binary search +// in every pack whose filter rejects the object, +// decreasing the cost of misses. +// +// # Rationale +// +// Especially for server-side usage, +// repacking is expensive, +// and creating multi-pack-indexes is still rather expensive. +// Incremental multi-pack-indexes partially solve this, +// but having too many of them defeats the purpose, +// since the indexes must still be walked in order +// while performing expensive lookups. +// +// Instead, each multi-pack-index layer, +// and each ordinary pack index, +// may carry its own filter. +// The indexes are still traversed in their usual order, +// but the first step when traversing one +// is to check whether it could possibly hold the wanted object. +// +// The filter is split into 64-octet buckets, +// matching the most common cache-line size. +// Some bits of the object ID choose the bucket, +// and the rest choose several bit positions inside it, +// so a lookup reads one 64-octet bucket +// and checks whether all required bits are set. +// +// # Parameters +// +// A filter is parameterized by +// the number of buckets B +// and the number of bits set and tested per object ID, K. +// All integers in the format are big endian. +// The object ID is interpreted as a big-endian bitstring, +// where bit offset 0 is the most significant bit of octet 0. +// B must be a nonzero power of two, +// K must be nonzero, +// and log2(B) + 9*K must not exceed the hash length in bits. +// +// # File format +// +// A filter file is a 64-octet header, +// then B buckets of 64 octets each, +// then a two-hash trailer: +// +// - 4-octet signature: {'I', 'D', 'B', 'L'}. +// - 4-octet version identifier (= 1). +// - 4-octet object hash algorithm identifier +// (= 1 for SHA-1, 2 for SHA-256). +// - 4-octet B, the number of buckets. +// - 2-octet K, the number of bits set and tested per object ID. +// - 46-octet padding, which must be all zero. +// - B buckets of 64 octets each. +// - the pack trailer hash, which binds the filter to its pack. +// - the checksum of everything before it, over the filter's hash function. +// +// The hash length is that of the object format, +// so the trailer is 2 hashes wide +// and the file size is exactly 64 + 64*B + 2*hashlen octets. +// +// A reader must validate that +// the signature matches, +// the version is supported, +// the hash function identifier is recognized, +// B is nonzero and a power of two, +// K is nonzero, +// log2(B) + 9*K does not exceed the hash length in bits, +// the padding is all zero, +// and the file size is exactly 64 + 64*B + 2*hashlen octets. +// +// # Binding and integrity +// +// The pack hash binds a filter to one pack; +// a reader trusts a filter only when the recorded pack hash +// matches the pack it accompanies. +// +// The checksum guards against corruption of the filter itself. +// Recomputing it reads the whole file and rehashes it as fsck. +// +// # Lookup +// +// A lookup against one filter proceeds as follows: +// +// 1. Let b be the unsigned integer encoded +// by the most significant log2(B) bits of the object ID. +// B is a power of two, so 0 <= b < B. +// 2. Select and read bucket b. +// 3. For each 0 <= i < K, +// take the i-th 9-bit field +// from the 9*K bits that follow the bucket-selecting bits, +// and let pi be the unsigned integer it encodes, +// so 0 <= pi < 512. +// Compute wi = pi >> 6 and bi = pi & 63, +// so wi identifies one of the eight 64-bit words in bucket b +// and bi identifies one bit within that word. +// Within each 64-bit word, +// bit index 0 is the most significant bit +// and bit index 63 is the least significant bit. +// Test whether bit bi is set in word wi of bucket b. +// +// If any test fails, +// the object ID is definitely not in the covered index. +// If all tests succeed, +// the object ID may be in it. +// Two of the K 9-bit fields can decode to the same pi, +// so an insertion may set fewer than K distinct bits; +// this only raises the false positive rate +// and never causes a false negative. +// +// # Worked example +// +// Let B = 1 << 15 = 32768 and K = 8. +// Then log2(B) = 15, +// so each lookup uses 15 bits to choose the bucket +// and 8*9 = 72 bits to choose the in-bucket positions, +// for a total of 87 bits taken from the object ID. +// A SHA-1 has 160 bits and a SHA-256 has 256 bits, +// so both leave ample headroom. +// +// # Security considerations +// +// Object IDs are public unkeyed hashes, +// so an adversary can mine packs +// whose object IDs share a chosen prefix +// to crowd objects into one bucket +// and fill its bits. +// In the worst case this renders some buckets useless, +// making the filter degrade to "may contain" for those buckets, +// but it never produces a false negative +// and is not a significant denial-of-service vector. +package bloom diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go new file mode 100644 index 00000000..4a8d7bda --- /dev/null +++ b/internal/format/packidx/bloom/lookup.go @@ -0,0 +1,42 @@ +package bloom + +import ( + "encoding/binary" +) + +// MayContain reports whether oid may be present +// in the index covered by the filter. +// +// oid must be exactly the filter's hash size; +// MayContain panics otherwise. +// +// Labels: Mut-No. +func (f *Bloom) MayContain(oid []byte) bool { + if len(oid) != f.objectFormat.Size() { + panic("internal/format/packidx/bloom: invalid object ID length") + } + + base := int(binary.BigEndian.Uint32(oid[:4])>>(32-f.log2B)) * BucketLen + + for i := range f.k { + word, mask := probe(oid, f.log2B, i) + if binary.BigEndian.Uint64(f.buckets[base+word*8:])&mask == 0 { + return false + } + } + + return true +} + +// probe returns the bucket word index and single-bit mask +// addressed by the i-th probe of oid. +func probe(oid []byte, log2B uint, i int) (word int, mask uint64) { + bitOff := log2B + fieldBits*uint(i) + byteOff := bitOff >> 3 + bitInByte := bitOff & 7 + + window := uint32(oid[byteOff])<<8 | uint32(oid[byteOff+1]) + pi := (window >> (16 - bitInByte - fieldBits)) & 0x1ff + + return int(pi >> 6), 1 << (wordBits - 1 - (pi & 63)) +} diff --git a/internal/format/packidx/bloom/lookup_test.go b/internal/format/packidx/bloom/lookup_test.go new file mode 100644 index 00000000..e6264f9a --- /dev/null +++ b/internal/format/packidx/bloom/lookup_test.go @@ -0,0 +1,32 @@ +package bloom_test + +import ( + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func TestMayContainBadLength(t *testing.T) { + t.Parallel() + + format := id.ObjectFormatSHA256 + + builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size())) + if err != nil { + t.Fatal(err) + } + + filter, err := bloom.Parse(builder.Bytes(), format) + if err != nil { + t.Fatal(err) + } + + defer func() { + if recover() == nil { + t.Fatal("MayContain did not panic on a short object ID") + } + }() + + filter.MayContain(make([]byte, format.Size()-1)) +} diff --git a/internal/format/packidx/bloom/roundtrip_test.go b/internal/format/packidx/bloom/roundtrip_test.go new file mode 100644 index 00000000..28db17a5 --- /dev/null +++ b/internal/format/packidx/bloom/roundtrip_test.go @@ -0,0 +1,92 @@ +package bloom_test + +import ( + "bytes" + "encoding/binary" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func makeOID(size int, seed uint64) []byte { + out := make([]byte, size) + state := seed + + for i := 0; i < size; i += 8 { + state = state*6364136223846793005 + 1442695040888963407 + + var word [8]byte + + binary.BigEndian.PutUint64(word[:], state) + copy(out[i:], word[:]) + } + + return out +} + +func TestRoundTrip(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + const objects = 10000 + + bucketCount, k, err := bloom.RecommendParams(format, objects) + if err != nil { + t.Fatal(err) + } + + size := format.Size() + packHash := makeOID(size, 0xC0FFEE) + + builder, err := bloom.NewBuilder(format, bucketCount, k, packHash) + if err != nil { + t.Fatal(err) + } + + for i := range objects { + builder.Add(makeOID(size, uint64(i))) + } + + filter, err := bloom.Parse(builder.Bytes(), format) + if err != nil { + t.Fatal(err) + } + + if !bytes.Equal(filter.PackHash(), packHash) { + t.Fatalf("PackHash = %x, want %x", filter.PackHash(), packHash) + } + + err = filter.Verify() + if err != nil { + t.Fatalf("Verify on a freshly built filter: %v", err) + } + + for i := range objects { + if !filter.MayContain(makeOID(size, uint64(i))) { + t.Fatalf("false negative for added object %d", i) + } + } + + const probes = 10000 + + falsePositives := 0 + + for i := range probes { + if filter.MayContain(makeOID(size, uint64(1)<<40+uint64(i))) { + falsePositives++ + } + } + + rate := float64(falsePositives) / float64(probes) + if rate > 0.05 { + t.Errorf("false positive rate %.4f exceeds 0.05", rate) + } + + t.Logf("B=%d K=%d false positive rate %.4f", bucketCount, k, rate) + }) + } +} diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go new file mode 100644 index 00000000..e6213a2c --- /dev/null +++ b/internal/format/packidx/bloom/write.go @@ -0,0 +1,164 @@ +package bloom + +import ( + "encoding/binary" + "errors" + "fmt" + "hash" + "math/bits" + + "lindenii.org/go/furgit/object/id" + "lindenii.org/go/lgo/intconv" +) + +// ErrInvalidParameters reports that +// the parameters supplied for a filter build +// are not representable in the format. +var ErrInvalidParameters = errors.New("internal/format/packidx/bloom: invalid parameters") + +// defaultK is the probe count used by [RecommendParams]. +// +// With 512-bit buckets it keeps the false positive rate near one percent +// at the target bucket load. +const defaultK = 8 + +// targetLoad is the object count per bucket that [RecommendParams] aims for. +const targetLoad = 48 + +// Builder accumulates object IDs into an in-memory Bloom filter +// and serializes it. +// +// Labels: MT-Unsafe. +type Builder struct { + // data is the full filter file, header and trailer included. + data []byte + + // buckets aliases the bucket region of data, between header and trailer. + buckets []byte + + // hashImpl computes the trailing checksum and gives the hash size. + hashImpl hash.Hash + + log2B uint + k int +} + +// NewBuilder creates a filter builder +// for bucketCount buckets and k probes per object ID, +// binding the filter to packHash. +// +// bucketCount must be a nonzero power of two, +// k must be nonzero, +// and log2(bucketCount) + 9*k must not exceed the hash length in bits. +// packHash must be the pack's trailer hash; +// NewBuilder panics when its length does not match the object format. +func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16, packHash []byte) (*Builder, error) { + hashID, err := hashFunctionID(objectFormat) + if err != nil { + return nil, err + } + + hashImpl, err := objectFormat.New() + if err != nil { + return nil, fmt.Errorf("internal/format/packidx/bloom: %w", err) + } + + hashSize := objectFormat.Size() + + if len(packHash) != hashSize { + panic("internal/format/packidx/bloom: invalid pack hash length") + } + + log2B, err := checkParams(bucketCount, k, hashSize) + if err != nil { + return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err) + } + + total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize)) //#nosec G115 + if err != nil { + return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err) + } + + data := make([]byte, total) + binary.BigEndian.PutUint32(data[0:], signature) + binary.BigEndian.PutUint32(data[4:], version) + binary.BigEndian.PutUint32(data[8:], hashID) + binary.BigEndian.PutUint32(data[12:], bucketCount) + binary.BigEndian.PutUint16(data[16:], k) + + bucketsEnd := total - 2*hashSize + copy(data[bucketsEnd:], packHash) + + return &Builder{ + data: data, + buckets: data[HeaderLen:bucketsEnd], + hashImpl: hashImpl, + log2B: log2B, + k: int(k), + }, nil +} + +// Add records oid in the filter. +// +// oid must be exactly the filter's hash size; +// Add panics otherwise. +func (b *Builder) Add(oid []byte) { + if len(oid) != b.hashImpl.Size() { + panic("internal/format/packidx/bloom: invalid object ID length") + } + + base := int(binary.BigEndian.Uint32(oid[:4])>>(32-b.log2B)) * BucketLen + + for i := range b.k { + word, mask := probe(oid, b.log2B, i) + + off := base + word*8 + set := binary.BigEndian.Uint64(b.buckets[off:]) | mask + binary.BigEndian.PutUint64(b.buckets[off:], set) + } +} + +// Bytes returns the serialized filter, including its trailing checksum. +// +// Labels: Life-Parent, Mut-No. +func (b *Builder) Bytes() []byte { + checksumOff := len(b.data) - b.hashImpl.Size() + + b.hashImpl.Reset() + _, _ = b.hashImpl.Write(b.data[:checksumOff]) + b.hashImpl.Sum(b.data[checksumOff:checksumOff]) + + return b.data +} + +// RecommendParams returns filter parameters for an index of n objects, +// targeting a false positive rate near one percent. +func RecommendParams(objectFormat id.ObjectFormat, n int) (bucketCount uint32, k uint16, err error) { + hashSize := objectFormat.Size() + if hashSize == 0 { + return 0, 0, id.ErrInvalidObjectFormat + } + + const maxPow2 = uint32(1) << 31 + + wanted := uint64(0) + if n > 0 { + wanted = (uint64(n) + targetLoad - 1) / targetLoad + } + + switch { + case wanted <= 1: + bucketCount = 1 + case wanted > uint64(maxPow2): + bucketCount = maxPow2 + default: + bucketCount = uint32(1) << bits.Len64(wanted-1) + } + + _, err = checkParams(bucketCount, defaultK, hashSize) + if err != nil { + return 0, 0, fmt.Errorf("%w: %w", ErrInvalidParameters, err) + } + + return bucketCount, defaultK, nil +} diff --git a/internal/format/packidx/bloom/write_test.go b/internal/format/packidx/bloom/write_test.go new file mode 100644 index 00000000..74173921 --- /dev/null +++ b/internal/format/packidx/bloom/write_test.go @@ -0,0 +1,95 @@ +package bloom_test + +import ( + "errors" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func TestRecommendParams(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + for _, n := range []int{0, 1, 1000, 10000, 1000000} { + bucketCount, k, err := bloom.RecommendParams(format, n) + if err != nil { + t.Fatalf("n=%d: %v", n, err) + } + + if bucketCount == 0 || bucketCount&(bucketCount-1) != 0 { + t.Errorf("n=%d: bucket count %d not a power of two", n, bucketCount) + } + + _, err = bloom.NewBuilder(format, bucketCount, k, make([]byte, format.Size())) + if err != nil { + t.Errorf("n=%d: recommended parameters rejected: %v", n, err) + } + } + }) + } +} + +func TestNewBuilderRejects(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + bucketCount uint32 + k uint16 + }{ + {"zero buckets", 0, 8}, + {"non power of two", 3, 8}, + {"zero probe count", 4, 0}, + } + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + _, err := bloom.NewBuilder(format, tc.bucketCount, tc.k, make([]byte, format.Size())) + if !errors.Is(err, bloom.ErrInvalidParameters) { + t.Fatalf("error = %v, want ErrInvalidParameters", err) + } + }) + } + }) + } +} + +func TestNewBuilderBadPackHash(t *testing.T) { + t.Parallel() + + defer func() { + if recover() == nil { + t.Fatal("NewBuilder did not panic on a short pack hash") + } + }() + + _, _ = bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size()-1)) +} + +func TestAddBadLength(t *testing.T) { + t.Parallel() + + builder, err := bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size())) + if err != nil { + t.Fatal(err) + } + + defer func() { + if recover() == nil { + t.Fatal("Add did not panic on a short object ID") + } + }() + + builder.Add(make([]byte, id.ObjectFormatSHA256.Size()-1)) +} diff --git a/internal/format/packidx/doc.go b/internal/format/packidx/doc.go new file mode 100644 index 00000000..5c6015b4 --- /dev/null +++ b/internal/format/packidx/doc.go @@ -0,0 +1,3 @@ +// Package packidx provides Git pack index (version 2) format +// parsing and writing primitives. +package packidx diff --git a/internal/format/packidx/helpers_test.go b/internal/format/packidx/helpers_test.go new file mode 100644 index 00000000..be9b47d1 --- /dev/null +++ b/internal/format/packidx/helpers_test.go @@ -0,0 +1,86 @@ +package packidx_test + +import ( + "os" + "strconv" + "strings" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/internal/testgit" + "lindenii.org/go/furgit/object/id" +) + +// makeGitPack seeds a repository, +// packs the seeded objects with git pack-objects, +// and returns the repository, the artifact path prefix, +// and the packed object IDs. +func makeGitPack(t *testing.T, objectFormat id.ObjectFormat) (*testgit.Repo, string, []id.ObjectID) { + t.Helper() + + repo, err := testgit.NewRepo(t, testgit.RepoOptions{ObjectFormat: objectFormat}) + if err != nil { + t.Fatalf("NewRepo: %v", err) + } + + seeded, err := repo.SeedHistory(t) + if err != nil { + t.Fatalf("SeedHistory: %v", err) + } + + oids := seeded.All() + + prefix, err := repo.PackObjects(t, oids, testgit.PackObjectsOptions{}) + if err != nil { + t.Fatalf("PackObjects: %v", err) + } + + return repo, prefix, oids +} + +// parseGitIdxFile reads and parses one .idx file produced by git. +func parseGitIdxFile(t *testing.T, prefix string, objectFormat id.ObjectFormat) ([]byte, packidx.Packidx) { + t.Helper() + + data, err := os.ReadFile(prefix + ".idx") //nolint:gosec + if err != nil { + t.Fatalf("ReadFile: %v", err) + } + + idx, err := packidx.Parse(data, objectFormat.Size()) + if err != nil { + t.Fatalf("Parse: %v", err) + } + + return data, idx +} + +// gitPackOffsets extracts the object-to-offset mapping +// from git verify-pack -v output. +func gitPackOffsets(t *testing.T, repo *testgit.Repo, idxPath string, objectFormat id.ObjectFormat) map[string]uint64 { + t.Helper() + + out, err := repo.VerifyPack(t, idxPath) + if err != nil { + t.Fatalf("VerifyPack: %v", err) + } + + hexLen := objectFormat.HexLen() + offsets := make(map[string]uint64) + + for line := range strings.Lines(string(out)) { + fields := strings.Fields(line) + if len(fields) < 5 || len(fields[0]) != hexLen { + continue + } + + offset, err := strconv.ParseUint(fields[4], 10, 64) + if err != nil { + continue + } + + offsets[fields[0]] = offset + } + + return offsets +} diff --git a/internal/format/packidx/lookup.go b/internal/format/packidx/lookup.go new file mode 100644 index 00000000..e4b565b6 --- /dev/null +++ b/internal/format/packidx/lookup.go @@ -0,0 +1,121 @@ +package packidx + +import ( + "bytes" + "encoding/binary" + "fmt" + "math/bits" +) + +// Lookup searches the index for one object ID +// and returns its pack file offset. +// +// oid must be exactly the index's hash size; +// Lookup panics otherwise. +func (idx *Packidx) Lookup(oid []byte) (offset uint64, found bool, err error) { + if len(oid) != idx.hashSize { + panic("internal/format/packidx: invalid object ID length") + } + + lo, hi := idx.fanoutRange(oid[0]) + + // Object IDs are uniform for honest inputs, + // interp on next 8 octets converges in O(log log n). + // + // OIDs are public unkeyed hashes, + // so an attacker may sample/filter/mine prefix clusters, + // making interpolation mis-estimate every probe. + // See https://runxiyu.org/comp/ch4ht/ + // for why cryptographic hash algorithms are insufficient. + // + // Cap the interp at bisect's probe count and finish by bisect; + // adversarial at O(log n) plus small interpolation overhead, + // honest exit well under the cap. + target := binary.BigEndian.Uint64(oid[1:9]) + + for budget := bits.Len(uint(hi - lo)); hi-lo > 8 && budget > 0; budget-- { + loKey := binary.BigEndian.Uint64(idx.OIDAt(lo)[1:9]) + hiKey := binary.BigEndian.Uint64(idx.OIDAt(hi - 1)[1:9]) + + var mid int + + switch { + case target <= loKey: + mid = lo + case target >= hiKey: + mid = hi - 1 + default: + hi128, lo128 := bits.Mul64(target-loKey, uint64(hi-lo-1)) //#nosec G115 + q, _ := bits.Div64(hi128, lo128, hiKey-loKey) + mid = lo + int(q) //#nosec G115 + } + + switch cmp := bytes.Compare(oid, idx.OIDAt(mid)); { + case cmp == 0: + offset, err = idx.OffsetAt(mid) + if err != nil { + return 0, false, err + } + + return offset, true, nil + case cmp < 0: + hi = mid + default: + lo = mid + 1 + } + } + + // Interpolation narrowed or capped; bisect to finish. + for lo < hi { + mid := lo + (hi-lo)/2 + + cmp := bytes.Compare(oid, idx.OIDAt(mid)) + + switch { + case cmp == 0: + offset, err = idx.OffsetAt(mid) + if err != nil { + return 0, false, err + } + + return offset, true, nil + case cmp < 0: + hi = mid + default: + lo = mid + 1 + } + } + + return 0, false, nil +} + +// OffsetAt returns the pack file offset at one index position. +// +// OffsetAt panics when pos is out of range. +func (idx *Packidx) OffsetAt(pos int) (uint64, error) { + idx.checkPos(pos) + + raw := binary.BigEndian.Uint32(idx.data[idx.off32Off+4*pos:]) + if raw&largeOffsetFlag == 0 { + return uint64(raw), nil + } + + slot := raw &^ largeOffsetFlag + if uint64(slot) >= idx.off64Count { + return 0, fmt.Errorf("%w: 64-bit offset reference out of range", ErrMalformedPackIndex) + } + + return binary.BigEndian.Uint64(idx.data[idx.off64Off+8*int(slot):]), nil +} + +// fanoutRange returns the index position range [lo, hi) +// of object IDs beginning with first. +func (idx *Packidx) fanoutRange(first byte) (lo, hi int) { + hi = int(binary.BigEndian.Uint32(idx.data[headerLen+4*int(first):])) + + if first > 0 { + lo = int(binary.BigEndian.Uint32(idx.data[headerLen+4*(int(first)-1):])) + } + + return lo, hi +} diff --git a/internal/format/packidx/lookup_test.go b/internal/format/packidx/lookup_test.go new file mode 100644 index 00000000..514aad7a --- /dev/null +++ b/internal/format/packidx/lookup_test.go @@ -0,0 +1,99 @@ +package packidx_test + +import ( + "bytes" + "encoding/binary" + "errors" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/object/id" +) + +func TestLookupGitIndex(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + repo, prefix, oids := makeGitPack(t, objectFormat) + _, idx := parseGitIdxFile(t, prefix, objectFormat) + + wantOffsets := gitPackOffsets(t, repo, prefix+".idx", objectFormat) + if len(wantOffsets) != len(oids) { + t.Fatalf("verify-pack offsets = %d entries, want %d", len(wantOffsets), len(oids)) + } + + for _, oid := range oids { + offset, found, err := idx.Lookup(oid.RawBytes()) + if err != nil { + t.Fatalf("Lookup(%s): %v", oid, err) + } + + if !found { + t.Fatalf("Lookup(%s) not found", oid) + } + + if offset != wantOffsets[oid.String()] { + t.Fatalf("Lookup(%s) = %d, want %d", oid, offset, wantOffsets[oid.String()]) + } + } + }) + } +} + +func TestLookupMissing(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + _, prefix, oids := makeGitPack(t, objectFormat) + _, idx := parseGitIdxFile(t, prefix, objectFormat) + + missing := bytes.Clone(oids[0].RawBytes()) + missing[len(missing)-1] ^= 0xff + + _, found, err := idx.Lookup(missing) + if err != nil { + t.Fatalf("Lookup: %v", err) + } + + if found { + t.Fatalf("Lookup of mutated oid unexpectedly found") + } + }) + } +} + +func TestOffsetAtMalformedLargeReference(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + hashSize := objectFormat.Size() + + entries := syntheticEntries(3) + data := writeSyntheticIndex(t, objectFormat, entries) + + // Mark the first 32-bit offset entry as a 64-bit reference; + // the index has no 64-bit offset table. + off32Off := 8 + 256*4 + len(entries)*hashSize + len(entries)*4 + binary.BigEndian.PutUint32(data[off32Off:], 0x80000000) + + idx, err := packidx.Parse(data, hashSize) + if err != nil { + t.Fatalf("Parse: %v", err) + } + + _, err = idx.OffsetAt(0) + if !errors.Is(err, packidx.ErrMalformedPackIndex) { + t.Fatalf("OffsetAt error = %v, want ErrMalformedPackIndex", err) + } + }) + } +} diff --git a/internal/format/packidx/packidx.go b/internal/format/packidx/packidx.go new file mode 100644 index 00000000..ef2c93f4 --- /dev/null +++ b/internal/format/packidx/packidx.go @@ -0,0 +1,197 @@ +package packidx + +import ( + "encoding/binary" + "errors" + "fmt" + + "lindenii.org/go/furgit/object/id" + "lindenii.org/go/lgo/intconv" +) + +// ErrMalformedPackIndex reports that +// a pack index is truncated, +// has a bad signature or unsupported version, +// or has inconsistent tables. +var ErrMalformedPackIndex = errors.New("internal/format/packidx: malformed pack index") + +const ( + signature = 0xff744f63 + version = 2 + + headerLen = 8 + fanoutLen = 256 * 4 + + // largeOffsetFlag marks one 32-bit offset table entry + // as an index into the 64-bit offset table. + largeOffsetFlag = 0x80000000 +) + +// Packidx is one parsed pack index view over borrowed bytes. +// +// Labels: Deps-Borrowed, Life-Parent, MT-Safe. +type Packidx struct { + // data is the entire pack index payload. + data []byte + // hashSize is the object ID size of the index's object format. + hashSize int + + // numObjects is the object count from the last fanout entry. + numObjects int + + // namesOff, crcOff, off32Off, and off64Off are + // the byte offsets of the object ID, CRC32, + // 32-bit offset, and 64-bit offset tables. + namesOff int + crcOff int + off32Off int + off64Off int + // off64Count is the number of 64-bit offset table entries. + off64Count uint64 +} + +// Parse parses one pack index from data. +// +// hashSize must be the object ID size +// of the pack's object format; +// Parse panics on implausible hash sizes. +func Parse(data []byte, hashSize int) (Packidx, error) { + var zero Packidx + + if hashSize <= 0 || hashSize > id.MaxObjectIDSize { + panic("internal/format/packidx: invalid hash size") + } + + if len(data) < headerLen+fanoutLen+2*hashSize { + return zero, fmt.Errorf("%w: truncated", ErrMalformedPackIndex) + } + + if binary.BigEndian.Uint32(data) != signature { + return zero, fmt.Errorf("%w: bad signature", ErrMalformedPackIndex) + } + + if binary.BigEndian.Uint32(data[4:]) != version { + return zero, fmt.Errorf("%w: unsupported version", ErrMalformedPackIndex) + } + + prev := uint32(0) + + for i := range 256 { + count := binary.BigEndian.Uint32(data[headerLen+4*i:]) + if count < prev { + return zero, fmt.Errorf("%w: non-monotonic fanout", ErrMalformedPackIndex) + } + + prev = count + } + + numObjects := uint64(prev) + hashSize64 := uint64(hashSize) + + namesOff := uint64(headerLen + fanoutLen) + crcOff := namesOff + numObjects*hashSize64 + off32Off := crcOff + 4*numObjects + off64Off := off32Off + 4*numObjects + + minTotal := off64Off + 2*hashSize64 + + dataLen, err := intconv.IntToUint64(len(data)) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err) + } + + if dataLen < minTotal { + return zero, fmt.Errorf("%w: tables exceed index size", ErrMalformedPackIndex) + } + + off64Bytes := dataLen - minTotal + if off64Bytes%8 != 0 { + return zero, fmt.Errorf("%w: trailing table size not a 64-bit offset multiple", ErrMalformedPackIndex) + } + + off64Count := off64Bytes / 8 + if off64Count > numObjects { + return zero, fmt.Errorf("%w: more 64-bit offsets than objects", ErrMalformedPackIndex) + } + + idx := Packidx{ + data: data, + hashSize: hashSize, + off64Count: off64Count, + } + + idx.numObjects, err = intconv.Uint64ToInt(numObjects) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err) + } + + idx.namesOff, err = intconv.Uint64ToInt(namesOff) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err) + } + + idx.crcOff, err = intconv.Uint64ToInt(crcOff) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err) + } + + idx.off32Off, err = intconv.Uint64ToInt(off32Off) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err) + } + + idx.off64Off, err = intconv.Uint64ToInt(off64Off) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedPackIndex, err) + } + + return idx, nil +} + +// NumObjects returns the number of objects in the index. +func (idx *Packidx) NumObjects() int { + return idx.numObjects +} + +// PackHash returns the pack hash recorded in the index trailer. +// +// Labels: Life-Parent, Mut-No. +func (idx *Packidx) PackHash() []byte { + return idx.data[len(idx.data)-2*idx.hashSize : len(idx.data)-idx.hashSize] +} + +// OIDAt returns the object ID bytes at one index position. +// Positions follow object ID sort order. +// +// OIDAt panics when pos is out of range. +// +// Labels: Life-Parent, Mut-No. +func (idx *Packidx) OIDAt(pos int) []byte { + idx.checkPos(pos) + + start := idx.namesOff + pos*idx.hashSize + + return idx.data[start : start+idx.hashSize] +} + +// CRCAt returns the CRC32 of the packed entry data +// at one index position. +// +// CRCAt panics when pos is out of range. +func (idx *Packidx) CRCAt(pos int) uint32 { + idx.checkPos(pos) + + return binary.BigEndian.Uint32(idx.data[idx.crcOff+4*pos:]) +} + +// checkPos panics when pos is not a valid index position. +// +// An out-of-range position is a caller bug +// that slice bounds checking would not catch, +// since the tables share one data slice; +// an unchecked access would silently read other tables' bytes. +func (idx *Packidx) checkPos(pos int) { + if pos < 0 || pos >= idx.numObjects { + panic("internal/format/packidx: index position out of range") + } +} diff --git a/internal/format/packidx/packidx_test.go b/internal/format/packidx/packidx_test.go new file mode 100644 index 00000000..d225a993 --- /dev/null +++ b/internal/format/packidx/packidx_test.go @@ -0,0 +1,124 @@ +package packidx_test + +import ( + "bytes" + "encoding/binary" + "errors" + "os" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/object/id" +) + +func TestParseGitIndex(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + _, prefix, oids := makeGitPack(t, objectFormat) + _, idx := parseGitIdxFile(t, prefix, objectFormat) + + if idx.NumObjects() != len(oids) { + t.Fatalf("NumObjects = %d, want %d", idx.NumObjects(), len(oids)) + } + + for pos := 1; pos < idx.NumObjects(); pos++ { + if bytes.Compare(idx.OIDAt(pos-1), idx.OIDAt(pos)) >= 0 { + t.Fatalf("OIDAt(%d) not sorted after predecessor", pos) + } + } + + packData, err := os.ReadFile(prefix + ".pack") //nolint:gosec + if err != nil { + t.Fatalf("ReadFile: %v", err) + } + + packTrailer := packData[len(packData)-objectFormat.Size():] + if !bytes.Equal(idx.PackHash(), packTrailer) { + t.Fatalf("PackHash does not match pack trailer") + } + }) + } +} + +func TestParseMalformed(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + hashSize := objectFormat.Size() + + valid := writeSyntheticIndex(t, objectFormat, syntheticEntries(8)) + + corrupt := func(mutate func(data []byte) []byte) []byte { + return mutate(bytes.Clone(valid)) + } + + cases := []struct { + name string + data []byte + }{ + {name: "empty", data: []byte{}}, + {name: "truncated", data: corrupt(func(d []byte) []byte { return d[:20] })}, + { + name: "bad signature", + data: corrupt(func(d []byte) []byte { + d[0] ^= 0xff + + return d + }), + }, + { + name: "bad version", + data: corrupt(func(d []byte) []byte { + d[7] = 3 + + return d + }), + }, + { + name: "non-monotonic fanout", + data: corrupt(func(d []byte) []byte { + binary.BigEndian.PutUint32(d[8:], 0xffffffff) + + return d + }), + }, + { + name: "tables exceed index size", + data: corrupt(func(d []byte) []byte { + binary.BigEndian.PutUint32(d[8+255*4:], 0x00ffffff) + + return d + }), + }, + { + name: "trailing size not 64-bit multiple", + data: corrupt(func(d []byte) []byte { return append(d, 0xde, 0xad, 0xbe, 0xef) }), + }, + { + name: "more 64-bit offsets than objects", + data: corrupt(func(d []byte) []byte { + return append(d, bytes.Repeat([]byte{0x00}, 9*8)...) + }), + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + _, err := packidx.Parse(tc.data, hashSize) + if !errors.Is(err, packidx.ErrMalformedPackIndex) { + t.Fatalf("Parse error = %v, want ErrMalformedPackIndex", err) + } + }) + } + }) + } +} diff --git a/internal/format/packidx/roundtrip_test.go b/internal/format/packidx/roundtrip_test.go new file mode 100644 index 00000000..f5bd82e7 --- /dev/null +++ b/internal/format/packidx/roundtrip_test.go @@ -0,0 +1,88 @@ +package packidx_test + +import ( + "bytes" + "math" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/object/id" +) + +func TestWriteRoundTrip(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + hashSize := objectFormat.Size() + + entries := syntheticEntries(20) + entries[3].Offset = 1 << 33 + entries[11].Offset = math.MaxUint64 + entries[12].Offset = 0x7fffffff + entries[13].Offset = 0x80000000 + + data := writeSyntheticIndex(t, objectFormat, entries) + + idx, err := packidx.Parse(data, hashSize) + if err != nil { + t.Fatalf("Parse: %v", err) + } + + if idx.NumObjects() != len(entries) { + t.Fatalf("NumObjects = %d, want %d", idx.NumObjects(), len(entries)) + } + + if !bytes.Equal(idx.PackHash(), bytes.Repeat([]byte{0x5a}, hashSize)) { + t.Fatalf("PackHash mismatch") + } + + for pos, entry := range entries { + if !bytes.Equal(idx.OIDAt(pos), entry.OID[:hashSize]) { + t.Fatalf("OIDAt(%d) mismatch", pos) + } + + if idx.CRCAt(pos) != entry.CRC32 { + t.Fatalf("CRCAt(%d) = %x, want %x", pos, idx.CRCAt(pos), entry.CRC32) + } + + offset, err := idx.OffsetAt(pos) + if err != nil { + t.Fatalf("OffsetAt(%d): %v", pos, err) + } + + if offset != entry.Offset { + t.Fatalf("OffsetAt(%d) = %d, want %d", pos, offset, entry.Offset) + } + + offset, found, err := idx.Lookup(entry.OID[:hashSize]) + if err != nil { + t.Fatalf("Lookup(%d): %v", pos, err) + } + + if !found || offset != entry.Offset { + t.Fatalf("Lookup(%d) = %d %v, want %d found", pos, offset, found, entry.Offset) + } + } + }) + } +} + +// writeSyntheticIndex writes one index over entries +// with a fixed fake pack hash. +func writeSyntheticIndex(t *testing.T, objectFormat id.ObjectFormat, entries []packidx.Entry) []byte { + t.Helper() + + packHash := bytes.Repeat([]byte{0x5a}, objectFormat.Size()) + + var buf bytes.Buffer + + err := packidx.Write(&buf, objectFormat, entries, packHash) + if err != nil { + t.Fatalf("Write: %v", err) + } + + return buf.Bytes() +} diff --git a/internal/format/packidx/write.go b/internal/format/packidx/write.go new file mode 100644 index 00000000..35b2805f --- /dev/null +++ b/internal/format/packidx/write.go @@ -0,0 +1,129 @@ +package packidx + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "math" + + "lindenii.org/go/furgit/internal/stickyio" + "lindenii.org/go/furgit/object/id" +) + +// ErrInvalidEntries reports that +// entries supplied for an index write +// are unsorted, duplicated, or not representable +// in the pack index format. +var ErrInvalidEntries = errors.New("internal/format/packidx: invalid entries") + +// Entry is one object record for an index write. +type Entry struct { + // OID holds the object ID bytes; + // only the first hash-size bytes are meaningful. + OID [id.MaxObjectIDSize]byte + // Offset is the entry's pack file offset. + Offset uint64 + // CRC32 is the CRC32 of the entry's packed data. + CRC32 uint32 +} + +// Write writes one pack index over entries to w. +// +// entries must be sorted by object ID without duplicates. +// packHash must be the pack's trailer hash; +// Write panics when its length does not match the object format. +func Write(w io.Writer, objectFormat id.ObjectFormat, entries []Entry, packHash []byte) error { + hashSize := objectFormat.Size() + if hashSize == 0 { + return id.ErrInvalidObjectFormat + } + + if len(packHash) != hashSize { + panic("internal/format/packidx: invalid pack hash length") + } + + if len(entries) > math.MaxUint32 { + return fmt.Errorf("%w: too many entries", ErrInvalidEntries) + } + + for i := 1; i < len(entries); i++ { + if bytes.Compare(entries[i-1].OID[:hashSize], entries[i].OID[:hashSize]) >= 0 { + return fmt.Errorf("%w: not sorted by object ID", ErrInvalidEntries) + } + } + + hashImpl, err := objectFormat.New() + if err != nil { + return fmt.Errorf("internal/format/packidx: %w", err) + } + + bw := bufio.NewWriter(io.MultiWriter(w, hashImpl)) + sw := stickyio.New(bw) + + sw.PutUint32(signature) + sw.PutUint32(version) + + var counts [256]uint32 + for i := range entries { + counts[entries[i].OID[0]]++ + } + + cumulative := uint32(0) + for _, count := range counts { + cumulative += count + sw.PutUint32(cumulative) + } + + for i := range entries { + sw.Put(entries[i].OID[:hashSize]) + } + + for i := range entries { + sw.PutUint32(entries[i].CRC32) + } + + largeOffsets := make([]uint64, 0, len(entries)) + + for i := range entries { + offset := entries[i].Offset + if offset < largeOffsetFlag { + sw.PutUint32(uint32(offset)) + + continue + } + + slot := len(largeOffsets) + if slot >= largeOffsetFlag { + return fmt.Errorf("%w: too many large offsets", ErrInvalidEntries) + } + + sw.PutUint32(largeOffsetFlag | uint32(slot)) + + largeOffsets = append(largeOffsets, offset) + } + + for _, offset := range largeOffsets { + sw.PutUint64(offset) + } + + sw.Put(packHash) + + err = sw.Err() + if err != nil { + return fmt.Errorf("internal/format/packidx: %w", err) + } + + err = bw.Flush() + if err != nil { + return fmt.Errorf("internal/format/packidx: %w", err) + } + + _, err = w.Write(hashImpl.Sum(nil)) + if err != nil { + return fmt.Errorf("internal/format/packidx: %w", err) + } + + return nil +} diff --git a/internal/format/packidx/write_test.go b/internal/format/packidx/write_test.go new file mode 100644 index 00000000..68df3ece --- /dev/null +++ b/internal/format/packidx/write_test.go @@ -0,0 +1,112 @@ +package packidx_test + +import ( + "bytes" + "errors" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/object/id" +) + +// syntheticEntries builds n distinct entries sorted by object ID, +// spread over the fanout table. +func syntheticEntries(n int) []packidx.Entry { + entries := make([]packidx.Entry, n) + for i := range entries { + entries[i].OID[0] = byte(i * 7) + entries[i].OID[1] = byte(i + 1) + entries[i].Offset = uint64(i+1) * 100 + entries[i].CRC32 = uint32(i+1) * 0x01010101 + } + + return entries +} + +func TestWriteMatchesGit(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + _, prefix, _ := makeGitPack(t, objectFormat) + gitData, idx := parseGitIdxFile(t, prefix, objectFormat) + + entries := make([]packidx.Entry, idx.NumObjects()) + for pos := range entries { + copy(entries[pos].OID[:], idx.OIDAt(pos)) + entries[pos].CRC32 = idx.CRCAt(pos) + + offset, err := idx.OffsetAt(pos) + if err != nil { + t.Fatalf("OffsetAt(%d): %v", pos, err) + } + + entries[pos].Offset = offset + } + + var buf bytes.Buffer + + err := packidx.Write(&buf, objectFormat, entries, idx.PackHash()) + if err != nil { + t.Fatalf("Write: %v", err) + } + + if !bytes.Equal(buf.Bytes(), gitData) { + t.Fatalf("Write output differs from git's index (%d vs %d bytes)", buf.Len(), len(gitData)) + } + }) + } +} + +func TestWriteInvalidEntries(t *testing.T) { + t.Parallel() + + unsorted := syntheticEntries(3) + unsorted[0], unsorted[2] = unsorted[2], unsorted[0] + + duplicated := syntheticEntries(3) + duplicated[1] = duplicated[0] + + cases := []struct { + name string + entries []packidx.Entry + }{ + {name: "unsorted", entries: unsorted}, + {name: "duplicated", entries: duplicated}, + } + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + packHash := bytes.Repeat([]byte{0x5a}, objectFormat.Size()) + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + err := packidx.Write(&bytes.Buffer{}, objectFormat, tc.entries, packHash) + if !errors.Is(err, packidx.ErrInvalidEntries) { + t.Fatalf("Write error = %v, want ErrInvalidEntries", err) + } + }) + } + }) + } +} + +func TestWriteBadPackHashPanics(t *testing.T) { + t.Parallel() + + objectFormat := id.ObjectFormatSHA256 + + defer func() { + if recover() == nil { + t.Fatalf("Write with short pack hash: expected panic") + } + }() + + _ = packidx.Write(&bytes.Buffer{}, objectFormat, nil, []byte{0x01}) +} |
