diff options
| author | 2026-06-14 14:11:46 +0000 | |
|---|---|---|
| committer | 2026-06-14 14:11:46 +0000 | |
| commit | 56db52ce91661de515a8581b1b3d0d5500e2c8f7 (patch) | |
| tree | 957aade58bc6e9b35f9b0faff5d7052e2a47f7cf /internal/format | |
| parent | object/store/packed: Skip if bloom filter says impossible (diff) | |
internal/format/packidx/bloom: Add trailers
Diffstat (limited to 'internal/format')
| -rw-r--r-- | internal/format/packidx/bloom/bloom.go | 55 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/lookup.go | 2 | ||||
| -rw-r--r-- | internal/format/packidx/bloom/write.go | 48 |
3 files changed, 83 insertions, 22 deletions
diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go index 2605a8e1..71de2618 100644 --- a/internal/format/packidx/bloom/bloom.go +++ b/internal/format/packidx/bloom/bloom.go @@ -1,6 +1,7 @@ package bloom import ( + "bytes" "encoding/binary" "errors" "fmt" @@ -72,18 +73,21 @@ func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) { // // Labels: Deps-Borrowed, Life-Parent, MT-Safe. type Bloom struct { - // buckets is the bucket region, after the header. + // data is the entire filter payload. + data []byte + + // buckets is the bucket region, between the header and the trailer. buckets []byte + // objectFormat is the filter's object format. + objectFormat id.ObjectFormat + // log2B is the base-2 logarithm of the bucket count, // i.e. the number of leading object ID bits that select a bucket. log2B uint // k is the number of bits set and tested per object ID. k int - - // hashSize is the object ID size of the filter's object format. - hashSize int } // Parse parses one Bloom filter from data. @@ -129,15 +133,48 @@ func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) { return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err) } - want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) if uint64(len(data)) != want { return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter) } return Bloom{ - buckets: data[HeaderLen:], - log2B: log2B, - k: int(k), - hashSize: hashSize, + data: data, + buckets: data[HeaderLen : len(data)-2*hashSize], + objectFormat: objectFormat, + log2B: log2B, + k: int(k), }, nil } + +// PackHash returns the pack hash recorded in the filter trailer. +// +// Labels: Life-Parent, Mut-No. +func (f *Bloom) PackHash() []byte { + hashSize := f.objectFormat.Size() + end := len(f.data) - hashSize + + return f.data[end-hashSize : end] +} + +// Verify recomputes the filter's trailing checksum and reports any mismatch. +// +// Verify reads the whole filter, +// so callers should treat it as a deliberate integrity check +// rather than part of the open path. +func (f *Bloom) Verify() error { + hashImpl, err := f.objectFormat.New() + if err != nil { + return fmt.Errorf("internal/format/packidx/bloom: %w", err) + } + + checksumOff := len(f.data) - f.objectFormat.Size() + + _, _ = hashImpl.Write(f.data[:checksumOff]) + + if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) { + return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter) + } + + return nil +} diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go index 37470a15..4a8d7bda 100644 --- a/internal/format/packidx/bloom/lookup.go +++ b/internal/format/packidx/bloom/lookup.go @@ -12,7 +12,7 @@ import ( // // Labels: Mut-No. func (f *Bloom) MayContain(oid []byte) bool { - if len(oid) != f.hashSize { + if len(oid) != f.objectFormat.Size() { panic("internal/format/packidx/bloom: invalid object ID length") } diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go index a76897ac..431b7a7d 100644 --- a/internal/format/packidx/bloom/write.go +++ b/internal/format/packidx/bloom/write.go @@ -4,6 +4,7 @@ import ( "encoding/binary" "errors" "fmt" + "hash" "math/bits" "lindenii.org/go/furgit/object/id" @@ -29,37 +30,51 @@ const targetLoad = 48 // // Labels: MT-Unsafe. type Builder struct { - // data is the full filter file, header included. + // data is the full filter file, header and trailer included. data []byte - // buckets aliases the bucket region of data, after the header. + // buckets aliases the bucket region of data, between header and trailer. buckets []byte - log2B uint - k int - hashSize int + // hashImpl computes the trailing checksum and gives the hash size. + hashImpl hash.Hash + + log2B uint + k int } // NewBuilder creates a filter builder -// for bucketCount buckets and k probes per object ID. +// for bucketCount buckets and k probes per object ID, +// binding the filter to packHash. // // bucketCount must be a nonzero power of two, // k must be nonzero, // and log2(bucketCount) + 9*k must not exceed the hash length in bits. -func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Builder, error) { +// packHash must be the pack's trailer hash; +// NewBuilder panics when its length does not match the object format. +func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16, packHash []byte) (*Builder, error) { hashID, err := hashFunctionID(objectFormat) if err != nil { return nil, err } + hashImpl, err := objectFormat.New() + if err != nil { + return nil, fmt.Errorf("internal/format/packidx/bloom: %w", err) + } + hashSize := objectFormat.Size() + if len(packHash) != hashSize { + panic("internal/format/packidx/bloom: invalid pack hash length") + } + log2B, err := checkParams(bucketCount, k, hashSize) if err != nil { return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err) } - total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount)) + total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize)) if err != nil { return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err) } @@ -71,12 +86,15 @@ func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Bu binary.BigEndian.PutUint32(data[12:], bucketCount) binary.BigEndian.PutUint16(data[16:], k) + bucketsEnd := total - 2*hashSize + copy(data[bucketsEnd:], packHash) + return &Builder{ data: data, - buckets: data[HeaderLen:], + buckets: data[HeaderLen:bucketsEnd], + hashImpl: hashImpl, log2B: log2B, k: int(k), - hashSize: hashSize, }, nil } @@ -85,7 +103,7 @@ func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Bu // oid must be exactly the filter's hash size; // Add panics otherwise. func (b *Builder) Add(oid []byte) { - if len(oid) != b.hashSize { + if len(oid) != b.hashImpl.Size() { panic("internal/format/packidx/bloom: invalid object ID length") } @@ -100,10 +118,16 @@ func (b *Builder) Add(oid []byte) { } } -// Bytes returns the serialized filter. +// Bytes returns the serialized filter, including its trailing checksum. // // Labels: Life-Parent, Mut-No. func (b *Builder) Bytes() []byte { + checksumOff := len(b.data) - b.hashImpl.Size() + + b.hashImpl.Reset() + _, _ = b.hashImpl.Write(b.data[:checksumOff]) + b.hashImpl.Sum(b.data[checksumOff:checksumOff]) + return b.data } |
