aboutsummaryrefslogtreecommitdiff
path: root/internal/format
diff options
context:
space:
mode:
authorGravatar Runxi Yu2026-06-14 14:11:46 +0000
committerGravatar Runxi Yu2026-06-14 14:11:46 +0000
commit56db52ce91661de515a8581b1b3d0d5500e2c8f7 (patch)
tree957aade58bc6e9b35f9b0faff5d7052e2a47f7cf /internal/format
parentobject/store/packed: Skip if bloom filter says impossible (diff)
internal/format/packidx/bloom: Add trailers
Diffstat (limited to 'internal/format')
-rw-r--r--internal/format/packidx/bloom/bloom.go55
-rw-r--r--internal/format/packidx/bloom/lookup.go2
-rw-r--r--internal/format/packidx/bloom/write.go48
3 files changed, 83 insertions, 22 deletions
diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go
index 2605a8e1..71de2618 100644
--- a/internal/format/packidx/bloom/bloom.go
+++ b/internal/format/packidx/bloom/bloom.go
@@ -1,6 +1,7 @@
package bloom
import (
+ "bytes"
"encoding/binary"
"errors"
"fmt"
@@ -72,18 +73,21 @@ func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
//
// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
type Bloom struct {
- // buckets is the bucket region, after the header.
+ // data is the entire filter payload.
+ data []byte
+
+ // buckets is the bucket region, between the header and the trailer.
buckets []byte
+ // objectFormat is the filter's object format.
+ objectFormat id.ObjectFormat
+
// log2B is the base-2 logarithm of the bucket count,
// i.e. the number of leading object ID bits that select a bucket.
log2B uint
// k is the number of bits set and tested per object ID.
k int
-
- // hashSize is the object ID size of the filter's object format.
- hashSize int
}
// Parse parses one Bloom filter from data.
@@ -129,15 +133,48 @@ func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
}
- want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount)
+ want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize)
if uint64(len(data)) != want {
return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
}
return Bloom{
- buckets: data[HeaderLen:],
- log2B: log2B,
- k: int(k),
- hashSize: hashSize,
+ data: data,
+ buckets: data[HeaderLen : len(data)-2*hashSize],
+ objectFormat: objectFormat,
+ log2B: log2B,
+ k: int(k),
}, nil
}
+
+// PackHash returns the pack hash recorded in the filter trailer.
+//
+// Labels: Life-Parent, Mut-No.
+func (f *Bloom) PackHash() []byte {
+ hashSize := f.objectFormat.Size()
+ end := len(f.data) - hashSize
+
+ return f.data[end-hashSize : end]
+}
+
+// Verify recomputes the filter's trailing checksum and reports any mismatch.
+//
+// Verify reads the whole filter,
+// so callers should treat it as a deliberate integrity check
+// rather than part of the open path.
+func (f *Bloom) Verify() error {
+ hashImpl, err := f.objectFormat.New()
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
+ checksumOff := len(f.data) - f.objectFormat.Size()
+
+ _, _ = hashImpl.Write(f.data[:checksumOff])
+
+ if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) {
+ return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter)
+ }
+
+ return nil
+}
diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go
index 37470a15..4a8d7bda 100644
--- a/internal/format/packidx/bloom/lookup.go
+++ b/internal/format/packidx/bloom/lookup.go
@@ -12,7 +12,7 @@ import (
//
// Labels: Mut-No.
func (f *Bloom) MayContain(oid []byte) bool {
- if len(oid) != f.hashSize {
+ if len(oid) != f.objectFormat.Size() {
panic("internal/format/packidx/bloom: invalid object ID length")
}
diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go
index a76897ac..431b7a7d 100644
--- a/internal/format/packidx/bloom/write.go
+++ b/internal/format/packidx/bloom/write.go
@@ -4,6 +4,7 @@ import (
"encoding/binary"
"errors"
"fmt"
+ "hash"
"math/bits"
"lindenii.org/go/furgit/object/id"
@@ -29,37 +30,51 @@ const targetLoad = 48
//
// Labels: MT-Unsafe.
type Builder struct {
- // data is the full filter file, header included.
+ // data is the full filter file, header and trailer included.
data []byte
- // buckets aliases the bucket region of data, after the header.
+ // buckets aliases the bucket region of data, between header and trailer.
buckets []byte
- log2B uint
- k int
- hashSize int
+ // hashImpl computes the trailing checksum and gives the hash size.
+ hashImpl hash.Hash
+
+ log2B uint
+ k int
}
// NewBuilder creates a filter builder
-// for bucketCount buckets and k probes per object ID.
+// for bucketCount buckets and k probes per object ID,
+// binding the filter to packHash.
//
// bucketCount must be a nonzero power of two,
// k must be nonzero,
// and log2(bucketCount) + 9*k must not exceed the hash length in bits.
-func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Builder, error) {
+// packHash must be the pack's trailer hash;
+// NewBuilder panics when its length does not match the object format.
+func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16, packHash []byte) (*Builder, error) {
hashID, err := hashFunctionID(objectFormat)
if err != nil {
return nil, err
}
+ hashImpl, err := objectFormat.New()
+ if err != nil {
+ return nil, fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
hashSize := objectFormat.Size()
+ if len(packHash) != hashSize {
+ panic("internal/format/packidx/bloom: invalid pack hash length")
+ }
+
log2B, err := checkParams(bucketCount, k, hashSize)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
}
- total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount))
+ total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize))
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
}
@@ -71,12 +86,15 @@ func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Bu
binary.BigEndian.PutUint32(data[12:], bucketCount)
binary.BigEndian.PutUint16(data[16:], k)
+ bucketsEnd := total - 2*hashSize
+ copy(data[bucketsEnd:], packHash)
+
return &Builder{
data: data,
- buckets: data[HeaderLen:],
+ buckets: data[HeaderLen:bucketsEnd],
+ hashImpl: hashImpl,
log2B: log2B,
k: int(k),
- hashSize: hashSize,
}, nil
}
@@ -85,7 +103,7 @@ func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Bu
// oid must be exactly the filter's hash size;
// Add panics otherwise.
func (b *Builder) Add(oid []byte) {
- if len(oid) != b.hashSize {
+ if len(oid) != b.hashImpl.Size() {
panic("internal/format/packidx/bloom: invalid object ID length")
}
@@ -100,10 +118,16 @@ func (b *Builder) Add(oid []byte) {
}
}
-// Bytes returns the serialized filter.
+// Bytes returns the serialized filter, including its trailing checksum.
//
// Labels: Life-Parent, Mut-No.
func (b *Builder) Bytes() []byte {
+ checksumOff := len(b.data) - b.hashImpl.Size()
+
+ b.hashImpl.Reset()
+ _, _ = b.hashImpl.Write(b.data[:checksumOff])
+ b.hashImpl.Sum(b.data[checksumOff:checksumOff])
+
return b.data
}