package bloom
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"math/bits"
"lindenii.org/go/furgit/object/id"
)
// ErrMalformedBloomFilter reports that
// a Bloom filter is truncated,
// has a bad signature, version, or hash function,
// or has inconsistent parameters.
var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter")
const (
signature = 0x4944424c // "IDBL"
version = 1
// HeaderLen is the fixed header length in octets,
// i.e., the signature, version, hash function identifier,
// B, K, and the trailing zero padding.
HeaderLen = 64
// BucketLen is the length of one bucket in octets,
// chosen to match the most common cache-line size.
BucketLen = 64
// wordBits is the bit width of one bucket word.
wordBits = 64
// fieldBits is the width of one in-bucket position field.
fieldBits = 9
)
// checkParams validates the filter parameters
// against one object hash size,
// returning log2(bucketCount) on success.
func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) {
switch {
case bucketCount == 0 || bucketCount&(bucketCount-1) != 0:
return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113
case k == 0:
return 0, errors.New("zero probe count") //nolint:err113
}
log2B := uint(bits.TrailingZeros32(bucketCount))
if log2B+fieldBits*uint(k) > uint(hashSize)*8 {
return 0, errors.New("parameters exceed hash length") //nolint:err113
}
return log2B, nil
}
// hashFunctionID returns the on-disk hash function identifier
// for one object format.
func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
switch objectFormat {
case id.ObjectFormatSHA1:
return 1, nil
case id.ObjectFormatSHA256:
return 2, nil
case id.ObjectFormatUnknown:
}
return 0, id.ErrInvalidObjectFormat
}
// Bloom is a parsed blocked Bloom filter view over borrowed bytes.
//
// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
type Bloom struct {
// data is the entire filter payload.
data []byte
// buckets is the bucket region, between the header and the trailer.
buckets []byte
// objectFormat is the filter's object format.
objectFormat id.ObjectFormat
// log2B is the base-2 logarithm of the bucket count,
// i.e. the number of leading object ID bits that select a bucket.
log2B uint
// k is the number of bits set and tested per object ID.
k int
}
// Parse parses one Bloom filter from data.
//
// Labels: Deps-Borrowed, Life-Parent.
func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
var zero Bloom
wantHashID, err := hashFunctionID(objectFormat)
if err != nil {
return zero, err
}
hashSize := objectFormat.Size()
if len(data) < HeaderLen {
return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter)
}
if binary.BigEndian.Uint32(data) != signature {
return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter)
}
if binary.BigEndian.Uint32(data[4:]) != version {
return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter)
}
if binary.BigEndian.Uint32(data[8:]) != wantHashID {
return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter)
}
bucketCount := binary.BigEndian.Uint32(data[12:])
k := binary.BigEndian.Uint16(data[16:])
for _, octet := range data[18:HeaderLen] {
if octet != 0 {
return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter)
}
}
log2B, err := checkParams(bucketCount, k, hashSize)
if err != nil {
return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
}
want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115
if uint64(len(data)) != want {
return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
}
return Bloom{
data: data,
buckets: data[HeaderLen : len(data)-2*hashSize],
objectFormat: objectFormat,
log2B: log2B,
k: int(k),
}, nil
}
// PackHash returns the pack hash recorded in the filter trailer.
//
// Labels: Life-Parent, Mut-No.
func (f *Bloom) PackHash() []byte {
hashSize := f.objectFormat.Size()
end := len(f.data) - hashSize
return f.data[end-hashSize : end]
}
// Verify recomputes the filter's trailing checksum and reports any mismatch.
//
// Verify reads the whole filter,
// so callers should treat it as a deliberate integrity check
// rather than part of the open path.
func (f *Bloom) Verify() error {
hashImpl, err := f.objectFormat.New()
if err != nil {
return fmt.Errorf("internal/format/packidx/bloom: %w", err)
}
checksumOff := len(f.data) - f.objectFormat.Size()
_, _ = hashImpl.Write(f.data[:checksumOff])
if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) {
return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter)
}
return nil
}