aboutsummaryrefslogtreecommitdiff
package bloom

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"math/bits"

	"lindenii.org/go/furgit/object/id"
)

// ErrMalformedBloomFilter reports that
// a Bloom filter is truncated,
// has a bad signature, version, or hash function,
// or has inconsistent parameters.
var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter")

const (
	signature = 0x4944424c // "IDBL"
	version   = 1

	// HeaderLen is the fixed header length in octets,
	// i.e., the signature, version, hash function identifier,
	// B, K, and the trailing zero padding.
	HeaderLen = 64

	// BucketLen is the length of one bucket in octets,
	// chosen to match the most common cache-line size.
	BucketLen = 64

	// wordBits is the bit width of one bucket word.
	wordBits = 64

	// fieldBits is the width of one in-bucket position field.
	fieldBits = 9
)

// checkParams validates the filter parameters
// against one object hash size,
// returning log2(bucketCount) on success.
func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) {
	switch {
	case bucketCount == 0 || bucketCount&(bucketCount-1) != 0:
		return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113
	case k == 0:
		return 0, errors.New("zero probe count") //nolint:err113
	}

	log2B := uint(bits.TrailingZeros32(bucketCount))
	if log2B+fieldBits*uint(k) > uint(hashSize)*8 {
		return 0, errors.New("parameters exceed hash length") //nolint:err113
	}

	return log2B, nil
}

// hashFunctionID returns the on-disk hash function identifier
// for one object format.
func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
	switch objectFormat {
	case id.ObjectFormatSHA1:
		return 1, nil
	case id.ObjectFormatSHA256:
		return 2, nil
	case id.ObjectFormatUnknown:
	}

	return 0, id.ErrInvalidObjectFormat
}

// Bloom is a parsed blocked Bloom filter view over borrowed bytes.
//
// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
type Bloom struct {
	// data is the entire filter payload.
	data []byte

	// buckets is the bucket region, between the header and the trailer.
	buckets []byte

	// objectFormat is the filter's object format.
	objectFormat id.ObjectFormat

	// log2B is the base-2 logarithm of the bucket count,
	// i.e. the number of leading object ID bits that select a bucket.
	log2B uint

	// k is the number of bits set and tested per object ID.
	k int
}

// Parse parses one Bloom filter from data.
//
// Labels: Deps-Borrowed, Life-Parent.
func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
	var zero Bloom

	wantHashID, err := hashFunctionID(objectFormat)
	if err != nil {
		return zero, err
	}

	hashSize := objectFormat.Size()

	if len(data) < HeaderLen {
		return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter)
	}

	if binary.BigEndian.Uint32(data) != signature {
		return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter)
	}

	if binary.BigEndian.Uint32(data[4:]) != version {
		return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter)
	}

	if binary.BigEndian.Uint32(data[8:]) != wantHashID {
		return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter)
	}

	bucketCount := binary.BigEndian.Uint32(data[12:])
	k := binary.BigEndian.Uint16(data[16:])

	for _, octet := range data[18:HeaderLen] {
		if octet != 0 {
			return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter)
		}
	}

	log2B, err := checkParams(bucketCount, k, hashSize)
	if err != nil {
		return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
	}

	want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115
	if uint64(len(data)) != want {
		return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
	}

	return Bloom{
		data:         data,
		buckets:      data[HeaderLen : len(data)-2*hashSize],
		objectFormat: objectFormat,
		log2B:        log2B,
		k:            int(k),
	}, nil
}

// PackHash returns the pack hash recorded in the filter trailer.
//
// Labels: Life-Parent, Mut-No.
func (f *Bloom) PackHash() []byte {
	hashSize := f.objectFormat.Size()
	end := len(f.data) - hashSize

	return f.data[end-hashSize : end]
}

// Verify recomputes the filter's trailing checksum and reports any mismatch.
//
// Verify reads the whole filter,
// so callers should treat it as a deliberate integrity check
// rather than part of the open path.
func (f *Bloom) Verify() error {
	hashImpl, err := f.objectFormat.New()
	if err != nil {
		return fmt.Errorf("internal/format/packidx/bloom: %w", err)
	}

	checksumOff := len(f.data) - f.objectFormat.Size()

	_, _ = hashImpl.Write(f.data[:checksumOff])

	if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) {
		return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter)
	}

	return nil
}