aboutsummaryrefslogtreecommitdiff
path: root/internal/format
diff options
context:
space:
mode:
authorGravatar Runxi Yu2026-06-14 13:21:43 +0000
committerGravatar Runxi Yu2026-06-14 13:21:43 +0000
commiteb643bd6cc46db8cf00f68b2ddf4a5d6afd4d252 (patch)
tree037dbd23281da795932bac0547f0d67347db9fd0 /internal/format
parentresearch: Add back here (diff)
internal/format/packidx/bloom: Add
Diffstat (limited to 'internal/format')
-rw-r--r--internal/format/packidx/bloom/bloom.go143
-rw-r--r--internal/format/packidx/bloom/bloom_test.go128
-rw-r--r--internal/format/packidx/bloom/doc.go122
-rw-r--r--internal/format/packidx/bloom/lookup.go42
-rw-r--r--internal/format/packidx/bloom/lookup_test.go32
-rw-r--r--internal/format/packidx/bloom/roundtrip_test.go80
-rw-r--r--internal/format/packidx/bloom/write.go151
-rw-r--r--internal/format/packidx/bloom/write_test.go108
8 files changed, 806 insertions, 0 deletions
diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go
new file mode 100644
index 00000000..2605a8e1
--- /dev/null
+++ b/internal/format/packidx/bloom/bloom.go
@@ -0,0 +1,143 @@
+package bloom
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "math/bits"
+
+ "lindenii.org/go/furgit/object/id"
+)
+
+// ErrMalformedBloomFilter reports that
+// a Bloom filter is truncated,
+// has a bad signature, version, or hash function,
+// or has inconsistent parameters.
+var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter")
+
+const (
+ signature = 0x4944424c // "IDBL"
+ version = 1
+
+ // HeaderLen is the fixed header length in octets,
+ // i.e., the signature, version, hash function identifier,
+ // B, K, and the trailing zero padding.
+ HeaderLen = 64
+
+ // BucketLen is the length of one bucket in octets,
+ // chosen to match the most common cache-line size.
+ BucketLen = 64
+
+ // wordBits is the bit width of one bucket word.
+ wordBits = 64
+
+ // fieldBits is the width of one in-bucket position field.
+ fieldBits = 9
+)
+
+// checkParams validates the filter parameters
+// against one object hash size,
+// returning log2(bucketCount) on success.
+func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) {
+ switch {
+ case bucketCount == 0 || bucketCount&(bucketCount-1) != 0:
+ return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113
+ case k == 0:
+ return 0, errors.New("zero probe count") //nolint:err113
+ }
+
+ log2B := uint(bits.TrailingZeros32(bucketCount))
+ if log2B+fieldBits*uint(k) > uint(hashSize)*8 {
+ return 0, errors.New("parameters exceed hash length") //nolint:err113
+ }
+
+ return log2B, nil
+}
+
+// hashFunctionID returns the on-disk hash function identifier
+// for one object format.
+func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
+ switch objectFormat {
+ case id.ObjectFormatSHA1:
+ return 1, nil
+ case id.ObjectFormatSHA256:
+ return 2, nil
+ case id.ObjectFormatUnknown:
+ }
+
+ return 0, id.ErrInvalidObjectFormat
+}
+
+// Bloom is a parsed blocked Bloom filter view over borrowed bytes.
+//
+// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
+type Bloom struct {
+ // buckets is the bucket region, after the header.
+ buckets []byte
+
+ // log2B is the base-2 logarithm of the bucket count,
+ // i.e. the number of leading object ID bits that select a bucket.
+ log2B uint
+
+ // k is the number of bits set and tested per object ID.
+ k int
+
+ // hashSize is the object ID size of the filter's object format.
+ hashSize int
+}
+
+// Parse parses one Bloom filter from data.
+//
+// Labels: Deps-Borrowed, Life-Parent.
+func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
+ var zero Bloom
+
+ wantHashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return zero, err
+ }
+
+ hashSize := objectFormat.Size()
+
+ if len(data) < HeaderLen {
+ return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data) != signature {
+ return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data[4:]) != version {
+ return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data[8:]) != wantHashID {
+ return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter)
+ }
+
+ bucketCount := binary.BigEndian.Uint32(data[12:])
+ k := binary.BigEndian.Uint16(data[16:])
+
+ for _, octet := range data[18:HeaderLen] {
+ if octet != 0 {
+ return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter)
+ }
+ }
+
+ log2B, err := checkParams(bucketCount, k, hashSize)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
+ }
+
+ want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount)
+ if uint64(len(data)) != want {
+ return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
+ }
+
+ return Bloom{
+ buckets: data[HeaderLen:],
+ log2B: log2B,
+ k: int(k),
+ hashSize: hashSize,
+ }, nil
+}
diff --git a/internal/format/packidx/bloom/bloom_test.go b/internal/format/packidx/bloom/bloom_test.go
new file mode 100644
index 00000000..16295afd
--- /dev/null
+++ b/internal/format/packidx/bloom/bloom_test.go
@@ -0,0 +1,128 @@
+package bloom_test
+
+import (
+ "encoding/binary"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func validFilter(t *testing.T, format id.ObjectFormat) []byte {
+ // TODO: maybe testgit should have something like this?
+ t.Helper()
+
+ builder, err := bloom.NewBuilder(format, 4, 2)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return builder.Bytes()
+}
+
+func otherFormat(t *testing.T, format id.ObjectFormat) id.ObjectFormat {
+ t.Helper()
+
+ for _, candidate := range id.SupportedObjectFormats() {
+ if candidate != format {
+ return candidate
+ }
+ }
+
+ t.Skip("only one supported object format")
+
+ return id.ObjectFormatUnknown
+}
+
+func TestParseValid(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.Parse(validFilter(t, format), format)
+ if err != nil {
+ t.Fatalf("Parse rejected a valid filter: %v", err)
+ }
+ })
+ }
+}
+
+func TestParseMalformed(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ mangle func(data []byte) []byte
+ }{
+ {"truncated", func(data []byte) []byte { return data[:bloom.HeaderLen-1] }},
+ {"bad signature", func(data []byte) []byte {
+ data[0] ^= 0xff
+
+ return data
+ }},
+ {"bad version", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[4:], 99)
+
+ return data
+ }},
+ {"non power of two", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[12:], 3)
+
+ return data
+ }},
+ {"zero probe count", func(data []byte) []byte {
+ binary.BigEndian.PutUint16(data[16:], 0)
+
+ return data
+ }},
+ {"parameters exceed hash", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[12:], 1<<31)
+ binary.BigEndian.PutUint16(data[16:], 30)
+
+ return data
+ }},
+ {"nonzero padding", func(data []byte) []byte {
+ data[20] = 1
+
+ return data
+ }},
+ {"size disagrees", func(data []byte) []byte { return data[:len(data)-1] }},
+ }
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ data := tc.mangle(append([]byte(nil), validFilter(t, format)...))
+
+ _, err := bloom.Parse(data, format)
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+func TestParseHashMismatch(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.Parse(validFilter(t, format), otherFormat(t, format))
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packidx/bloom/doc.go b/internal/format/packidx/bloom/doc.go
new file mode 100644
index 00000000..df7ded6c
--- /dev/null
+++ b/internal/format/packidx/bloom/doc.go
@@ -0,0 +1,122 @@
+// Package bloom provides a blocked Bloom filter
+// for pack indexes.
+//
+// A filter answers, from a single cache-line-sized read,
+// whether an object ID is definitely absent from the index it covers.
+// A lookup that must consult many packs
+// can then skip the full binary search
+// in every pack whose filter rejects the object,
+// decreasing the cost of misses.
+//
+// # Rationale
+//
+// Especially for server-side usage,
+// repacking is expensive,
+// and creating multi-pack-indexes is still rather expensive.
+// Incremental multi-pack-indexes partially solve this,
+// but having too many of them defeats the purpose,
+// since the indexes must still be walked in order
+// while performing expensive lookups.
+//
+// Instead, each multi-pack-index layer,
+// and each ordinary pack index,
+// may carry its own filter.
+// The indexes are still traversed in their usual order,
+// but the first step when traversing one
+// is to check whether it could possibly hold the wanted object.
+//
+// The filter is split into 64-octet buckets,
+// matching the most common cache-line size.
+// Some bits of the object ID choose the bucket,
+// and the rest choose several bit positions inside it,
+// so a lookup reads one 64-octet bucket
+// and checks whether all required bits are set.
+//
+// # Parameters
+//
+// A filter is parameterized by
+// the number of buckets B
+// and the number of bits set and tested per object ID, K.
+// All integers in the format are big endian.
+// The object ID is interpreted as a big-endian bitstring,
+// where bit offset 0 is the most significant bit of octet 0.
+// B must be a nonzero power of two,
+// K must be nonzero,
+// and log2(B) + 9*K must not exceed the hash length in bits.
+//
+// # File format
+//
+// A filter file is a 64-octet header
+// followed by B buckets of 64 octets each:
+//
+// - 4-octet signature: {'I', 'D', 'B', 'L'}.
+// - 4-octet version identifier (= 1).
+// - 4-octet object hash algorithm identifier
+// (= 1 for SHA-1, 2 for SHA-256).
+// - 4-octet B, the number of buckets.
+// - 2-octet K, the number of bits set and tested per object ID.
+// - 46-octet padding, which must be all zero.
+// - B buckets of 64 octets each.
+//
+// A reader must validate that
+// the signature matches,
+// the version is supported,
+// the hash function identifier is recognized,
+// B is nonzero and a power of two,
+// K is nonzero,
+// log2(B) + 9*K does not exceed the hash length in bits,
+// the padding is all zero,
+// and the file size is exactly 64 + 64*B octets.
+//
+// # Lookup
+//
+// A lookup against one filter proceeds as follows:
+//
+// 1. Let b be the unsigned integer encoded
+// by the most significant log2(B) bits of the object ID.
+// B is a power of two, so 0 <= b < B.
+// 2. Select and read bucket b.
+// 3. For each 0 <= i < K,
+// take the i-th 9-bit field
+// from the 9*K bits that follow the bucket-selecting bits,
+// and let pi be the unsigned integer it encodes,
+// so 0 <= pi < 512.
+// Compute wi = pi >> 6 and bi = pi & 63,
+// so wi identifies one of the eight 64-bit words in bucket b
+// and bi identifies one bit within that word.
+// Within each 64-bit word,
+// bit index 0 is the most significant bit
+// and bit index 63 is the least significant bit.
+// Test whether bit bi is set in word wi of bucket b.
+//
+// If any test fails,
+// the object ID is definitely not in the covered index.
+// If all tests succeed,
+// the object ID may be in it.
+// Two of the K 9-bit fields can decode to the same pi,
+// so an insertion may set fewer than K distinct bits;
+// this only raises the false positive rate
+// and never causes a false negative.
+//
+// # Worked example
+//
+// Let B = 1 << 15 = 32768 and K = 8.
+// Then log2(B) = 15,
+// so each lookup uses 15 bits to choose the bucket
+// and 8*9 = 72 bits to choose the in-bucket positions,
+// for a total of 87 bits taken from the object ID.
+// A SHA-1 has 160 bits and a SHA-256 has 256 bits,
+// so both leave ample headroom.
+//
+// # Security considerations
+//
+// Object IDs are public unkeyed hashes,
+// so an adversary can mine packs
+// whose object IDs share a chosen prefix
+// to crowd objects into one bucket
+// and fill its bits.
+// In the worst case this renders some buckets useless,
+// making the filter degrade to "may contain" for those buckets,
+// but it never produces a false negative
+// and is not a significant denial-of-service vector.
+package bloom
diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go
new file mode 100644
index 00000000..37470a15
--- /dev/null
+++ b/internal/format/packidx/bloom/lookup.go
@@ -0,0 +1,42 @@
+package bloom
+
+import (
+ "encoding/binary"
+)
+
+// MayContain reports whether oid may be present
+// in the index covered by the filter.
+//
+// oid must be exactly the filter's hash size;
+// MayContain panics otherwise.
+//
+// Labels: Mut-No.
+func (f *Bloom) MayContain(oid []byte) bool {
+ if len(oid) != f.hashSize {
+ panic("internal/format/packidx/bloom: invalid object ID length")
+ }
+
+ base := int(binary.BigEndian.Uint32(oid[:4])>>(32-f.log2B)) * BucketLen
+
+ for i := range f.k {
+ word, mask := probe(oid, f.log2B, i)
+ if binary.BigEndian.Uint64(f.buckets[base+word*8:])&mask == 0 {
+ return false
+ }
+ }
+
+ return true
+}
+
+// probe returns the bucket word index and single-bit mask
+// addressed by the i-th probe of oid.
+func probe(oid []byte, log2B uint, i int) (word int, mask uint64) {
+ bitOff := log2B + fieldBits*uint(i)
+ byteOff := bitOff >> 3
+ bitInByte := bitOff & 7
+
+ window := uint32(oid[byteOff])<<8 | uint32(oid[byteOff+1])
+ pi := (window >> (16 - bitInByte - fieldBits)) & 0x1ff
+
+ return int(pi >> 6), 1 << (wordBits - 1 - (pi & 63))
+}
diff --git a/internal/format/packidx/bloom/lookup_test.go b/internal/format/packidx/bloom/lookup_test.go
new file mode 100644
index 00000000..adfbcce5
--- /dev/null
+++ b/internal/format/packidx/bloom/lookup_test.go
@@ -0,0 +1,32 @@
+package bloom_test
+
+import (
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestMayContainBadLength(t *testing.T) {
+ t.Parallel()
+
+ format := id.ObjectFormatSHA256
+
+ builder, err := bloom.NewBuilder(format, 4, 2)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ filter, err := bloom.Parse(builder.Bytes(), format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("MayContain did not panic on a short object ID")
+ }
+ }()
+
+ filter.MayContain(make([]byte, format.Size()-1))
+}
diff --git a/internal/format/packidx/bloom/roundtrip_test.go b/internal/format/packidx/bloom/roundtrip_test.go
new file mode 100644
index 00000000..990c83e0
--- /dev/null
+++ b/internal/format/packidx/bloom/roundtrip_test.go
@@ -0,0 +1,80 @@
+package bloom_test
+
+import (
+ "encoding/binary"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func makeOID(size int, seed uint64) []byte {
+ out := make([]byte, size)
+ state := seed
+
+ for i := 0; i < size; i += 8 {
+ state = state*6364136223846793005 + 1442695040888963407
+
+ var word [8]byte
+
+ binary.BigEndian.PutUint64(word[:], state)
+ copy(out[i:], word[:])
+ }
+
+ return out
+}
+
+func TestRoundTrip(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ const objects = 10000
+
+ bucketCount, k, err := bloom.RecommendParams(format, objects)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ builder, err := bloom.NewBuilder(format, bucketCount, k)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ size := format.Size()
+ for i := range objects {
+ builder.Add(makeOID(size, uint64(i)))
+ }
+
+ filter, err := bloom.Parse(builder.Bytes(), format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ for i := range objects {
+ if !filter.MayContain(makeOID(size, uint64(i))) {
+ t.Fatalf("false negative for added object %d", i)
+ }
+ }
+
+ const probes = 10000
+
+ falsePositives := 0
+
+ for i := range probes {
+ if filter.MayContain(makeOID(size, uint64(1)<<40+uint64(i))) {
+ falsePositives++
+ }
+ }
+
+ rate := float64(falsePositives) / float64(probes)
+ if rate > 0.05 {
+ t.Errorf("false positive rate %.4f exceeds 0.05", rate)
+ }
+
+ t.Logf("B=%d K=%d false positive rate %.4f", bucketCount, k, rate)
+ })
+ }
+}
diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go
new file mode 100644
index 00000000..a9efbaae
--- /dev/null
+++ b/internal/format/packidx/bloom/write.go
@@ -0,0 +1,151 @@
+package bloom
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "io"
+ "math/bits"
+
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+// ErrInvalidParameters reports that
+// the parameters supplied for a filter build
+// are not representable in the format.
+var ErrInvalidParameters = errors.New("internal/format/packidx/bloom: invalid parameters")
+
+// defaultK is the probe count used by [RecommendParams].
+//
+// With 512-bit buckets it keeps the false positive rate near one percent
+// at the target bucket load.
+const defaultK = 8
+
+// targetLoad is the object count per bucket that [RecommendParams] aims for.
+const targetLoad = 48
+
+// Builder accumulates object IDs into an in-memory Bloom filter
+// and serializes it.
+//
+// Labels: MT-Unsafe.
+type Builder struct {
+ // data is the full filter file, header included.
+ data []byte
+
+ // buckets aliases the bucket region of data, after the header.
+ buckets []byte
+
+ log2B uint
+ k int
+ hashSize int
+}
+
+// NewBuilder creates a filter builder
+// for bucketCount buckets and k probes per object ID.
+//
+// bucketCount must be a nonzero power of two,
+// k must be nonzero,
+// and log2(bucketCount) + 9*k must not exceed the hash length in bits.
+func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16) (*Builder, error) {
+ hashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return nil, err
+ }
+
+ hashSize := objectFormat.Size()
+
+ log2B, err := checkParams(bucketCount, k, hashSize)
+ if err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount))
+ if err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ data := make([]byte, total)
+ binary.BigEndian.PutUint32(data[0:], signature)
+ binary.BigEndian.PutUint32(data[4:], version)
+ binary.BigEndian.PutUint32(data[8:], hashID)
+ binary.BigEndian.PutUint32(data[12:], bucketCount)
+ binary.BigEndian.PutUint16(data[16:], k)
+
+ return &Builder{
+ data: data,
+ buckets: data[HeaderLen:],
+ log2B: log2B,
+ k: int(k),
+ hashSize: hashSize,
+ }, nil
+}
+
+// Add records oid in the filter.
+//
+// oid must be exactly the filter's hash size;
+// Add panics otherwise.
+func (b *Builder) Add(oid []byte) {
+ if len(oid) != b.hashSize {
+ panic("internal/format/packidx/bloom: invalid object ID length")
+ }
+
+ base := int(binary.BigEndian.Uint32(oid[:4])>>(32-b.log2B)) * BucketLen
+
+ for i := range b.k {
+ word, mask := probe(oid, b.log2B, i)
+
+ off := base + word*8
+ set := binary.BigEndian.Uint64(b.buckets[off:]) | mask
+ binary.BigEndian.PutUint64(b.buckets[off:], set)
+ }
+}
+
+// Bytes returns the serialized filter.
+//
+// Labels: Life-Parent, Mut-No.
+func (b *Builder) Bytes() []byte {
+ return b.data
+}
+
+// WriteTo writes the serialized filter to w.
+func (b *Builder) WriteTo(w io.Writer) (int64, error) {
+ n, err := w.Write(b.data)
+ if err != nil {
+ return int64(n), fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
+ return int64(n), nil
+}
+
+// RecommendParams returns filter parameters for an index of n objects,
+// targeting a false positive rate near one percent.
+func RecommendParams(objectFormat id.ObjectFormat, n int) (bucketCount uint32, k uint16, err error) {
+ hashSize := objectFormat.Size()
+ if hashSize == 0 {
+ return 0, 0, id.ErrInvalidObjectFormat
+ }
+
+ const maxPow2 = uint32(1) << 31
+
+ wanted := uint64(0)
+ if n > 0 {
+ wanted = (uint64(n) + targetLoad - 1) / targetLoad
+ }
+
+ switch {
+ case wanted <= 1:
+ bucketCount = 1
+ case wanted > uint64(maxPow2):
+ bucketCount = maxPow2
+ default:
+ bucketCount = uint32(1) << bits.Len64(wanted-1)
+ }
+
+ _, err = checkParams(bucketCount, defaultK, hashSize)
+ if err != nil {
+ return 0, 0, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ return bucketCount, defaultK, nil
+}
diff --git a/internal/format/packidx/bloom/write_test.go b/internal/format/packidx/bloom/write_test.go
new file mode 100644
index 00000000..18532a33
--- /dev/null
+++ b/internal/format/packidx/bloom/write_test.go
@@ -0,0 +1,108 @@
+package bloom_test
+
+import (
+ "bytes"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestRecommendParams(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, n := range []int{0, 1, 1000, 10000, 1000000} {
+ bucketCount, k, err := bloom.RecommendParams(format, n)
+ if err != nil {
+ t.Fatalf("n=%d: %v", n, err)
+ }
+
+ if bucketCount == 0 || bucketCount&(bucketCount-1) != 0 {
+ t.Errorf("n=%d: bucket count %d not a power of two", n, bucketCount)
+ }
+
+ _, err = bloom.NewBuilder(format, bucketCount, k)
+ if err != nil {
+ t.Errorf("n=%d: recommended parameters rejected: %v", n, err)
+ }
+ }
+ })
+ }
+}
+
+func TestNewBuilderRejects(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ bucketCount uint32
+ k uint16
+ }{
+ {"zero buckets", 0, 8},
+ {"non power of two", 3, 8},
+ {"zero probe count", 4, 0},
+ }
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.NewBuilder(format, tc.bucketCount, tc.k)
+ if !errors.Is(err, bloom.ErrInvalidParameters) {
+ t.Fatalf("error = %v, want ErrInvalidParameters", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+func TestAddBadLength(t *testing.T) {
+ t.Parallel()
+
+ builder, err := bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("Add did not panic on a short object ID")
+ }
+ }()
+
+ builder.Add(make([]byte, id.ObjectFormatSHA256.Size()-1))
+}
+
+func TestWriteTo(t *testing.T) {
+ t.Parallel()
+
+ builder, err := bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ var buf bytes.Buffer
+
+ n, err := builder.WriteTo(&buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if int(n) != len(builder.Bytes()) {
+ t.Fatalf("WriteTo wrote %d bytes, want %d", n, len(builder.Bytes()))
+ }
+
+ if !bytes.Equal(buf.Bytes(), builder.Bytes()) {
+ t.Fatal("WriteTo output differs from Bytes")
+ }
+}