diff options
| author | 2026-01-30 15:08:46 +0100 | |
|---|---|---|
| committer | 2026-01-30 15:08:46 +0100 | |
| commit | b5ccb5e2f456a7f34d39990e60662665f6ea2af7 (patch) | |
| tree | 53a38d1439e050720be59f7bbd8042f71890efe8 /internal/bloom | |
| parent | difflines: Rename from diffbytes (diff) | |
| signature | No signature | |
bloom: Add bloom filter implementation for commit-graph
Diffstat (limited to 'internal/bloom')
| -rw-r--r-- | internal/bloom/bloom.go | 236 | ||||
| -rw-r--r-- | internal/bloom/errors.go | 5 |
2 files changed, 241 insertions, 0 deletions
diff --git a/internal/bloom/bloom.go b/internal/bloom/bloom.go new file mode 100644 index 00000000..c1a116d6 --- /dev/null +++ b/internal/bloom/bloom.go @@ -0,0 +1,236 @@ +// Package bloom provides a bloom filter implementation used for changed-path +// filters in Git commit graphs. +package bloom + +import "encoding/binary" + +const ( + // DataHeaderSize is the size of the BDAT header in commit-graph files. + DataHeaderSize = 3 * 4 + // DefaultMaxChange matches Git's default max-changed-paths behavior. + DefaultMaxChange = 512 +) + +// Settings describe the changed-paths Bloom filter parameters stored in +// commit-graph BDAT chunks. +// +// Obviously, they must match the repository's commit-graph settings to +// interpret filters correctly. +type Settings struct { + HashVersion uint32 + NumHashes uint32 + BitsPerEntry uint32 + MaxChangePaths uint32 +} + +// Filter represents a changed-paths Bloom filter associated with a commit. +// +// The filter encodes which paths changed between a commit and its first +// parent. Paths are expected to be in Git's slash-separated form and +// are queried using a path and its prefixes (e.g. "a/b/c", "a/b", "a"). +type Filter struct { + Data []byte + Version uint32 +} + +// ParseSettings reads Bloom filter settings from a BDAT chunk header. +func ParseSettings(bdat []byte) (*Settings, error) { + if len(bdat) < DataHeaderSize { + return nil, ErrInvalid + } + settings := &Settings{ + HashVersion: binary.BigEndian.Uint32(bdat[0:4]), + NumHashes: binary.BigEndian.Uint32(bdat[4:8]), + BitsPerEntry: binary.BigEndian.Uint32(bdat[8:12]), + MaxChangePaths: DefaultMaxChange, + } + return settings, nil +} + +// MightContain reports whether the Bloom filter may contain the given path. +// +// Evaluated against the full path and each of its directory prefixes. A true +// result indicates a possible match; false means the path definitely did not +// change. +func (f *Filter) MightContain(path []byte, settings *Settings) bool { + if f == nil || settings == nil { + return false + } + if len(f.Data) == 0 { + return false + } + keys := keyvec(path, settings) + for i := range keys { + if filterContainsKey(f, &keys[i], settings) { + return true + } + } + return false +} + +type key struct { + hashes []uint32 +} + +func keyvec(path []byte, settings *Settings) []key { + if len(path) == 0 { + return nil + } + count := 1 + for _, b := range path { + if b == '/' { + count++ + } + } + keys := make([]key, 0, count) + keys = append(keys, keyFill(path, settings)) + for i := len(path) - 1; i >= 0; i-- { + if path[i] == '/' { + keys = append(keys, keyFill(path[:i], settings)) + } + } + return keys +} + +func keyFill(path []byte, settings *Settings) key { + const seed0 = 0x293ae76f + const seed1 = 0x7e646e2c + var h0, h1 uint32 + if settings.HashVersion == 2 { + h0 = murmur3SeededV2(seed0, path) + h1 = murmur3SeededV2(seed1, path) + } else { + h0 = murmur3SeededV1(seed0, path) + h1 = murmur3SeededV1(seed1, path) + } + hashes := make([]uint32, settings.NumHashes) + for i := uint32(0); i < settings.NumHashes; i++ { + hashes[i] = h0 + i*h1 + } + return key{hashes: hashes} +} + +func filterContainsKey(filter *Filter, key *key, settings *Settings) bool { + if filter == nil || key == nil || settings == nil { + return false + } + if len(filter.Data) == 0 { + return false + } + mod := uint64(len(filter.Data)) * 8 + for _, h := range key.hashes { + idx := uint64(h) % mod + bytePos := idx / 8 + bit := byte(1 << (idx & 7)) + if filter.Data[bytePos]&bit == 0 { + return false + } + } + return true +} + +func murmur3SeededV2(seed uint32, data []byte) uint32 { + const ( + c1 = 0xcc9e2d51 + c2 = 0x1b873593 + r1 = 15 + r2 = 13 + m = 5 + n = 0xe6546b64 + ) + h := seed + nblocks := len(data) / 4 + for i := 0; i < nblocks; i++ { + k := uint32(data[4*i]) | + (uint32(data[4*i+1]) << 8) | + (uint32(data[4*i+2]) << 16) | + (uint32(data[4*i+3]) << 24) + k *= c1 + k = (k << r1) | (k >> (32 - r1)) + k *= c2 + + h ^= k + h = (h << r2) | (h >> (32 - r2)) + h = h*m + n + } + + var k1 uint32 + tail := data[nblocks*4:] + switch len(tail) & 3 { + case 3: + k1 ^= uint32(tail[2]) << 16 + fallthrough + case 2: + k1 ^= uint32(tail[1]) << 8 + fallthrough + case 1: + k1 ^= uint32(tail[0]) + k1 *= c1 + k1 = (k1 << r1) | (k1 >> (32 - r1)) + k1 *= c2 + h ^= k1 + } + + h ^= uint32(len(data)) + h ^= h >> 16 + h *= 0x85ebca6b + h ^= h >> 13 + h *= 0xc2b2ae35 + h ^= h >> 16 + return h +} + +func murmur3SeededV1(seed uint32, data []byte) uint32 { + const ( + c1 = 0xcc9e2d51 + c2 = 0x1b873593 + r1 = 15 + r2 = 13 + m = 5 + n = 0xe6546b64 + ) + h := seed + nblocks := len(data) / 4 + for i := 0; i < nblocks; i++ { + b0 := int8(data[4*i]) + b1 := int8(data[4*i+1]) + b2 := int8(data[4*i+2]) + b3 := int8(data[4*i+3]) + k := uint32(b0) | + (uint32(b1) << 8) | + (uint32(b2) << 16) | + (uint32(b3) << 24) + k *= c1 + k = (k << r1) | (k >> (32 - r1)) + k *= c2 + + h ^= k + h = (h << r2) | (h >> (32 - r2)) + h = h*m + n + } + + var k1 uint32 + tail := data[nblocks*4:] + switch len(tail) & 3 { + case 3: + k1 ^= uint32(int8(tail[2])) << 16 + fallthrough + case 2: + k1 ^= uint32(int8(tail[1])) << 8 + fallthrough + case 1: + k1 ^= uint32(int8(tail[0])) + k1 *= c1 + k1 = (k1 << r1) | (k1 >> (32 - r1)) + k1 *= c2 + h ^= k1 + } + + h ^= uint32(len(data)) + h ^= h >> 16 + h *= 0x85ebca6b + h ^= h >> 13 + h *= 0xc2b2ae35 + h ^= h >> 16 + return h +} diff --git a/internal/bloom/errors.go b/internal/bloom/errors.go new file mode 100644 index 00000000..fe38d1bc --- /dev/null +++ b/internal/bloom/errors.go @@ -0,0 +1,5 @@ +package bloom + +import "errors" + +var ErrInvalid = errors.New("bloom: invalid data") |
