aboutsummaryrefslogtreecommitdiff
path: root/internal/format/packidx/bloom/bloom.go
blob: 8b6082210601576455d0e7e3175645ce0bf62fa2 (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
package bloom

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"math/bits"

	"lindenii.org/go/furgit/object/id"
)

// ErrMalformedBloomFilter reports that
// a Bloom filter is truncated,
// has a bad signature, version, or hash function,
// or has inconsistent parameters.
var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter")

const (
	signature = 0x4944424c // "IDBL"
	version   = 1

	// HeaderLen is the fixed header length in octets,
	// i.e., the signature, version, hash function identifier,
	// B, K, and the trailing zero padding.
	HeaderLen = 64

	// BucketLen is the length of one bucket in octets,
	// chosen to match the most common cache-line size.
	BucketLen = 64

	// wordBits is the bit width of one bucket word.
	wordBits = 64

	// fieldBits is the width of one in-bucket position field.
	fieldBits = 9
)

// checkParams validates the filter parameters
// against one object hash size,
// returning log2(bucketCount) on success.
func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) {
	switch {
	case bucketCount == 0 || bucketCount&(bucketCount-1) != 0:
		return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113
	case k == 0:
		return 0, errors.New("zero probe count") //nolint:err113
	}

	log2B := uint(bits.TrailingZeros32(bucketCount))
	if log2B+fieldBits*uint(k) > uint(hashSize)*8 {
		return 0, errors.New("parameters exceed hash length") //nolint:err113
	}

	return log2B, nil
}

// hashFunctionID returns the on-disk hash function identifier
// for one object format.
func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
	switch objectFormat {
	case id.ObjectFormatSHA1:
		return 1, nil
	case id.ObjectFormatSHA256:
		return 2, nil
	case id.ObjectFormatUnknown:
	}

	return 0, id.ErrInvalidObjectFormat
}

// Bloom is a parsed blocked Bloom filter view over borrowed bytes.
//
// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
type Bloom struct {
	// data is the entire filter payload.
	data []byte

	// buckets is the bucket region, between the header and the trailer.
	buckets []byte

	// objectFormat is the filter's object format.
	objectFormat id.ObjectFormat

	// log2B is the base-2 logarithm of the bucket count,
	// i.e. the number of leading object ID bits that select a bucket.
	log2B uint

	// k is the number of bits set and tested per object ID.
	k int
}

// Parse parses one Bloom filter from data.
//
// Labels: Deps-Borrowed, Life-Parent.
func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
	var zero Bloom

	wantHashID, err := hashFunctionID(objectFormat)
	if err != nil {
		return zero, err
	}

	hashSize := objectFormat.Size()

	if len(data) < HeaderLen {
		return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter)
	}

	if binary.BigEndian.Uint32(data) != signature {
		return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter)
	}

	if binary.BigEndian.Uint32(data[4:]) != version {
		return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter)
	}

	if binary.BigEndian.Uint32(data[8:]) != wantHashID {
		return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter)
	}

	bucketCount := binary.BigEndian.Uint32(data[12:])
	k := binary.BigEndian.Uint16(data[16:])

	for _, octet := range data[18:HeaderLen] {
		if octet != 0 {
			return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter)
		}
	}

	log2B, err := checkParams(bucketCount, k, hashSize)
	if err != nil {
		return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
	}

	want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115
	if uint64(len(data)) != want {
		return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
	}

	return Bloom{
		data:         data,
		buckets:      data[HeaderLen : len(data)-2*hashSize],
		objectFormat: objectFormat,
		log2B:        log2B,
		k:            int(k),
	}, nil
}

// PackHash returns the pack hash recorded in the filter trailer.
//
// Labels: Life-Parent, Mut-No.
func (f *Bloom) PackHash() []byte {
	hashSize := f.objectFormat.Size()
	end := len(f.data) - hashSize

	return f.data[end-hashSize : end]
}

// Verify recomputes the filter's trailing checksum and reports any mismatch.
//
// Verify reads the whole filter,
// so callers should treat it as a deliberate integrity check
// rather than part of the open path.
func (f *Bloom) Verify() error {
	hashImpl, err := f.objectFormat.New()
	if err != nil {
		return fmt.Errorf("internal/format/packidx/bloom: %w", err)
	}

	checksumOff := len(f.data) - f.objectFormat.Size()

	_, _ = hashImpl.Write(f.data[:checksumOff])

	if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) {
		return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter)
	}

	return nil
}