aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.golangci.yaml4
-rw-r--r--cmd/doc.go3
-rw-r--r--cmd/explain-pack/delta.go120
-rw-r--r--cmd/explain-pack/doc.go10
-rw-r--r--cmd/explain-pack/entry.go257
-rw-r--r--cmd/explain-pack/fmt.go30
-rw-r--r--cmd/explain-pack/main.go240
-rw-r--r--cmd/explain-pack/resolve.go161
-rw-r--r--cmd/idx-bloom/doc.go8
-rw-r--r--cmd/idx-bloom/main.go99
-rw-r--r--internal/format/packidx/bloom/bloom.go180
-rw-r--r--internal/format/packidx/bloom/bloom_test.go159
-rw-r--r--internal/format/packidx/bloom/doc.go138
-rw-r--r--internal/format/packidx/bloom/lookup.go42
-rw-r--r--internal/format/packidx/bloom/lookup_test.go32
-rw-r--r--internal/format/packidx/bloom/roundtrip_test.go92
-rw-r--r--internal/format/packidx/bloom/write.go164
-rw-r--r--internal/format/packidx/bloom/write_test.go95
-rw-r--r--internal/format/packidx/lookup.go48
-rw-r--r--object/store/packed/internal/ingest/finalize.go45
-rw-r--r--object/store/packed/internal/ingest/result.go3
-rw-r--r--object/store/packed/internal/ingest/writepack_test.go77
-rw-r--r--object/store/packed/lookup.go4
-rw-r--r--object/store/packed/pack.go52
-rw-r--r--object/store/packed/quarantine.go2
-rw-r--r--object/tree/malformed_test.go1
-rw-r--r--research/dynamic_packfiles.txt179
-rw-r--r--research/packfile_bloom.txt133
28 files changed, 2369 insertions, 9 deletions
diff --git a/.golangci.yaml b/.golangci.yaml
index 0f2234be..cb9dec5c 100644
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -10,6 +10,10 @@ linters:
- path: internal/testgit
linters:
- err113 # test helpers report ad-hoc git failures; static errors add no value here
+ - path: cmd/
+ linters:
+ - err113
+ - wrapcheck
disable:
- dupword # extremely normal in tests and a pretty unnecessary linter
- goconst # unnecessary especially for our parsing code; many false positives
diff --git a/cmd/doc.go b/cmd/doc.go
new file mode 100644
index 00000000..119a6522
--- /dev/null
+++ b/cmd/doc.go
@@ -0,0 +1,3 @@
+// Package cmd provides some commands for testing furgit
+// and inspecting git repositories.
+package cmd
diff --git a/cmd/explain-pack/delta.go b/cmd/explain-pack/delta.go
new file mode 100644
index 00000000..22e32195
--- /dev/null
+++ b/cmd/explain-pack/delta.go
@@ -0,0 +1,120 @@
+package main
+
+import (
+ "fmt"
+)
+
+func (explainer *explainer) walkDelta(base, payload []byte, pos int) ([]byte, bool, error) {
+ explainer.printf("\tdelta\n")
+
+ building := base != nil
+
+ var result []byte
+
+ insn := 0
+
+ for pos < len(payload) {
+ op := payload[pos]
+ pos++
+ insn++
+
+ switch {
+ case op&0x80 != 0:
+ next, seg, err := explainer.decodeCopy(base, payload, pos, op)
+ if err != nil {
+ return nil, false, err
+ }
+
+ pos = next
+
+ if building {
+ result = append(result, seg...)
+ }
+ case op != 0:
+ next, lit, err := explainer.decodeInsert(payload, pos, int(op))
+ if err != nil {
+ return nil, false, err
+ }
+
+ pos = next
+
+ if building {
+ result = append(result, lit...)
+ }
+ default:
+ explainer.printf("\t\tinvalid opcode 0x00; stopping delta decode\n")
+
+ return nil, false, nil
+ }
+ }
+
+ if !building {
+ return nil, false, nil
+ }
+
+ return result, true, nil
+}
+
+func (explainer *explainer) decodeCopy(base, payload []byte, pos int, op byte) (int, []byte, error) {
+ offset := 0
+
+ for i := range 4 {
+ if op&(1<<uint(i)) == 0 {
+ continue
+ }
+
+ if pos >= len(payload) {
+ return 0, nil, fmt.Errorf("truncated copy offset")
+ }
+
+ offset |= int(payload[pos]) << (8 * uint(i))
+ pos++
+ }
+
+ size := 0
+
+ for i := range 3 {
+ if op&(1<<uint(4+i)) == 0 {
+ continue
+ }
+
+ if pos >= len(payload) {
+ return 0, nil, fmt.Errorf("truncated copy size")
+ }
+
+ size |= int(payload[pos]) << (8 * uint(i))
+ pos++
+ }
+
+ if size == 0 {
+ size = 0x10000
+ }
+
+ explainer.printf("\t\tcpy %d from %d\n", size, offset)
+
+ if base == nil {
+ return pos, nil, nil
+ }
+
+ if offset < 0 || offset+size > len(base) {
+ return 0, nil, fmt.Errorf("copy of %d byte(s) from base offset %d exceeds base length %d", size, offset, len(base))
+ }
+
+ seg := base[offset : offset+size]
+ hexBlock(explainer.out, "\t\t\t", seg)
+
+ return pos, seg, nil
+}
+
+func (explainer *explainer) decodeInsert(payload []byte, pos, n int) (int, []byte, error) {
+ if pos+n > len(payload) {
+ return 0, nil, fmt.Errorf("truncated insert payload")
+ }
+
+ lit := payload[pos : pos+n]
+
+ explainer.printf("\t\tins %d\n", n)
+ hexBlock(explainer.out, "\t\t\t", lit)
+
+ return pos + n, lit, nil
+}
diff --git a/cmd/explain-pack/doc.go b/cmd/explain-pack/doc.go
new file mode 100644
index 00000000..f5fcc986
--- /dev/null
+++ b/cmd/explain-pack/doc.go
@@ -0,0 +1,10 @@
+// Command explain-pack reads a Git packfile and writes a
+// human-readable explanation to stdout.
+//
+// With a pack filename argument
+// the pack is mmap'd
+// and a sibling .idx is used when present;
+// with no argument the pack is read from stdin.
+// A packfile does not record its object format,
+// so the format must be given with -format.
+package main
diff --git a/cmd/explain-pack/entry.go b/cmd/explain-pack/entry.go
new file mode 100644
index 00000000..0b796ef0
--- /dev/null
+++ b/cmd/explain-pack/entry.go
@@ -0,0 +1,257 @@
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+
+ "lindenii.org/go/furgit/internal/compress/zlib"
+ "lindenii.org/go/furgit/internal/format/packfile"
+ "lindenii.org/go/furgit/internal/format/packfile/delta"
+ "lindenii.org/go/furgit/object/tree"
+ "lindenii.org/go/lgo/intconv"
+)
+
+func (explainer *explainer) explainEntry(num, count, cursor int) (int, error) {
+ hashSize := explainer.objectFormat.Size()
+
+ header, err := packfile.ParseEntryHeader(explainer.data[cursor:], hashSize)
+ if err != nil {
+ return 0, fmt.Errorf("entry %d at offset %d: %w", num, cursor, err)
+ }
+
+ payloadStart := cursor + header.HeaderLen
+ if payloadStart > len(explainer.data) {
+ return 0, fmt.Errorf("entry %d at offset %d: header runs past the end of the pack", num, cursor)
+ }
+
+ payload, consumed, err := inflateAt(explainer.data[payloadStart:])
+ if err != nil {
+ return 0, fmt.Errorf("entry %d at offset %d: %w", num, cursor, err)
+ }
+
+ next := payloadStart + consumed
+
+ explainer.printf("object %d of %d\n", num, count)
+ explainer.printf("\tty\t%s\n", entryTypeLabel(header.Type))
+ explainer.printf("\tofs\t%d\n", cursor)
+ explainer.printf("\thdrsz\t%d\n", header.HeaderLen)
+ explainer.printf("\tsz\t%d\n", header.Size)
+
+ if uint64(len(payload)) != header.Size {
+ explainer.printf("\tnote\tdeclared %d byte(s) but inflated to %d\n", header.Size, len(payload))
+ }
+
+ if header.Type.IsBase() {
+ err = explainer.renderBase(cursor, header.Type, payload, consumed)
+ } else {
+ err = explainer.renderDelta(cursor, header, payload, consumed)
+ }
+
+ if err != nil {
+ return 0, fmt.Errorf("entry %d at offset %d: %w", num, cursor, err)
+ }
+
+ explainer.printf("\n")
+
+ return next, nil
+}
+
+func (explainer *explainer) renderBase(cursor int, entryType packfile.EntryType, content []byte, consumed int) error {
+ explainer.renderContent(entryType, content)
+
+ explainer.printf("\tzlib\t%d\n", consumed)
+
+ oid, err := explainer.recomputeOID(entryType, content)
+ if err != nil {
+ return err
+ }
+
+ explainer.printf("\toid\t%s\n", oid)
+
+ explainer.oidIndex[oid] = cursor
+ explainer.cache.Add(cursor, resolvedBase{entryType: entryType, content: content})
+
+ return nil
+}
+
+func (explainer *explainer) renderDelta(cursor int, header packfile.EntryHeader, payload []byte, consumed int) error {
+ baseSize, resultSize, pos, err := delta.ParseHeaderSizes(payload)
+ if err != nil {
+ return fmt.Errorf("delta header: %w", err)
+ }
+
+ err = explainer.renderBaseRef(cursor, header)
+ if err != nil {
+ return err
+ }
+
+ explainer.printf("\tbasesz\t%d\n", baseSize)
+ explainer.printf("\tnewsz\t%d\n", resultSize)
+
+ baseOffset, located, err := explainer.baseOffset(cursor, header)
+ if err != nil {
+ return err
+ }
+
+ var (
+ baseType packfile.EntryType
+ baseContent []byte
+ baseResolved bool
+ )
+
+ if located {
+ baseType, baseContent, baseResolved, err = explainer.reconstruct(baseOffset, 0)
+ if err != nil {
+ return err
+ }
+ }
+
+ var walkBase []byte
+ if baseResolved {
+ walkBase = baseContent
+ }
+
+ result, complete, err := explainer.walkDelta(walkBase, payload, pos)
+ if err != nil {
+ return err
+ }
+
+ explainer.printf("\tzlib\t%d\n", consumed)
+
+ switch {
+ case baseResolved && complete:
+ if uint64(len(result)) != resultSize {
+ explainer.printf("\tnote\tdelta produced %d byte(s) but declared %d\n", len(result), resultSize)
+ }
+
+ explainer.renderContent(baseType, result)
+
+ newOID, err := explainer.recomputeOID(baseType, result)
+ if err != nil {
+ return err
+ }
+
+ explainer.printf("\tnewoid\t%s\n", newOID)
+
+ explainer.oidIndex[newOID] = cursor
+ explainer.cache.Add(cursor, resolvedBase{entryType: baseType, content: result})
+ case !baseResolved:
+ explainer.printf("\tnote\tbase not available in this pack; cannot reconstruct\n")
+ default:
+ explainer.printf("\tnote\tdelta decode incomplete; cannot reconstruct\n")
+ }
+
+ return nil
+}
+
+func (explainer *explainer) renderBaseRef(cursor int, header packfile.EntryHeader) error {
+ switch header.Type {
+ case packfile.EntryTypeOfsDelta:
+ dist, err := intconv.Uint64ToInt(header.OfsDistance)
+ if err != nil {
+ return fmt.Errorf("ofs-delta distance overflows int: %w", err)
+ }
+
+ explainer.printf("\tbaseofs\t-%d = %d\n", dist, cursor-dist)
+ case packfile.EntryTypeRefDelta:
+ baseID, err := explainer.objectFormat.FromBytes(header.RefBase[:explainer.objectFormat.Size()])
+ if err != nil {
+ return fmt.Errorf("ref-delta base ID: %w", err)
+ }
+
+ explainer.printf("\tbaseoid\t%s\n", baseID)
+ case packfile.EntryTypeInvalid,
+ packfile.EntryTypeCommit,
+ packfile.EntryTypeTree,
+ packfile.EntryTypeBlob,
+ packfile.EntryTypeTag,
+ packfile.EntryTypeFuture:
+ }
+
+ return nil
+}
+
+func (explainer *explainer) renderContent(entryType packfile.EntryType, content []byte) {
+ switch entryType {
+ case packfile.EntryTypeCommit, packfile.EntryTypeTag:
+ explainer.printf("\tcontent\n")
+ indentBlock(explainer.out, "\t\t", content)
+ case packfile.EntryTypeTree:
+ explainer.renderTree(content)
+ case packfile.EntryTypeBlob,
+ packfile.EntryTypeOfsDelta,
+ packfile.EntryTypeRefDelta,
+ packfile.EntryTypeInvalid,
+ packfile.EntryTypeFuture:
+ explainer.printf("\thexdump\n")
+ hexBlock(explainer.out, "\t\t", content)
+ }
+}
+
+func (explainer *explainer) renderTree(content []byte) {
+ parsed, err := tree.Parse(content, explainer.objectFormat)
+ if err != nil {
+ explainer.printf("\thexdump\t(not a valid tree: %v)\n", err)
+ hexBlock(explainer.out, "\t\t", content)
+
+ return
+ }
+
+ explainer.printf("\ttree\n")
+
+ for _, entry := range parsed.Entries() {
+ mode := string(entry.Mode.Append(nil))
+ explainer.printf(
+ "\t\t%s %s %s\t%s\n",
+ mode, entry.Mode.ObjectType().Name(), entry.ID, entry.Name,
+ )
+ }
+}
+
+func inflateAt(data []byte) ([]byte, int, error) {
+ reader := bytes.NewReader(data)
+
+ zr, err := zlib.NewReader(reader)
+ if err != nil {
+ return nil, 0, fmt.Errorf("opening zlib stream: %w", err)
+ }
+
+ content, err := io.ReadAll(zr)
+ closeErr := zr.Close()
+
+ if err != nil {
+ return nil, 0, fmt.Errorf("inflating payload: %w", err)
+ }
+
+ if closeErr != nil {
+ return nil, 0, fmt.Errorf("closing zlib stream: %w", closeErr)
+ }
+
+ consumed := len(data) - reader.Len()
+
+ return content, consumed, nil
+}
+
+func entryTypeLabel(entryType packfile.EntryType) string {
+ switch entryType {
+ case packfile.EntryTypeCommit:
+ return "commit"
+ case packfile.EntryTypeTree:
+ return "tree"
+ case packfile.EntryTypeBlob:
+ return "blob"
+ case packfile.EntryTypeTag:
+ return "tag"
+ case packfile.EntryTypeOfsDelta:
+ return "ofs-delta"
+ case packfile.EntryTypeRefDelta:
+ return "ref-delta"
+ case packfile.EntryTypeInvalid:
+ return "invalid"
+ case packfile.EntryTypeFuture:
+ return "future"
+ default:
+ return fmt.Sprintf("unknown (%d)", entryType)
+ }
+}
diff --git a/cmd/explain-pack/fmt.go b/cmd/explain-pack/fmt.go
new file mode 100644
index 00000000..a3d1b333
--- /dev/null
+++ b/cmd/explain-pack/fmt.go
@@ -0,0 +1,30 @@
+package main
+
+import (
+ "bytes"
+ "encoding/hex"
+ "io"
+
+ "lindenii.org/go/furgit/internal/utils"
+)
+
+func indentBlock(out io.Writer, indent string, block []byte) {
+ lines := bytes.Split(block, []byte("\n"))
+ if n := len(lines); n > 0 && len(lines[n-1]) == 0 {
+ lines = lines[:n-1]
+ }
+
+ for _, line := range lines {
+ utils.BestEffortFprintf(out, "%s%s\n", indent, line)
+ }
+}
+
+func hexBlock(out io.Writer, indent string, data []byte) {
+ var buf bytes.Buffer
+
+ dumper := hex.Dumper(&buf)
+ _, _ = dumper.Write(data)
+ _ = dumper.Close()
+
+ indentBlock(out, indent, buf.Bytes())
+}
diff --git a/cmd/explain-pack/main.go b/cmd/explain-pack/main.go
new file mode 100644
index 00000000..af5b7480
--- /dev/null
+++ b/cmd/explain-pack/main.go
@@ -0,0 +1,240 @@
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "encoding/hex"
+ "flag"
+ "fmt"
+ "io"
+ "os"
+ "strings"
+
+ "lindenii.org/go/furgit/internal/format/packfile"
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/mmap"
+ "lindenii.org/go/furgit/internal/utils"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func main() {
+ format := flag.String("format", "", "object format of the pack: sha1 or sha256 (required)")
+
+ flag.Parse()
+
+ err := run(*format, flag.Args(), os.Stdin, os.Stdout)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, "explain-pack:", err)
+ os.Exit(1)
+ }
+}
+
+type explainer struct {
+ data []byte
+ objectFormat id.ObjectFormat
+ out *bufio.Writer
+
+ idx *packidx.Packidx
+
+ cache *baseCache
+ oidIndex map[id.ObjectID]int
+}
+
+func run(format string, args []string, stdin io.Reader, stdout io.Writer) error {
+ if format == "" {
+ return fmt.Errorf("the -format flag is required (sha1 or sha256)")
+ }
+
+ objectFormat, err := id.ParseObjectFormat(format)
+ if err != nil {
+ return fmt.Errorf("invalid -format %q: %w", format, err)
+ }
+
+ if len(args) > 1 {
+ return fmt.Errorf("at most one pack file argument is accepted, got %d", len(args))
+ }
+
+ data, idx, closers, err := openInput(args, objectFormat, stdin)
+ if err != nil {
+ return err
+ }
+
+ defer func() {
+ for _, c := range closers {
+ _ = c.Close()
+ }
+ }()
+
+ out := bufio.NewWriter(stdout)
+
+ explainer := &explainer{
+ data: data,
+ objectFormat: objectFormat,
+ out: out,
+ idx: idx,
+ cache: newBaseCache(),
+ oidIndex: make(map[id.ObjectID]int),
+ }
+
+ err = explainer.explain()
+ if err != nil {
+ return err
+ }
+
+ return out.Flush()
+}
+
+func openInput(args []string, objectFormat id.ObjectFormat, stdin io.Reader) ([]byte, *packidx.Packidx, []io.Closer, error) {
+ if len(args) == 0 {
+ data, err := io.ReadAll(stdin)
+ if err != nil {
+ return nil, nil, nil, fmt.Errorf("reading pack from stdin: %w", err)
+ }
+
+ return data, nil, nil, nil
+ }
+
+ packPath := args[0]
+
+ packMapping, err := mapPath(packPath)
+ if err != nil {
+ return nil, nil, nil, err
+ }
+
+ closers := []io.Closer{packMapping}
+
+ idx, idxMapping, err := openIndex(packPath, objectFormat)
+ if err != nil {
+ _ = packMapping.Close()
+
+ return nil, nil, nil, err
+ }
+
+ if idxMapping != nil {
+ closers = append(closers, idxMapping)
+ }
+
+ return packMapping.Data(), idx, closers, nil
+}
+
+func openIndex(packPath string, objectFormat id.ObjectFormat) (*packidx.Packidx, *mmap.Mmap, error) {
+ idxPath := strings.TrimSuffix(packPath, ".pack") + ".idx"
+
+ file, err := os.Open(idxPath) //#nosec G304
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil, nil, nil
+ }
+
+ return nil, nil, fmt.Errorf("opening index %q: %w", idxPath, err)
+ }
+
+ defer func() { _ = file.Close() }()
+
+ mapping, err := mmap.Open(file)
+ if err != nil {
+ return nil, nil, fmt.Errorf("mapping index %q: %w", idxPath, err)
+ }
+
+ idx, err := packidx.Parse(mapping.Data(), objectFormat.Size())
+ if err != nil {
+ _ = mapping.Close()
+
+ return nil, nil, fmt.Errorf("parsing index %q: %w", idxPath, err)
+ }
+
+ return &idx, mapping, nil
+}
+
+func mapPath(path string) (*mmap.Mmap, error) {
+ file, err := os.Open(path) //#nosec G304
+ if err != nil {
+ return nil, fmt.Errorf("opening pack %q: %w", path, err)
+ }
+
+ defer func() { _ = file.Close() }()
+
+ mapping, err := mmap.Open(file)
+ if err != nil {
+ return nil, fmt.Errorf("mapping pack %q: %w", path, err)
+ }
+
+ return mapping, nil
+}
+
+func (explainer *explainer) printf(format string, args ...any) {
+ utils.BestEffortFprintf(explainer.out, format, args...)
+}
+
+func (explainer *explainer) explain() error {
+ hashSize := explainer.objectFormat.Size()
+
+ if len(explainer.data) < packfile.HeaderLen+hashSize {
+ return fmt.Errorf("pack is too short to contain a header and a %d-byte trailer", hashSize)
+ }
+
+ count, err := explainer.explainHeader()
+ if err != nil {
+ return err
+ }
+
+ cursor := packfile.HeaderLen
+
+ for num := 1; num <= count; num++ {
+ next, err := explainer.explainEntry(num, count, cursor)
+ if err != nil {
+ return err
+ }
+
+ cursor = next
+ }
+
+ return explainer.explainTrailer(cursor)
+}
+
+func (explainer *explainer) explainHeader() (int, error) {
+ header, err := packfile.ParseHeader(explainer.data[:packfile.HeaderLen])
+ if err != nil {
+ return 0, fmt.Errorf("pack header: %w", err)
+ }
+
+ explainer.printf("pack header\n")
+ explainer.printf("\tmagic\t\"PACK\"\n")
+ explainer.printf("\tversion\t2\n")
+ explainer.printf("\tobjects\t%d\n", header.ObjectCount)
+ explainer.printf("\n")
+
+ return int(header.ObjectCount), nil
+}
+
+func (explainer *explainer) explainTrailer(cursor int) error {
+ hashSize := explainer.objectFormat.Size()
+ trailerStart := len(explainer.data) - hashSize
+
+ if cursor != trailerStart {
+ explainer.printf(
+ "note\t%d byte(s) between the last entry and the trailer were unaccounted for\n",
+ trailerStart-cursor,
+ )
+ }
+
+ trailer := explainer.data[trailerStart:]
+
+ explainer.printf("pack trailer\n")
+ explainer.printf("\tchecksum\t%s\n", hex.EncodeToString(trailer))
+
+ hashImpl, err := explainer.objectFormat.New()
+ if err != nil {
+ return fmt.Errorf("object/store: %w", err)
+ }
+
+ _, _ = hashImpl.Write(explainer.data[:trailerStart])
+
+ if bytes.Equal(hashImpl.Sum(nil), trailer) {
+ explainer.printf("\trecomputed\tmatches\n")
+ } else {
+ explainer.printf("\trecomputed\tMISMATCH (corrupt pack or wrong -format)\n")
+ }
+
+ return nil
+}
diff --git a/cmd/explain-pack/resolve.go b/cmd/explain-pack/resolve.go
new file mode 100644
index 00000000..4396fe19
--- /dev/null
+++ b/cmd/explain-pack/resolve.go
@@ -0,0 +1,161 @@
+package main
+
+import (
+ "fmt"
+
+ "lindenii.org/go/furgit/internal/cache/clock"
+ "lindenii.org/go/furgit/internal/format/packfile"
+ "lindenii.org/go/furgit/internal/format/packfile/delta"
+ "lindenii.org/go/furgit/object/header"
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+const baseCacheMaxWeight = 64 << 20
+
+type resolvedBase struct {
+ entryType packfile.EntryType
+ content []byte
+}
+
+type baseCache = clock.Clock[int, resolvedBase]
+
+func newBaseCache() *baseCache {
+ return clock.New(baseCacheMaxWeight, func(_ int, base resolvedBase) uint64 {
+ return uint64(len(base.content)) + 32
+ })
+}
+
+func (explainer *explainer) reconstruct(offset, depth int) (packfile.EntryType, []byte, bool, error) {
+ var zero packfile.EntryType
+
+ if depth > delta.MaxChainDepth {
+ return zero, nil, false, fmt.Errorf("delta chain too deep at offset %d", offset)
+ }
+
+ if cached, ok := explainer.cache.Get(offset); ok {
+ return cached.entryType, cached.content, true, nil
+ }
+
+ header, err := packfile.ParseEntryHeader(explainer.data[offset:], explainer.objectFormat.Size())
+ if err != nil {
+ return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err)
+ }
+
+ payloadStart := offset + header.HeaderLen
+ if payloadStart > len(explainer.data) {
+ return zero, nil, false, fmt.Errorf("entry at offset %d: header runs past end of pack", offset)
+ }
+
+ if header.Type.IsBase() {
+ content, _, err := inflateAt(explainer.data[payloadStart:])
+ if err != nil {
+ return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err)
+ }
+
+ explainer.cache.Add(offset, resolvedBase{entryType: header.Type, content: content})
+
+ return header.Type, content, true, nil
+ }
+
+ baseOffset, ok, err := explainer.baseOffset(offset, header)
+ if err != nil {
+ return zero, nil, false, err
+ }
+
+ if !ok {
+ return zero, nil, false, nil
+ }
+
+ baseType, baseContent, ok, err := explainer.reconstruct(baseOffset, depth+1)
+ if err != nil || !ok {
+ return zero, nil, ok, err
+ }
+
+ payload, _, err := inflateAt(explainer.data[payloadStart:])
+ if err != nil {
+ return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err)
+ }
+
+ content, err := delta.Apply(baseContent, payload)
+ if err != nil {
+ return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err)
+ }
+
+ explainer.cache.Add(offset, resolvedBase{entryType: baseType, content: content})
+
+ return baseType, content, true, nil
+}
+
+func (explainer *explainer) baseOffset(offset int, header packfile.EntryHeader) (int, bool, error) {
+ switch header.Type {
+ case packfile.EntryTypeOfsDelta:
+ dist, err := intconv.Uint64ToInt(header.OfsDistance)
+ if err != nil || dist <= 0 || dist > offset {
+ return 0, false, fmt.Errorf("entry at offset %d: ofs-delta base out of bounds", offset)
+ }
+
+ return offset - dist, true, nil
+ case packfile.EntryTypeRefDelta:
+ refBytes := header.RefBase[:explainer.objectFormat.Size()]
+
+ if explainer.idx != nil {
+ baseOffsetU, found, err := explainer.idx.Lookup(refBytes)
+ if err != nil {
+ return 0, false, fmt.Errorf("entry at offset %d: index lookup: %w", offset, err)
+ }
+
+ if found {
+ baseOffset, err := intconv.Uint64ToInt(baseOffsetU)
+ if err != nil {
+ return 0, false, fmt.Errorf("entry at offset %d: index base offset overflows int: %w", offset, err)
+ }
+
+ return baseOffset, true, nil
+ }
+ }
+
+ baseID, err := explainer.objectFormat.FromBytes(refBytes)
+ if err != nil {
+ return 0, false, fmt.Errorf("entry at offset %d: %w", offset, err)
+ }
+
+ if baseOffset, found := explainer.oidIndex[baseID]; found {
+ return baseOffset, true, nil
+ }
+
+ return 0, false, nil
+ case packfile.EntryTypeInvalid,
+ packfile.EntryTypeCommit,
+ packfile.EntryTypeTree,
+ packfile.EntryTypeBlob,
+ packfile.EntryTypeTag,
+ packfile.EntryTypeFuture:
+ }
+
+ return 0, false, fmt.Errorf("entry at offset %d: not a delta entry", offset)
+}
+
+func (explainer *explainer) recomputeOID(entryType packfile.EntryType, content []byte) (id.ObjectID, error) {
+ var zero id.ObjectID
+
+ objectType, err := entryType.ObjectType()
+ if err != nil {
+ return zero, err
+ }
+
+ hashImpl, err := explainer.objectFormat.New()
+ if err != nil {
+ return zero, err
+ }
+
+ _, _ = hashImpl.Write(header.Append(nil, objectType, len(content)))
+ _, _ = hashImpl.Write(content)
+
+ oid, err := explainer.objectFormat.FromBytes(hashImpl.Sum(nil))
+ if err != nil {
+ return zero, err
+ }
+
+ return oid, nil
+}
diff --git a/cmd/idx-bloom/doc.go b/cmd/idx-bloom/doc.go
new file mode 100644
index 00000000..e7d4e818
--- /dev/null
+++ b/cmd/idx-bloom/doc.go
@@ -0,0 +1,8 @@
+// Command idx-bloom reads a Git pack index
+// and writes an IDBL Bloom filter over its object IDs to stdout.
+//
+// With an index filename argument the index is read from that file;
+// with no argument it is read from stdin.
+// A pack index does not record its object format,
+// so the format must be given with -format.
+package main
diff --git a/cmd/idx-bloom/main.go b/cmd/idx-bloom/main.go
new file mode 100644
index 00000000..fa471237
--- /dev/null
+++ b/cmd/idx-bloom/main.go
@@ -0,0 +1,99 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io"
+ "os"
+
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func main() {
+ format := flag.String("format", "", "object format of the index: sha1 or sha256 (required)")
+
+ flag.Parse()
+
+ err := run(*format, flag.Args(), os.Stdin, os.Stdout)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, "idx-bloom:", err)
+ os.Exit(1)
+ }
+}
+
+func run(format string, args []string, stdin io.Reader, stdout io.Writer) error {
+ if format == "" {
+ return fmt.Errorf("the -format flag is required (sha1 or sha256)")
+ }
+
+ objectFormat, err := id.ParseObjectFormat(format)
+ if err != nil {
+ return fmt.Errorf("invalid -format %q: %w", format, err)
+ }
+
+ if len(args) > 1 {
+ return fmt.Errorf("at most one index file argument is accepted, got %d", len(args))
+ }
+
+ data, err := readInput(args, stdin)
+ if err != nil {
+ return err
+ }
+
+ index, err := packidx.Parse(data, objectFormat.Size())
+ if err != nil {
+ return fmt.Errorf("parsing index: %w", err)
+ }
+
+ filter, err := buildFilter(objectFormat, &index)
+ if err != nil {
+ return err
+ }
+
+ _, err = stdout.Write(filter)
+ if err != nil {
+ return fmt.Errorf("writing filter: %w", err)
+ }
+
+ return nil
+}
+
+func readInput(args []string, stdin io.Reader) ([]byte, error) {
+ if len(args) == 0 {
+ data, err := io.ReadAll(stdin)
+ if err != nil {
+ return nil, fmt.Errorf("reading index from stdin: %w", err)
+ }
+
+ return data, nil
+ }
+
+ data, err := os.ReadFile(args[0]) //#nosec G304
+ if err != nil {
+ return nil, fmt.Errorf("reading index %q: %w", args[0], err)
+ }
+
+ return data, nil
+}
+
+func buildFilter(objectFormat id.ObjectFormat, index *packidx.Packidx) ([]byte, error) {
+ objects := index.NumObjects()
+
+ bucketCount, k, err := bloom.RecommendParams(objectFormat, objects)
+ if err != nil {
+ return nil, fmt.Errorf("choosing parameters: %w", err)
+ }
+
+ builder, err := bloom.NewBuilder(objectFormat, bucketCount, k, index.PackHash())
+ if err != nil {
+ return nil, fmt.Errorf("creating builder: %w", err)
+ }
+
+ for pos := range objects {
+ builder.Add(index.OIDAt(pos))
+ }
+
+ return builder.Bytes(), nil
+}
diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go
new file mode 100644
index 00000000..8b608221
--- /dev/null
+++ b/internal/format/packidx/bloom/bloom.go
@@ -0,0 +1,180 @@
+package bloom
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "math/bits"
+
+ "lindenii.org/go/furgit/object/id"
+)
+
+// ErrMalformedBloomFilter reports that
+// a Bloom filter is truncated,
+// has a bad signature, version, or hash function,
+// or has inconsistent parameters.
+var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter")
+
+const (
+ signature = 0x4944424c // "IDBL"
+ version = 1
+
+ // HeaderLen is the fixed header length in octets,
+ // i.e., the signature, version, hash function identifier,
+ // B, K, and the trailing zero padding.
+ HeaderLen = 64
+
+ // BucketLen is the length of one bucket in octets,
+ // chosen to match the most common cache-line size.
+ BucketLen = 64
+
+ // wordBits is the bit width of one bucket word.
+ wordBits = 64
+
+ // fieldBits is the width of one in-bucket position field.
+ fieldBits = 9
+)
+
+// checkParams validates the filter parameters
+// against one object hash size,
+// returning log2(bucketCount) on success.
+func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) {
+ switch {
+ case bucketCount == 0 || bucketCount&(bucketCount-1) != 0:
+ return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113
+ case k == 0:
+ return 0, errors.New("zero probe count") //nolint:err113
+ }
+
+ log2B := uint(bits.TrailingZeros32(bucketCount))
+ if log2B+fieldBits*uint(k) > uint(hashSize)*8 {
+ return 0, errors.New("parameters exceed hash length") //nolint:err113
+ }
+
+ return log2B, nil
+}
+
+// hashFunctionID returns the on-disk hash function identifier
+// for one object format.
+func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) {
+ switch objectFormat {
+ case id.ObjectFormatSHA1:
+ return 1, nil
+ case id.ObjectFormatSHA256:
+ return 2, nil
+ case id.ObjectFormatUnknown:
+ }
+
+ return 0, id.ErrInvalidObjectFormat
+}
+
+// Bloom is a parsed blocked Bloom filter view over borrowed bytes.
+//
+// Labels: Deps-Borrowed, Life-Parent, MT-Safe.
+type Bloom struct {
+ // data is the entire filter payload.
+ data []byte
+
+ // buckets is the bucket region, between the header and the trailer.
+ buckets []byte
+
+ // objectFormat is the filter's object format.
+ objectFormat id.ObjectFormat
+
+ // log2B is the base-2 logarithm of the bucket count,
+ // i.e. the number of leading object ID bits that select a bucket.
+ log2B uint
+
+ // k is the number of bits set and tested per object ID.
+ k int
+}
+
+// Parse parses one Bloom filter from data.
+//
+// Labels: Deps-Borrowed, Life-Parent.
+func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) {
+ var zero Bloom
+
+ wantHashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return zero, err
+ }
+
+ hashSize := objectFormat.Size()
+
+ if len(data) < HeaderLen {
+ return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data) != signature {
+ return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data[4:]) != version {
+ return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter)
+ }
+
+ if binary.BigEndian.Uint32(data[8:]) != wantHashID {
+ return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter)
+ }
+
+ bucketCount := binary.BigEndian.Uint32(data[12:])
+ k := binary.BigEndian.Uint16(data[16:])
+
+ for _, octet := range data[18:HeaderLen] {
+ if octet != 0 {
+ return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter)
+ }
+ }
+
+ log2B, err := checkParams(bucketCount, k, hashSize)
+ if err != nil {
+ return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err)
+ }
+
+ want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115
+ if uint64(len(data)) != want {
+ return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter)
+ }
+
+ return Bloom{
+ data: data,
+ buckets: data[HeaderLen : len(data)-2*hashSize],
+ objectFormat: objectFormat,
+ log2B: log2B,
+ k: int(k),
+ }, nil
+}
+
+// PackHash returns the pack hash recorded in the filter trailer.
+//
+// Labels: Life-Parent, Mut-No.
+func (f *Bloom) PackHash() []byte {
+ hashSize := f.objectFormat.Size()
+ end := len(f.data) - hashSize
+
+ return f.data[end-hashSize : end]
+}
+
+// Verify recomputes the filter's trailing checksum and reports any mismatch.
+//
+// Verify reads the whole filter,
+// so callers should treat it as a deliberate integrity check
+// rather than part of the open path.
+func (f *Bloom) Verify() error {
+ hashImpl, err := f.objectFormat.New()
+ if err != nil {
+ return fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
+ checksumOff := len(f.data) - f.objectFormat.Size()
+
+ _, _ = hashImpl.Write(f.data[:checksumOff])
+
+ if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) {
+ return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter)
+ }
+
+ return nil
+}
diff --git a/internal/format/packidx/bloom/bloom_test.go b/internal/format/packidx/bloom/bloom_test.go
new file mode 100644
index 00000000..bcfb4419
--- /dev/null
+++ b/internal/format/packidx/bloom/bloom_test.go
@@ -0,0 +1,159 @@
+package bloom_test
+
+import (
+ "encoding/binary"
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func validFilter(t *testing.T, format id.ObjectFormat) []byte {
+ // TODO: maybe testgit should have something like this?
+ t.Helper()
+
+ builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ return builder.Bytes()
+}
+
+func otherFormat(t *testing.T, format id.ObjectFormat) id.ObjectFormat {
+ t.Helper()
+
+ for _, candidate := range id.SupportedObjectFormats() {
+ if candidate != format {
+ return candidate
+ }
+ }
+
+ t.Skip("only one supported object format")
+
+ return id.ObjectFormatUnknown
+}
+
+func TestParseValid(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.Parse(validFilter(t, format), format)
+ if err != nil {
+ t.Fatalf("Parse rejected a valid filter: %v", err)
+ }
+ })
+ }
+}
+
+func TestParseMalformed(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ mangle func(data []byte) []byte
+ }{
+ {"truncated", func(data []byte) []byte { return data[:bloom.HeaderLen-1] }},
+ {"bad signature", func(data []byte) []byte {
+ data[0] ^= 0xff
+
+ return data
+ }},
+ {"bad version", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[4:], 99)
+
+ return data
+ }},
+ {"non power of two", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[12:], 3)
+
+ return data
+ }},
+ {"zero probe count", func(data []byte) []byte {
+ binary.BigEndian.PutUint16(data[16:], 0)
+
+ return data
+ }},
+ {"parameters exceed hash", func(data []byte) []byte {
+ binary.BigEndian.PutUint32(data[12:], 1<<31)
+ binary.BigEndian.PutUint16(data[16:], 30)
+
+ return data
+ }},
+ {"nonzero padding", func(data []byte) []byte {
+ data[20] = 1
+
+ return data
+ }},
+ {"size disagrees", func(data []byte) []byte { return data[:len(data)-1] }},
+ }
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ data := tc.mangle(append([]byte(nil), validFilter(t, format)...))
+
+ _, err := bloom.Parse(data, format)
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+// TestVerifyDetectsCorruption checks that Verify accepts a sound filter
+// and rejects one whose bucket bytes have been altered.
+func TestVerifyDetectsCorruption(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ data := validFilter(t, format)
+
+ filter, err := bloom.Parse(data, format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ err = filter.Verify()
+ if err != nil {
+ t.Fatalf("Verify on a sound filter: %v", err)
+ }
+
+ data[bloom.HeaderLen] ^= 0xff
+
+ err = filter.Verify()
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Verify error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+}
+
+func TestParseHashMismatch(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.Parse(validFilter(t, format), otherFormat(t, format))
+ if !errors.Is(err, bloom.ErrMalformedBloomFilter) {
+ t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err)
+ }
+ })
+ }
+}
diff --git a/internal/format/packidx/bloom/doc.go b/internal/format/packidx/bloom/doc.go
new file mode 100644
index 00000000..06ca57cd
--- /dev/null
+++ b/internal/format/packidx/bloom/doc.go
@@ -0,0 +1,138 @@
+// Package bloom provides a blocked Bloom filter
+// for pack indexes.
+//
+// A filter answers, from a single cache-line-sized read,
+// whether an object ID is definitely absent from the index it covers.
+// A lookup that must consult many packs
+// can then skip the full binary search
+// in every pack whose filter rejects the object,
+// decreasing the cost of misses.
+//
+// # Rationale
+//
+// Especially for server-side usage,
+// repacking is expensive,
+// and creating multi-pack-indexes is still rather expensive.
+// Incremental multi-pack-indexes partially solve this,
+// but having too many of them defeats the purpose,
+// since the indexes must still be walked in order
+// while performing expensive lookups.
+//
+// Instead, each multi-pack-index layer,
+// and each ordinary pack index,
+// may carry its own filter.
+// The indexes are still traversed in their usual order,
+// but the first step when traversing one
+// is to check whether it could possibly hold the wanted object.
+//
+// The filter is split into 64-octet buckets,
+// matching the most common cache-line size.
+// Some bits of the object ID choose the bucket,
+// and the rest choose several bit positions inside it,
+// so a lookup reads one 64-octet bucket
+// and checks whether all required bits are set.
+//
+// # Parameters
+//
+// A filter is parameterized by
+// the number of buckets B
+// and the number of bits set and tested per object ID, K.
+// All integers in the format are big endian.
+// The object ID is interpreted as a big-endian bitstring,
+// where bit offset 0 is the most significant bit of octet 0.
+// B must be a nonzero power of two,
+// K must be nonzero,
+// and log2(B) + 9*K must not exceed the hash length in bits.
+//
+// # File format
+//
+// A filter file is a 64-octet header,
+// then B buckets of 64 octets each,
+// then a two-hash trailer:
+//
+// - 4-octet signature: {'I', 'D', 'B', 'L'}.
+// - 4-octet version identifier (= 1).
+// - 4-octet object hash algorithm identifier
+// (= 1 for SHA-1, 2 for SHA-256).
+// - 4-octet B, the number of buckets.
+// - 2-octet K, the number of bits set and tested per object ID.
+// - 46-octet padding, which must be all zero.
+// - B buckets of 64 octets each.
+// - the pack trailer hash, which binds the filter to its pack.
+// - the checksum of everything before it, over the filter's hash function.
+//
+// The hash length is that of the object format,
+// so the trailer is 2 hashes wide
+// and the file size is exactly 64 + 64*B + 2*hashlen octets.
+//
+// A reader must validate that
+// the signature matches,
+// the version is supported,
+// the hash function identifier is recognized,
+// B is nonzero and a power of two,
+// K is nonzero,
+// log2(B) + 9*K does not exceed the hash length in bits,
+// the padding is all zero,
+// and the file size is exactly 64 + 64*B + 2*hashlen octets.
+//
+// # Binding and integrity
+//
+// The pack hash binds a filter to one pack;
+// a reader trusts a filter only when the recorded pack hash
+// matches the pack it accompanies.
+//
+// The checksum guards against corruption of the filter itself.
+// Recomputing it reads the whole file and rehashes it as fsck.
+//
+// # Lookup
+//
+// A lookup against one filter proceeds as follows:
+//
+// 1. Let b be the unsigned integer encoded
+// by the most significant log2(B) bits of the object ID.
+// B is a power of two, so 0 <= b < B.
+// 2. Select and read bucket b.
+// 3. For each 0 <= i < K,
+// take the i-th 9-bit field
+// from the 9*K bits that follow the bucket-selecting bits,
+// and let pi be the unsigned integer it encodes,
+// so 0 <= pi < 512.
+// Compute wi = pi >> 6 and bi = pi & 63,
+// so wi identifies one of the eight 64-bit words in bucket b
+// and bi identifies one bit within that word.
+// Within each 64-bit word,
+// bit index 0 is the most significant bit
+// and bit index 63 is the least significant bit.
+// Test whether bit bi is set in word wi of bucket b.
+//
+// If any test fails,
+// the object ID is definitely not in the covered index.
+// If all tests succeed,
+// the object ID may be in it.
+// Two of the K 9-bit fields can decode to the same pi,
+// so an insertion may set fewer than K distinct bits;
+// this only raises the false positive rate
+// and never causes a false negative.
+//
+// # Worked example
+//
+// Let B = 1 << 15 = 32768 and K = 8.
+// Then log2(B) = 15,
+// so each lookup uses 15 bits to choose the bucket
+// and 8*9 = 72 bits to choose the in-bucket positions,
+// for a total of 87 bits taken from the object ID.
+// A SHA-1 has 160 bits and a SHA-256 has 256 bits,
+// so both leave ample headroom.
+//
+// # Security considerations
+//
+// Object IDs are public unkeyed hashes,
+// so an adversary can mine packs
+// whose object IDs share a chosen prefix
+// to crowd objects into one bucket
+// and fill its bits.
+// In the worst case this renders some buckets useless,
+// making the filter degrade to "may contain" for those buckets,
+// but it never produces a false negative
+// and is not a significant denial-of-service vector.
+package bloom
diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go
new file mode 100644
index 00000000..4a8d7bda
--- /dev/null
+++ b/internal/format/packidx/bloom/lookup.go
@@ -0,0 +1,42 @@
+package bloom
+
+import (
+ "encoding/binary"
+)
+
+// MayContain reports whether oid may be present
+// in the index covered by the filter.
+//
+// oid must be exactly the filter's hash size;
+// MayContain panics otherwise.
+//
+// Labels: Mut-No.
+func (f *Bloom) MayContain(oid []byte) bool {
+ if len(oid) != f.objectFormat.Size() {
+ panic("internal/format/packidx/bloom: invalid object ID length")
+ }
+
+ base := int(binary.BigEndian.Uint32(oid[:4])>>(32-f.log2B)) * BucketLen
+
+ for i := range f.k {
+ word, mask := probe(oid, f.log2B, i)
+ if binary.BigEndian.Uint64(f.buckets[base+word*8:])&mask == 0 {
+ return false
+ }
+ }
+
+ return true
+}
+
+// probe returns the bucket word index and single-bit mask
+// addressed by the i-th probe of oid.
+func probe(oid []byte, log2B uint, i int) (word int, mask uint64) {
+ bitOff := log2B + fieldBits*uint(i)
+ byteOff := bitOff >> 3
+ bitInByte := bitOff & 7
+
+ window := uint32(oid[byteOff])<<8 | uint32(oid[byteOff+1])
+ pi := (window >> (16 - bitInByte - fieldBits)) & 0x1ff
+
+ return int(pi >> 6), 1 << (wordBits - 1 - (pi & 63))
+}
diff --git a/internal/format/packidx/bloom/lookup_test.go b/internal/format/packidx/bloom/lookup_test.go
new file mode 100644
index 00000000..e6264f9a
--- /dev/null
+++ b/internal/format/packidx/bloom/lookup_test.go
@@ -0,0 +1,32 @@
+package bloom_test
+
+import (
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestMayContainBadLength(t *testing.T) {
+ t.Parallel()
+
+ format := id.ObjectFormatSHA256
+
+ builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ filter, err := bloom.Parse(builder.Bytes(), format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("MayContain did not panic on a short object ID")
+ }
+ }()
+
+ filter.MayContain(make([]byte, format.Size()-1))
+}
diff --git a/internal/format/packidx/bloom/roundtrip_test.go b/internal/format/packidx/bloom/roundtrip_test.go
new file mode 100644
index 00000000..28db17a5
--- /dev/null
+++ b/internal/format/packidx/bloom/roundtrip_test.go
@@ -0,0 +1,92 @@
+package bloom_test
+
+import (
+ "bytes"
+ "encoding/binary"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func makeOID(size int, seed uint64) []byte {
+ out := make([]byte, size)
+ state := seed
+
+ for i := 0; i < size; i += 8 {
+ state = state*6364136223846793005 + 1442695040888963407
+
+ var word [8]byte
+
+ binary.BigEndian.PutUint64(word[:], state)
+ copy(out[i:], word[:])
+ }
+
+ return out
+}
+
+func TestRoundTrip(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ const objects = 10000
+
+ bucketCount, k, err := bloom.RecommendParams(format, objects)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ size := format.Size()
+ packHash := makeOID(size, 0xC0FFEE)
+
+ builder, err := bloom.NewBuilder(format, bucketCount, k, packHash)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ for i := range objects {
+ builder.Add(makeOID(size, uint64(i)))
+ }
+
+ filter, err := bloom.Parse(builder.Bytes(), format)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if !bytes.Equal(filter.PackHash(), packHash) {
+ t.Fatalf("PackHash = %x, want %x", filter.PackHash(), packHash)
+ }
+
+ err = filter.Verify()
+ if err != nil {
+ t.Fatalf("Verify on a freshly built filter: %v", err)
+ }
+
+ for i := range objects {
+ if !filter.MayContain(makeOID(size, uint64(i))) {
+ t.Fatalf("false negative for added object %d", i)
+ }
+ }
+
+ const probes = 10000
+
+ falsePositives := 0
+
+ for i := range probes {
+ if filter.MayContain(makeOID(size, uint64(1)<<40+uint64(i))) {
+ falsePositives++
+ }
+ }
+
+ rate := float64(falsePositives) / float64(probes)
+ if rate > 0.05 {
+ t.Errorf("false positive rate %.4f exceeds 0.05", rate)
+ }
+
+ t.Logf("B=%d K=%d false positive rate %.4f", bucketCount, k, rate)
+ })
+ }
+}
diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go
new file mode 100644
index 00000000..e6213a2c
--- /dev/null
+++ b/internal/format/packidx/bloom/write.go
@@ -0,0 +1,164 @@
+package bloom
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "hash"
+ "math/bits"
+
+ "lindenii.org/go/furgit/object/id"
+ "lindenii.org/go/lgo/intconv"
+)
+
+// ErrInvalidParameters reports that
+// the parameters supplied for a filter build
+// are not representable in the format.
+var ErrInvalidParameters = errors.New("internal/format/packidx/bloom: invalid parameters")
+
+// defaultK is the probe count used by [RecommendParams].
+//
+// With 512-bit buckets it keeps the false positive rate near one percent
+// at the target bucket load.
+const defaultK = 8
+
+// targetLoad is the object count per bucket that [RecommendParams] aims for.
+const targetLoad = 48
+
+// Builder accumulates object IDs into an in-memory Bloom filter
+// and serializes it.
+//
+// Labels: MT-Unsafe.
+type Builder struct {
+ // data is the full filter file, header and trailer included.
+ data []byte
+
+ // buckets aliases the bucket region of data, between header and trailer.
+ buckets []byte
+
+ // hashImpl computes the trailing checksum and gives the hash size.
+ hashImpl hash.Hash
+
+ log2B uint
+ k int
+}
+
+// NewBuilder creates a filter builder
+// for bucketCount buckets and k probes per object ID,
+// binding the filter to packHash.
+//
+// bucketCount must be a nonzero power of two,
+// k must be nonzero,
+// and log2(bucketCount) + 9*k must not exceed the hash length in bits.
+// packHash must be the pack's trailer hash;
+// NewBuilder panics when its length does not match the object format.
+func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16, packHash []byte) (*Builder, error) {
+ hashID, err := hashFunctionID(objectFormat)
+ if err != nil {
+ return nil, err
+ }
+
+ hashImpl, err := objectFormat.New()
+ if err != nil {
+ return nil, fmt.Errorf("internal/format/packidx/bloom: %w", err)
+ }
+
+ hashSize := objectFormat.Size()
+
+ if len(packHash) != hashSize {
+ panic("internal/format/packidx/bloom: invalid pack hash length")
+ }
+
+ log2B, err := checkParams(bucketCount, k, hashSize)
+ if err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize)) //#nosec G115
+ if err != nil {
+ return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ data := make([]byte, total)
+ binary.BigEndian.PutUint32(data[0:], signature)
+ binary.BigEndian.PutUint32(data[4:], version)
+ binary.BigEndian.PutUint32(data[8:], hashID)
+ binary.BigEndian.PutUint32(data[12:], bucketCount)
+ binary.BigEndian.PutUint16(data[16:], k)
+
+ bucketsEnd := total - 2*hashSize
+ copy(data[bucketsEnd:], packHash)
+
+ return &Builder{
+ data: data,
+ buckets: data[HeaderLen:bucketsEnd],
+ hashImpl: hashImpl,
+ log2B: log2B,
+ k: int(k),
+ }, nil
+}
+
+// Add records oid in the filter.
+//
+// oid must be exactly the filter's hash size;
+// Add panics otherwise.
+func (b *Builder) Add(oid []byte) {
+ if len(oid) != b.hashImpl.Size() {
+ panic("internal/format/packidx/bloom: invalid object ID length")
+ }
+
+ base := int(binary.BigEndian.Uint32(oid[:4])>>(32-b.log2B)) * BucketLen
+
+ for i := range b.k {
+ word, mask := probe(oid, b.log2B, i)
+
+ off := base + word*8
+ set := binary.BigEndian.Uint64(b.buckets[off:]) | mask
+ binary.BigEndian.PutUint64(b.buckets[off:], set)
+ }
+}
+
+// Bytes returns the serialized filter, including its trailing checksum.
+//
+// Labels: Life-Parent, Mut-No.
+func (b *Builder) Bytes() []byte {
+ checksumOff := len(b.data) - b.hashImpl.Size()
+
+ b.hashImpl.Reset()
+ _, _ = b.hashImpl.Write(b.data[:checksumOff])
+ b.hashImpl.Sum(b.data[checksumOff:checksumOff])
+
+ return b.data
+}
+
+// RecommendParams returns filter parameters for an index of n objects,
+// targeting a false positive rate near one percent.
+func RecommendParams(objectFormat id.ObjectFormat, n int) (bucketCount uint32, k uint16, err error) {
+ hashSize := objectFormat.Size()
+ if hashSize == 0 {
+ return 0, 0, id.ErrInvalidObjectFormat
+ }
+
+ const maxPow2 = uint32(1) << 31
+
+ wanted := uint64(0)
+ if n > 0 {
+ wanted = (uint64(n) + targetLoad - 1) / targetLoad
+ }
+
+ switch {
+ case wanted <= 1:
+ bucketCount = 1
+ case wanted > uint64(maxPow2):
+ bucketCount = maxPow2
+ default:
+ bucketCount = uint32(1) << bits.Len64(wanted-1)
+ }
+
+ _, err = checkParams(bucketCount, defaultK, hashSize)
+ if err != nil {
+ return 0, 0, fmt.Errorf("%w: %w", ErrInvalidParameters, err)
+ }
+
+ return bucketCount, defaultK, nil
+}
diff --git a/internal/format/packidx/bloom/write_test.go b/internal/format/packidx/bloom/write_test.go
new file mode 100644
index 00000000..74173921
--- /dev/null
+++ b/internal/format/packidx/bloom/write_test.go
@@ -0,0 +1,95 @@
+package bloom_test
+
+import (
+ "errors"
+ "testing"
+
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
+ "lindenii.org/go/furgit/object/id"
+)
+
+func TestRecommendParams(t *testing.T) {
+ t.Parallel()
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, n := range []int{0, 1, 1000, 10000, 1000000} {
+ bucketCount, k, err := bloom.RecommendParams(format, n)
+ if err != nil {
+ t.Fatalf("n=%d: %v", n, err)
+ }
+
+ if bucketCount == 0 || bucketCount&(bucketCount-1) != 0 {
+ t.Errorf("n=%d: bucket count %d not a power of two", n, bucketCount)
+ }
+
+ _, err = bloom.NewBuilder(format, bucketCount, k, make([]byte, format.Size()))
+ if err != nil {
+ t.Errorf("n=%d: recommended parameters rejected: %v", n, err)
+ }
+ }
+ })
+ }
+}
+
+func TestNewBuilderRejects(t *testing.T) {
+ t.Parallel()
+
+ cases := []struct {
+ name string
+ bucketCount uint32
+ k uint16
+ }{
+ {"zero buckets", 0, 8},
+ {"non power of two", 3, 8},
+ {"zero probe count", 4, 0},
+ }
+
+ for _, format := range id.SupportedObjectFormats() {
+ t.Run(format.String(), func(t *testing.T) {
+ t.Parallel()
+
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ t.Parallel()
+
+ _, err := bloom.NewBuilder(format, tc.bucketCount, tc.k, make([]byte, format.Size()))
+ if !errors.Is(err, bloom.ErrInvalidParameters) {
+ t.Fatalf("error = %v, want ErrInvalidParameters", err)
+ }
+ })
+ }
+ })
+ }
+}
+
+func TestNewBuilderBadPackHash(t *testing.T) {
+ t.Parallel()
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("NewBuilder did not panic on a short pack hash")
+ }
+ }()
+
+ _, _ = bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size()-1))
+}
+
+func TestAddBadLength(t *testing.T) {
+ t.Parallel()
+
+ builder, err := bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ defer func() {
+ if recover() == nil {
+ t.Fatal("Add did not panic on a short object ID")
+ }
+ }()
+
+ builder.Add(make([]byte, id.ObjectFormatSHA256.Size()-1))
+}
diff --git a/internal/format/packidx/lookup.go b/internal/format/packidx/lookup.go
index d1293f47..e4b565b6 100644
--- a/internal/format/packidx/lookup.go
+++ b/internal/format/packidx/lookup.go
@@ -4,6 +4,7 @@ import (
"bytes"
"encoding/binary"
"fmt"
+ "math/bits"
)
// Lookup searches the index for one object ID
@@ -18,6 +19,53 @@ func (idx *Packidx) Lookup(oid []byte) (offset uint64, found bool, err error) {
lo, hi := idx.fanoutRange(oid[0])
+ // Object IDs are uniform for honest inputs,
+ // interp on next 8 octets converges in O(log log n).
+ //
+ // OIDs are public unkeyed hashes,
+ // so an attacker may sample/filter/mine prefix clusters,
+ // making interpolation mis-estimate every probe.
+ // See https://runxiyu.org/comp/ch4ht/
+ // for why cryptographic hash algorithms are insufficient.
+ //
+ // Cap the interp at bisect's probe count and finish by bisect;
+ // adversarial at O(log n) plus small interpolation overhead,
+ // honest exit well under the cap.
+ target := binary.BigEndian.Uint64(oid[1:9])
+
+ for budget := bits.Len(uint(hi - lo)); hi-lo > 8 && budget > 0; budget-- {
+ loKey := binary.BigEndian.Uint64(idx.OIDAt(lo)[1:9])
+ hiKey := binary.BigEndian.Uint64(idx.OIDAt(hi - 1)[1:9])
+
+ var mid int
+
+ switch {
+ case target <= loKey:
+ mid = lo
+ case target >= hiKey:
+ mid = hi - 1
+ default:
+ hi128, lo128 := bits.Mul64(target-loKey, uint64(hi-lo-1)) //#nosec G115
+ q, _ := bits.Div64(hi128, lo128, hiKey-loKey)
+ mid = lo + int(q) //#nosec G115
+ }
+
+ switch cmp := bytes.Compare(oid, idx.OIDAt(mid)); {
+ case cmp == 0:
+ offset, err = idx.OffsetAt(mid)
+ if err != nil {
+ return 0, false, err
+ }
+
+ return offset, true, nil
+ case cmp < 0:
+ hi = mid
+ default:
+ lo = mid + 1
+ }
+ }
+
+ // Interpolation narrowed or capped; bisect to finish.
for lo < hi {
mid := lo + (hi-lo)/2
diff --git a/object/store/packed/internal/ingest/finalize.go b/object/store/packed/internal/ingest/finalize.go
index f0ab6622..afed996c 100644
--- a/object/store/packed/internal/ingest/finalize.go
+++ b/object/store/packed/internal/ingest/finalize.go
@@ -8,6 +8,7 @@ import (
"slices"
"lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
"lindenii.org/go/furgit/internal/format/packrev"
"lindenii.org/go/furgit/object/id"
"lindenii.org/go/lgo/intconv"
@@ -38,12 +39,27 @@ func (ingestion *ingestion) finalize() (Result, error) {
return Result{}, err
}
+ bloomBuilder, err := ingestion.buildBloom(entries, packHash)
+ if err != nil {
+ return Result{}, err
+ }
+
+ bloomTmp, err := ingestion.writeTemp("tmp_bloom_", func(w io.Writer) error {
+ _, err := w.Write(bloomBuilder.Bytes())
+
+ return err
+ })
+ if err != nil {
+ return Result{}, err
+ }
+
base := "pack-" + ingestion.packHash.String()
packFinal := base + ".pack"
idxFinal := base + ".idx"
revFinal := base + ".rev"
+ bloomFinal := base + ".bloom"
- // Link the pack and reverse index before the index,
+ // Link the pack, reverse index, and Bloom filter before the index,
// since the index is what publishes the pack to readers.
err = ingestion.link(ingestion.packTmp, packFinal)
if err != nil {
@@ -55,6 +71,11 @@ func (ingestion *ingestion) finalize() (Result, error) {
return Result{}, err
}
+ err = ingestion.link(bloomTmp, bloomFinal)
+ if err != nil {
+ return Result{}, err
+ }
+
err = ingestion.link(idxTmp, idxFinal)
if err != nil {
return Result{}, err
@@ -69,12 +90,34 @@ func (ingestion *ingestion) finalize() (Result, error) {
PackName: packFinal,
IdxName: idxFinal,
RevName: revFinal,
+ BloomName: bloomFinal,
PackHash: ingestion.packHash,
ObjectCount: objectCount,
ThinFixed: ingestion.thinFixed,
}, nil
}
+// buildBloom builds a Bloom filter over the index entries' object IDs,
+// bound to packHash.
+func (ingestion *ingestion) buildBloom(entries []packidx.Entry, packHash []byte) (*bloom.Builder, error) {
+ bucketCount, k, err := bloom.RecommendParams(ingestion.objectFormat, len(entries))
+ if err != nil {
+ return nil, fmt.Errorf("object/store/packed/internal/ingest: %w", err)
+ }
+
+ builder, err := bloom.NewBuilder(ingestion.objectFormat, bucketCount, k, packHash)
+ if err != nil {
+ return nil, fmt.Errorf("object/store/packed/internal/ingest: %w", err)
+ }
+
+ size := ingestion.objectFormat.Size()
+ for i := range entries {
+ builder.Add(entries[i].OID[:size])
+ }
+
+ return builder, nil
+}
+
// indexEntries returns the index entries in object-ID order
// and, for each record in pack order, its position in that index order.
func (ingestion *ingestion) indexEntries() ([]packidx.Entry, []uint32, error) {
diff --git a/object/store/packed/internal/ingest/result.go b/object/store/packed/internal/ingest/result.go
index 0ae5593a..9cd6ef1d 100644
--- a/object/store/packed/internal/ingest/result.go
+++ b/object/store/packed/internal/ingest/result.go
@@ -13,6 +13,9 @@ type Result struct {
// RevName is the destination-relative name of the written reverse index.
RevName string
+ // BloomName is the destination-relative name of the written Bloom filter.
+ BloomName string
+
// PackHash is the pack trailer hash
// shared by the pack, index, and reverse index.
PackHash id.ObjectID
diff --git a/object/store/packed/internal/ingest/writepack_test.go b/object/store/packed/internal/ingest/writepack_test.go
index 394d8f6e..adc0ba35 100644
--- a/object/store/packed/internal/ingest/writepack_test.go
+++ b/object/store/packed/internal/ingest/writepack_test.go
@@ -8,6 +8,8 @@ import (
"path/filepath"
"testing"
+ "lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
"lindenii.org/go/furgit/internal/testgit"
"lindenii.org/go/furgit/object/id"
"lindenii.org/go/furgit/object/store"
@@ -89,6 +91,81 @@ func TestWritePackMatchesGit(t *testing.T) {
}
}
+// TestWritePackBloom verifies that ingesting a pack writes a Bloom filter
+// that reports every object in the pack as present.
+func TestWritePackBloom(t *testing.T) {
+ t.Parallel()
+
+ for _, objectFormat := range id.SupportedObjectFormats() {
+ t.Run(objectFormat.String(), func(t *testing.T) {
+ t.Parallel()
+
+ repo, err := testgit.NewRepo(t, testgit.RepoOptions{ObjectFormat: objectFormat})
+ if err != nil {
+ t.Fatalf("NewRepo: %v", err)
+ }
+
+ seeded, err := repo.SeedHistory(t)
+ if err != nil {
+ t.Fatalf("SeedHistory: %v", err)
+ }
+
+ gitPrefix, err := repo.PackObjects(t, seeded.All(), testgit.PackObjectsOptions{
+ RevIndex: true,
+ Revs: false,
+ Exclude: nil,
+ })
+ if err != nil {
+ t.Fatalf("PackObjects: %v", err)
+ }
+
+ stream, err := os.ReadFile(gitPrefix + ".pack") //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile pack: %v", err)
+ }
+
+ dir, result := writePack(t, objectFormat, bytes.NewReader(stream), store.PackWriteOptions{
+ ThinBase: nil,
+ Progress: nil,
+ })
+
+ if result.BloomName == "" {
+ t.Fatal("BloomName is empty")
+ }
+
+ bloomBytes, err := os.ReadFile(filepath.Join(dir, result.BloomName)) //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile bloom: %v", err)
+ }
+
+ filter, err := bloom.Parse(bloomBytes, objectFormat)
+ if err != nil {
+ t.Fatalf("bloom.Parse: %v", err)
+ }
+
+ idxBytes, err := os.ReadFile(filepath.Join(dir, result.IdxName)) //nolint:gosec
+ if err != nil {
+ t.Fatalf("ReadFile idx: %v", err)
+ }
+
+ index, err := packidx.Parse(idxBytes, objectFormat.Size())
+ if err != nil {
+ t.Fatalf("packidx.Parse: %v", err)
+ }
+
+ if !bytes.Equal(filter.PackHash(), index.PackHash()) {
+ t.Fatalf("filter pack hash %x, want %x", filter.PackHash(), index.PackHash())
+ }
+
+ for pos := range index.NumObjects() {
+ if !filter.MayContain(index.OIDAt(pos)) {
+ t.Fatalf("filter rejects object at index position %d", pos)
+ }
+ }
+ })
+ }
+}
+
// TestWritePackEmpty verifies that a zero-object pack
// succeeds without writing any artifacts.
func TestWritePackEmpty(t *testing.T) {
diff --git a/object/store/packed/lookup.go b/object/store/packed/lookup.go
index e54d34b2..e06870a9 100644
--- a/object/store/packed/lookup.go
+++ b/object/store/packed/lookup.go
@@ -24,6 +24,10 @@ func (packed *Packed) lookup(objectID id.ObjectID) (*pack, int, error) {
oid := objectID.RawBytes()
for _, p := range packed.order.Keys() {
+ if p.filter != nil && !p.filter.MayContain(oid) {
+ continue
+ }
+
offsetU, found, err := p.idx.Lookup(oid)
if err != nil {
return nil, 0, fmt.Errorf("%w: pack %q: %w", ErrMalformedPackedStore, p.name, err)
diff --git a/object/store/packed/pack.go b/object/store/packed/pack.go
index dd43bc7a..9cd6162b 100644
--- a/object/store/packed/pack.go
+++ b/object/store/packed/pack.go
@@ -8,6 +8,7 @@ import (
"lindenii.org/go/furgit/internal/format/packfile"
"lindenii.org/go/furgit/internal/format/packidx"
+ "lindenii.org/go/furgit/internal/format/packidx/bloom"
"lindenii.org/go/furgit/internal/mmap"
"lindenii.org/go/furgit/object/id"
"lindenii.org/go/lgo/intconv"
@@ -36,6 +37,9 @@ type pack struct {
// and data aliases them.
dataMapping *mmap.Mmap
data []byte
+
+ bloomMapping *mmap.Mmap
+ filter *bloom.Bloom
}
// openPack opens, maps, and validates
@@ -69,15 +73,41 @@ func openPack(root *os.Root, name string, objectFormat id.ObjectFormat) (*pack,
return nil, fmt.Errorf("%w: pack %q: %w", ErrMalformedPackedStore, name, err)
}
+ bloomMapping, filter := openBloom(root, name, objectFormat, idx.PackHash())
+
return &pack{
- name: name,
- idxMapping: idxMapping,
- idx: idx,
- dataMapping: dataMapping,
- data: dataMapping.Data(),
+ name: name,
+ idxMapping: idxMapping,
+ idx: idx,
+ dataMapping: dataMapping,
+ data: dataMapping.Data(),
+ bloomMapping: bloomMapping,
+ filter: filter,
}, nil
}
+func openBloom(root *os.Root, name string, objectFormat id.ObjectFormat, packHash []byte) (*mmap.Mmap, *bloom.Bloom) {
+ mapping, err := mapFile(root, name+".bloom")
+ if err != nil {
+ return nil, nil
+ }
+
+ filter, err := bloom.Parse(mapping.Data(), objectFormat)
+ if err != nil {
+ _ = mapping.Close()
+
+ return nil, nil
+ }
+
+ if !bytes.Equal(filter.PackHash(), packHash) {
+ _ = mapping.Close()
+
+ return nil, nil
+ }
+
+ return mapping, &filter
+}
+
// mapFile opens and maps one file under root.
func mapFile(root *os.Root, name string) (*mmap.Mmap, error) {
file, err := root.Open(name)
@@ -125,10 +155,16 @@ func validatePackData(data []byte, idx *packidx.Packidx, hashSize int) error {
return nil
}
-// close releases the pack data and index mappings.
+// close releases the pack data, index, and filter mappings.
func (pack *pack) close() error {
- return errors.Join(
+ errs := []error{
pack.dataMapping.Close(),
pack.idxMapping.Close(),
- )
+ }
+
+ if pack.bloomMapping != nil {
+ errs = append(errs, pack.bloomMapping.Close())
+ }
+
+ return errors.Join(errs...)
}
diff --git a/object/store/packed/quarantine.go b/object/store/packed/quarantine.go
index 5e0b85cb..977a9543 100644
--- a/object/store/packed/quarantine.go
+++ b/object/store/packed/quarantine.go
@@ -156,6 +156,8 @@ func packPromotionPriority(name string) int {
return 1
case strings.HasPrefix(name, "pack-") && strings.HasSuffix(name, ".rev"):
return 2
+ case strings.HasPrefix(name, "pack-") && strings.HasSuffix(name, ".bloom"):
+ return 2
case strings.HasPrefix(name, "pack-") && strings.HasSuffix(name, ".idx"):
return 3
default:
diff --git a/object/tree/malformed_test.go b/object/tree/malformed_test.go
index ca00ea94..8a22b90f 100644
--- a/object/tree/malformed_test.go
+++ b/object/tree/malformed_test.go
@@ -44,6 +44,7 @@ func TestParseMalformed(t *testing.T) {
{name: "unsorted", body: append(record("100644", "b", size), record("100644", "a", size)...)},
{name: "duplicate", body: append(record("100644", "a", size), record("100644", "a", size)...)},
{name: "conflicting-tree-blob", body: append(record("100644", "foo", size), record("40000", "foo", size)...)},
+ {name: "conflicting-tree-blob-nonadjacent", body: append(append(record("100644", "foo", size), record("100644", "foo.c", size)...), record("40000", "foo", size)...)},
} {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
diff --git a/research/dynamic_packfiles.txt b/research/dynamic_packfiles.txt
new file mode 100644
index 00000000..e4fe7e54
--- /dev/null
+++ b/research/dynamic_packfiles.txt
@@ -0,0 +1,179 @@
+dynamic packfiles to append objects
+
+gc/refcount process punches page-sized holes in them for pages fully
+within the space of unwanted objects, after setting a tombstone mark
+
+holes are recorded in an index and re-used
+
+then, if desired, the repack process removes all the punched holes
+and anything surrounding from unwanted objects that are slightly out
+of the page boundary
+
+repack is not really git's repack algorithm, it's bascially just
+defragmentation.
+
+genreational bloom filters
+
+idx design
+==========
+
+so, let's first get our invariants and patterns clear.
+
+* fixed-length cryptographic object IDs
+* essentially uniform key distribution
+* exact lookup only, no range scans, no ordered iteration requirements
+* reads are extremely important
+* writes are mostly append-like
+* deletes/tombstones may happen later but are secondary
+
+1st design
+----------
+
+* mutable front index
+* immutable base index
+* period merge/compaction into a new base generation
+
+
+
+upload-pack/send-pack/defrag
+============================
+
+take current pack, remove dead objects/holes, filter objects out, record
+offsets and adjust ofs_deltas since they always go backwards, write the pack
+back; then stream written pack to client. two-step necessary because pack
+header includes object count; could have a custom new protocol that doesn't do
+so.
+
+random chat log dump
+====================
+<~runxiyu> ori: actually. i think my hashtable-ish .idx scheme doesn't work really well with e.g. "user provided us a small part of the hash"
+<~runxiyu> and when using the Git CLI, abbreviated hashes are extremely common....
+<~runxiyu> not lik ei'd need them in a *forge*
+<~runxiyu> but ugh
+<~runxiyu> i guess i'm going with some sort of b-tree :((
+<~runxiyu> ~~maybe i should just port gefs to git~~
+<&ori> runxiyu: why not? you should be able to pick the pages based on the prefix and then scan, no?
+<~rx> ori: i need to somehow munge the has to prevent page directory explosions
+<~rx> the hash*
+<~rx> e.g. siphash(objectid, secret)
+<~rx> otherwise an attacker could give you 10M objects that start with 00000 and whatnot
+<&ori> what's the worst case that would happen there, and is it exponentially worse than giving you 10M objects that start with anything?
+<&ori> I'm thinking that you can't generate a case worse than 256/nobject extra table lookups, assuming one bit per fanout..
+<~runxiyu> ori: for extendible hashing, yes, definitely worse
+<~runxiyu> the directory will expand a lot for no good reason
+<&ori> yes, but you have 256 bits of hash
+<&ori> how much is a lot worse?
+<&ori> what's the worst an attacker can do, and how is the impact worse than uploading 10M giant objects?
+<&ori> also, spotted a bag of kuai kuai keeping the cash register working today at a tea shop
+<~runxiyu> waitt
+<~runxiyu> hmmm
+ * runxiyu looks agagin if it's O(N) or O(2^N)
+<~runxiyu> well
+<~runxiyu> i think it should be a O(2^n) directory size when the attacker can control n bits prefix
+<&ori> what's the 'n' here?
+<~runxiyu> > can control n bits prefix
+<&ori> yeah, you run out of prefix pretty quickly, though
+<&ori> I'm not seeing how you could get an exponential blowup if you share pages
+<&ori> may be missing something, though
+<~runxiyu> hm
+<&ori> oh, wait, I see
+<&ori> no, wait
+<~runxiyu> i think im confusing myself too to some extent but something doesn't feel right
+<~runxiyu> urgh
+<~runxiyu> okay, rethinking this
+<~runxiyu> d is the global depth
+<~runxiyu> diretory size is 2^d
+<~runxiyu> B records per bucket
+<~runxiyu> whatever happens inside the bucket idc, let's say it's a linked list
+<~runxiyu> whatever happens inside the bucket idc, let's say it's an array* (linked lists suck)
+<~runxiyu> l <= d
+<~runxiyu> (l being the local depth of a bucket)
+<~runxiyu> normal: d = log^2(N/B)
+<&ori> ahh, I see.
+<~runxiyu> N is the object count
+<&ori> yes, so what if you binary searched the page directory, or made it multi-level
+<~runxiyu> an attacker could grab a giant repo and find commonly-prefixed objects, they don't need to brute force their own
+<~runxiyu> ori: remember we're trying to do something easy to add new objects into
+<~runxiyu> how'd you do that with a binary search?
+<~runxiyu> not sure what you mean by multi-level yet here
+<~runxiyu> well, it could just turn into a b+tree...
+<~runxiyu> hm
+<&ori> multilevel -- you have pd[0] using bits 0..n
+<~runxiyu> maybe an lmdb object store isn't too bad after all
+<&ori> pd[0][1] using bits n...m
+<&ori> etc
+<&ori> and the reason I was a bit confused was that I had thought the directory was a trie
+<&ori> rather than just an expanding top level directory
+<~runxiyu> ah
+<&ori> so, yeah, I was thinking you could make the page directory an actual trie
+<~runxiyu> sigh
+<~runxiyu> i guess abbreviated object IDs is something i can't really skip.
+<~runxiyu> ori: ill look into radix trees and LSM trees too
+<~runxiyu> well, you're basically suggesting a radix tree i guess
+<~runxiyu> well actually
+<~runxiyu> radix might not necessarily be the best trie here
+<~runxiyu> idk
+<~runxiyu> hm
+<~runxiyu> firstly im really heavy on reads
+<~runxiyu> and random keys with no sequential access
+<~runxiyu> ok LSM makes no sense
+<&hax[xor]> > O(2^N)
+<~runxiyu> ori: thoughts on how to make tries reasonable to use on disks?
+<&hax[xor]> that sounds like something is already very broken
+<~runxiyu> hax[xor]: wdym
+<&hax[xor]> directory size should absolutely not scale like that
+<~runxiyu> hax[xor]: maybe read up on how extendible hsahing works again?
+<&hax[xor]> probably but if that's how it scales it still sounds verybroken
+<~runxiyu> n is not the amount of objects
+<~runxiyu> it's a pathlogic condition caused by chosne-prefix keys
+<~runxiyu> (your keys are usually supposed to be hashed into something the attacker can't predict)
+<&hax[xor]> if you mean the directory size scales linearly with the number of objects the attacker puts in it... that sounds perfectly normal?
+<&ori> runxiyu: same as extendible hashing, just after you extend to, say, 8 bits, you stop splitting the page directory, and have subdirectories
+<~runxiyu> ori: that could make senes
+<~runxiyu> haven't thought it through
+<~runxiyu> directory size is 2^d, d being the global depth
+<~runxiyu> urgh i need to review for exams
+<~runxiyu> okay
+<~runxiyu> write amplification issue
+<~runxiyu> im not sure how significant this is for realistic git workloads
+<~runxiyu> i haven't counted, but there should be many, many, many more reads than writes
+<~runxiyu> if write amplification is really an issue
+<&ori> I may go wander around a bit.
+<~runxiyu> then ill just port gefs
+<~runxiyu> ori: do you mean IRL, or over dynamic pack data structures-
+<&ori> irl.
+<~runxiyu> alright that makes more sense :P
+<&ori> tomorrow I think I check out Jiufen
+<~runxiyu> frick i want to be able to type epsilon with compose
+<&ori> is that not possible?
+<~runxiyu> i don't seem to be able to
+<~runxiyu> but idk the compose tables on my system
+<~runxiyu> ε
+<~runxiyu> well
+<~runxiyu> unicode hex input always works :/
+<~runxiyu> OKAY FUCK
+<~runxiyu> I keep getting distracted by interesting things
+<~runxiyu> I need to review for my fucking exams
+-- Mode #chat [-q runxiyu] by runxiyu
+-- Mode #chat [-a runxiyu] by runxiyu
+-- #chat: You must be a channel halfop or higher to set channel mode b (ban).
+-- Mode #chat [+b mute:account:runxiyu] by runxiyu
+-- #chat: You cannot send messages to this channel whilst a m: (mute) extban is set matching you.
+-- #chat: You cannot send messages to this channel whilst a m: (mute) extban is set matching you.
+<&f_> does that even work?
+<&ori> for 9front, <alt>*e gives ε
+<&ori> but, don't remember the compose map
+<&ori> thought that there was a similar thing for all greek letters
+
+
+See also:
+https://github.com/inkandswitch/darn
+https://www.youtube.com/watch?v=nk4nefmguZk
+https://crates.io/crates/iroh-blobs
+https://crates.io/crates/bao-tree
+
+
+Actually, who cares about abbreviated hashes?
+Clients. Clients only.
+It will be a separate interface satisfied by "normal repos"
+but not satisfied by our store. Good enough.
diff --git a/research/packfile_bloom.txt b/research/packfile_bloom.txt
new file mode 100644
index 00000000..63acafbe
--- /dev/null
+++ b/research/packfile_bloom.txt
@@ -0,0 +1,133 @@
+Packfile bloom filter RFC
+=========================
+
+Problem
+-------
+
+Especially for server-side usages, repacking is extremely expensive, and
+creating multi-pack-indexes is still rather expensive. Incremental MIDX
+partially solves this, but would defeat the purpose of MIDX when there are too
+many of them, as Git would still have to walk the MIDXes in order while
+performing expensive indexing queries.
+
+Idea
+----
+
+Each MIDX layer, and each non-MIDX index, comes with a bloom filter. MIDXes and
+ordinary .idx files are still traversed in their usual order, but the first
+step when traversing them, is to check whether that index could possibly have
+the desired object, through a bloom filter.
+
+We will want the filters to be mmaped, and we want the lookup cost to be
+dominated by one cache-line read rather than using many scattered reads.
+Therefore, a blocked bloom filter is likely the right direction here. The steps
+are as follows:
+
+1. Split the filter into 64-octet buckets, since 64 octets is the most common
+ cache-line size.
+2. Use some bits of the object ID to choose the bucket.
+3. Use the rest of the key to choose several bit positions inside that bucket.
+4. A lookup thus reads one 64-octet bucket and checks whether all required bits
+ are set.
+
+Definitions
+-----------
+
+Let:
+
+ B := number of buckets
+ K := number of bits set and tested per object ID
+
+* All integers here are big endian.
+* The OID is to be interpreted as a big-endian bitstring, where bit offset 0
+ is the most significant bit of octet 0.
+* log2(B) + 9K <= hash length in bits.
+
+File layout
+-----------
+
+* 4-octet signature: {'I', 'D', 'B', 'L'}
+* 4-octet version identifier (= 1)
+* 4-octet object hash algorithm identifier (= 1 for SHA-1, 2 for SHA-256)
+* 4-octet B (number of buckets)
+* 2-octet K (number of bits set and tested per object ID)
+* 46-octet padding (must be all zeros)
+* B buckets of 64 octets each.
+
+Validation
+----------
+
+* Matching signature
+* Supported version (the rest of the rules are for this version)
+* Hash function identifier must be recognized
+* B must be nonzero and a power of two
+* K must be nonzero
+* log2(B) + 9K <= hash length in bits
+* Padding must be all zero
+* File size must be 64 + 64 * B octets
+
+Lookup procedure
+----------------
+
+1. Let b be the unsigned integer encoded by the most significant log2(B) bits
+ of OID. B is a power of two, and 0 <= b < B.
+2. Select and read bucket b.
+3. For each 0 <= i < K:
+ 1. Start immediately after the most significant log2(B) bits of OID, let the
+ i-th 9-bit field be the bits at offset 9 * i through 9 * i + 8 within the
+ next 9 * K bits of the OID.
+ 2. Let pi be the unsigned integer encoded by that 9-bit field.
+ Then, 0 <= pi < 512.
+ 3. Compute wi := pi >> 6, and bi := pi & 63.
+ Thus, wi identifies one of the 8 64-bit words in bucket b, and bi
+ identifies one bit within that word.
+ 4. Test whether bi is set in the word wi of bucket b. (Within each 64-bit
+ word, bit index 0 denotes the most significant bit, and bit index 63
+ denotes the least significant bit.)
+
+If any test fails, the OID is definitely not in the relevant idx.
+If all tests succeed, the OID may be in the relevant idx.
+
+Note that two of the K 9-bit fields can decode to the same pi, which means an
+insertion may set fewer than K distinct bits.
+
+Worked example
+--------------
+
+Let:
+
+ B = 1 << 15 = 32768
+ K = 8
+
+Then, log2(B) = 15. Each lookup thus uses 15 bits to choose the bucket
+and 8 * 9 = 72 bits to choose the in-bucket positions, for a total of
+87 bits taken from the object ID.
+
+1. Read the first 15 bits of OID and interpret them as b, where
+ 0 <= b < 32768.
+2. Read bucket b.
+3. For each 0 <= i < 8:
+ 1. Read the i-th 9-bit field from the next 72 bits of OID and interpret it
+ as pi, where 0 <= pi < 512.
+ 2. Compute: wi = pi >> 6, bi = pi & 63.
+ 3. Test whether bit bi is set in the word wi of bucket b.
+
+Security considerations
+------------------------
+
+An adversarial packfile where objects are (computationally intensive, even for
+SHA-1 as vulnerable as it is) constructed to have the same prefix for the
+relevant object format hash algorithm could be used to fill up the bloom
+filters, rendering some buckets useless. In the worst case, if they somehow
+fill all filters, this proposal's optimizations become useless, but would not
+be a significant DoS vector.
+
+TODOs
+-----
+
+* Consider dropping mmap (page read vs cachline read)
+* How should B and K be chosen?
+* How does creation/insert work? Note that packfiles and `.idx`es are immutable.
+* What are the sizes?
+* What are the false positive rates?
+* How are benchmarks?