diff options
28 files changed, 2369 insertions, 9 deletions
diff --git a/.golangci.yaml b/.golangci.yaml index 0f2234be..cb9dec5c 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -10,6 +10,10 @@ linters: - path: internal/testgit linters: - err113 # test helpers report ad-hoc git failures; static errors add no value here + - path: cmd/ + linters: + - err113 + - wrapcheck disable: - dupword # extremely normal in tests and a pretty unnecessary linter - goconst # unnecessary especially for our parsing code; many false positives diff --git a/cmd/doc.go b/cmd/doc.go new file mode 100644 index 00000000..119a6522 --- /dev/null +++ b/cmd/doc.go @@ -0,0 +1,3 @@ +// Package cmd provides some commands for testing furgit +// and inspecting git repositories. +package cmd diff --git a/cmd/explain-pack/delta.go b/cmd/explain-pack/delta.go new file mode 100644 index 00000000..22e32195 --- /dev/null +++ b/cmd/explain-pack/delta.go @@ -0,0 +1,120 @@ +package main + +import ( + "fmt" +) + +func (explainer *explainer) walkDelta(base, payload []byte, pos int) ([]byte, bool, error) { + explainer.printf("\tdelta\n") + + building := base != nil + + var result []byte + + insn := 0 + + for pos < len(payload) { + op := payload[pos] + pos++ + insn++ + + switch { + case op&0x80 != 0: + next, seg, err := explainer.decodeCopy(base, payload, pos, op) + if err != nil { + return nil, false, err + } + + pos = next + + if building { + result = append(result, seg...) + } + case op != 0: + next, lit, err := explainer.decodeInsert(payload, pos, int(op)) + if err != nil { + return nil, false, err + } + + pos = next + + if building { + result = append(result, lit...) + } + default: + explainer.printf("\t\tinvalid opcode 0x00; stopping delta decode\n") + + return nil, false, nil + } + } + + if !building { + return nil, false, nil + } + + return result, true, nil +} + +func (explainer *explainer) decodeCopy(base, payload []byte, pos int, op byte) (int, []byte, error) { + offset := 0 + + for i := range 4 { + if op&(1<<uint(i)) == 0 { + continue + } + + if pos >= len(payload) { + return 0, nil, fmt.Errorf("truncated copy offset") + } + + offset |= int(payload[pos]) << (8 * uint(i)) + pos++ + } + + size := 0 + + for i := range 3 { + if op&(1<<uint(4+i)) == 0 { + continue + } + + if pos >= len(payload) { + return 0, nil, fmt.Errorf("truncated copy size") + } + + size |= int(payload[pos]) << (8 * uint(i)) + pos++ + } + + if size == 0 { + size = 0x10000 + } + + explainer.printf("\t\tcpy %d from %d\n", size, offset) + + if base == nil { + return pos, nil, nil + } + + if offset < 0 || offset+size > len(base) { + return 0, nil, fmt.Errorf("copy of %d byte(s) from base offset %d exceeds base length %d", size, offset, len(base)) + } + + seg := base[offset : offset+size] + hexBlock(explainer.out, "\t\t\t", seg) + + return pos, seg, nil +} + +func (explainer *explainer) decodeInsert(payload []byte, pos, n int) (int, []byte, error) { + if pos+n > len(payload) { + return 0, nil, fmt.Errorf("truncated insert payload") + } + + lit := payload[pos : pos+n] + + explainer.printf("\t\tins %d\n", n) + hexBlock(explainer.out, "\t\t\t", lit) + + return pos + n, lit, nil +} diff --git a/cmd/explain-pack/doc.go b/cmd/explain-pack/doc.go new file mode 100644 index 00000000..f5fcc986 --- /dev/null +++ b/cmd/explain-pack/doc.go @@ -0,0 +1,10 @@ +// Command explain-pack reads a Git packfile and writes a +// human-readable explanation to stdout. +// +// With a pack filename argument +// the pack is mmap'd +// and a sibling .idx is used when present; +// with no argument the pack is read from stdin. +// A packfile does not record its object format, +// so the format must be given with -format. +package main diff --git a/cmd/explain-pack/entry.go b/cmd/explain-pack/entry.go new file mode 100644 index 00000000..0b796ef0 --- /dev/null +++ b/cmd/explain-pack/entry.go @@ -0,0 +1,257 @@ +package main + +import ( + "bytes" + "fmt" + "io" + + "lindenii.org/go/furgit/internal/compress/zlib" + "lindenii.org/go/furgit/internal/format/packfile" + "lindenii.org/go/furgit/internal/format/packfile/delta" + "lindenii.org/go/furgit/object/tree" + "lindenii.org/go/lgo/intconv" +) + +func (explainer *explainer) explainEntry(num, count, cursor int) (int, error) { + hashSize := explainer.objectFormat.Size() + + header, err := packfile.ParseEntryHeader(explainer.data[cursor:], hashSize) + if err != nil { + return 0, fmt.Errorf("entry %d at offset %d: %w", num, cursor, err) + } + + payloadStart := cursor + header.HeaderLen + if payloadStart > len(explainer.data) { + return 0, fmt.Errorf("entry %d at offset %d: header runs past the end of the pack", num, cursor) + } + + payload, consumed, err := inflateAt(explainer.data[payloadStart:]) + if err != nil { + return 0, fmt.Errorf("entry %d at offset %d: %w", num, cursor, err) + } + + next := payloadStart + consumed + + explainer.printf("object %d of %d\n", num, count) + explainer.printf("\tty\t%s\n", entryTypeLabel(header.Type)) + explainer.printf("\tofs\t%d\n", cursor) + explainer.printf("\thdrsz\t%d\n", header.HeaderLen) + explainer.printf("\tsz\t%d\n", header.Size) + + if uint64(len(payload)) != header.Size { + explainer.printf("\tnote\tdeclared %d byte(s) but inflated to %d\n", header.Size, len(payload)) + } + + if header.Type.IsBase() { + err = explainer.renderBase(cursor, header.Type, payload, consumed) + } else { + err = explainer.renderDelta(cursor, header, payload, consumed) + } + + if err != nil { + return 0, fmt.Errorf("entry %d at offset %d: %w", num, cursor, err) + } + + explainer.printf("\n") + + return next, nil +} + +func (explainer *explainer) renderBase(cursor int, entryType packfile.EntryType, content []byte, consumed int) error { + explainer.renderContent(entryType, content) + + explainer.printf("\tzlib\t%d\n", consumed) + + oid, err := explainer.recomputeOID(entryType, content) + if err != nil { + return err + } + + explainer.printf("\toid\t%s\n", oid) + + explainer.oidIndex[oid] = cursor + explainer.cache.Add(cursor, resolvedBase{entryType: entryType, content: content}) + + return nil +} + +func (explainer *explainer) renderDelta(cursor int, header packfile.EntryHeader, payload []byte, consumed int) error { + baseSize, resultSize, pos, err := delta.ParseHeaderSizes(payload) + if err != nil { + return fmt.Errorf("delta header: %w", err) + } + + err = explainer.renderBaseRef(cursor, header) + if err != nil { + return err + } + + explainer.printf("\tbasesz\t%d\n", baseSize) + explainer.printf("\tnewsz\t%d\n", resultSize) + + baseOffset, located, err := explainer.baseOffset(cursor, header) + if err != nil { + return err + } + + var ( + baseType packfile.EntryType + baseContent []byte + baseResolved bool + ) + + if located { + baseType, baseContent, baseResolved, err = explainer.reconstruct(baseOffset, 0) + if err != nil { + return err + } + } + + var walkBase []byte + if baseResolved { + walkBase = baseContent + } + + result, complete, err := explainer.walkDelta(walkBase, payload, pos) + if err != nil { + return err + } + + explainer.printf("\tzlib\t%d\n", consumed) + + switch { + case baseResolved && complete: + if uint64(len(result)) != resultSize { + explainer.printf("\tnote\tdelta produced %d byte(s) but declared %d\n", len(result), resultSize) + } + + explainer.renderContent(baseType, result) + + newOID, err := explainer.recomputeOID(baseType, result) + if err != nil { + return err + } + + explainer.printf("\tnewoid\t%s\n", newOID) + + explainer.oidIndex[newOID] = cursor + explainer.cache.Add(cursor, resolvedBase{entryType: baseType, content: result}) + case !baseResolved: + explainer.printf("\tnote\tbase not available in this pack; cannot reconstruct\n") + default: + explainer.printf("\tnote\tdelta decode incomplete; cannot reconstruct\n") + } + + return nil +} + +func (explainer *explainer) renderBaseRef(cursor int, header packfile.EntryHeader) error { + switch header.Type { + case packfile.EntryTypeOfsDelta: + dist, err := intconv.Uint64ToInt(header.OfsDistance) + if err != nil { + return fmt.Errorf("ofs-delta distance overflows int: %w", err) + } + + explainer.printf("\tbaseofs\t-%d = %d\n", dist, cursor-dist) + case packfile.EntryTypeRefDelta: + baseID, err := explainer.objectFormat.FromBytes(header.RefBase[:explainer.objectFormat.Size()]) + if err != nil { + return fmt.Errorf("ref-delta base ID: %w", err) + } + + explainer.printf("\tbaseoid\t%s\n", baseID) + case packfile.EntryTypeInvalid, + packfile.EntryTypeCommit, + packfile.EntryTypeTree, + packfile.EntryTypeBlob, + packfile.EntryTypeTag, + packfile.EntryTypeFuture: + } + + return nil +} + +func (explainer *explainer) renderContent(entryType packfile.EntryType, content []byte) { + switch entryType { + case packfile.EntryTypeCommit, packfile.EntryTypeTag: + explainer.printf("\tcontent\n") + indentBlock(explainer.out, "\t\t", content) + case packfile.EntryTypeTree: + explainer.renderTree(content) + case packfile.EntryTypeBlob, + packfile.EntryTypeOfsDelta, + packfile.EntryTypeRefDelta, + packfile.EntryTypeInvalid, + packfile.EntryTypeFuture: + explainer.printf("\thexdump\n") + hexBlock(explainer.out, "\t\t", content) + } +} + +func (explainer *explainer) renderTree(content []byte) { + parsed, err := tree.Parse(content, explainer.objectFormat) + if err != nil { + explainer.printf("\thexdump\t(not a valid tree: %v)\n", err) + hexBlock(explainer.out, "\t\t", content) + + return + } + + explainer.printf("\ttree\n") + + for _, entry := range parsed.Entries() { + mode := string(entry.Mode.Append(nil)) + explainer.printf( + "\t\t%s %s %s\t%s\n", + mode, entry.Mode.ObjectType().Name(), entry.ID, entry.Name, + ) + } +} + +func inflateAt(data []byte) ([]byte, int, error) { + reader := bytes.NewReader(data) + + zr, err := zlib.NewReader(reader) + if err != nil { + return nil, 0, fmt.Errorf("opening zlib stream: %w", err) + } + + content, err := io.ReadAll(zr) + closeErr := zr.Close() + + if err != nil { + return nil, 0, fmt.Errorf("inflating payload: %w", err) + } + + if closeErr != nil { + return nil, 0, fmt.Errorf("closing zlib stream: %w", closeErr) + } + + consumed := len(data) - reader.Len() + + return content, consumed, nil +} + +func entryTypeLabel(entryType packfile.EntryType) string { + switch entryType { + case packfile.EntryTypeCommit: + return "commit" + case packfile.EntryTypeTree: + return "tree" + case packfile.EntryTypeBlob: + return "blob" + case packfile.EntryTypeTag: + return "tag" + case packfile.EntryTypeOfsDelta: + return "ofs-delta" + case packfile.EntryTypeRefDelta: + return "ref-delta" + case packfile.EntryTypeInvalid: + return "invalid" + case packfile.EntryTypeFuture: + return "future" + default: + return fmt.Sprintf("unknown (%d)", entryType) + } +} diff --git a/cmd/explain-pack/fmt.go b/cmd/explain-pack/fmt.go new file mode 100644 index 00000000..a3d1b333 --- /dev/null +++ b/cmd/explain-pack/fmt.go @@ -0,0 +1,30 @@ +package main + +import ( + "bytes" + "encoding/hex" + "io" + + "lindenii.org/go/furgit/internal/utils" +) + +func indentBlock(out io.Writer, indent string, block []byte) { + lines := bytes.Split(block, []byte("\n")) + if n := len(lines); n > 0 && len(lines[n-1]) == 0 { + lines = lines[:n-1] + } + + for _, line := range lines { + utils.BestEffortFprintf(out, "%s%s\n", indent, line) + } +} + +func hexBlock(out io.Writer, indent string, data []byte) { + var buf bytes.Buffer + + dumper := hex.Dumper(&buf) + _, _ = dumper.Write(data) + _ = dumper.Close() + + indentBlock(out, indent, buf.Bytes()) +} diff --git a/cmd/explain-pack/main.go b/cmd/explain-pack/main.go new file mode 100644 index 00000000..af5b7480 --- /dev/null +++ b/cmd/explain-pack/main.go @@ -0,0 +1,240 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/hex" + "flag" + "fmt" + "io" + "os" + "strings" + + "lindenii.org/go/furgit/internal/format/packfile" + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/internal/mmap" + "lindenii.org/go/furgit/internal/utils" + "lindenii.org/go/furgit/object/id" +) + +func main() { + format := flag.String("format", "", "object format of the pack: sha1 or sha256 (required)") + + flag.Parse() + + err := run(*format, flag.Args(), os.Stdin, os.Stdout) + if err != nil { + fmt.Fprintln(os.Stderr, "explain-pack:", err) + os.Exit(1) + } +} + +type explainer struct { + data []byte + objectFormat id.ObjectFormat + out *bufio.Writer + + idx *packidx.Packidx + + cache *baseCache + oidIndex map[id.ObjectID]int +} + +func run(format string, args []string, stdin io.Reader, stdout io.Writer) error { + if format == "" { + return fmt.Errorf("the -format flag is required (sha1 or sha256)") + } + + objectFormat, err := id.ParseObjectFormat(format) + if err != nil { + return fmt.Errorf("invalid -format %q: %w", format, err) + } + + if len(args) > 1 { + return fmt.Errorf("at most one pack file argument is accepted, got %d", len(args)) + } + + data, idx, closers, err := openInput(args, objectFormat, stdin) + if err != nil { + return err + } + + defer func() { + for _, c := range closers { + _ = c.Close() + } + }() + + out := bufio.NewWriter(stdout) + + explainer := &explainer{ + data: data, + objectFormat: objectFormat, + out: out, + idx: idx, + cache: newBaseCache(), + oidIndex: make(map[id.ObjectID]int), + } + + err = explainer.explain() + if err != nil { + return err + } + + return out.Flush() +} + +func openInput(args []string, objectFormat id.ObjectFormat, stdin io.Reader) ([]byte, *packidx.Packidx, []io.Closer, error) { + if len(args) == 0 { + data, err := io.ReadAll(stdin) + if err != nil { + return nil, nil, nil, fmt.Errorf("reading pack from stdin: %w", err) + } + + return data, nil, nil, nil + } + + packPath := args[0] + + packMapping, err := mapPath(packPath) + if err != nil { + return nil, nil, nil, err + } + + closers := []io.Closer{packMapping} + + idx, idxMapping, err := openIndex(packPath, objectFormat) + if err != nil { + _ = packMapping.Close() + + return nil, nil, nil, err + } + + if idxMapping != nil { + closers = append(closers, idxMapping) + } + + return packMapping.Data(), idx, closers, nil +} + +func openIndex(packPath string, objectFormat id.ObjectFormat) (*packidx.Packidx, *mmap.Mmap, error) { + idxPath := strings.TrimSuffix(packPath, ".pack") + ".idx" + + file, err := os.Open(idxPath) //#nosec G304 + if err != nil { + if os.IsNotExist(err) { + return nil, nil, nil + } + + return nil, nil, fmt.Errorf("opening index %q: %w", idxPath, err) + } + + defer func() { _ = file.Close() }() + + mapping, err := mmap.Open(file) + if err != nil { + return nil, nil, fmt.Errorf("mapping index %q: %w", idxPath, err) + } + + idx, err := packidx.Parse(mapping.Data(), objectFormat.Size()) + if err != nil { + _ = mapping.Close() + + return nil, nil, fmt.Errorf("parsing index %q: %w", idxPath, err) + } + + return &idx, mapping, nil +} + +func mapPath(path string) (*mmap.Mmap, error) { + file, err := os.Open(path) //#nosec G304 + if err != nil { + return nil, fmt.Errorf("opening pack %q: %w", path, err) + } + + defer func() { _ = file.Close() }() + + mapping, err := mmap.Open(file) + if err != nil { + return nil, fmt.Errorf("mapping pack %q: %w", path, err) + } + + return mapping, nil +} + +func (explainer *explainer) printf(format string, args ...any) { + utils.BestEffortFprintf(explainer.out, format, args...) +} + +func (explainer *explainer) explain() error { + hashSize := explainer.objectFormat.Size() + + if len(explainer.data) < packfile.HeaderLen+hashSize { + return fmt.Errorf("pack is too short to contain a header and a %d-byte trailer", hashSize) + } + + count, err := explainer.explainHeader() + if err != nil { + return err + } + + cursor := packfile.HeaderLen + + for num := 1; num <= count; num++ { + next, err := explainer.explainEntry(num, count, cursor) + if err != nil { + return err + } + + cursor = next + } + + return explainer.explainTrailer(cursor) +} + +func (explainer *explainer) explainHeader() (int, error) { + header, err := packfile.ParseHeader(explainer.data[:packfile.HeaderLen]) + if err != nil { + return 0, fmt.Errorf("pack header: %w", err) + } + + explainer.printf("pack header\n") + explainer.printf("\tmagic\t\"PACK\"\n") + explainer.printf("\tversion\t2\n") + explainer.printf("\tobjects\t%d\n", header.ObjectCount) + explainer.printf("\n") + + return int(header.ObjectCount), nil +} + +func (explainer *explainer) explainTrailer(cursor int) error { + hashSize := explainer.objectFormat.Size() + trailerStart := len(explainer.data) - hashSize + + if cursor != trailerStart { + explainer.printf( + "note\t%d byte(s) between the last entry and the trailer were unaccounted for\n", + trailerStart-cursor, + ) + } + + trailer := explainer.data[trailerStart:] + + explainer.printf("pack trailer\n") + explainer.printf("\tchecksum\t%s\n", hex.EncodeToString(trailer)) + + hashImpl, err := explainer.objectFormat.New() + if err != nil { + return fmt.Errorf("object/store: %w", err) + } + + _, _ = hashImpl.Write(explainer.data[:trailerStart]) + + if bytes.Equal(hashImpl.Sum(nil), trailer) { + explainer.printf("\trecomputed\tmatches\n") + } else { + explainer.printf("\trecomputed\tMISMATCH (corrupt pack or wrong -format)\n") + } + + return nil +} diff --git a/cmd/explain-pack/resolve.go b/cmd/explain-pack/resolve.go new file mode 100644 index 00000000..4396fe19 --- /dev/null +++ b/cmd/explain-pack/resolve.go @@ -0,0 +1,161 @@ +package main + +import ( + "fmt" + + "lindenii.org/go/furgit/internal/cache/clock" + "lindenii.org/go/furgit/internal/format/packfile" + "lindenii.org/go/furgit/internal/format/packfile/delta" + "lindenii.org/go/furgit/object/header" + "lindenii.org/go/furgit/object/id" + "lindenii.org/go/lgo/intconv" +) + +const baseCacheMaxWeight = 64 << 20 + +type resolvedBase struct { + entryType packfile.EntryType + content []byte +} + +type baseCache = clock.Clock[int, resolvedBase] + +func newBaseCache() *baseCache { + return clock.New(baseCacheMaxWeight, func(_ int, base resolvedBase) uint64 { + return uint64(len(base.content)) + 32 + }) +} + +func (explainer *explainer) reconstruct(offset, depth int) (packfile.EntryType, []byte, bool, error) { + var zero packfile.EntryType + + if depth > delta.MaxChainDepth { + return zero, nil, false, fmt.Errorf("delta chain too deep at offset %d", offset) + } + + if cached, ok := explainer.cache.Get(offset); ok { + return cached.entryType, cached.content, true, nil + } + + header, err := packfile.ParseEntryHeader(explainer.data[offset:], explainer.objectFormat.Size()) + if err != nil { + return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err) + } + + payloadStart := offset + header.HeaderLen + if payloadStart > len(explainer.data) { + return zero, nil, false, fmt.Errorf("entry at offset %d: header runs past end of pack", offset) + } + + if header.Type.IsBase() { + content, _, err := inflateAt(explainer.data[payloadStart:]) + if err != nil { + return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err) + } + + explainer.cache.Add(offset, resolvedBase{entryType: header.Type, content: content}) + + return header.Type, content, true, nil + } + + baseOffset, ok, err := explainer.baseOffset(offset, header) + if err != nil { + return zero, nil, false, err + } + + if !ok { + return zero, nil, false, nil + } + + baseType, baseContent, ok, err := explainer.reconstruct(baseOffset, depth+1) + if err != nil || !ok { + return zero, nil, ok, err + } + + payload, _, err := inflateAt(explainer.data[payloadStart:]) + if err != nil { + return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err) + } + + content, err := delta.Apply(baseContent, payload) + if err != nil { + return zero, nil, false, fmt.Errorf("entry at offset %d: %w", offset, err) + } + + explainer.cache.Add(offset, resolvedBase{entryType: baseType, content: content}) + + return baseType, content, true, nil +} + +func (explainer *explainer) baseOffset(offset int, header packfile.EntryHeader) (int, bool, error) { + switch header.Type { + case packfile.EntryTypeOfsDelta: + dist, err := intconv.Uint64ToInt(header.OfsDistance) + if err != nil || dist <= 0 || dist > offset { + return 0, false, fmt.Errorf("entry at offset %d: ofs-delta base out of bounds", offset) + } + + return offset - dist, true, nil + case packfile.EntryTypeRefDelta: + refBytes := header.RefBase[:explainer.objectFormat.Size()] + + if explainer.idx != nil { + baseOffsetU, found, err := explainer.idx.Lookup(refBytes) + if err != nil { + return 0, false, fmt.Errorf("entry at offset %d: index lookup: %w", offset, err) + } + + if found { + baseOffset, err := intconv.Uint64ToInt(baseOffsetU) + if err != nil { + return 0, false, fmt.Errorf("entry at offset %d: index base offset overflows int: %w", offset, err) + } + + return baseOffset, true, nil + } + } + + baseID, err := explainer.objectFormat.FromBytes(refBytes) + if err != nil { + return 0, false, fmt.Errorf("entry at offset %d: %w", offset, err) + } + + if baseOffset, found := explainer.oidIndex[baseID]; found { + return baseOffset, true, nil + } + + return 0, false, nil + case packfile.EntryTypeInvalid, + packfile.EntryTypeCommit, + packfile.EntryTypeTree, + packfile.EntryTypeBlob, + packfile.EntryTypeTag, + packfile.EntryTypeFuture: + } + + return 0, false, fmt.Errorf("entry at offset %d: not a delta entry", offset) +} + +func (explainer *explainer) recomputeOID(entryType packfile.EntryType, content []byte) (id.ObjectID, error) { + var zero id.ObjectID + + objectType, err := entryType.ObjectType() + if err != nil { + return zero, err + } + + hashImpl, err := explainer.objectFormat.New() + if err != nil { + return zero, err + } + + _, _ = hashImpl.Write(header.Append(nil, objectType, len(content))) + _, _ = hashImpl.Write(content) + + oid, err := explainer.objectFormat.FromBytes(hashImpl.Sum(nil)) + if err != nil { + return zero, err + } + + return oid, nil +} diff --git a/cmd/idx-bloom/doc.go b/cmd/idx-bloom/doc.go new file mode 100644 index 00000000..e7d4e818 --- /dev/null +++ b/cmd/idx-bloom/doc.go @@ -0,0 +1,8 @@ +// Command idx-bloom reads a Git pack index +// and writes an IDBL Bloom filter over its object IDs to stdout. +// +// With an index filename argument the index is read from that file; +// with no argument it is read from stdin. +// A pack index does not record its object format, +// so the format must be given with -format. +package main diff --git a/cmd/idx-bloom/main.go b/cmd/idx-bloom/main.go new file mode 100644 index 00000000..fa471237 --- /dev/null +++ b/cmd/idx-bloom/main.go @@ -0,0 +1,99 @@ +package main + +import ( + "flag" + "fmt" + "io" + "os" + + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func main() { + format := flag.String("format", "", "object format of the index: sha1 or sha256 (required)") + + flag.Parse() + + err := run(*format, flag.Args(), os.Stdin, os.Stdout) + if err != nil { + fmt.Fprintln(os.Stderr, "idx-bloom:", err) + os.Exit(1) + } +} + +func run(format string, args []string, stdin io.Reader, stdout io.Writer) error { + if format == "" { + return fmt.Errorf("the -format flag is required (sha1 or sha256)") + } + + objectFormat, err := id.ParseObjectFormat(format) + if err != nil { + return fmt.Errorf("invalid -format %q: %w", format, err) + } + + if len(args) > 1 { + return fmt.Errorf("at most one index file argument is accepted, got %d", len(args)) + } + + data, err := readInput(args, stdin) + if err != nil { + return err + } + + index, err := packidx.Parse(data, objectFormat.Size()) + if err != nil { + return fmt.Errorf("parsing index: %w", err) + } + + filter, err := buildFilter(objectFormat, &index) + if err != nil { + return err + } + + _, err = stdout.Write(filter) + if err != nil { + return fmt.Errorf("writing filter: %w", err) + } + + return nil +} + +func readInput(args []string, stdin io.Reader) ([]byte, error) { + if len(args) == 0 { + data, err := io.ReadAll(stdin) + if err != nil { + return nil, fmt.Errorf("reading index from stdin: %w", err) + } + + return data, nil + } + + data, err := os.ReadFile(args[0]) //#nosec G304 + if err != nil { + return nil, fmt.Errorf("reading index %q: %w", args[0], err) + } + + return data, nil +} + +func buildFilter(objectFormat id.ObjectFormat, index *packidx.Packidx) ([]byte, error) { + objects := index.NumObjects() + + bucketCount, k, err := bloom.RecommendParams(objectFormat, objects) + if err != nil { + return nil, fmt.Errorf("choosing parameters: %w", err) + } + + builder, err := bloom.NewBuilder(objectFormat, bucketCount, k, index.PackHash()) + if err != nil { + return nil, fmt.Errorf("creating builder: %w", err) + } + + for pos := range objects { + builder.Add(index.OIDAt(pos)) + } + + return builder.Bytes(), nil +} diff --git a/internal/format/packidx/bloom/bloom.go b/internal/format/packidx/bloom/bloom.go new file mode 100644 index 00000000..8b608221 --- /dev/null +++ b/internal/format/packidx/bloom/bloom.go @@ -0,0 +1,180 @@ +package bloom + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "math/bits" + + "lindenii.org/go/furgit/object/id" +) + +// ErrMalformedBloomFilter reports that +// a Bloom filter is truncated, +// has a bad signature, version, or hash function, +// or has inconsistent parameters. +var ErrMalformedBloomFilter = errors.New("internal/format/packidx/bloom: malformed bloom filter") + +const ( + signature = 0x4944424c // "IDBL" + version = 1 + + // HeaderLen is the fixed header length in octets, + // i.e., the signature, version, hash function identifier, + // B, K, and the trailing zero padding. + HeaderLen = 64 + + // BucketLen is the length of one bucket in octets, + // chosen to match the most common cache-line size. + BucketLen = 64 + + // wordBits is the bit width of one bucket word. + wordBits = 64 + + // fieldBits is the width of one in-bucket position field. + fieldBits = 9 +) + +// checkParams validates the filter parameters +// against one object hash size, +// returning log2(bucketCount) on success. +func checkParams(bucketCount uint32, k uint16, hashSize int) (uint, error) { + switch { + case bucketCount == 0 || bucketCount&(bucketCount-1) != 0: + return 0, errors.New("bucket count not a nonzero power of two") //nolint:err113 + case k == 0: + return 0, errors.New("zero probe count") //nolint:err113 + } + + log2B := uint(bits.TrailingZeros32(bucketCount)) + if log2B+fieldBits*uint(k) > uint(hashSize)*8 { + return 0, errors.New("parameters exceed hash length") //nolint:err113 + } + + return log2B, nil +} + +// hashFunctionID returns the on-disk hash function identifier +// for one object format. +func hashFunctionID(objectFormat id.ObjectFormat) (uint32, error) { + switch objectFormat { + case id.ObjectFormatSHA1: + return 1, nil + case id.ObjectFormatSHA256: + return 2, nil + case id.ObjectFormatUnknown: + } + + return 0, id.ErrInvalidObjectFormat +} + +// Bloom is a parsed blocked Bloom filter view over borrowed bytes. +// +// Labels: Deps-Borrowed, Life-Parent, MT-Safe. +type Bloom struct { + // data is the entire filter payload. + data []byte + + // buckets is the bucket region, between the header and the trailer. + buckets []byte + + // objectFormat is the filter's object format. + objectFormat id.ObjectFormat + + // log2B is the base-2 logarithm of the bucket count, + // i.e. the number of leading object ID bits that select a bucket. + log2B uint + + // k is the number of bits set and tested per object ID. + k int +} + +// Parse parses one Bloom filter from data. +// +// Labels: Deps-Borrowed, Life-Parent. +func Parse(data []byte, objectFormat id.ObjectFormat) (Bloom, error) { + var zero Bloom + + wantHashID, err := hashFunctionID(objectFormat) + if err != nil { + return zero, err + } + + hashSize := objectFormat.Size() + + if len(data) < HeaderLen { + return zero, fmt.Errorf("%w: truncated", ErrMalformedBloomFilter) + } + + if binary.BigEndian.Uint32(data) != signature { + return zero, fmt.Errorf("%w: bad signature", ErrMalformedBloomFilter) + } + + if binary.BigEndian.Uint32(data[4:]) != version { + return zero, fmt.Errorf("%w: unsupported version", ErrMalformedBloomFilter) + } + + if binary.BigEndian.Uint32(data[8:]) != wantHashID { + return zero, fmt.Errorf("%w: hash function mismatch", ErrMalformedBloomFilter) + } + + bucketCount := binary.BigEndian.Uint32(data[12:]) + k := binary.BigEndian.Uint16(data[16:]) + + for _, octet := range data[18:HeaderLen] { + if octet != 0 { + return zero, fmt.Errorf("%w: nonzero padding", ErrMalformedBloomFilter) + } + } + + log2B, err := checkParams(bucketCount, k, hashSize) + if err != nil { + return zero, fmt.Errorf("%w: %w", ErrMalformedBloomFilter, err) + } + + want := uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize) //#nosec G115 + if uint64(len(data)) != want { + return zero, fmt.Errorf("%w: file size disagrees with bucket count", ErrMalformedBloomFilter) + } + + return Bloom{ + data: data, + buckets: data[HeaderLen : len(data)-2*hashSize], + objectFormat: objectFormat, + log2B: log2B, + k: int(k), + }, nil +} + +// PackHash returns the pack hash recorded in the filter trailer. +// +// Labels: Life-Parent, Mut-No. +func (f *Bloom) PackHash() []byte { + hashSize := f.objectFormat.Size() + end := len(f.data) - hashSize + + return f.data[end-hashSize : end] +} + +// Verify recomputes the filter's trailing checksum and reports any mismatch. +// +// Verify reads the whole filter, +// so callers should treat it as a deliberate integrity check +// rather than part of the open path. +func (f *Bloom) Verify() error { + hashImpl, err := f.objectFormat.New() + if err != nil { + return fmt.Errorf("internal/format/packidx/bloom: %w", err) + } + + checksumOff := len(f.data) - f.objectFormat.Size() + + _, _ = hashImpl.Write(f.data[:checksumOff]) + + if !bytes.Equal(hashImpl.Sum(nil), f.data[checksumOff:]) { + return fmt.Errorf("%w: checksum mismatch", ErrMalformedBloomFilter) + } + + return nil +} diff --git a/internal/format/packidx/bloom/bloom_test.go b/internal/format/packidx/bloom/bloom_test.go new file mode 100644 index 00000000..bcfb4419 --- /dev/null +++ b/internal/format/packidx/bloom/bloom_test.go @@ -0,0 +1,159 @@ +package bloom_test + +import ( + "encoding/binary" + "errors" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func validFilter(t *testing.T, format id.ObjectFormat) []byte { + // TODO: maybe testgit should have something like this? + t.Helper() + + builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size())) + if err != nil { + t.Fatal(err) + } + + return builder.Bytes() +} + +func otherFormat(t *testing.T, format id.ObjectFormat) id.ObjectFormat { + t.Helper() + + for _, candidate := range id.SupportedObjectFormats() { + if candidate != format { + return candidate + } + } + + t.Skip("only one supported object format") + + return id.ObjectFormatUnknown +} + +func TestParseValid(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + _, err := bloom.Parse(validFilter(t, format), format) + if err != nil { + t.Fatalf("Parse rejected a valid filter: %v", err) + } + }) + } +} + +func TestParseMalformed(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + mangle func(data []byte) []byte + }{ + {"truncated", func(data []byte) []byte { return data[:bloom.HeaderLen-1] }}, + {"bad signature", func(data []byte) []byte { + data[0] ^= 0xff + + return data + }}, + {"bad version", func(data []byte) []byte { + binary.BigEndian.PutUint32(data[4:], 99) + + return data + }}, + {"non power of two", func(data []byte) []byte { + binary.BigEndian.PutUint32(data[12:], 3) + + return data + }}, + {"zero probe count", func(data []byte) []byte { + binary.BigEndian.PutUint16(data[16:], 0) + + return data + }}, + {"parameters exceed hash", func(data []byte) []byte { + binary.BigEndian.PutUint32(data[12:], 1<<31) + binary.BigEndian.PutUint16(data[16:], 30) + + return data + }}, + {"nonzero padding", func(data []byte) []byte { + data[20] = 1 + + return data + }}, + {"size disagrees", func(data []byte) []byte { return data[:len(data)-1] }}, + } + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + data := tc.mangle(append([]byte(nil), validFilter(t, format)...)) + + _, err := bloom.Parse(data, format) + if !errors.Is(err, bloom.ErrMalformedBloomFilter) { + t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err) + } + }) + } + }) + } +} + +// TestVerifyDetectsCorruption checks that Verify accepts a sound filter +// and rejects one whose bucket bytes have been altered. +func TestVerifyDetectsCorruption(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + data := validFilter(t, format) + + filter, err := bloom.Parse(data, format) + if err != nil { + t.Fatal(err) + } + + err = filter.Verify() + if err != nil { + t.Fatalf("Verify on a sound filter: %v", err) + } + + data[bloom.HeaderLen] ^= 0xff + + err = filter.Verify() + if !errors.Is(err, bloom.ErrMalformedBloomFilter) { + t.Fatalf("Verify error = %v, want ErrMalformedBloomFilter", err) + } + }) + } +} + +func TestParseHashMismatch(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + _, err := bloom.Parse(validFilter(t, format), otherFormat(t, format)) + if !errors.Is(err, bloom.ErrMalformedBloomFilter) { + t.Fatalf("Parse error = %v, want ErrMalformedBloomFilter", err) + } + }) + } +} diff --git a/internal/format/packidx/bloom/doc.go b/internal/format/packidx/bloom/doc.go new file mode 100644 index 00000000..06ca57cd --- /dev/null +++ b/internal/format/packidx/bloom/doc.go @@ -0,0 +1,138 @@ +// Package bloom provides a blocked Bloom filter +// for pack indexes. +// +// A filter answers, from a single cache-line-sized read, +// whether an object ID is definitely absent from the index it covers. +// A lookup that must consult many packs +// can then skip the full binary search +// in every pack whose filter rejects the object, +// decreasing the cost of misses. +// +// # Rationale +// +// Especially for server-side usage, +// repacking is expensive, +// and creating multi-pack-indexes is still rather expensive. +// Incremental multi-pack-indexes partially solve this, +// but having too many of them defeats the purpose, +// since the indexes must still be walked in order +// while performing expensive lookups. +// +// Instead, each multi-pack-index layer, +// and each ordinary pack index, +// may carry its own filter. +// The indexes are still traversed in their usual order, +// but the first step when traversing one +// is to check whether it could possibly hold the wanted object. +// +// The filter is split into 64-octet buckets, +// matching the most common cache-line size. +// Some bits of the object ID choose the bucket, +// and the rest choose several bit positions inside it, +// so a lookup reads one 64-octet bucket +// and checks whether all required bits are set. +// +// # Parameters +// +// A filter is parameterized by +// the number of buckets B +// and the number of bits set and tested per object ID, K. +// All integers in the format are big endian. +// The object ID is interpreted as a big-endian bitstring, +// where bit offset 0 is the most significant bit of octet 0. +// B must be a nonzero power of two, +// K must be nonzero, +// and log2(B) + 9*K must not exceed the hash length in bits. +// +// # File format +// +// A filter file is a 64-octet header, +// then B buckets of 64 octets each, +// then a two-hash trailer: +// +// - 4-octet signature: {'I', 'D', 'B', 'L'}. +// - 4-octet version identifier (= 1). +// - 4-octet object hash algorithm identifier +// (= 1 for SHA-1, 2 for SHA-256). +// - 4-octet B, the number of buckets. +// - 2-octet K, the number of bits set and tested per object ID. +// - 46-octet padding, which must be all zero. +// - B buckets of 64 octets each. +// - the pack trailer hash, which binds the filter to its pack. +// - the checksum of everything before it, over the filter's hash function. +// +// The hash length is that of the object format, +// so the trailer is 2 hashes wide +// and the file size is exactly 64 + 64*B + 2*hashlen octets. +// +// A reader must validate that +// the signature matches, +// the version is supported, +// the hash function identifier is recognized, +// B is nonzero and a power of two, +// K is nonzero, +// log2(B) + 9*K does not exceed the hash length in bits, +// the padding is all zero, +// and the file size is exactly 64 + 64*B + 2*hashlen octets. +// +// # Binding and integrity +// +// The pack hash binds a filter to one pack; +// a reader trusts a filter only when the recorded pack hash +// matches the pack it accompanies. +// +// The checksum guards against corruption of the filter itself. +// Recomputing it reads the whole file and rehashes it as fsck. +// +// # Lookup +// +// A lookup against one filter proceeds as follows: +// +// 1. Let b be the unsigned integer encoded +// by the most significant log2(B) bits of the object ID. +// B is a power of two, so 0 <= b < B. +// 2. Select and read bucket b. +// 3. For each 0 <= i < K, +// take the i-th 9-bit field +// from the 9*K bits that follow the bucket-selecting bits, +// and let pi be the unsigned integer it encodes, +// so 0 <= pi < 512. +// Compute wi = pi >> 6 and bi = pi & 63, +// so wi identifies one of the eight 64-bit words in bucket b +// and bi identifies one bit within that word. +// Within each 64-bit word, +// bit index 0 is the most significant bit +// and bit index 63 is the least significant bit. +// Test whether bit bi is set in word wi of bucket b. +// +// If any test fails, +// the object ID is definitely not in the covered index. +// If all tests succeed, +// the object ID may be in it. +// Two of the K 9-bit fields can decode to the same pi, +// so an insertion may set fewer than K distinct bits; +// this only raises the false positive rate +// and never causes a false negative. +// +// # Worked example +// +// Let B = 1 << 15 = 32768 and K = 8. +// Then log2(B) = 15, +// so each lookup uses 15 bits to choose the bucket +// and 8*9 = 72 bits to choose the in-bucket positions, +// for a total of 87 bits taken from the object ID. +// A SHA-1 has 160 bits and a SHA-256 has 256 bits, +// so both leave ample headroom. +// +// # Security considerations +// +// Object IDs are public unkeyed hashes, +// so an adversary can mine packs +// whose object IDs share a chosen prefix +// to crowd objects into one bucket +// and fill its bits. +// In the worst case this renders some buckets useless, +// making the filter degrade to "may contain" for those buckets, +// but it never produces a false negative +// and is not a significant denial-of-service vector. +package bloom diff --git a/internal/format/packidx/bloom/lookup.go b/internal/format/packidx/bloom/lookup.go new file mode 100644 index 00000000..4a8d7bda --- /dev/null +++ b/internal/format/packidx/bloom/lookup.go @@ -0,0 +1,42 @@ +package bloom + +import ( + "encoding/binary" +) + +// MayContain reports whether oid may be present +// in the index covered by the filter. +// +// oid must be exactly the filter's hash size; +// MayContain panics otherwise. +// +// Labels: Mut-No. +func (f *Bloom) MayContain(oid []byte) bool { + if len(oid) != f.objectFormat.Size() { + panic("internal/format/packidx/bloom: invalid object ID length") + } + + base := int(binary.BigEndian.Uint32(oid[:4])>>(32-f.log2B)) * BucketLen + + for i := range f.k { + word, mask := probe(oid, f.log2B, i) + if binary.BigEndian.Uint64(f.buckets[base+word*8:])&mask == 0 { + return false + } + } + + return true +} + +// probe returns the bucket word index and single-bit mask +// addressed by the i-th probe of oid. +func probe(oid []byte, log2B uint, i int) (word int, mask uint64) { + bitOff := log2B + fieldBits*uint(i) + byteOff := bitOff >> 3 + bitInByte := bitOff & 7 + + window := uint32(oid[byteOff])<<8 | uint32(oid[byteOff+1]) + pi := (window >> (16 - bitInByte - fieldBits)) & 0x1ff + + return int(pi >> 6), 1 << (wordBits - 1 - (pi & 63)) +} diff --git a/internal/format/packidx/bloom/lookup_test.go b/internal/format/packidx/bloom/lookup_test.go new file mode 100644 index 00000000..e6264f9a --- /dev/null +++ b/internal/format/packidx/bloom/lookup_test.go @@ -0,0 +1,32 @@ +package bloom_test + +import ( + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func TestMayContainBadLength(t *testing.T) { + t.Parallel() + + format := id.ObjectFormatSHA256 + + builder, err := bloom.NewBuilder(format, 4, 2, make([]byte, format.Size())) + if err != nil { + t.Fatal(err) + } + + filter, err := bloom.Parse(builder.Bytes(), format) + if err != nil { + t.Fatal(err) + } + + defer func() { + if recover() == nil { + t.Fatal("MayContain did not panic on a short object ID") + } + }() + + filter.MayContain(make([]byte, format.Size()-1)) +} diff --git a/internal/format/packidx/bloom/roundtrip_test.go b/internal/format/packidx/bloom/roundtrip_test.go new file mode 100644 index 00000000..28db17a5 --- /dev/null +++ b/internal/format/packidx/bloom/roundtrip_test.go @@ -0,0 +1,92 @@ +package bloom_test + +import ( + "bytes" + "encoding/binary" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func makeOID(size int, seed uint64) []byte { + out := make([]byte, size) + state := seed + + for i := 0; i < size; i += 8 { + state = state*6364136223846793005 + 1442695040888963407 + + var word [8]byte + + binary.BigEndian.PutUint64(word[:], state) + copy(out[i:], word[:]) + } + + return out +} + +func TestRoundTrip(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + const objects = 10000 + + bucketCount, k, err := bloom.RecommendParams(format, objects) + if err != nil { + t.Fatal(err) + } + + size := format.Size() + packHash := makeOID(size, 0xC0FFEE) + + builder, err := bloom.NewBuilder(format, bucketCount, k, packHash) + if err != nil { + t.Fatal(err) + } + + for i := range objects { + builder.Add(makeOID(size, uint64(i))) + } + + filter, err := bloom.Parse(builder.Bytes(), format) + if err != nil { + t.Fatal(err) + } + + if !bytes.Equal(filter.PackHash(), packHash) { + t.Fatalf("PackHash = %x, want %x", filter.PackHash(), packHash) + } + + err = filter.Verify() + if err != nil { + t.Fatalf("Verify on a freshly built filter: %v", err) + } + + for i := range objects { + if !filter.MayContain(makeOID(size, uint64(i))) { + t.Fatalf("false negative for added object %d", i) + } + } + + const probes = 10000 + + falsePositives := 0 + + for i := range probes { + if filter.MayContain(makeOID(size, uint64(1)<<40+uint64(i))) { + falsePositives++ + } + } + + rate := float64(falsePositives) / float64(probes) + if rate > 0.05 { + t.Errorf("false positive rate %.4f exceeds 0.05", rate) + } + + t.Logf("B=%d K=%d false positive rate %.4f", bucketCount, k, rate) + }) + } +} diff --git a/internal/format/packidx/bloom/write.go b/internal/format/packidx/bloom/write.go new file mode 100644 index 00000000..e6213a2c --- /dev/null +++ b/internal/format/packidx/bloom/write.go @@ -0,0 +1,164 @@ +package bloom + +import ( + "encoding/binary" + "errors" + "fmt" + "hash" + "math/bits" + + "lindenii.org/go/furgit/object/id" + "lindenii.org/go/lgo/intconv" +) + +// ErrInvalidParameters reports that +// the parameters supplied for a filter build +// are not representable in the format. +var ErrInvalidParameters = errors.New("internal/format/packidx/bloom: invalid parameters") + +// defaultK is the probe count used by [RecommendParams]. +// +// With 512-bit buckets it keeps the false positive rate near one percent +// at the target bucket load. +const defaultK = 8 + +// targetLoad is the object count per bucket that [RecommendParams] aims for. +const targetLoad = 48 + +// Builder accumulates object IDs into an in-memory Bloom filter +// and serializes it. +// +// Labels: MT-Unsafe. +type Builder struct { + // data is the full filter file, header and trailer included. + data []byte + + // buckets aliases the bucket region of data, between header and trailer. + buckets []byte + + // hashImpl computes the trailing checksum and gives the hash size. + hashImpl hash.Hash + + log2B uint + k int +} + +// NewBuilder creates a filter builder +// for bucketCount buckets and k probes per object ID, +// binding the filter to packHash. +// +// bucketCount must be a nonzero power of two, +// k must be nonzero, +// and log2(bucketCount) + 9*k must not exceed the hash length in bits. +// packHash must be the pack's trailer hash; +// NewBuilder panics when its length does not match the object format. +func NewBuilder(objectFormat id.ObjectFormat, bucketCount uint32, k uint16, packHash []byte) (*Builder, error) { + hashID, err := hashFunctionID(objectFormat) + if err != nil { + return nil, err + } + + hashImpl, err := objectFormat.New() + if err != nil { + return nil, fmt.Errorf("internal/format/packidx/bloom: %w", err) + } + + hashSize := objectFormat.Size() + + if len(packHash) != hashSize { + panic("internal/format/packidx/bloom: invalid pack hash length") + } + + log2B, err := checkParams(bucketCount, k, hashSize) + if err != nil { + return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err) + } + + total, err := intconv.Uint64ToInt(uint64(HeaderLen) + uint64(BucketLen)*uint64(bucketCount) + 2*uint64(hashSize)) //#nosec G115 + if err != nil { + return nil, fmt.Errorf("%w: %w", ErrInvalidParameters, err) + } + + data := make([]byte, total) + binary.BigEndian.PutUint32(data[0:], signature) + binary.BigEndian.PutUint32(data[4:], version) + binary.BigEndian.PutUint32(data[8:], hashID) + binary.BigEndian.PutUint32(data[12:], bucketCount) + binary.BigEndian.PutUint16(data[16:], k) + + bucketsEnd := total - 2*hashSize + copy(data[bucketsEnd:], packHash) + + return &Builder{ + data: data, + buckets: data[HeaderLen:bucketsEnd], + hashImpl: hashImpl, + log2B: log2B, + k: int(k), + }, nil +} + +// Add records oid in the filter. +// +// oid must be exactly the filter's hash size; +// Add panics otherwise. +func (b *Builder) Add(oid []byte) { + if len(oid) != b.hashImpl.Size() { + panic("internal/format/packidx/bloom: invalid object ID length") + } + + base := int(binary.BigEndian.Uint32(oid[:4])>>(32-b.log2B)) * BucketLen + + for i := range b.k { + word, mask := probe(oid, b.log2B, i) + + off := base + word*8 + set := binary.BigEndian.Uint64(b.buckets[off:]) | mask + binary.BigEndian.PutUint64(b.buckets[off:], set) + } +} + +// Bytes returns the serialized filter, including its trailing checksum. +// +// Labels: Life-Parent, Mut-No. +func (b *Builder) Bytes() []byte { + checksumOff := len(b.data) - b.hashImpl.Size() + + b.hashImpl.Reset() + _, _ = b.hashImpl.Write(b.data[:checksumOff]) + b.hashImpl.Sum(b.data[checksumOff:checksumOff]) + + return b.data +} + +// RecommendParams returns filter parameters for an index of n objects, +// targeting a false positive rate near one percent. +func RecommendParams(objectFormat id.ObjectFormat, n int) (bucketCount uint32, k uint16, err error) { + hashSize := objectFormat.Size() + if hashSize == 0 { + return 0, 0, id.ErrInvalidObjectFormat + } + + const maxPow2 = uint32(1) << 31 + + wanted := uint64(0) + if n > 0 { + wanted = (uint64(n) + targetLoad - 1) / targetLoad + } + + switch { + case wanted <= 1: + bucketCount = 1 + case wanted > uint64(maxPow2): + bucketCount = maxPow2 + default: + bucketCount = uint32(1) << bits.Len64(wanted-1) + } + + _, err = checkParams(bucketCount, defaultK, hashSize) + if err != nil { + return 0, 0, fmt.Errorf("%w: %w", ErrInvalidParameters, err) + } + + return bucketCount, defaultK, nil +} diff --git a/internal/format/packidx/bloom/write_test.go b/internal/format/packidx/bloom/write_test.go new file mode 100644 index 00000000..74173921 --- /dev/null +++ b/internal/format/packidx/bloom/write_test.go @@ -0,0 +1,95 @@ +package bloom_test + +import ( + "errors" + "testing" + + "lindenii.org/go/furgit/internal/format/packidx/bloom" + "lindenii.org/go/furgit/object/id" +) + +func TestRecommendParams(t *testing.T) { + t.Parallel() + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + for _, n := range []int{0, 1, 1000, 10000, 1000000} { + bucketCount, k, err := bloom.RecommendParams(format, n) + if err != nil { + t.Fatalf("n=%d: %v", n, err) + } + + if bucketCount == 0 || bucketCount&(bucketCount-1) != 0 { + t.Errorf("n=%d: bucket count %d not a power of two", n, bucketCount) + } + + _, err = bloom.NewBuilder(format, bucketCount, k, make([]byte, format.Size())) + if err != nil { + t.Errorf("n=%d: recommended parameters rejected: %v", n, err) + } + } + }) + } +} + +func TestNewBuilderRejects(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + bucketCount uint32 + k uint16 + }{ + {"zero buckets", 0, 8}, + {"non power of two", 3, 8}, + {"zero probe count", 4, 0}, + } + + for _, format := range id.SupportedObjectFormats() { + t.Run(format.String(), func(t *testing.T) { + t.Parallel() + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + _, err := bloom.NewBuilder(format, tc.bucketCount, tc.k, make([]byte, format.Size())) + if !errors.Is(err, bloom.ErrInvalidParameters) { + t.Fatalf("error = %v, want ErrInvalidParameters", err) + } + }) + } + }) + } +} + +func TestNewBuilderBadPackHash(t *testing.T) { + t.Parallel() + + defer func() { + if recover() == nil { + t.Fatal("NewBuilder did not panic on a short pack hash") + } + }() + + _, _ = bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size()-1)) +} + +func TestAddBadLength(t *testing.T) { + t.Parallel() + + builder, err := bloom.NewBuilder(id.ObjectFormatSHA256, 4, 2, make([]byte, id.ObjectFormatSHA256.Size())) + if err != nil { + t.Fatal(err) + } + + defer func() { + if recover() == nil { + t.Fatal("Add did not panic on a short object ID") + } + }() + + builder.Add(make([]byte, id.ObjectFormatSHA256.Size()-1)) +} diff --git a/internal/format/packidx/lookup.go b/internal/format/packidx/lookup.go index d1293f47..e4b565b6 100644 --- a/internal/format/packidx/lookup.go +++ b/internal/format/packidx/lookup.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/binary" "fmt" + "math/bits" ) // Lookup searches the index for one object ID @@ -18,6 +19,53 @@ func (idx *Packidx) Lookup(oid []byte) (offset uint64, found bool, err error) { lo, hi := idx.fanoutRange(oid[0]) + // Object IDs are uniform for honest inputs, + // interp on next 8 octets converges in O(log log n). + // + // OIDs are public unkeyed hashes, + // so an attacker may sample/filter/mine prefix clusters, + // making interpolation mis-estimate every probe. + // See https://runxiyu.org/comp/ch4ht/ + // for why cryptographic hash algorithms are insufficient. + // + // Cap the interp at bisect's probe count and finish by bisect; + // adversarial at O(log n) plus small interpolation overhead, + // honest exit well under the cap. + target := binary.BigEndian.Uint64(oid[1:9]) + + for budget := bits.Len(uint(hi - lo)); hi-lo > 8 && budget > 0; budget-- { + loKey := binary.BigEndian.Uint64(idx.OIDAt(lo)[1:9]) + hiKey := binary.BigEndian.Uint64(idx.OIDAt(hi - 1)[1:9]) + + var mid int + + switch { + case target <= loKey: + mid = lo + case target >= hiKey: + mid = hi - 1 + default: + hi128, lo128 := bits.Mul64(target-loKey, uint64(hi-lo-1)) //#nosec G115 + q, _ := bits.Div64(hi128, lo128, hiKey-loKey) + mid = lo + int(q) //#nosec G115 + } + + switch cmp := bytes.Compare(oid, idx.OIDAt(mid)); { + case cmp == 0: + offset, err = idx.OffsetAt(mid) + if err != nil { + return 0, false, err + } + + return offset, true, nil + case cmp < 0: + hi = mid + default: + lo = mid + 1 + } + } + + // Interpolation narrowed or capped; bisect to finish. for lo < hi { mid := lo + (hi-lo)/2 diff --git a/object/store/packed/internal/ingest/finalize.go b/object/store/packed/internal/ingest/finalize.go index f0ab6622..afed996c 100644 --- a/object/store/packed/internal/ingest/finalize.go +++ b/object/store/packed/internal/ingest/finalize.go @@ -8,6 +8,7 @@ import ( "slices" "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/internal/format/packidx/bloom" "lindenii.org/go/furgit/internal/format/packrev" "lindenii.org/go/furgit/object/id" "lindenii.org/go/lgo/intconv" @@ -38,12 +39,27 @@ func (ingestion *ingestion) finalize() (Result, error) { return Result{}, err } + bloomBuilder, err := ingestion.buildBloom(entries, packHash) + if err != nil { + return Result{}, err + } + + bloomTmp, err := ingestion.writeTemp("tmp_bloom_", func(w io.Writer) error { + _, err := w.Write(bloomBuilder.Bytes()) + + return err + }) + if err != nil { + return Result{}, err + } + base := "pack-" + ingestion.packHash.String() packFinal := base + ".pack" idxFinal := base + ".idx" revFinal := base + ".rev" + bloomFinal := base + ".bloom" - // Link the pack and reverse index before the index, + // Link the pack, reverse index, and Bloom filter before the index, // since the index is what publishes the pack to readers. err = ingestion.link(ingestion.packTmp, packFinal) if err != nil { @@ -55,6 +71,11 @@ func (ingestion *ingestion) finalize() (Result, error) { return Result{}, err } + err = ingestion.link(bloomTmp, bloomFinal) + if err != nil { + return Result{}, err + } + err = ingestion.link(idxTmp, idxFinal) if err != nil { return Result{}, err @@ -69,12 +90,34 @@ func (ingestion *ingestion) finalize() (Result, error) { PackName: packFinal, IdxName: idxFinal, RevName: revFinal, + BloomName: bloomFinal, PackHash: ingestion.packHash, ObjectCount: objectCount, ThinFixed: ingestion.thinFixed, }, nil } +// buildBloom builds a Bloom filter over the index entries' object IDs, +// bound to packHash. +func (ingestion *ingestion) buildBloom(entries []packidx.Entry, packHash []byte) (*bloom.Builder, error) { + bucketCount, k, err := bloom.RecommendParams(ingestion.objectFormat, len(entries)) + if err != nil { + return nil, fmt.Errorf("object/store/packed/internal/ingest: %w", err) + } + + builder, err := bloom.NewBuilder(ingestion.objectFormat, bucketCount, k, packHash) + if err != nil { + return nil, fmt.Errorf("object/store/packed/internal/ingest: %w", err) + } + + size := ingestion.objectFormat.Size() + for i := range entries { + builder.Add(entries[i].OID[:size]) + } + + return builder, nil +} + // indexEntries returns the index entries in object-ID order // and, for each record in pack order, its position in that index order. func (ingestion *ingestion) indexEntries() ([]packidx.Entry, []uint32, error) { diff --git a/object/store/packed/internal/ingest/result.go b/object/store/packed/internal/ingest/result.go index 0ae5593a..9cd6ef1d 100644 --- a/object/store/packed/internal/ingest/result.go +++ b/object/store/packed/internal/ingest/result.go @@ -13,6 +13,9 @@ type Result struct { // RevName is the destination-relative name of the written reverse index. RevName string + // BloomName is the destination-relative name of the written Bloom filter. + BloomName string + // PackHash is the pack trailer hash // shared by the pack, index, and reverse index. PackHash id.ObjectID diff --git a/object/store/packed/internal/ingest/writepack_test.go b/object/store/packed/internal/ingest/writepack_test.go index 394d8f6e..adc0ba35 100644 --- a/object/store/packed/internal/ingest/writepack_test.go +++ b/object/store/packed/internal/ingest/writepack_test.go @@ -8,6 +8,8 @@ import ( "path/filepath" "testing" + "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/internal/format/packidx/bloom" "lindenii.org/go/furgit/internal/testgit" "lindenii.org/go/furgit/object/id" "lindenii.org/go/furgit/object/store" @@ -89,6 +91,81 @@ func TestWritePackMatchesGit(t *testing.T) { } } +// TestWritePackBloom verifies that ingesting a pack writes a Bloom filter +// that reports every object in the pack as present. +func TestWritePackBloom(t *testing.T) { + t.Parallel() + + for _, objectFormat := range id.SupportedObjectFormats() { + t.Run(objectFormat.String(), func(t *testing.T) { + t.Parallel() + + repo, err := testgit.NewRepo(t, testgit.RepoOptions{ObjectFormat: objectFormat}) + if err != nil { + t.Fatalf("NewRepo: %v", err) + } + + seeded, err := repo.SeedHistory(t) + if err != nil { + t.Fatalf("SeedHistory: %v", err) + } + + gitPrefix, err := repo.PackObjects(t, seeded.All(), testgit.PackObjectsOptions{ + RevIndex: true, + Revs: false, + Exclude: nil, + }) + if err != nil { + t.Fatalf("PackObjects: %v", err) + } + + stream, err := os.ReadFile(gitPrefix + ".pack") //nolint:gosec + if err != nil { + t.Fatalf("ReadFile pack: %v", err) + } + + dir, result := writePack(t, objectFormat, bytes.NewReader(stream), store.PackWriteOptions{ + ThinBase: nil, + Progress: nil, + }) + + if result.BloomName == "" { + t.Fatal("BloomName is empty") + } + + bloomBytes, err := os.ReadFile(filepath.Join(dir, result.BloomName)) //nolint:gosec + if err != nil { + t.Fatalf("ReadFile bloom: %v", err) + } + + filter, err := bloom.Parse(bloomBytes, objectFormat) + if err != nil { + t.Fatalf("bloom.Parse: %v", err) + } + + idxBytes, err := os.ReadFile(filepath.Join(dir, result.IdxName)) //nolint:gosec + if err != nil { + t.Fatalf("ReadFile idx: %v", err) + } + + index, err := packidx.Parse(idxBytes, objectFormat.Size()) + if err != nil { + t.Fatalf("packidx.Parse: %v", err) + } + + if !bytes.Equal(filter.PackHash(), index.PackHash()) { + t.Fatalf("filter pack hash %x, want %x", filter.PackHash(), index.PackHash()) + } + + for pos := range index.NumObjects() { + if !filter.MayContain(index.OIDAt(pos)) { + t.Fatalf("filter rejects object at index position %d", pos) + } + } + }) + } +} + // TestWritePackEmpty verifies that a zero-object pack // succeeds without writing any artifacts. func TestWritePackEmpty(t *testing.T) { diff --git a/object/store/packed/lookup.go b/object/store/packed/lookup.go index e54d34b2..e06870a9 100644 --- a/object/store/packed/lookup.go +++ b/object/store/packed/lookup.go @@ -24,6 +24,10 @@ func (packed *Packed) lookup(objectID id.ObjectID) (*pack, int, error) { oid := objectID.RawBytes() for _, p := range packed.order.Keys() { + if p.filter != nil && !p.filter.MayContain(oid) { + continue + } + offsetU, found, err := p.idx.Lookup(oid) if err != nil { return nil, 0, fmt.Errorf("%w: pack %q: %w", ErrMalformedPackedStore, p.name, err) diff --git a/object/store/packed/pack.go b/object/store/packed/pack.go index dd43bc7a..9cd6162b 100644 --- a/object/store/packed/pack.go +++ b/object/store/packed/pack.go @@ -8,6 +8,7 @@ import ( "lindenii.org/go/furgit/internal/format/packfile" "lindenii.org/go/furgit/internal/format/packidx" + "lindenii.org/go/furgit/internal/format/packidx/bloom" "lindenii.org/go/furgit/internal/mmap" "lindenii.org/go/furgit/object/id" "lindenii.org/go/lgo/intconv" @@ -36,6 +37,9 @@ type pack struct { // and data aliases them. dataMapping *mmap.Mmap data []byte + + bloomMapping *mmap.Mmap + filter *bloom.Bloom } // openPack opens, maps, and validates @@ -69,15 +73,41 @@ func openPack(root *os.Root, name string, objectFormat id.ObjectFormat) (*pack, return nil, fmt.Errorf("%w: pack %q: %w", ErrMalformedPackedStore, name, err) } + bloomMapping, filter := openBloom(root, name, objectFormat, idx.PackHash()) + return &pack{ - name: name, - idxMapping: idxMapping, - idx: idx, - dataMapping: dataMapping, - data: dataMapping.Data(), + name: name, + idxMapping: idxMapping, + idx: idx, + dataMapping: dataMapping, + data: dataMapping.Data(), + bloomMapping: bloomMapping, + filter: filter, }, nil } +func openBloom(root *os.Root, name string, objectFormat id.ObjectFormat, packHash []byte) (*mmap.Mmap, *bloom.Bloom) { + mapping, err := mapFile(root, name+".bloom") + if err != nil { + return nil, nil + } + + filter, err := bloom.Parse(mapping.Data(), objectFormat) + if err != nil { + _ = mapping.Close() + + return nil, nil + } + + if !bytes.Equal(filter.PackHash(), packHash) { + _ = mapping.Close() + + return nil, nil + } + + return mapping, &filter +} + // mapFile opens and maps one file under root. func mapFile(root *os.Root, name string) (*mmap.Mmap, error) { file, err := root.Open(name) @@ -125,10 +155,16 @@ func validatePackData(data []byte, idx *packidx.Packidx, hashSize int) error { return nil } -// close releases the pack data and index mappings. +// close releases the pack data, index, and filter mappings. func (pack *pack) close() error { - return errors.Join( + errs := []error{ pack.dataMapping.Close(), pack.idxMapping.Close(), - ) + } + + if pack.bloomMapping != nil { + errs = append(errs, pack.bloomMapping.Close()) + } + + return errors.Join(errs...) } diff --git a/object/store/packed/quarantine.go b/object/store/packed/quarantine.go index 5e0b85cb..977a9543 100644 --- a/object/store/packed/quarantine.go +++ b/object/store/packed/quarantine.go @@ -156,6 +156,8 @@ func packPromotionPriority(name string) int { return 1 case strings.HasPrefix(name, "pack-") && strings.HasSuffix(name, ".rev"): return 2 + case strings.HasPrefix(name, "pack-") && strings.HasSuffix(name, ".bloom"): + return 2 case strings.HasPrefix(name, "pack-") && strings.HasSuffix(name, ".idx"): return 3 default: diff --git a/object/tree/malformed_test.go b/object/tree/malformed_test.go index ca00ea94..8a22b90f 100644 --- a/object/tree/malformed_test.go +++ b/object/tree/malformed_test.go @@ -44,6 +44,7 @@ func TestParseMalformed(t *testing.T) { {name: "unsorted", body: append(record("100644", "b", size), record("100644", "a", size)...)}, {name: "duplicate", body: append(record("100644", "a", size), record("100644", "a", size)...)}, {name: "conflicting-tree-blob", body: append(record("100644", "foo", size), record("40000", "foo", size)...)}, + {name: "conflicting-tree-blob-nonadjacent", body: append(append(record("100644", "foo", size), record("100644", "foo.c", size)...), record("40000", "foo", size)...)}, } { t.Run(tc.name, func(t *testing.T) { t.Parallel() diff --git a/research/dynamic_packfiles.txt b/research/dynamic_packfiles.txt new file mode 100644 index 00000000..e4fe7e54 --- /dev/null +++ b/research/dynamic_packfiles.txt @@ -0,0 +1,179 @@ +dynamic packfiles to append objects + +gc/refcount process punches page-sized holes in them for pages fully +within the space of unwanted objects, after setting a tombstone mark + +holes are recorded in an index and re-used + +then, if desired, the repack process removes all the punched holes +and anything surrounding from unwanted objects that are slightly out +of the page boundary + +repack is not really git's repack algorithm, it's bascially just +defragmentation. + +genreational bloom filters + +idx design +========== + +so, let's first get our invariants and patterns clear. + +* fixed-length cryptographic object IDs +* essentially uniform key distribution +* exact lookup only, no range scans, no ordered iteration requirements +* reads are extremely important +* writes are mostly append-like +* deletes/tombstones may happen later but are secondary + +1st design +---------- + +* mutable front index +* immutable base index +* period merge/compaction into a new base generation + + + +upload-pack/send-pack/defrag +============================ + +take current pack, remove dead objects/holes, filter objects out, record +offsets and adjust ofs_deltas since they always go backwards, write the pack +back; then stream written pack to client. two-step necessary because pack +header includes object count; could have a custom new protocol that doesn't do +so. + +random chat log dump +==================== +<~runxiyu> ori: actually. i think my hashtable-ish .idx scheme doesn't work really well with e.g. "user provided us a small part of the hash" +<~runxiyu> and when using the Git CLI, abbreviated hashes are extremely common.... +<~runxiyu> not lik ei'd need them in a *forge* +<~runxiyu> but ugh +<~runxiyu> i guess i'm going with some sort of b-tree :(( +<~runxiyu> ~~maybe i should just port gefs to git~~ +<&ori> runxiyu: why not? you should be able to pick the pages based on the prefix and then scan, no? +<~rx> ori: i need to somehow munge the has to prevent page directory explosions +<~rx> the hash* +<~rx> e.g. siphash(objectid, secret) +<~rx> otherwise an attacker could give you 10M objects that start with 00000 and whatnot +<&ori> what's the worst case that would happen there, and is it exponentially worse than giving you 10M objects that start with anything? +<&ori> I'm thinking that you can't generate a case worse than 256/nobject extra table lookups, assuming one bit per fanout.. +<~runxiyu> ori: for extendible hashing, yes, definitely worse +<~runxiyu> the directory will expand a lot for no good reason +<&ori> yes, but you have 256 bits of hash +<&ori> how much is a lot worse? +<&ori> what's the worst an attacker can do, and how is the impact worse than uploading 10M giant objects? +<&ori> also, spotted a bag of kuai kuai keeping the cash register working today at a tea shop +<~runxiyu> waitt +<~runxiyu> hmmm + * runxiyu looks agagin if it's O(N) or O(2^N) +<~runxiyu> well +<~runxiyu> i think it should be a O(2^n) directory size when the attacker can control n bits prefix +<&ori> what's the 'n' here? +<~runxiyu> > can control n bits prefix +<&ori> yeah, you run out of prefix pretty quickly, though +<&ori> I'm not seeing how you could get an exponential blowup if you share pages +<&ori> may be missing something, though +<~runxiyu> hm +<&ori> oh, wait, I see +<&ori> no, wait +<~runxiyu> i think im confusing myself too to some extent but something doesn't feel right +<~runxiyu> urgh +<~runxiyu> okay, rethinking this +<~runxiyu> d is the global depth +<~runxiyu> diretory size is 2^d +<~runxiyu> B records per bucket +<~runxiyu> whatever happens inside the bucket idc, let's say it's a linked list +<~runxiyu> whatever happens inside the bucket idc, let's say it's an array* (linked lists suck) +<~runxiyu> l <= d +<~runxiyu> (l being the local depth of a bucket) +<~runxiyu> normal: d = log^2(N/B) +<&ori> ahh, I see. +<~runxiyu> N is the object count +<&ori> yes, so what if you binary searched the page directory, or made it multi-level +<~runxiyu> an attacker could grab a giant repo and find commonly-prefixed objects, they don't need to brute force their own +<~runxiyu> ori: remember we're trying to do something easy to add new objects into +<~runxiyu> how'd you do that with a binary search? +<~runxiyu> not sure what you mean by multi-level yet here +<~runxiyu> well, it could just turn into a b+tree... +<~runxiyu> hm +<&ori> multilevel -- you have pd[0] using bits 0..n +<~runxiyu> maybe an lmdb object store isn't too bad after all +<&ori> pd[0][1] using bits n...m +<&ori> etc +<&ori> and the reason I was a bit confused was that I had thought the directory was a trie +<&ori> rather than just an expanding top level directory +<~runxiyu> ah +<&ori> so, yeah, I was thinking you could make the page directory an actual trie +<~runxiyu> sigh +<~runxiyu> i guess abbreviated object IDs is something i can't really skip. +<~runxiyu> ori: ill look into radix trees and LSM trees too +<~runxiyu> well, you're basically suggesting a radix tree i guess +<~runxiyu> well actually +<~runxiyu> radix might not necessarily be the best trie here +<~runxiyu> idk +<~runxiyu> hm +<~runxiyu> firstly im really heavy on reads +<~runxiyu> and random keys with no sequential access +<~runxiyu> ok LSM makes no sense +<&hax[xor]> > O(2^N) +<~runxiyu> ori: thoughts on how to make tries reasonable to use on disks? +<&hax[xor]> that sounds like something is already very broken +<~runxiyu> hax[xor]: wdym +<&hax[xor]> directory size should absolutely not scale like that +<~runxiyu> hax[xor]: maybe read up on how extendible hsahing works again? +<&hax[xor]> probably but if that's how it scales it still sounds verybroken +<~runxiyu> n is not the amount of objects +<~runxiyu> it's a pathlogic condition caused by chosne-prefix keys +<~runxiyu> (your keys are usually supposed to be hashed into something the attacker can't predict) +<&hax[xor]> if you mean the directory size scales linearly with the number of objects the attacker puts in it... that sounds perfectly normal? +<&ori> runxiyu: same as extendible hashing, just after you extend to, say, 8 bits, you stop splitting the page directory, and have subdirectories +<~runxiyu> ori: that could make senes +<~runxiyu> haven't thought it through +<~runxiyu> directory size is 2^d, d being the global depth +<~runxiyu> urgh i need to review for exams +<~runxiyu> okay +<~runxiyu> write amplification issue +<~runxiyu> im not sure how significant this is for realistic git workloads +<~runxiyu> i haven't counted, but there should be many, many, many more reads than writes +<~runxiyu> if write amplification is really an issue +<&ori> I may go wander around a bit. +<~runxiyu> then ill just port gefs +<~runxiyu> ori: do you mean IRL, or over dynamic pack data structures- +<&ori> irl. +<~runxiyu> alright that makes more sense :P +<&ori> tomorrow I think I check out Jiufen +<~runxiyu> frick i want to be able to type epsilon with compose +<&ori> is that not possible? +<~runxiyu> i don't seem to be able to +<~runxiyu> but idk the compose tables on my system +<~runxiyu> ε +<~runxiyu> well +<~runxiyu> unicode hex input always works :/ +<~runxiyu> OKAY FUCK +<~runxiyu> I keep getting distracted by interesting things +<~runxiyu> I need to review for my fucking exams +-- Mode #chat [-q runxiyu] by runxiyu +-- Mode #chat [-a runxiyu] by runxiyu +-- #chat: You must be a channel halfop or higher to set channel mode b (ban). +-- Mode #chat [+b mute:account:runxiyu] by runxiyu +-- #chat: You cannot send messages to this channel whilst a m: (mute) extban is set matching you. +-- #chat: You cannot send messages to this channel whilst a m: (mute) extban is set matching you. +<&f_> does that even work? +<&ori> for 9front, <alt>*e gives ε +<&ori> but, don't remember the compose map +<&ori> thought that there was a similar thing for all greek letters + + +See also: +https://github.com/inkandswitch/darn +https://www.youtube.com/watch?v=nk4nefmguZk +https://crates.io/crates/iroh-blobs +https://crates.io/crates/bao-tree + + +Actually, who cares about abbreviated hashes? +Clients. Clients only. +It will be a separate interface satisfied by "normal repos" +but not satisfied by our store. Good enough. diff --git a/research/packfile_bloom.txt b/research/packfile_bloom.txt new file mode 100644 index 00000000..63acafbe --- /dev/null +++ b/research/packfile_bloom.txt @@ -0,0 +1,133 @@ +Packfile bloom filter RFC +========================= + +Problem +------- + +Especially for server-side usages, repacking is extremely expensive, and +creating multi-pack-indexes is still rather expensive. Incremental MIDX +partially solves this, but would defeat the purpose of MIDX when there are too +many of them, as Git would still have to walk the MIDXes in order while +performing expensive indexing queries. + +Idea +---- + +Each MIDX layer, and each non-MIDX index, comes with a bloom filter. MIDXes and +ordinary .idx files are still traversed in their usual order, but the first +step when traversing them, is to check whether that index could possibly have +the desired object, through a bloom filter. + +We will want the filters to be mmaped, and we want the lookup cost to be +dominated by one cache-line read rather than using many scattered reads. +Therefore, a blocked bloom filter is likely the right direction here. The steps +are as follows: + +1. Split the filter into 64-octet buckets, since 64 octets is the most common + cache-line size. +2. Use some bits of the object ID to choose the bucket. +3. Use the rest of the key to choose several bit positions inside that bucket. +4. A lookup thus reads one 64-octet bucket and checks whether all required bits + are set. + +Definitions +----------- + +Let: + + B := number of buckets + K := number of bits set and tested per object ID + +* All integers here are big endian. +* The OID is to be interpreted as a big-endian bitstring, where bit offset 0 + is the most significant bit of octet 0. +* log2(B) + 9K <= hash length in bits. + +File layout +----------- + +* 4-octet signature: {'I', 'D', 'B', 'L'} +* 4-octet version identifier (= 1) +* 4-octet object hash algorithm identifier (= 1 for SHA-1, 2 for SHA-256) +* 4-octet B (number of buckets) +* 2-octet K (number of bits set and tested per object ID) +* 46-octet padding (must be all zeros) +* B buckets of 64 octets each. + +Validation +---------- + +* Matching signature +* Supported version (the rest of the rules are for this version) +* Hash function identifier must be recognized +* B must be nonzero and a power of two +* K must be nonzero +* log2(B) + 9K <= hash length in bits +* Padding must be all zero +* File size must be 64 + 64 * B octets + +Lookup procedure +---------------- + +1. Let b be the unsigned integer encoded by the most significant log2(B) bits + of OID. B is a power of two, and 0 <= b < B. +2. Select and read bucket b. +3. For each 0 <= i < K: + 1. Start immediately after the most significant log2(B) bits of OID, let the + i-th 9-bit field be the bits at offset 9 * i through 9 * i + 8 within the + next 9 * K bits of the OID. + 2. Let pi be the unsigned integer encoded by that 9-bit field. + Then, 0 <= pi < 512. + 3. Compute wi := pi >> 6, and bi := pi & 63. + Thus, wi identifies one of the 8 64-bit words in bucket b, and bi + identifies one bit within that word. + 4. Test whether bi is set in the word wi of bucket b. (Within each 64-bit + word, bit index 0 denotes the most significant bit, and bit index 63 + denotes the least significant bit.) + +If any test fails, the OID is definitely not in the relevant idx. +If all tests succeed, the OID may be in the relevant idx. + +Note that two of the K 9-bit fields can decode to the same pi, which means an +insertion may set fewer than K distinct bits. + +Worked example +-------------- + +Let: + + B = 1 << 15 = 32768 + K = 8 + +Then, log2(B) = 15. Each lookup thus uses 15 bits to choose the bucket +and 8 * 9 = 72 bits to choose the in-bucket positions, for a total of +87 bits taken from the object ID. + +1. Read the first 15 bits of OID and interpret them as b, where + 0 <= b < 32768. +2. Read bucket b. +3. For each 0 <= i < 8: + 1. Read the i-th 9-bit field from the next 72 bits of OID and interpret it + as pi, where 0 <= pi < 512. + 2. Compute: wi = pi >> 6, bi = pi & 63. + 3. Test whether bit bi is set in the word wi of bucket b. + +Security considerations +------------------------ + +An adversarial packfile where objects are (computationally intensive, even for +SHA-1 as vulnerable as it is) constructed to have the same prefix for the +relevant object format hash algorithm could be used to fill up the bloom +filters, rendering some buckets useless. In the worst case, if they somehow +fill all filters, this proposal's optimizations become useless, but would not +be a significant DoS vector. + +TODOs +----- + +* Consider dropping mmap (page read vs cachline read) +* How should B and K be chosen? +* How does creation/insert work? Note that packfiles and `.idx`es are immutable. +* What are the sizes? +* What are the false positive rates? +* How are benchmarks? |
