From 64ebbb5e32ee2f44bdd09fae5bb1153dfd1aad22 Mon Sep 17 00:00:00 2001 From: Runxi Yu Date: Sun, 14 Jun 2026 11:58:31 +0000 Subject: internal/format/packidx: Lookup interpolation heuristic On my copy of linux.git with a few packs, -.32 on all-miss lookups, -.15 on all-hit lookups, -.05 on walking all objects in the root tree. For microbenchmarks, -.4 on honest workloads, and +.85 on fully adversarial workloads. Might be able to improve the adversarial one in the future, but it's a rare case not worth optimizing too much for. --- internal/format/packidx/lookup.go | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'internal/format') diff --git a/internal/format/packidx/lookup.go b/internal/format/packidx/lookup.go index d1293f47..a9196aff 100644 --- a/internal/format/packidx/lookup.go +++ b/internal/format/packidx/lookup.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/binary" "fmt" + "math/bits" ) // Lookup searches the index for one object ID @@ -18,6 +19,52 @@ func (idx *Packidx) Lookup(oid []byte) (offset uint64, found bool, err error) { lo, hi := idx.fanoutRange(oid[0]) + // Object IDs are uniform for honest inputs, + // interp on next 8 octets converges in O(log log n). + // + // OIDs are public unkeyed hashes, + // so an attacker may sample/filter/mine prefix clusters, + // making interpolation mis-estimate every probe. + // See https://runxiyu.org/comp/ch4ht/ + // for why cryptographic hash algorithms are insufficient. + // + // Cap the interp at bisect's probe count and finish by bisect; + // adversarial at O(log n) plus small interpolation overhead, + // honest exit well under the cap. + target := binary.BigEndian.Uint64(oid[1:9]) + + for budget := bits.Len(uint(hi - lo)); hi-lo > 8 && budget > 0; budget-- { + loKey := binary.BigEndian.Uint64(idx.OIDAt(lo)[1:9]) + hiKey := binary.BigEndian.Uint64(idx.OIDAt(hi - 1)[1:9]) + + var mid int + + switch { + case target <= loKey: + mid = lo + case target >= hiKey: + mid = hi - 1 + default: + frac := float64(target-loKey) / float64(hiKey-loKey) + mid = lo + int(frac*float64(hi-lo-1)) + } + + switch cmp := bytes.Compare(oid, idx.OIDAt(mid)); { + case cmp == 0: + offset, err = idx.OffsetAt(mid) + if err != nil { + return 0, false, err + } + + return offset, true, nil + case cmp < 0: + hi = mid + default: + lo = mid + 1 + } + } + + // Interpolation narrowed or capped; bisect to finish. for lo < hi { mid := lo + (hi-lo)/2 -- cgit v1.3.1-10-gc9f91