diff options
| -rw-r--r-- | delta_read_apply.go | 159 | ||||
| -rw-r--r-- | delta_test.go | 55 | ||||
| -rw-r--r-- | delta_write_deltify.go | 306 | ||||
| -rw-r--r-- | delta_write_encode.go | 108 | ||||
| -rw-r--r-- | delta_write_select.go | 56 | ||||
| -rw-r--r-- | internal/murmurhash2/murmurhash2.go | 42 | ||||
| -rw-r--r-- | packed_read_pack.go | 150 | ||||
| -rw-r--r-- | packed_write_pack.go | 123 | ||||
| -rw-r--r-- | packed_write_test.go | 328 |
9 files changed, 1157 insertions, 170 deletions
diff --git a/delta_read_apply.go b/delta_read_apply.go new file mode 100644 index 00000000..48233292 --- /dev/null +++ b/delta_read_apply.go @@ -0,0 +1,159 @@ +package furgit + +import ( + "io" + + "codeberg.org/lindenii/furgit/internal/bufpool" +) + +func packDeltaReadOfsDistance(data []byte) (uint64, int, error) { + if len(data) == 0 { + return 0, 0, io.ErrUnexpectedEOF + } + b := data[0] + dist := uint64(b & 0x7f) + consumed := 1 + for (b & 0x80) != 0 { + if consumed >= len(data) { + return 0, 0, io.ErrUnexpectedEOF + } + b = data[consumed] + consumed++ + dist += 1 + dist <<= 7 + dist |= uint64(b & 0x7f) + } + return dist, consumed, nil +} + +func packDeltaApply(base, delta bufpool.Buffer) (bufpool.Buffer, error) { + pos := 0 + baseBytes := base.Bytes() + deltaBytes := delta.Bytes() + srcSize, err := packVarintRead(deltaBytes, &pos) + if err != nil { + return bufpool.Buffer{}, err + } + dstSize, err := packVarintRead(deltaBytes, &pos) + if err != nil { + return bufpool.Buffer{}, err + } + if srcSize != len(baseBytes) { + return bufpool.Buffer{}, ErrInvalidObject + } + out := bufpool.Borrow(dstSize) + out.Resize(dstSize) + outBytes := out.Bytes() + outPos := 0 + + for pos < len(deltaBytes) { + op := deltaBytes[pos] + pos++ + switch { + case op&0x80 != 0: + off := 0 + n := 0 + if op&0x01 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + off |= int(deltaBytes[pos]) + pos++ + } + if op&0x02 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + off |= int(deltaBytes[pos]) << 8 + pos++ + } + if op&0x04 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + off |= int(deltaBytes[pos]) << 16 + pos++ + } + if op&0x08 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + off |= int(deltaBytes[pos]) << 24 + pos++ + } + if op&0x10 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + n |= int(deltaBytes[pos]) + pos++ + } + if op&0x20 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + n |= int(deltaBytes[pos]) << 8 + pos++ + } + if op&0x40 != 0 { + if pos >= len(deltaBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + n |= int(deltaBytes[pos]) << 16 + pos++ + } + if n == 0 { + n = 0x10000 + } + if off+n > len(baseBytes) || outPos+n > len(outBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + copy(outBytes[outPos:], baseBytes[off:off+n]) + outPos += n + case op != 0: + n := int(op) + if pos+n > len(deltaBytes) || outPos+n > len(outBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + copy(outBytes[outPos:], deltaBytes[pos:pos+n]) + pos += n + outPos += n + default: + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + } + + if outPos != len(outBytes) { + out.Release() + return bufpool.Buffer{}, ErrInvalidObject + } + return out, nil +} + +func packVarintRead(buf []byte, pos *int) (int, error) { + res := 0 + shift := 0 + for { + if *pos >= len(buf) { + return 0, ErrInvalidObject + } + b := buf[*pos] + *pos++ + res |= int(b&0x7f) << shift + if (b & 0x80) == 0 { + break + } + shift += 7 + } + return res, nil +} diff --git a/delta_test.go b/delta_test.go new file mode 100644 index 00000000..f1ee5840 --- /dev/null +++ b/delta_test.go @@ -0,0 +1,55 @@ +package furgit + +import ( + "bytes" + "testing" + + "codeberg.org/lindenii/furgit/internal/bufpool" +) + +func TestDeltaEncodeApplyRoundtrip(t *testing.T) { + base := []byte("Hello, world!") + target := []byte("Hello, brave new world!") + + instr := []deltaInstruction{ + {copy: true, offset: 0, length: 7}, + {copy: false, data: []byte("brave new "), length: len("brave new ")}, + {copy: true, offset: 7, length: 6}, + } + delta, err := deltaEncode(len(base), len(target), instr) + if err != nil { + t.Fatalf("deltaEncode error: %v", err) + } + + out, err := packDeltaApply(bufpool.FromOwned(base), bufpool.FromOwned(delta)) + if err != nil { + t.Fatalf("packDeltaApply error: %v", err) + } + defer out.Release() + if !bytes.Equal(out.Bytes(), target) { + t.Fatalf("delta apply mismatch: got %q, want %q", out.Bytes(), target) + } +} + +func TestDeltifyRoundtrip(t *testing.T) { + base := []byte("The quick brown fox jumps over the lazy dog.") + target := []byte("The quick red fox jumps over the very lazy dog.") + + instr, err := deltify(base, target, 0) + if err != nil { + t.Fatalf("deltify error: %v", err) + } + delta, err := deltaEncode(len(base), len(target), instr) + if err != nil { + t.Fatalf("deltaEncode error: %v", err) + } + + out, err := packDeltaApply(bufpool.FromOwned(base), bufpool.FromOwned(delta)) + if err != nil { + t.Fatalf("packDeltaApply error: %v", err) + } + defer out.Release() + if !bytes.Equal(out.Bytes(), target) { + t.Fatalf("delta apply mismatch: got %q, want %q", out.Bytes(), target) + } +} diff --git a/delta_write_deltify.go b/delta_write_deltify.go new file mode 100644 index 00000000..cd3d6aa8 --- /dev/null +++ b/delta_write_deltify.go @@ -0,0 +1,306 @@ +package furgit + +import ( + "bytes" + + "codeberg.org/lindenii/furgit/internal/murmurhash2" +) + +const ( + deltaMinChunk = 32 + deltaMaxChunk = 8192 + deltaSplitMask = (1 << 8) - 1 + deltaStretchMax = (1 << 24) - 1 +) + +var deltaGearTab = [...]uint32{ + 0x67ed26b7, 0x32da500c, 0x53d0fee0, 0xce387dc7, 0xcd406d90, 0x2e83a4d4, + 0x9fc9a38d, 0xb67259dc, 0xca6b1722, 0x6d2ea08c, 0x235cea2e, 0x3149bb5f, + 0x1beda787, 0x2a6b77d5, 0x2f22d9ac, 0x91fc0544, 0xe413acfa, 0x5a30ff7a, + 0xad6fdde0, 0x444fd0f5, 0x7ad87864, 0x58c5ff05, 0x8d2ec336, 0x2371f853, + 0x550f8572, 0x6aa448dd, 0x7c9ddbcf, 0x95221e14, 0x2a82ec33, 0xcbec5a78, + 0xc6795a0d, 0x243995b7, 0x1c909a2f, 0x4fded51c, 0x635d334b, 0x0e2b9999, + 0x2702968d, 0x856de1d5, 0x3325d60e, 0xeb6a7502, 0xec2a9844, 0x0905835a, + 0xa1820375, 0xa4be5cab, 0x96a6c058, 0x2c2ccd70, 0xba40fce3, 0xd794c46b, + 0x8fbae83e, 0xc3aa7899, 0x3d3ff8ed, 0xa0d42b5b, 0x571c0c97, 0xd2811516, + 0xf7e7b96c, 0x4fd2fcbd, 0xe2fdec94, 0x282cc436, 0x78e8e95c, 0x80a3b613, + 0xcfbee20c, 0xd4a32d1c, 0x2a12ff13, 0x6af82936, 0xe5630258, 0x8efa6a98, + 0x294fb2d1, 0xdeb57086, 0x5f0fddb3, 0xeceda7ce, 0x4c87305f, 0x3a6d3307, + 0xe22d2942, 0x9d060217, 0x1e42ed02, 0xb6f63b52, 0x4367f39f, 0x055cf262, + 0x03a461b2, 0x5ef9e382, 0x386bc03a, 0x2a1e79c7, 0xf1a0058b, 0xd4d2dea9, + 0x56baf37d, 0x5daff6cc, 0xf03a951d, 0xaef7de45, 0xa8f4581e, 0x3960b555, + 0xffbfff6d, 0xbe702a23, 0x8f5b6d6f, 0x061739fb, 0x98696f47, 0x3fd596d4, + 0x151eac6b, 0xa9fcc4f5, 0x69181a12, 0x3ac5a107, 0xb5198fe7, 0x96bcb1da, + 0x1b5ddf8e, 0xc757d650, 0x65865c3a, 0x8fc0a41a, 0x87435536, 0x99eda6f2, + 0x41874794, 0x29cff4e8, 0xb70efd9a, 0x3103f6e7, 0x84d2453b, 0x15a450bd, + 0x74f49af1, 0x60f664b1, 0xa1c86935, 0xfdafbce1, 0xe36353e3, 0x5d9ba739, + 0xbc0559ba, 0x708b0054, 0xd41d808c, 0xb2f31723, 0x9027c41f, 0xf136d165, + 0xb5374b12, 0x9420a6ac, 0x273958b6, 0xe6c2fad0, 0xebdc1f21, 0xfb33af8b, + 0xc71c25cd, 0xe9a2d8e5, 0xbeb38a50, 0xbceb7cc2, 0x4e4e73f0, 0xcd6c251d, + 0xde4c032c, 0x4b04ac30, 0x725b8b21, 0x4eb8c33b, 0x20d07b75, 0x0567aa63, + 0xb56b2bb7, 0xc1f5fd3a, 0xcafd35ca, 0x470dd4da, 0xfe4f94cd, 0xfb8de424, + 0xe8dbcf40, 0xfe50a37a, 0x62db5b5d, 0xf32f4ab6, 0x2c4a8a51, 0x18473dc0, + 0xfe0cbb6e, 0xfe399efd, 0xdf34ecc9, 0x6ccd5055, 0x46097073, 0x139135c2, + 0x721c76f6, 0x1c6a94b4, 0x6eee014d, 0x8a508e02, 0x3da538f5, 0x280d394f, + 0x5248a0c4, 0x3ce94c6c, 0x9a71ad3a, 0x8493dd05, 0xe43f0ab6, 0x18e4ed42, + 0x6c5c0e09, 0x42b06ec9, 0x8d330343, 0xa45b6f59, 0x2a573c0c, 0xd7fd3de6, + 0xeedeab68, 0x5c84dafc, 0xbbd1b1a8, 0xa3ce1ad1, 0x85b70bed, 0xb6add07f, + 0xa531309c, 0x8f8ab852, 0x564de332, 0xeac9ed0c, 0x73da402c, 0x3ec52761, + 0x43af2f4d, 0xd6ff45c8, 0x4c367462, 0xd553bd6a, 0x44724855, 0x3b2aa728, + 0x56e5eb65, 0xeaf16173, 0x33fa42ff, 0xd714bb5d, 0xfbd0a3b9, 0xaf517134, + 0x9416c8cd, 0x534cf94f, 0x548947c2, 0x34193569, 0x32f4389a, 0xfe7028bc, + 0xed73b1ed, 0x9db95770, 0x468e3922, 0x0440c3cd, 0x60059a62, 0x33504562, + 0x2b229fbd, 0x5174dca5, 0xf7028752, 0xd63c6aa8, 0x31276f38, 0x0646721c, + 0xb0191da8, 0xe00e6de0, 0x9eac1a6e, 0x9f7628a5, 0xed6c06ea, 0x0bb8af15, + 0xf119fb12, 0x38693c1c, 0x732bc0fe, 0x84953275, 0xb82ec888, 0x33a4f1b3, + 0x3099835e, 0x028a8782, 0x5fdd51d7, 0xc6c717b3, 0xb06caf71, 0x17c8c111, + 0x61bad754, 0x9fd03061, 0xe09df1af, 0x3bc9eb73, 0x85878413, 0x9889aaf2, + 0x3f5a9e46, 0x42c9f01f, 0x9984a4f4, 0xd5de43cc, 0xd294daed, 0xbecba2d2, + 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba, +} + +type deltaBlock struct { + length int + offset int + hash uint32 +} + +type deltaTable struct { + blocks []deltaBlock + nblocks int + nalloc int + + offs []uint32 + len int + size int +} + +type deltaInstruction struct { + copy bool + offset int + length int + data []byte +} + +func deltify(base, target []byte, seed uint32) ([]deltaInstruction, error) { + if base == nil { + base = []byte{} + } + if target == nil { + target = []byte{} + } + dt, err := deltaTableInitMem(base, 0, len(base), seed) + if err != nil { + return nil, err + } + return deltifyMemMem(target, 0, len(target), seed, dt, base, 0, len(base)) +} + +func deltifyMemMem(target []byte, fileoffset, filesize int, seed uint32, dt *deltaTable, base []byte, baseOffset0, baseSize int) ([]deltaInstruction, error) { + const allocChunkSize = 64 + instr := make([]deltaInstruction, 0, allocChunkSize) + offset0 := fileoffset + + for fileoffset < filesize { + blocklen := deltaNextBlockLen(target, fileoffset, filesize) + if blocklen == 0 { + if fileoffset < filesize { + data := target[fileoffset:filesize] + instr = append(instr, deltaInstruction{ + copy: false, + offset: fileoffset - offset0, + length: len(data), + data: data, + }) + } + break + } + + block := dt.lookupBlock(target[fileoffset:fileoffset+blocklen], blocklen, seed, base, baseOffset0) + if block != nil { + if err := deltaStretchMemMem(base, baseOffset0, baseSize, block, target, fileoffset+blocklen, filesize, &blocklen); err != nil { + return nil, err + } + instr = append(instr, deltaInstruction{ + copy: true, + offset: block.offset, + length: blocklen, + }) + } else { + data := target[fileoffset : fileoffset+blocklen] + instr = append(instr, deltaInstruction{ + copy: false, + offset: fileoffset - offset0, + length: len(data), + data: data, + }) + } + fileoffset += blocklen + } + + return instr, nil +} + +func deltaTableInitMem(data []byte, fileoffset, filesize int, seed uint32) (*deltaTable, error) { + dt := &deltaTable{ + nalloc: 128, + size: 128, + } + dt.blocks = make([]deltaBlock, dt.nalloc) + dt.offs = make([]uint32, dt.size) + + offset0 := fileoffset + for fileoffset < filesize { + blocklen := deltaNextBlockLen(data, fileoffset, filesize) + if blocklen == 0 { + break + } + h := deltaHashBlock(data[fileoffset:fileoffset+blocklen], seed) + if err := dt.addBlock(data, offset0, blocklen, fileoffset-offset0, h); err != nil { + return nil, err + } + fileoffset += blocklen + } + return dt, nil +} + +func deltaHashBlock(p []byte, seed uint32) uint32 { + return murmurhash2.Sum32(p, seed) +} + +func deltaNextBlockLen(data []byte, fileoffset, filesize int) int { + if fileoffset >= filesize || filesize-fileoffset < deltaMinChunk { + return 0 + } + p := fileoffset + deltaMinChunk + end := fileoffset + deltaMaxChunk + if end > filesize { + end = filesize + } + var gh uint32 + for p < end { + gh = (gh << 1) + deltaGearTab[data[p]] + p++ + if (gh & deltaSplitMask) == 0 { + break + } + } + return p - fileoffset +} + +func (dt *deltaTable) addBlock(data []byte, offset0, length, offset int, h uint32) error { + if length == 0 { + return nil + } + if err := dt.resize(); err != nil { + return err + } + idx, found := dt.lookupSlot(data, offset0, offset, length, h) + if found { + return nil + } + dt.offs[idx] = uint32(dt.nblocks + 1) + dt.blocks[dt.nblocks] = deltaBlock{ + length: length, + offset: offset, + hash: h, + } + dt.nblocks++ + dt.len++ + return nil +} + +func (dt *deltaTable) lookupBlock(chunk []byte, length int, seed uint32, base []byte, baseOffset0 int) *deltaBlock { + if dt == nil || dt.size == 0 { + return nil + } + h := deltaHashBlock(chunk, seed) + for i := int(h % uint32(dt.size)); dt.offs[i] != 0; i = (i + 1) % dt.size { + block := &dt.blocks[dt.offs[i]-1] + if block.hash != h || block.length != length { + continue + } + baseOff := baseOffset0 + block.offset + if baseOff < 0 || baseOff+length > len(base) { + continue + } + if bytes.Equal(base[baseOff:baseOff+length], chunk) { + return block + } + } + return nil +} + +func (dt *deltaTable) lookupSlot(data []byte, offset0, offset, length int, h uint32) (int, bool) { + if dt == nil || dt.size == 0 { + return 0, false + } + for i := int(h % uint32(dt.size)); ; i = (i + 1) % dt.size { + if dt.offs[i] == 0 { + return i, false + } + block := &dt.blocks[dt.offs[i]-1] + if block.hash != h || block.length != length { + continue + } + left := offset0 + block.offset + right := offset0 + offset + if left < 0 || right < 0 || left+length > len(data) || right+length > len(data) { + continue + } + if bytes.Equal(data[left:left+length], data[right:right+length]) { + return i, true + } + } +} + +func (dt *deltaTable) resize() error { + if dt.nblocks == dt.nalloc { + newAlloc := dt.nalloc + 256 + newBlocks := make([]deltaBlock, newAlloc) + copy(newBlocks, dt.blocks[:dt.nblocks]) + dt.blocks = newBlocks + dt.nalloc = newAlloc + } + + if dt.offs == nil || (dt.len*100)/dt.size > 70 { + oldSize := dt.size + dt.size *= 2 + if dt.size == 0 { + dt.size = oldSize + return ErrInvalidObject + } + dt.offs = make([]uint32, dt.size) + for i := 0; i < dt.nblocks; i++ { + block := &dt.blocks[i] + idx := int(block.hash % uint32(dt.size)) + for dt.offs[idx] != 0 { + idx = (idx + 1) % dt.size + } + dt.offs[idx] = uint32(i + 1) + } + } + + return nil +} + +func deltaStretchMemMem(base []byte, baseOffset0, baseSize int, block *deltaBlock, data []byte, fileoffset, filesize int, blocklen *int) error { + baseOffset := baseOffset0 + block.offset + *blocklen + if baseOffset > baseSize { + return ErrInvalidObject + } + if fileoffset > filesize { + return ErrInvalidObject + } + maxlen := baseSize - baseOffset + if avail := filesize - fileoffset; avail < maxlen { + maxlen = avail + } + for i := 0; i < maxlen && *blocklen < deltaStretchMax-1; i++ { + if data[fileoffset+i] != base[baseOffset+i] { + break + } + *blocklen++ + } + return nil +} diff --git a/delta_write_encode.go b/delta_write_encode.go new file mode 100644 index 00000000..66c77052 --- /dev/null +++ b/delta_write_encode.go @@ -0,0 +1,108 @@ +package furgit + +const deltaCopyDefaultLen = 0x10000 + +func deltaEncode(baseSize, resultSize int, instr []deltaInstruction) ([]byte, error) { + if baseSize < 0 || resultSize < 0 { + return nil, ErrInvalidObject + } + out := make([]byte, 0, 64) + baseHdr, err := packVarintEncode(baseSize) + if err != nil { + return nil, err + } + out = append(out, baseHdr...) + resultHdr, err := packVarintEncode(resultSize) + if err != nil { + return nil, err + } + out = append(out, resultHdr...) + + for _, ins := range instr { + if ins.copy { + if ins.offset < 0 || ins.length <= 0 { + return nil, ErrInvalidObject + } + if ins.offset > 0xffffffff { + return nil, ErrInvalidObject + } + if ins.length > 0xffffff { + return nil, ErrInvalidObject + } + op := byte(0x80) + var tmp [7]byte + pos := 0 + + off := ins.offset + for i := 0; i < 4; i++ { + op |= 1 << i + tmp[pos] = byte(off & 0xff) + pos++ + off >>= 8 + if off == 0 { + break + } + } + if off != 0 { + return nil, ErrInvalidObject + } + + n := ins.length + if n != deltaCopyDefaultLen { + for i := 0; i < 3 && n > 0; i++ { + op |= 1 << (i + 4) + tmp[pos] = byte(n & 0xff) + pos++ + n >>= 8 + } + if n > 0 { + return nil, ErrInvalidObject + } + } + + out = append(out, op) + out = append(out, tmp[:pos]...) + continue + } + + data := ins.data + if ins.length > 0 { + if len(data) < ins.length { + return nil, ErrInvalidObject + } + data = data[:ins.length] + } + if len(data) == 0 { + return nil, ErrInvalidObject + } + for len(data) > 0 { + n := len(data) + if n > 127 { + n = 127 + } + out = append(out, byte(n)) + out = append(out, data[:n]...) + data = data[n:] + } + } + + return out, nil +} + +func deltaTry(base, target []byte, seed uint32, minSavings int) ([]byte, bool) { + if minSavings < 0 { + minSavings = 0 + } + instr, err := deltify(base, target, seed) + if err != nil { + return nil, false + } + delta, err := deltaEncode(len(base), len(target), instr) + if err != nil { + return nil, false + } + if len(delta)+minSavings >= len(target) { + return nil, false + } + return delta, true +} diff --git a/delta_write_select.go b/delta_write_select.go new file mode 100644 index 00000000..ad8e0d0e --- /dev/null +++ b/delta_write_select.go @@ -0,0 +1,56 @@ +package furgit + +const defaultDeltaWindow = 64 + +type objectToPack struct { + id Hash + ty ObjectType + body []byte + offset uint64 + deltaDepth int +} + +type deltaContext struct { + window int + candidates []*objectToPack +} + +func (ctx *deltaContext) addCandidate(obj *objectToPack) { + if ctx.window <= 0 { + return + } + ctx.candidates = append(ctx.candidates, obj) + if len(ctx.candidates) > ctx.window { + over := len(ctx.candidates) - ctx.window + ctx.candidates = ctx.candidates[over:] + } +} + +func pickDeltaBase(ctx *deltaContext, obj *objectToPack, seed uint32, minSavings, maxDepth int) (*objectToPack, []byte) { + if ctx == nil || len(ctx.candidates) == 0 { + return nil, nil + } + if maxDepth <= 0 { + maxDepth = 1 + } + var bestBase *objectToPack + var bestDelta []byte + for i := len(ctx.candidates) - 1; i >= 0; i-- { + base := ctx.candidates[i] + if base.ty != ObjectTypeBlob { + continue + } + if base.deltaDepth >= maxDepth { + continue + } + delta, ok := deltaTry(base.body, obj.body, seed, minSavings) + if !ok { + continue + } + if bestDelta == nil || len(delta) < len(bestDelta) { + bestDelta = delta + bestBase = base + } + } + return bestBase, bestDelta +} diff --git a/internal/murmurhash2/murmurhash2.go b/internal/murmurhash2/murmurhash2.go new file mode 100644 index 00000000..c728f932 --- /dev/null +++ b/internal/murmurhash2/murmurhash2.go @@ -0,0 +1,42 @@ +// Package murmurhash2 provides a non-cryptographic hash. +package murmurhash2 + +import "encoding/binary" + +// Sum32 computes the MurmurHash2 value for key with the provided seed. +func Sum32(key []byte, seed uint32) uint32 { + const ( + m uint32 = 0x5bd1e995 + r uint32 = 24 + ) + + h := seed ^ uint32(len(key)) + i := 0 + for len(key)-i >= 4 { + k := binary.LittleEndian.Uint32(key[i:]) + k *= m + k ^= k >> r + k *= m + + h *= m + h ^= k + i += 4 + } + + switch len(key) - i { + case 3: + h ^= uint32(key[i+2]) << 16 + fallthrough + case 2: + h ^= uint32(key[i+1]) << 8 + fallthrough + case 1: + h ^= uint32(key[i]) + h *= m + } + + h ^= h >> 13 + h *= m + h ^= h >> 15 + return h +} diff --git a/packed_read_pack.go b/packed_read_pack.go index 628fa258..593fe97f 100644 --- a/packed_read_pack.go +++ b/packed_read_pack.go @@ -108,24 +108,6 @@ func packSectionInflate(pf *packFile, start uint64, sizeHint int) (bufpool.Buffe return body, nil } -func packDeltaReadOfsDistance(data []byte) (uint64, int, error) { - if len(data) == 0 { - return 0, 0, io.ErrUnexpectedEOF - } - b := data[0] - dist := uint64(b & 0x7f) - consumed := 1 - for (b & 0x80) != 0 { - if consumed >= len(data) { - return 0, 0, io.ErrUnexpectedEOF - } - b = data[consumed] - consumed++ - dist = ((dist + 1) << 7) + uint64(b&0x7f) - } - return dist, consumed, nil -} - type packKey struct { path string ofs uint64 @@ -344,138 +326,6 @@ func (repo *Repository) packBodyResolveWithin(pf *packFile, ofs uint64) (ObjectT return resultTy, body, nil } -func packDeltaApply(base, delta bufpool.Buffer) (bufpool.Buffer, error) { - pos := 0 - baseBytes := base.Bytes() - deltaBytes := delta.Bytes() - srcSize, err := packVarintRead(deltaBytes, &pos) - if err != nil { - return bufpool.Buffer{}, err - } - dstSize, err := packVarintRead(deltaBytes, &pos) - if err != nil { - return bufpool.Buffer{}, err - } - if srcSize != len(baseBytes) { - return bufpool.Buffer{}, ErrInvalidObject - } - out := bufpool.Borrow(dstSize) - out.Resize(dstSize) - outBytes := out.Bytes() - outPos := 0 - - for pos < len(deltaBytes) { - op := deltaBytes[pos] - pos++ - switch { - case op&0x80 != 0: - off := 0 - n := 0 - if op&0x01 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - off |= int(deltaBytes[pos]) - pos++ - } - if op&0x02 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - off |= int(deltaBytes[pos]) << 8 - pos++ - } - if op&0x04 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - off |= int(deltaBytes[pos]) << 16 - pos++ - } - if op&0x08 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - off |= int(deltaBytes[pos]) << 24 - pos++ - } - if op&0x10 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - n |= int(deltaBytes[pos]) - pos++ - } - if op&0x20 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - n |= int(deltaBytes[pos]) << 8 - pos++ - } - if op&0x40 != 0 { - if pos >= len(deltaBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - n |= int(deltaBytes[pos]) << 16 - pos++ - } - if n == 0 { - n = 0x10000 - } - if off+n > len(baseBytes) || outPos+n > len(outBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - copy(outBytes[outPos:], baseBytes[off:off+n]) - outPos += n - case op != 0: - n := int(op) - if pos+n > len(deltaBytes) || outPos+n > len(outBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - copy(outBytes[outPos:], deltaBytes[pos:pos+n]) - pos += n - outPos += n - default: - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - } - - if outPos != len(outBytes) { - out.Release() - return bufpool.Buffer{}, ErrInvalidObject - } - return out, nil -} - -func packVarintRead(buf []byte, pos *int) (int, error) { - res := 0 - shift := 0 - for { - if *pos >= len(buf) { - return 0, ErrInvalidObject - } - b := buf[*pos] - *pos++ - res |= int(b&0x7f) << shift - if (b & 0x80) == 0 { - break - } - shift += 7 - } - return res, nil -} - type packFile struct { relPath string size int64 diff --git a/packed_write_pack.go b/packed_write_pack.go index 435c9edb..0bcaf9ca 100644 --- a/packed_write_pack.go +++ b/packed_write_pack.go @@ -115,19 +115,84 @@ func (pw *packWriter) WriteObject(ty ObjectType, body []byte) error { } func (pw *packWriter) WriteOfsDelta(baseOffset uint64, baseSize, resultSize int, delta []byte) error { - _ = baseOffset - _ = baseSize - _ = resultSize - _ = delta - return errPackDeltaUnimplemented + if pw == nil || !pw.wroteHeader { + return ErrInvalidObject + } + if baseSize < 0 || resultSize < 0 { + return ErrInvalidObject + } + if delta == nil { + delta = []byte{} + } + deltaSize := len(delta) + if deltaSize <= 0 { + return ErrInvalidObject + } + currentOffset := pw.bytesWritten + if baseOffset >= currentOffset { + return ErrInvalidObject + } + dist := currentOffset - baseOffset + + hdr, err := packHeaderEncode(ObjectTypeOfsDelta, deltaSize) + if err != nil { + return err + } + if err := pw.writePacked(hdr); err != nil { + return err + } + ofs, err := packOfsEncode(dist) + if err != nil { + return err + } + if err := pw.writePacked(ofs); err != nil { + return err + } + + zw := zlib.NewWriter(&packHashWriter{pw: pw}) + if _, err := zw.Write(delta); err != nil { + _ = zw.Close() + return err + } + return zw.Close() } func (pw *packWriter) WriteRefDelta(base Hash, baseSize, resultSize int, delta []byte) error { - _ = base - _ = baseSize - _ = resultSize - _ = delta - return errPackDeltaUnimplemented + if pw == nil || !pw.wroteHeader { + return ErrInvalidObject + } + if baseSize < 0 || resultSize < 0 { + return ErrInvalidObject + } + if delta == nil { + delta = []byte{} + } + deltaSize := len(delta) + if deltaSize <= 0 { + return ErrInvalidObject + } + baseBytes := base.Bytes() + if len(baseBytes) == 0 { + return ErrInvalidObject + } + + hdr, err := packHeaderEncode(ObjectTypeRefDelta, deltaSize) + if err != nil { + return err + } + if err := pw.writePacked(hdr); err != nil { + return err + } + if err := pw.writePacked(baseBytes); err != nil { + return err + } + + zw := zlib.NewWriter(&packHashWriter{pw: pw}) + if _, err := zw.Write(delta); err != nil { + _ = zw.Close() + return err + } + return zw.Close() } func (pw *packWriter) Close() (Hash, error) { @@ -237,7 +302,7 @@ func (repo *Repository) packWrite(w io.Writer, objects []Hash, opts packWriteOpt if repo == nil { return Hash{}, ErrInvalidObject } - if opts.EnableDeltas || opts.EnableThinPack { + if opts.EnableThinPack { return Hash{}, errPackDeltaUnimplemented } if len(objects) > int(^uint32(0)) { @@ -252,13 +317,45 @@ func (repo *Repository) packWrite(w io.Writer, objects []Hash, opts packWriteOpt return Hash{}, err } + var dctx deltaContext + if opts.EnableDeltas { + dctx.window = defaultDeltaWindow + } + deltaSeed := uint32(0) + for _, id := range objects { ty, body, err := repo.ReadObjectTypeRaw(id) if err != nil { return Hash{}, err } - if err := pw.WriteObject(ty, body); err != nil { - return Hash{}, err + obj := &objectToPack{ + id: id, + ty: ty, + body: body, + } + startOffset := pw.bytesWritten + wroteDelta := false + + if opts.EnableDeltas && ty == ObjectTypeBlob { + base, delta := pickDeltaBase(&dctx, obj, deltaSeed, opts.MinDeltaSavings, opts.MaxDeltaDepth) + if base != nil && delta != nil { + if err := pw.WriteOfsDelta(base.offset, len(base.body), len(body), delta); err != nil { + return Hash{}, err + } + wroteDelta = true + obj.deltaDepth = base.deltaDepth + 1 + } + } + if !wroteDelta { + if err := pw.WriteObject(ty, body); err != nil { + return Hash{}, err + } + obj.deltaDepth = 0 + } + obj.offset = startOffset + + if opts.EnableDeltas && ty == ObjectTypeBlob { + dctx.addCandidate(obj) } } diff --git a/packed_write_test.go b/packed_write_test.go index da7ecfa7..08f4484b 100644 --- a/packed_write_test.go +++ b/packed_write_test.go @@ -3,12 +3,16 @@ package furgit import ( "bytes" "crypto/rand" - "errors" "fmt" "os" + "os/exec" "path/filepath" "strings" "testing" + + "codeberg.org/lindenii/furgit/internal/adler32" + "codeberg.org/lindenii/furgit/internal/bufpool" + "codeberg.org/lindenii/furgit/internal/flatex" ) func TestPackHeaderEncodeParseRoundtrip(t *testing.T) { @@ -169,7 +173,20 @@ func TestPackWriteNoDeltas(t *testing.T) { _ = os.Remove(idxPath) }() - _ = gitCmd(t, repoPath, "index-pack", "-o", idxPath, packPath) + if err := checkPackStream(packPath, repo.hashAlgo, len(objects)); err != nil { + t.Fatalf("pack stream invalid: %v", err) + } + + cmd := exec.Command("git", "index-pack", "-o", idxPath, packPath) + cmd.Dir = repoPath + cmd.Env = append(os.Environ(), + "GIT_CONFIG_GLOBAL=/dev/null", + "GIT_CONFIG_SYSTEM=/dev/null", + ) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git index-pack failed: %v\n%s", err, output) + } verifyOut := gitCmd(t, repoPath, "verify-pack", "-v", idxPath) seen := make(map[string]struct{}) @@ -204,21 +221,318 @@ func TestPackWriteNoDeltas(t *testing.T) { _ = gitCmd(t, repoPath, "fsck", "--full", "--strict") } -func TestPackWriteDeltasUnimplemented(t *testing.T) { +func TestPackWriteDeltas(t *testing.T) { repoPath, cleanup := setupTestRepo(t) defer cleanup() + workDir, cleanupWork := setupWorkDir(t) + defer cleanupWork() + + const ( + fileCount = 200 + fileSize = 2048 + ) + base := bytes.Repeat([]byte("delta-base-"), fileSize/10) + for i := 0; i < fileCount; i++ { + buf := make([]byte, len(base)) + copy(buf, base) + buf[i%len(buf)] ^= byte(i) + name := filepath.Join(workDir, fmt.Sprintf("delta%04d.txt", i)) + if err := os.WriteFile(name, buf, 0o644); err != nil { + t.Fatalf("failed to write %s: %v", name, err) + } + } + + gitCmd(t, repoPath, "--work-tree="+workDir, "add", ".") + gitCmd(t, repoPath, "--work-tree="+workDir, "commit", "-m", "Delta commit") + commitHash := gitCmd(t, repoPath, "rev-parse", "HEAD") + + commitBody := gitCatFile(t, repoPath, "commit", commitHash) + lines := bytes.Split(commitBody, []byte{'\n'}) + if len(lines) == 0 || !bytes.HasPrefix(lines[0], []byte("tree ")) { + t.Fatalf("commit missing tree header") + } + treeHash := strings.TrimSpace(string(bytes.TrimPrefix(lines[0], []byte("tree ")))) + + lsTree := gitCmd(t, repoPath, "ls-tree", "-r", treeHash) + var blobHashes []string + for _, line := range strings.Split(lsTree, "\n") { + if line == "" { + continue + } + fields := strings.Fields(line) + if len(fields) < 3 { + t.Fatalf("unexpected ls-tree line: %q", line) + } + blobHashes = append(blobHashes, fields[2]) + } + repo, err := OpenRepository(repoPath) if err != nil { t.Fatalf("OpenRepository failed: %v", err) } defer func() { _ = repo.Close() }() - buf := new(bytes.Buffer) - _, err = repo.packWrite(buf, nil, packWriteOptions{EnableDeltas: true}) - if !errors.Is(err, errPackDeltaUnimplemented) { - t.Fatalf("expected errPackDeltaUnimplemented, got %v", err) + var objects []Hash + commitID, _ := repo.ParseHash(commitHash) + objects = append(objects, commitID) + treeID, _ := repo.ParseHash(treeHash) + objects = append(objects, treeID) + for _, bh := range blobHashes { + id, _ := repo.ParseHash(bh) + objects = append(objects, id) + } + expectedOids := append([]string{commitHash, treeHash}, blobHashes...) + + packDir := filepath.Join(repoPath, "objects", "pack") + if err := os.MkdirAll(packDir, 0o755); err != nil { + t.Fatalf("failed to create pack dir: %v", err) + } + pf, err := os.CreateTemp(packDir, "furgit-delta-test-*.pack") + if err != nil { + t.Fatalf("failed to create pack file: %v", err) + } + packPath := pf.Name() + idxPath := strings.TrimSuffix(packPath, ".pack") + ".idx" + if _, err := repo.packWrite(pf, objects, packWriteOptions{ + EnableDeltas: true, + MinDeltaSavings: 1, + }); err != nil { + _ = pf.Close() + t.Fatalf("packWrite failed: %v", err) + } + if err := pf.Close(); err != nil { + t.Fatalf("failed to close pack file: %v", err) + } + + defer func() { + _ = os.Remove(packPath) + _ = os.Remove(idxPath) + }() + + if err := checkPackStream(packPath, repo.hashAlgo, len(objects)); err != nil { + t.Fatalf("pack stream invalid: %v", err) + } + + cmd := exec.Command("git", "index-pack", "-o", idxPath, packPath) + cmd.Dir = repoPath + cmd.Env = append(os.Environ(), + "GIT_CONFIG_GLOBAL=/dev/null", + "GIT_CONFIG_SYSTEM=/dev/null", + ) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git index-pack failed: %v\n%s", err, output) + } + + verifyOut := gitCmd(t, repoPath, "verify-pack", "-v", idxPath) + seen := make(map[string]struct{}) + for _, line := range strings.Split(verifyOut, "\n") { + if strings.TrimSpace(line) == "" { + continue + } + if strings.HasPrefix(line, "chain length") || strings.HasPrefix(line, "non delta") { + continue + } + parts := strings.Fields(line) + if len(parts) == 0 { + continue + } + seen[parts[0]] = struct{}{} + } + for _, oid := range expectedOids { + if _, ok := seen[oid]; !ok { + t.Fatalf("verify-pack missing object %s", oid) + } + } + + for _, oid := range expectedOids { + if err := removeLooseObject(repoPath, oid); err != nil { + t.Fatalf("remove loose object %s: %v", oid, err) + } + } + for _, oid := range expectedOids { + _ = gitCmd(t, repoPath, "cat-file", "-p", oid) + } + + _ = gitCmd(t, repoPath, "fsck", "--full", "--strict") +} + +func checkPackStream(path string, algo hashAlgorithm, objectCount int) error { + data, err := os.ReadFile(path) + if err != nil { + return err + } + if len(data) < 12 { + return ErrInvalidObject + } + if readBE32(data[0:4]) != packMagic || readBE32(data[4:8]) != packVersion2 { + return ErrInvalidObject + } + pos := 12 + hashSize := algo.Size() + type objEntry struct { + offset uint64 + ty ObjectType + body []byte + } + byOffset := make(map[uint64]objEntry, objectCount) + byHash := make(map[string]objEntry, objectCount) + for i := 0; i < objectCount; i++ { + objOffset := uint64(pos) + ty, size, consumed, err := packHeaderParse(data[pos:]) + if err != nil { + return fmt.Errorf("obj %d header at %d: %v", i, pos, err) + } + pos += consumed + baseOffset := uint64(0) + baseTy := ObjectTypeInvalid + var baseBody []byte + var baseHash Hash + switch ty { + case ObjectTypeOfsDelta: + dist, distConsumed, err := packDeltaReadOfsDistance(data[pos:]) + if err != nil { + return fmt.Errorf("obj %d ofs at %d: %v", i, pos, err) + } + pos += distConsumed + if dist == 0 || dist > objOffset { + return fmt.Errorf("obj %d ofs at %d: invalid dist", i, pos) + } + baseOffset = objOffset - dist + base, ok := byOffset[baseOffset] + if !ok { + return fmt.Errorf("obj %d ofs at %d: missing base", i, pos) + } + baseTy = base.ty + baseBody = base.body + case ObjectTypeRefDelta: + if pos+hashSize > len(data) { + return ErrInvalidObject + } + copy(baseHash.data[:], data[pos:pos+hashSize]) + baseHash.algo = algo + baseEntry, ok := byHash[baseHash.String()] + if !ok { + return fmt.Errorf("obj %d ref base not found", i) + } + baseTy = baseEntry.ty + baseBody = baseEntry.body + pos += hashSize + default: + } + + payload, zconsumed, err := consumeZlibStream(data[pos:], size) + if err != nil { + return fmt.Errorf("obj %d zlib at %d: %v", i, pos, err) + } + pos += zconsumed + switch ty { + case ObjectTypeOfsDelta: + if baseBody == nil { + return fmt.Errorf("obj %d missing base body", i) + } + baseSize, resultSize, err := readDeltaSizes(payload) + if err != nil { + return fmt.Errorf("obj %d delta sizes: %v", i, err) + } + if baseSize != len(baseBody) { + return fmt.Errorf("obj %d delta base size mismatch: got %d want %d", i, baseSize, len(baseBody)) + } + out, err := packDeltaApply(bufpool.FromOwned(baseBody), bufpool.FromOwned(payload)) + if err != nil { + return fmt.Errorf("obj %d delta apply: %v", i, err) + } + body := append([]byte(nil), out.Bytes()...) + out.Release() + if resultSize != len(body) { + return fmt.Errorf("obj %d delta result size mismatch: got %d want %d", i, len(body), resultSize) + } + byOffset[objOffset] = objEntry{offset: objOffset, ty: baseTy, body: body} + case ObjectTypeRefDelta: + if baseBody == nil { + return fmt.Errorf("obj %d missing ref base body", i) + } + baseSize, resultSize, err := readDeltaSizes(payload) + if err != nil { + return fmt.Errorf("obj %d ref delta sizes: %v", i, err) + } + if baseSize != len(baseBody) { + return fmt.Errorf("obj %d ref delta base size mismatch: got %d want %d", i, baseSize, len(baseBody)) + } + out, err := packDeltaApply(bufpool.FromOwned(baseBody), bufpool.FromOwned(payload)) + if err != nil { + return fmt.Errorf("obj %d ref delta apply: %v", i, err) + } + body := append([]byte(nil), out.Bytes()...) + out.Release() + if resultSize != len(body) { + return fmt.Errorf("obj %d ref delta result size mismatch: got %d want %d", i, len(body), resultSize) + } + byOffset[objOffset] = objEntry{offset: objOffset, ty: baseTy, body: body} + default: + if size >= 0 && len(payload) != size { + return fmt.Errorf("obj %d size mismatch: got %d want %d", i, len(payload), size) + } + body := append([]byte(nil), payload...) + byOffset[objOffset] = objEntry{offset: objOffset, ty: ty, body: body} + } + + entry := byOffset[objOffset] + if entry.body != nil && entry.ty != ObjectTypeInvalid { + hdr, err := headerForType(entry.ty, entry.body) + if err != nil { + return err + } + raw := append(hdr, entry.body...) + hash := algo.Sum(raw) + byHash[hash.String()] = entry + } + } + return nil +} + +func consumeZlibStream(src []byte, sizeHint int) ([]byte, int, error) { + if len(src) < 6 { + return nil, 0, fmt.Errorf("zlib too short") + } + cmf := src[0] + flg := src[1] + if (cmf&0x0f != 8) || (cmf>>4 > 7) || (uint16(cmf)<<8|uint16(flg))%31 != 0 { + return nil, 0, fmt.Errorf("zlib header invalid") + } + if flg&0x20 != 0 { + return nil, 0, fmt.Errorf("zlib dict not supported") + } + deflateData := src[2:] + out, consumed, err := flatex.DecompressSized(deflateData, sizeHint) + if err != nil { + return nil, 0, err + } + payload := append([]byte(nil), out.Bytes()...) + out.Release() + total := 2 + consumed + 4 + if total > len(src) { + return nil, 0, fmt.Errorf("zlib truncated") + } + expected := readBE32(src[2+consumed : 2+consumed+4]) + if expected != adler32.Checksum(payload) { + return nil, 0, fmt.Errorf("zlib checksum mismatch") + } + return payload, total, nil +} + +func readDeltaSizes(delta []byte) (int, int, error) { + pos := 0 + baseSize, err := packVarintRead(delta, &pos) + if err != nil { + return 0, 0, err + } + resultSize, err := packVarintRead(delta, &pos) + if err != nil { + return 0, 0, err } + return baseSize, resultSize, nil } func removeLooseObject(repoPath, oid string) error { |
