diff options
| author | 2025-11-19 08:00:00 +0800 | |
|---|---|---|
| committer | 2025-11-19 08:00:00 +0800 | |
| commit | 676f08cc4b0deb397033598ac96c0d0272fcae05 (patch) | |
| tree | a174be9361b43e9d2b031f70c9df15401f0adf00 /internal/adler32 | |
| parent | Well, we use a dependency (golang.org/x/sys/cpu) now... (diff) | |
| signature | No signature | |
SIMD with AVX2 on supported AMD64 machines
Some help from gpt-5.1-thinking taken: used wrong register size for
the weighted sum at first, so it was truncating the second half of
our block; also there was an overflow from the modulus and stuff.
Unfortunately the AVX2 adler32 is only about 20% faster than the
generic version which doesn't make for much.
Diffstat (limited to 'internal/adler32')
| -rw-r--r-- | internal/adler32/adler32.go | 92 | ||||
| -rw-r--r-- | internal/adler32/adler32_amd64.go | 130 | ||||
| -rw-r--r-- | internal/adler32/adler32_amd64_avx2.s | 173 | ||||
| -rw-r--r-- | internal/adler32/adler32_generic.go | 96 | ||||
| -rw-r--r-- | internal/adler32/bench_test.go | 24 |
5 files changed, 423 insertions, 92 deletions
diff --git a/internal/adler32/adler32.go b/internal/adler32/adler32.go index f8a289f5..c349cd5d 100644 --- a/internal/adler32/adler32.go +++ b/internal/adler32/adler32.go @@ -84,98 +84,6 @@ func (d *digest) Clone() (hash.Cloner, error) { return &r, nil } -// Add p to the running checksum d. -func update(d digest, p []byte) digest { - s1, s2 := uint32(d&0xffff), uint32(d>>16) - - for len(p) > 0 { - var q []byte - if len(p) > nmax { - p, q = p[:nmax], p[nmax:] - } - - for len(p) >= 32 { - v := p[:32] - p = p[32:] - - s1 += uint32(v[0]) - s2 += s1 - s1 += uint32(v[1]) - s2 += s1 - s1 += uint32(v[2]) - s2 += s1 - s1 += uint32(v[3]) - s2 += s1 - s1 += uint32(v[4]) - s2 += s1 - s1 += uint32(v[5]) - s2 += s1 - s1 += uint32(v[6]) - s2 += s1 - s1 += uint32(v[7]) - s2 += s1 - s1 += uint32(v[8]) - s2 += s1 - s1 += uint32(v[9]) - s2 += s1 - s1 += uint32(v[10]) - s2 += s1 - s1 += uint32(v[11]) - s2 += s1 - s1 += uint32(v[12]) - s2 += s1 - s1 += uint32(v[13]) - s2 += s1 - s1 += uint32(v[14]) - s2 += s1 - s1 += uint32(v[15]) - s2 += s1 - s1 += uint32(v[16]) - s2 += s1 - s1 += uint32(v[17]) - s2 += s1 - s1 += uint32(v[18]) - s2 += s1 - s1 += uint32(v[19]) - s2 += s1 - s1 += uint32(v[20]) - s2 += s1 - s1 += uint32(v[21]) - s2 += s1 - s1 += uint32(v[22]) - s2 += s1 - s1 += uint32(v[23]) - s2 += s1 - s1 += uint32(v[24]) - s2 += s1 - s1 += uint32(v[25]) - s2 += s1 - s1 += uint32(v[26]) - s2 += s1 - s1 += uint32(v[27]) - s2 += s1 - s1 += uint32(v[28]) - s2 += s1 - s1 += uint32(v[29]) - s2 += s1 - s1 += uint32(v[30]) - s2 += s1 - s1 += uint32(v[31]) - s2 += s1 - } - - for _, x := range p { - s1 += uint32(x) - s2 += s1 - } - - s1 %= mod - s2 %= mod - p = q - } - return digest(s2<<16 | s1) -} - func (d *digest) Write(p []byte) (nn int, err error) { *d = update(*d, p) return len(p), nil diff --git a/internal/adler32/adler32_amd64.go b/internal/adler32/adler32_amd64.go new file mode 100644 index 00000000..a109ccc9 --- /dev/null +++ b/internal/adler32/adler32_amd64.go @@ -0,0 +1,130 @@ +//go:build amd64 && !purego + +package adler32 + +import "golang.org/x/sys/cpu" + +var updateFn func(d digest, p []byte) digest = updateGeneric + +func init() { + switch { + case cpu.X86.HasAVX2: + updateFn = updateAVX2 + default: + updateFn = updateGeneric + } +} + +func update(d digest, p []byte) digest { + return updateFn(d, p) +} + +func updateGeneric(d digest, p []byte) digest { + s1, s2 := uint32(d&0xffff), uint32(d>>16) + + for len(p) > 0 { + var q []byte + if len(p) > nmax { + p, q = p[:nmax], p[nmax:] + } + + for len(p) >= 32 { + v := p[:32] + p = p[32:] + + s1 += uint32(v[0]) + s2 += s1 + s1 += uint32(v[1]) + s2 += s1 + s1 += uint32(v[2]) + s2 += s1 + s1 += uint32(v[3]) + s2 += s1 + s1 += uint32(v[4]) + s2 += s1 + s1 += uint32(v[5]) + s2 += s1 + s1 += uint32(v[6]) + s2 += s1 + s1 += uint32(v[7]) + s2 += s1 + s1 += uint32(v[8]) + s2 += s1 + s1 += uint32(v[9]) + s2 += s1 + s1 += uint32(v[10]) + s2 += s1 + s1 += uint32(v[11]) + s2 += s1 + s1 += uint32(v[12]) + s2 += s1 + s1 += uint32(v[13]) + s2 += s1 + s1 += uint32(v[14]) + s2 += s1 + s1 += uint32(v[15]) + s2 += s1 + s1 += uint32(v[16]) + s2 += s1 + s1 += uint32(v[17]) + s2 += s1 + s1 += uint32(v[18]) + s2 += s1 + s1 += uint32(v[19]) + s2 += s1 + s1 += uint32(v[20]) + s2 += s1 + s1 += uint32(v[21]) + s2 += s1 + s1 += uint32(v[22]) + s2 += s1 + s1 += uint32(v[23]) + s2 += s1 + s1 += uint32(v[24]) + s2 += s1 + s1 += uint32(v[25]) + s2 += s1 + s1 += uint32(v[26]) + s2 += s1 + s1 += uint32(v[27]) + s2 += s1 + s1 += uint32(v[28]) + s2 += s1 + s1 += uint32(v[29]) + s2 += s1 + s1 += uint32(v[30]) + s2 += s1 + s1 += uint32(v[31]) + s2 += s1 + } + + for i := 0; i < len(p); i++ { + x := p[i] + s1 += uint32(x) + s2 += s1 + } + + s1 %= mod + s2 %= mod + p = q + } + + return digest(s2<<16 | s1) +} + +//go:noescape +func adler32AVX2(state uint32, b []byte) uint32 + +func updateAVX2(d digest, p []byte) digest { + s := uint32(d) + for len(p) > 0 { + chunk := p + if len(chunk) > nmax { + chunk = p[:nmax] + } + s = adler32AVX2(s, chunk) + + p = p[len(chunk):] + } + return digest(s) +} diff --git a/internal/adler32/adler32_amd64_avx2.s b/internal/adler32/adler32_amd64_avx2.s new file mode 100644 index 00000000..da8402a3 --- /dev/null +++ b/internal/adler32/adler32_amd64_avx2.s @@ -0,0 +1,173 @@ +//go:build amd64 && !purego + +#include "textflag.h" + +// func adler32AVX2(state uint32, b []byte) uint32 +// state = s2<<16 | s1 +// len(b) <= nmax enforced by Go wrapper +TEXT ·adler32AVX2(SB), NOSPLIT, $0-40 + // state uint32 at 0 + // b.ptr *byte at 8 + // b.len int at 16 + // b.cap int at 24 + // ret uint32 at 32 + + MOVL state+0(FP), AX + MOVQ b_base+8(FP), SI + MOVQ b_len+16(FP), CX + + // s1 in R10d, s2 in R11d + MOVL AX, R10 + ANDL $0xffff, R10 // s1 = low 16 bits + SHRL $16, AX + MOVL AX, R11 // s2 = high 16 bits + + // Just use the scalar tail if len < 32 + CMPQ CX, $32 + JLT tail + +loop32: + VMOVDQU (SI), X0 + VMOVDQU 16(SI), X1 + ADDQ $32, SI + SUBQ $32, CX + + // Split 32 bytes into uint16 lanes (four groups of eight bytes) + VPMOVZXBW X0, Y2 + VPMOVZXBW X1, Y3 + VEXTRACTI128 $0, Y2, X4 + VEXTRACTI128 $1, Y2, X5 + VEXTRACTI128 $0, Y3, X6 + VEXTRACTI128 $1, Y3, X7 + + // Process each group using prefix sums (only adds, no multiplies) + // Group 0 (bytes 0..7) + VMOVDQA X4, X8 + VPSLLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $7, X8, R8 + VPSRLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $0, X8, R9 + MOVL R10, R12 + SHLL $3, R12 + ADDL R12, R11 + ADDL R9, R11 + ADDL R8, R10 + + // Group 1 (bytes 8..15) + VMOVDQA X5, X8 + VPSLLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $7, X8, R8 + VPSRLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $0, X8, R9 + MOVL R10, R12 + SHLL $3, R12 + ADDL R12, R11 + ADDL R9, R11 + ADDL R8, R10 + + // Group 2 (bytes 16..23) + VMOVDQA X6, X8 + VPSLLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $7, X8, R8 + VPSRLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $0, X8, R9 + MOVL R10, R12 + SHLL $3, R12 + ADDL R12, R11 + ADDL R9, R11 + ADDL R8, R10 + + // Group 3 (bytes 24..31) + VMOVDQA X7, X8 + VPSLLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSLLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $7, X8, R8 + VPSRLDQ $8, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $4, X8, X9 + VPADDW X9, X8, X8 + VPSRLDQ $2, X8, X9 + VPADDW X9, X8, X8 + VPEXTRW $0, X8, R9 + MOVL R10, R12 + SHLL $3, R12 + ADDL R12, R11 + ADDL R9, R11 + ADDL R8, R10 + + CMPQ CX, $32 + JGE loop32 + +tail: + TESTQ CX, CX + JEQ done + +tail_loop: + MOVBLZX (SI), R8 + INCQ SI + DECQ CX + + ADDL R8, R10 + ADDL R10, R11 + + TESTQ CX, CX + JNE tail_loop + +done: + MOVL $65521, R8 + + // Reduce s1 %= mod + MOVL R10, AX + XORL DX, DX + DIVL R8 + MOVL DX, R10 + + // Reduce s2 %= mod + MOVL R11, AX + XORL DX, DX + DIVL R8 + MOVL DX, R11 + + MOVL R11, AX + SHLL $16, AX + ANDL $0xffff, R10 + ORL R10, AX + + VZEROUPPER + + MOVL AX, ret+32(FP) + RET diff --git a/internal/adler32/adler32_generic.go b/internal/adler32/adler32_generic.go new file mode 100644 index 00000000..8ba330a5 --- /dev/null +++ b/internal/adler32/adler32_generic.go @@ -0,0 +1,96 @@ +//go:build !amd64 || purego + +package adler32 + +func update(d digest, p []byte) digest { + s1, s2 := uint32(d&0xffff), uint32(d>>16) + + for len(p) > 0 { + var q []byte + if len(p) > nmax { + p, q = p[:nmax], p[nmax:] + } + + for len(p) >= 32 { + v := p[:32] + p = p[32:] + + s1 += uint32(v[0]) + s2 += s1 + s1 += uint32(v[1]) + s2 += s1 + s1 += uint32(v[2]) + s2 += s1 + s1 += uint32(v[3]) + s2 += s1 + s1 += uint32(v[4]) + s2 += s1 + s1 += uint32(v[5]) + s2 += s1 + s1 += uint32(v[6]) + s2 += s1 + s1 += uint32(v[7]) + s2 += s1 + s1 += uint32(v[8]) + s2 += s1 + s1 += uint32(v[9]) + s2 += s1 + s1 += uint32(v[10]) + s2 += s1 + s1 += uint32(v[11]) + s2 += s1 + s1 += uint32(v[12]) + s2 += s1 + s1 += uint32(v[13]) + s2 += s1 + s1 += uint32(v[14]) + s2 += s1 + s1 += uint32(v[15]) + s2 += s1 + s1 += uint32(v[16]) + s2 += s1 + s1 += uint32(v[17]) + s2 += s1 + s1 += uint32(v[18]) + s2 += s1 + s1 += uint32(v[19]) + s2 += s1 + s1 += uint32(v[20]) + s2 += s1 + s1 += uint32(v[21]) + s2 += s1 + s1 += uint32(v[22]) + s2 += s1 + s1 += uint32(v[23]) + s2 += s1 + s1 += uint32(v[24]) + s2 += s1 + s1 += uint32(v[25]) + s2 += s1 + s1 += uint32(v[26]) + s2 += s1 + s1 += uint32(v[27]) + s2 += s1 + s1 += uint32(v[28]) + s2 += s1 + s1 += uint32(v[29]) + s2 += s1 + s1 += uint32(v[30]) + s2 += s1 + s1 += uint32(v[31]) + s2 += s1 + } + + for i := 0; i < len(p); i++ { + x := p[i] + s1 += uint32(x) + s2 += s1 + } + + s1 %= mod + s2 %= mod + p = q + } + + return digest(s2<<16 | s1) +} diff --git a/internal/adler32/bench_test.go b/internal/adler32/bench_test.go new file mode 100644 index 00000000..fd1e4710 --- /dev/null +++ b/internal/adler32/bench_test.go @@ -0,0 +1,24 @@ +package adler32_test + +import ( + "testing" + + "git.sr.ht/~runxiyu/furgit/internal/adler32" +) + +const benchmarkSize = 64 * 1024 + +var data = make([]byte, benchmarkSize) + +func init() { + for i := 0; i < benchmarkSize; i++ { + data[i] = byte(i % 256) + } +} + +func BenchmarkChecksum(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + adler32.Checksum(data) + } +} |
