aboutsummaryrefslogtreecommitdiff
path: root/internal/adler32
diff options
context:
space:
mode:
authorGravatar Runxi Yu2025-11-19 08:00:00 +0800
committerGravatar Runxi Yu2025-11-19 08:00:00 +0800
commit676f08cc4b0deb397033598ac96c0d0272fcae05 (patch)
treea174be9361b43e9d2b031f70c9df15401f0adf00 /internal/adler32
parentWell, we use a dependency (golang.org/x/sys/cpu) now... (diff)
signatureNo signature
SIMD with AVX2 on supported AMD64 machines
Some help from gpt-5.1-thinking taken: used wrong register size for the weighted sum at first, so it was truncating the second half of our block; also there was an overflow from the modulus and stuff. Unfortunately the AVX2 adler32 is only about 20% faster than the generic version which doesn't make for much.
Diffstat (limited to 'internal/adler32')
-rw-r--r--internal/adler32/adler32.go92
-rw-r--r--internal/adler32/adler32_amd64.go130
-rw-r--r--internal/adler32/adler32_amd64_avx2.s173
-rw-r--r--internal/adler32/adler32_generic.go96
-rw-r--r--internal/adler32/bench_test.go24
5 files changed, 423 insertions, 92 deletions
diff --git a/internal/adler32/adler32.go b/internal/adler32/adler32.go
index f8a289f5..c349cd5d 100644
--- a/internal/adler32/adler32.go
+++ b/internal/adler32/adler32.go
@@ -84,98 +84,6 @@ func (d *digest) Clone() (hash.Cloner, error) {
return &r, nil
}
-// Add p to the running checksum d.
-func update(d digest, p []byte) digest {
- s1, s2 := uint32(d&0xffff), uint32(d>>16)
-
- for len(p) > 0 {
- var q []byte
- if len(p) > nmax {
- p, q = p[:nmax], p[nmax:]
- }
-
- for len(p) >= 32 {
- v := p[:32]
- p = p[32:]
-
- s1 += uint32(v[0])
- s2 += s1
- s1 += uint32(v[1])
- s2 += s1
- s1 += uint32(v[2])
- s2 += s1
- s1 += uint32(v[3])
- s2 += s1
- s1 += uint32(v[4])
- s2 += s1
- s1 += uint32(v[5])
- s2 += s1
- s1 += uint32(v[6])
- s2 += s1
- s1 += uint32(v[7])
- s2 += s1
- s1 += uint32(v[8])
- s2 += s1
- s1 += uint32(v[9])
- s2 += s1
- s1 += uint32(v[10])
- s2 += s1
- s1 += uint32(v[11])
- s2 += s1
- s1 += uint32(v[12])
- s2 += s1
- s1 += uint32(v[13])
- s2 += s1
- s1 += uint32(v[14])
- s2 += s1
- s1 += uint32(v[15])
- s2 += s1
- s1 += uint32(v[16])
- s2 += s1
- s1 += uint32(v[17])
- s2 += s1
- s1 += uint32(v[18])
- s2 += s1
- s1 += uint32(v[19])
- s2 += s1
- s1 += uint32(v[20])
- s2 += s1
- s1 += uint32(v[21])
- s2 += s1
- s1 += uint32(v[22])
- s2 += s1
- s1 += uint32(v[23])
- s2 += s1
- s1 += uint32(v[24])
- s2 += s1
- s1 += uint32(v[25])
- s2 += s1
- s1 += uint32(v[26])
- s2 += s1
- s1 += uint32(v[27])
- s2 += s1
- s1 += uint32(v[28])
- s2 += s1
- s1 += uint32(v[29])
- s2 += s1
- s1 += uint32(v[30])
- s2 += s1
- s1 += uint32(v[31])
- s2 += s1
- }
-
- for _, x := range p {
- s1 += uint32(x)
- s2 += s1
- }
-
- s1 %= mod
- s2 %= mod
- p = q
- }
- return digest(s2<<16 | s1)
-}
-
func (d *digest) Write(p []byte) (nn int, err error) {
*d = update(*d, p)
return len(p), nil
diff --git a/internal/adler32/adler32_amd64.go b/internal/adler32/adler32_amd64.go
new file mode 100644
index 00000000..a109ccc9
--- /dev/null
+++ b/internal/adler32/adler32_amd64.go
@@ -0,0 +1,130 @@
+//go:build amd64 && !purego
+
+package adler32
+
+import "golang.org/x/sys/cpu"
+
+var updateFn func(d digest, p []byte) digest = updateGeneric
+
+func init() {
+ switch {
+ case cpu.X86.HasAVX2:
+ updateFn = updateAVX2
+ default:
+ updateFn = updateGeneric
+ }
+}
+
+func update(d digest, p []byte) digest {
+ return updateFn(d, p)
+}
+
+func updateGeneric(d digest, p []byte) digest {
+ s1, s2 := uint32(d&0xffff), uint32(d>>16)
+
+ for len(p) > 0 {
+ var q []byte
+ if len(p) > nmax {
+ p, q = p[:nmax], p[nmax:]
+ }
+
+ for len(p) >= 32 {
+ v := p[:32]
+ p = p[32:]
+
+ s1 += uint32(v[0])
+ s2 += s1
+ s1 += uint32(v[1])
+ s2 += s1
+ s1 += uint32(v[2])
+ s2 += s1
+ s1 += uint32(v[3])
+ s2 += s1
+ s1 += uint32(v[4])
+ s2 += s1
+ s1 += uint32(v[5])
+ s2 += s1
+ s1 += uint32(v[6])
+ s2 += s1
+ s1 += uint32(v[7])
+ s2 += s1
+ s1 += uint32(v[8])
+ s2 += s1
+ s1 += uint32(v[9])
+ s2 += s1
+ s1 += uint32(v[10])
+ s2 += s1
+ s1 += uint32(v[11])
+ s2 += s1
+ s1 += uint32(v[12])
+ s2 += s1
+ s1 += uint32(v[13])
+ s2 += s1
+ s1 += uint32(v[14])
+ s2 += s1
+ s1 += uint32(v[15])
+ s2 += s1
+ s1 += uint32(v[16])
+ s2 += s1
+ s1 += uint32(v[17])
+ s2 += s1
+ s1 += uint32(v[18])
+ s2 += s1
+ s1 += uint32(v[19])
+ s2 += s1
+ s1 += uint32(v[20])
+ s2 += s1
+ s1 += uint32(v[21])
+ s2 += s1
+ s1 += uint32(v[22])
+ s2 += s1
+ s1 += uint32(v[23])
+ s2 += s1
+ s1 += uint32(v[24])
+ s2 += s1
+ s1 += uint32(v[25])
+ s2 += s1
+ s1 += uint32(v[26])
+ s2 += s1
+ s1 += uint32(v[27])
+ s2 += s1
+ s1 += uint32(v[28])
+ s2 += s1
+ s1 += uint32(v[29])
+ s2 += s1
+ s1 += uint32(v[30])
+ s2 += s1
+ s1 += uint32(v[31])
+ s2 += s1
+ }
+
+ for i := 0; i < len(p); i++ {
+ x := p[i]
+ s1 += uint32(x)
+ s2 += s1
+ }
+
+ s1 %= mod
+ s2 %= mod
+ p = q
+ }
+
+ return digest(s2<<16 | s1)
+}
+
+//go:noescape
+func adler32AVX2(state uint32, b []byte) uint32
+
+func updateAVX2(d digest, p []byte) digest {
+ s := uint32(d)
+ for len(p) > 0 {
+ chunk := p
+ if len(chunk) > nmax {
+ chunk = p[:nmax]
+ }
+ s = adler32AVX2(s, chunk)
+
+ p = p[len(chunk):]
+ }
+ return digest(s)
+}
diff --git a/internal/adler32/adler32_amd64_avx2.s b/internal/adler32/adler32_amd64_avx2.s
new file mode 100644
index 00000000..da8402a3
--- /dev/null
+++ b/internal/adler32/adler32_amd64_avx2.s
@@ -0,0 +1,173 @@
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+// func adler32AVX2(state uint32, b []byte) uint32
+// state = s2<<16 | s1
+// len(b) <= nmax enforced by Go wrapper
+TEXT ·adler32AVX2(SB), NOSPLIT, $0-40
+ // state uint32 at 0
+ // b.ptr *byte at 8
+ // b.len int at 16
+ // b.cap int at 24
+ // ret uint32 at 32
+
+ MOVL state+0(FP), AX
+ MOVQ b_base+8(FP), SI
+ MOVQ b_len+16(FP), CX
+
+ // s1 in R10d, s2 in R11d
+ MOVL AX, R10
+ ANDL $0xffff, R10 // s1 = low 16 bits
+ SHRL $16, AX
+ MOVL AX, R11 // s2 = high 16 bits
+
+ // Just use the scalar tail if len < 32
+ CMPQ CX, $32
+ JLT tail
+
+loop32:
+ VMOVDQU (SI), X0
+ VMOVDQU 16(SI), X1
+ ADDQ $32, SI
+ SUBQ $32, CX
+
+ // Split 32 bytes into uint16 lanes (four groups of eight bytes)
+ VPMOVZXBW X0, Y2
+ VPMOVZXBW X1, Y3
+ VEXTRACTI128 $0, Y2, X4
+ VEXTRACTI128 $1, Y2, X5
+ VEXTRACTI128 $0, Y3, X6
+ VEXTRACTI128 $1, Y3, X7
+
+ // Process each group using prefix sums (only adds, no multiplies)
+ // Group 0 (bytes 0..7)
+ VMOVDQA X4, X8
+ VPSLLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $7, X8, R8
+ VPSRLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $0, X8, R9
+ MOVL R10, R12
+ SHLL $3, R12
+ ADDL R12, R11
+ ADDL R9, R11
+ ADDL R8, R10
+
+ // Group 1 (bytes 8..15)
+ VMOVDQA X5, X8
+ VPSLLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $7, X8, R8
+ VPSRLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $0, X8, R9
+ MOVL R10, R12
+ SHLL $3, R12
+ ADDL R12, R11
+ ADDL R9, R11
+ ADDL R8, R10
+
+ // Group 2 (bytes 16..23)
+ VMOVDQA X6, X8
+ VPSLLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $7, X8, R8
+ VPSRLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $0, X8, R9
+ MOVL R10, R12
+ SHLL $3, R12
+ ADDL R12, R11
+ ADDL R9, R11
+ ADDL R8, R10
+
+ // Group 3 (bytes 24..31)
+ VMOVDQA X7, X8
+ VPSLLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSLLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $7, X8, R8
+ VPSRLDQ $8, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $4, X8, X9
+ VPADDW X9, X8, X8
+ VPSRLDQ $2, X8, X9
+ VPADDW X9, X8, X8
+ VPEXTRW $0, X8, R9
+ MOVL R10, R12
+ SHLL $3, R12
+ ADDL R12, R11
+ ADDL R9, R11
+ ADDL R8, R10
+
+ CMPQ CX, $32
+ JGE loop32
+
+tail:
+ TESTQ CX, CX
+ JEQ done
+
+tail_loop:
+ MOVBLZX (SI), R8
+ INCQ SI
+ DECQ CX
+
+ ADDL R8, R10
+ ADDL R10, R11
+
+ TESTQ CX, CX
+ JNE tail_loop
+
+done:
+ MOVL $65521, R8
+
+ // Reduce s1 %= mod
+ MOVL R10, AX
+ XORL DX, DX
+ DIVL R8
+ MOVL DX, R10
+
+ // Reduce s2 %= mod
+ MOVL R11, AX
+ XORL DX, DX
+ DIVL R8
+ MOVL DX, R11
+
+ MOVL R11, AX
+ SHLL $16, AX
+ ANDL $0xffff, R10
+ ORL R10, AX
+
+ VZEROUPPER
+
+ MOVL AX, ret+32(FP)
+ RET
diff --git a/internal/adler32/adler32_generic.go b/internal/adler32/adler32_generic.go
new file mode 100644
index 00000000..8ba330a5
--- /dev/null
+++ b/internal/adler32/adler32_generic.go
@@ -0,0 +1,96 @@
+//go:build !amd64 || purego
+
+package adler32
+
+func update(d digest, p []byte) digest {
+ s1, s2 := uint32(d&0xffff), uint32(d>>16)
+
+ for len(p) > 0 {
+ var q []byte
+ if len(p) > nmax {
+ p, q = p[:nmax], p[nmax:]
+ }
+
+ for len(p) >= 32 {
+ v := p[:32]
+ p = p[32:]
+
+ s1 += uint32(v[0])
+ s2 += s1
+ s1 += uint32(v[1])
+ s2 += s1
+ s1 += uint32(v[2])
+ s2 += s1
+ s1 += uint32(v[3])
+ s2 += s1
+ s1 += uint32(v[4])
+ s2 += s1
+ s1 += uint32(v[5])
+ s2 += s1
+ s1 += uint32(v[6])
+ s2 += s1
+ s1 += uint32(v[7])
+ s2 += s1
+ s1 += uint32(v[8])
+ s2 += s1
+ s1 += uint32(v[9])
+ s2 += s1
+ s1 += uint32(v[10])
+ s2 += s1
+ s1 += uint32(v[11])
+ s2 += s1
+ s1 += uint32(v[12])
+ s2 += s1
+ s1 += uint32(v[13])
+ s2 += s1
+ s1 += uint32(v[14])
+ s2 += s1
+ s1 += uint32(v[15])
+ s2 += s1
+ s1 += uint32(v[16])
+ s2 += s1
+ s1 += uint32(v[17])
+ s2 += s1
+ s1 += uint32(v[18])
+ s2 += s1
+ s1 += uint32(v[19])
+ s2 += s1
+ s1 += uint32(v[20])
+ s2 += s1
+ s1 += uint32(v[21])
+ s2 += s1
+ s1 += uint32(v[22])
+ s2 += s1
+ s1 += uint32(v[23])
+ s2 += s1
+ s1 += uint32(v[24])
+ s2 += s1
+ s1 += uint32(v[25])
+ s2 += s1
+ s1 += uint32(v[26])
+ s2 += s1
+ s1 += uint32(v[27])
+ s2 += s1
+ s1 += uint32(v[28])
+ s2 += s1
+ s1 += uint32(v[29])
+ s2 += s1
+ s1 += uint32(v[30])
+ s2 += s1
+ s1 += uint32(v[31])
+ s2 += s1
+ }
+
+ for i := 0; i < len(p); i++ {
+ x := p[i]
+ s1 += uint32(x)
+ s2 += s1
+ }
+
+ s1 %= mod
+ s2 %= mod
+ p = q
+ }
+
+ return digest(s2<<16 | s1)
+}
diff --git a/internal/adler32/bench_test.go b/internal/adler32/bench_test.go
new file mode 100644
index 00000000..fd1e4710
--- /dev/null
+++ b/internal/adler32/bench_test.go
@@ -0,0 +1,24 @@
+package adler32_test
+
+import (
+ "testing"
+
+ "git.sr.ht/~runxiyu/furgit/internal/adler32"
+)
+
+const benchmarkSize = 64 * 1024
+
+var data = make([]byte, benchmarkSize)
+
+func init() {
+ for i := 0; i < benchmarkSize; i++ {
+ data[i] = byte(i % 256)
+ }
+}
+
+func BenchmarkChecksum(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ adler32.Checksum(data)
+ }
+}