diff options
| author | 2026-01-06 22:31:52 +0800 | |
|---|---|---|
| committer | 2026-01-06 22:31:52 +0800 | |
| commit | c85eeaccb831afddedac7bd5f1ba163ebac79fc1 (patch) | |
| tree | 5898c493825bb4c4b35cffb965c84eba140a7cfb /internal | |
| parent | adler32: Drop SSE3 support (diff) | |
| signature | No signature | |
Revert "adler32: Drop SSE3 support"
This reverts commit ba327e76c5b110044ec8ebb75630e79506dfbed8 and re-adds
SSE3 support at the request of Noisytoot.
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/adler32/adler32_amd64.go | 16 | ||||
| -rw-r--r-- | internal/adler32/adler32_sse3.go | 6 | ||||
| -rw-r--r-- | internal/adler32/adler32_sse3.s | 214 |
3 files changed, 232 insertions, 4 deletions
diff --git a/internal/adler32/adler32_amd64.go b/internal/adler32/adler32_amd64.go index 3fbcb4d8..88a854ed 100644 --- a/internal/adler32/adler32_amd64.go +++ b/internal/adler32/adler32_amd64.go @@ -15,6 +15,7 @@ import ( const Size = 4 var ( + hasSSE3 = cpu.X86.HasSSE3 hasAVX2 = cpu.X86.HasAVX2 ) @@ -26,7 +27,7 @@ func (d *digest) Reset() { *d = 1 } // New returns a new hash.Hash32 computing the Adler-32 checksum. func New() hash.Hash32 { - if !hasAVX2 { + if !hasSSE3 { return adler32.New() } d := new(digest) @@ -59,7 +60,11 @@ func (d *digest) BlockSize() int { return 4 } func (d *digest) Write(data []byte) (nn int, err error) { if len(data) >= 64 { var h uint32 - h = adler32_avx2(uint32(*d), data) + if hasAVX2 { + h = adler32_avx2(uint32(*d), data) + } else { + h = adler32_sse3(uint32(*d), data) + } *d = digest(h) } else { h := update(uint32(*d), data) @@ -77,9 +82,12 @@ func (d *digest) Sum(in []byte) []byte { // Checksum returns the Adler-32 checksum of data. func Checksum(data []byte) uint32 { - if !hasAVX2 || len(data) < 64 { + if !hasSSE3 || len(data) < 64 { return update(1, data) } - return adler32_avx2(1, data) + if hasAVX2 { + return adler32_avx2(1, data) + } + return adler32_sse3(1, data) } diff --git a/internal/adler32/adler32_sse3.go b/internal/adler32/adler32_sse3.go new file mode 100644 index 00000000..8e8c8a9b --- /dev/null +++ b/internal/adler32/adler32_sse3.go @@ -0,0 +1,6 @@ +//go:build !purego && amd64 + +package adler32 + +//go:noescape +func adler32_sse3(in uint32, buf []byte) uint32 diff --git a/internal/adler32/adler32_sse3.s b/internal/adler32/adler32_sse3.s new file mode 100644 index 00000000..5880bab8 --- /dev/null +++ b/internal/adler32/adler32_sse3.s @@ -0,0 +1,214 @@ +//go:build !purego && amd64 + +#include "textflag.h" + +DATA weights_17_32<>+0x00(SB)/8, $0x191a1b1c1d1e1f20 +DATA weights_17_32<>+0x08(SB)/8, $0x1112131415161718 +GLOBL weights_17_32<>(SB), (RODATA|NOPTR), $16 + +DATA ones_u16<>+0x00(SB)/8, $0x0001000100010001 +DATA ones_u16<>+0x08(SB)/8, $0x0001000100010001 +GLOBL ones_u16<>(SB), (RODATA|NOPTR), $16 + +DATA weights_1_16<>+0x00(SB)/8, $0x090a0b0c0d0e0f10 +DATA weights_1_16<>+0x08(SB)/8, $0x0102030405060708 +GLOBL weights_1_16<>(SB), (RODATA|NOPTR), $16 + +TEXT ·adler32_sse3(SB), NOSPLIT, $0-36 + MOVLQZX in+0(FP), DI + MOVQ buf_base+8(FP), SI + MOVQ buf_len+16(FP), DX + MOVQ buf_cap+24(FP), CX + NOP + NOP + NOP + WORD $0xf889 + LONG $0xc8b70f44 + WORD $0xe8c1; BYTE $0x10 + WORD $0xd189 + WORD $0xe183; BYTE $0x1f + CMPQ DX, $0x20 + JAE block_loop_setup + WORD $0x8944; BYTE $0xcf + JMP tail_entry + +block_loop_setup: + SHRQ $0x5, DX + LONG $0xc0ef0f66 + MOVO weights_17_32<>(SB), X1 + MOVO ones_u16<>(SB), X2 + MOVO weights_1_16<>(SB), X3 + LONG $0x8071b841; WORD $0x8007 + +block_outer_loop: + CMPQ DX, $0xad + LONG $0x00adba41; WORD $0x0000 + LONG $0xd2420f4c + WORD $0x8944; BYTE $0xcf + LONG $0xfaaf0f41 + LONG $0xef6e0f66 + LONG $0xe06e0f66 + WORD $0x8944; BYTE $0xd0 + LONG $0xf6ef0f66 + +block_inner_loop: + LONG $0x3e6f0ff3 + LONG $0x6f0f4466; BYTE $0xc7 + LONG $0x04380f66; BYTE $0xf9 + LONG $0xfaf50f66 + LONG $0xfcfe0f66 + LONG $0x666f0ff3; BYTE $0x10 + LONG $0xeefe0f66 + LONG $0xf60f4466; BYTE $0xc0 + LONG $0xfe0f4466; BYTE $0xc6 + LONG $0xf46f0f66 + LONG $0xf0f60f66 + LONG $0xfe0f4166; BYTE $0xf0 + LONG $0x04380f66; BYTE $0xe3 + LONG $0xe2f50f66 + LONG $0xe7fe0f66 + ADDQ $0x20, SI + WORD $0xc8ff + JNE block_inner_loop + LONG $0xf5720f66; BYTE $0x05 + LONG $0xe5fe0f66 + LONG $0xee700f66; BYTE $0xb1 + LONG $0xeefe0f66 + LONG $0xf5700f66; BYTE $0xee + LONG $0xf5fe0f66 + LONG $0xf77e0f66 + WORD $0x0144; BYTE $0xcf + LONG $0xec700f66; BYTE $0xb1 + LONG $0xecfe0f66 + LONG $0xe5700f66; BYTE $0xee + LONG $0xe5fe0f66 + LONG $0xe07e0f66 + MOVQ DI, R9 + IMULQ R8, R9 + SHRQ $0x2f, R9 + LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00 + WORD $0x2944; BYTE $0xcf + MOVQ AX, R9 + IMULQ R8, R9 + SHRQ $0x2f, R9 + LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00 + WORD $0x2944; BYTE $0xc8 + WORD $0x8941; BYTE $0xf9 + SUBQ R10, DX + JNE block_outer_loop + +tail_entry: + WORD $0x8548; BYTE $0xc9 + JE return_result + CMPL CX, $0x10 + JB tail_bytes_setup + WORD $0xb60f; BYTE $0x16 + WORD $0xd701 + WORD $0xf801 + LONG $0x0156b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x027eb60f + WORD $0xd701 + WORD $0xf801 + LONG $0x0356b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x047eb60f + WORD $0xd701 + WORD $0xf801 + LONG $0x0556b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x067eb60f + WORD $0xd701 + WORD $0xf801 + LONG $0x0756b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x087eb60f + WORD $0xd701 + WORD $0xf801 + LONG $0x0956b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x0a7eb60f + WORD $0xd701 + WORD $0xf801 + LONG $0x0b56b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x0c7eb60f + WORD $0xd701 + WORD $0xf801 + LONG $0x0d56b60f + WORD $0xfa01 + WORD $0xd001 + LONG $0x46b60f44; BYTE $0x0e + WORD $0x0141; BYTE $0xd0 + WORD $0x0144; BYTE $0xc0 + LONG $0x0f7eb60f + WORD $0x0144; BYTE $0xc7 + WORD $0xf801 + ADDQ $-0x10, CX + JE final_reduce + ADDQ $0x10, SI + +tail_bytes_setup: + LEAQ -0x1(CX), DX + MOVQ CX, R9 + ANDQ $0x3, R9 + JE tail_dword_setup + XORL R8, R8 + +tail_byte_loop: + LONG $0x14b60f46; BYTE $0x06 + WORD $0x0144; BYTE $0xd7 + WORD $0xf801 + INCQ R8 + CMPQ R9, R8 + JNE tail_byte_loop + ADDQ R8, SI + SUBQ R8, CX + +tail_dword_setup: + CMPQ DX, $0x3 + JB final_reduce + XORL DX, DX + +tail_dword_loop: + LONG $0x04b60f44; BYTE $0x16 + WORD $0x0141; BYTE $0xf8 + WORD $0x0144; BYTE $0xc0 + LONG $0x167cb60f; BYTE $0x01 + WORD $0x0144; BYTE $0xc7 + WORD $0xf801 + LONG $0x44b60f44; WORD $0x0216 + WORD $0x0141; BYTE $0xf8 + WORD $0x0144; BYTE $0xc0 + LONG $0x167cb60f; BYTE $0x03 + WORD $0x0144; BYTE $0xc7 + WORD $0xf801 + ADDQ $0x4, DX + CMPQ CX, DX + JNE tail_dword_loop + +final_reduce: + LONG $0x000f8f8d; WORD $0xffff + CMPL DI, $0xfff1 + WORD $0x420f; BYTE $0xcf + WORD $0xc289 + LONG $0x078071be; BYTE $0x80 + IMULQ DX, SI + SHRQ $0x2f, SI + LONG $0xfff1d669; WORD $0x0000 + WORD $0xd029 + WORD $0xcf89 + +return_result: + WORD $0xe0c1; BYTE $0x10 + WORD $0xf809 + NOP + NOP + MOVL AX, ret+32(FP) + RET |
