1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
|
//go:build !purego && amd64
#include "textflag.h"
DATA adler32AVX2ByteWeights<>+0x00(SB)/8, $0x191a1b1c1d1e1f20
DATA adler32AVX2ByteWeights<>+0x08(SB)/8, $0x1112131415161718
DATA adler32AVX2ByteWeights<>+0x10(SB)/8, $0x090a0b0c0d0e0f10
DATA adler32AVX2ByteWeights<>+0x18(SB)/8, $0x0102030405060708
GLOBL adler32AVX2ByteWeights<>(SB), (RODATA|NOPTR), $32
DATA adler32AVX2WordOne<>+0x00(SB)/2, $0x0001
GLOBL adler32AVX2WordOne<>(SB), (RODATA|NOPTR), $2
TEXT ·adler32_avx2(SB), NOSPLIT, $0-36
MOVLQZX in+0(FP), DI
MOVQ buf_base+8(FP), SI
MOVQ buf_len+16(FP), DX
MOVQ buf_cap+24(FP), CX
TESTQ SI, SI
JE return_one
MOVL DI, AX
TESTQ DX, DX
JE return_current
MOVL AX, CX
SHRL $0x10, CX
MOVWLZX AX, AX
CMPQ DX, $0x20
JB scalar_unrolled16
MOVL $2147975281, DI
VPXOR X0, X0, X0
VMOVDQA adler32AVX2ByteWeights<>(SB), Y1
VPBROADCASTW adler32AVX2WordOne<>(SB), Y2
JMP vector_outer
vector_tail_init:
VMOVDQA Y4, Y6
VPXOR X5, X5, X5
vector_reduce_finalize_chunk:
SUBQ AX, DX
VPSLLD $0x05, Y5, Y4
VPADDD Y3, Y4, Y3
VEXTRACTI128 $0x1, Y6, X4
VSHUFPS $0x88, X4, X6, X5
VPSHUFD $0x88, X4, X4
VPADDD X4, X5, X4
VPSHUFD $0x55, X4, X5
VPADDD X4, X5, X4
VMOVD X4, AX
MOVQ AX, CX
IMULQ DI, CX
SHRQ $0x2f, CX
IMULL $0xfff1, CX
SUBL CX, AX
VEXTRACTI128 $0x1, Y3, X4
VPADDD X3, X4, X3
VPSHUFD $0xee, X3, X4
VPADDD X4, X3, X3
VPSHUFD $0x55, X3, X4
VPADDD X3, X4, X3
VMOVD X3, CX
MOVQ CX, R8
IMULQ DI, R8
SHRQ $0x2f, R8
IMULL $0xfff1, R8
SUBL R8, CX
CMPQ DX, $0x1f
JBE scalar_entry
vector_outer:
VMOVD AX, X4
VMOVD CX, X3
CMPQ DX, $0x15b0
MOVL $0x15b0, R8
CMOVQCS DX, R8
MOVL R8, AX
ANDL $0x1fe0, AX
JE vector_tail_init
ADDQ $-0x20, R8
VPXOR X5, X5, X5
TESTL $0x20, R8
JNE vector_block32_check
VMOVDQU 0(SI), Y5
ADDQ $0x20, SI
LEAQ -0x20(AX), CX
VPSADBW Y0, Y5, Y6
VPADDD Y4, Y6, Y6
VPMADDUBSW Y1, Y5, Y5
VPMADDWD Y2, Y5, Y5
VPADDD Y3, Y5, Y3
VMOVDQA Y4, Y5
VMOVDQA Y6, Y4
CMPQ R8, $0x20
JAE vector_block64_loop
JMP vector_reduce_finalize_chunk
vector_block32_check:
MOVQ AX, CX
CMPQ R8, $0x20
JB vector_reduce_finalize_chunk
vector_block64_loop:
VMOVDQU 0(SI), Y6
VMOVDQU 0x20(SI), Y7
VPSADBW Y0, Y6, Y8
VPADDD Y4, Y8, Y8
VPADDD Y4, Y5, Y5
VPMADDUBSW Y1, Y6, Y4
VPMADDWD Y2, Y4, Y4
VPADDD Y3, Y4, Y3
ADDQ $0x40, SI
VPSADBW Y0, Y7, Y4
VPADDD Y4, Y8, Y4
VPADDD Y5, Y8, Y5
VPMADDUBSW Y1, Y7, Y6
VPMADDWD Y2, Y6, Y6
VPADDD Y3, Y6, Y3
ADDQ $-0x40, CX
JNE vector_block64_loop
VMOVDQA Y4, Y6
JMP vector_reduce_finalize_chunk
return_one:
MOVL $0x1, AX
return_current:
MOVL AX, ret+32(FP)
RET
scalar_entry:
TESTQ DX, DX
JE return_final
scalar_unrolled16:
CMPQ DX, $0x10
JB scalar_byte_prelude
MOVBLZX 0(SI), DI
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0x1(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0x2(SI), AX
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0x3(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0x4(SI), AX
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0x5(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0x6(SI), AX
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0x7(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0x8(SI), AX
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0x9(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0xa(SI), AX
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0xb(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0xc(SI), AX
ADDL DI, AX
ADDL AX, CX
MOVBLZX 0xd(SI), DI
ADDL AX, DI
ADDL DI, CX
MOVBLZX 0xe(SI), R8
ADDL DI, R8
ADDL R8, CX
MOVBLZX 0xf(SI), AX
ADDL R8, AX
ADDL AX, CX
ADDQ $-0x10, DX
JE scalar_finalize
ADDQ $0x10, SI
scalar_byte_prelude:
LEAQ -0x1(DX), DI
MOVQ DX, R9
ANDQ $0x3, R9
JE scalar_dword_prelude
XORL R8, R8
scalar_byte_prelude_loop:
MOVBLZX 0(SI)(R8*1), R10
ADDL R10, AX
ADDL AX, CX
INCQ R8
CMPQ R9, R8
JNE scalar_byte_prelude_loop
ADDQ R8, SI
SUBQ R8, DX
scalar_dword_prelude:
CMPQ DI, $0x3
JB scalar_finalize
XORL DI, DI
scalar_dword_loop:
MOVBLZX 0(SI)(DI*1), R8
ADDL AX, R8
ADDL R8, CX
MOVBLZX 0x1(SI)(DI*1), AX
ADDL R8, AX
ADDL AX, CX
MOVBLZX 0x2(SI)(DI*1), R8
ADDL AX, R8
ADDL R8, CX
MOVBLZX 0x3(SI)(DI*1), AX
ADDL R8, AX
ADDL AX, CX
ADDQ $0x4, DI
CMPQ DX, DI
JNE scalar_dword_loop
scalar_finalize:
LEAL -0xfff1(AX), DX
CMPL AX, $0xfff1
CMOVLCS AX, DX
MOVL CX, AX
MOVL $2147975281, SI
IMULQ AX, SI
SHRQ $0x2f, SI
MOVL SI, AX
IMULL $0xfff1, AX
SUBL AX, CX
SHLL $0x10, CX
ORL DX, CX
MOVL CX, AX
VZEROUPPER
MOVL AX, ret+32(FP)
RET
return_final:
SHLL $0x10, CX
ORL CX, AX
VZEROUPPER
MOVL AX, ret+32(FP)
RET
|