|
66 |
.align 16 |
66 |
.align 16 |
67 |
.Lmul_enter: |
67 |
.Lmul_enter: |
68 |
mov ${num}d,${num}d |
68 |
mov ${num}d,${num}d |
69 |
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument |
69 |
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
|
|
70 |
lea .Linc(%rip),%r10 |
70 |
push %rbx |
71 |
push %rbx |
71 |
push %rbp |
72 |
push %rbp |
72 |
push %r12 |
73 |
push %r12 |
73 |
push %r13 |
74 |
push %r13 |
74 |
push %r14 |
75 |
push %r14 |
75 |
push %r15 |
76 |
push %r15 |
76 |
___ |
77 |
|
77 |
$code.=<<___ if ($win64); |
|
|
78 |
lea -0x28(%rsp),%rsp |
79 |
movaps %xmm6,(%rsp) |
80 |
movaps %xmm7,0x10(%rsp) |
81 |
.Lmul_alloca: |
78 |
.Lmul_alloca: |
82 |
___ |
|
|
83 |
$code.=<<___; |
84 |
mov %rsp,%rax |
79 |
mov %rsp,%rax |
85 |
lea 2($num),%r11 |
80 |
lea 2($num),%r11 |
86 |
neg %r11 |
81 |
neg %r11 |
87 |
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) |
82 |
lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) |
88 |
and \$-1024,%rsp # minimize TLB usage |
83 |
and \$-1024,%rsp # minimize TLB usage |
89 |
|
84 |
|
90 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
85 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
91 |
.Lmul_body: |
86 |
.Lmul_body: |
92 |
mov $bp,%r12 # reassign $bp |
87 |
lea 128($bp),%r12 # reassign $bp (+size optimization) |
93 |
___ |
88 |
___ |
94 |
$bp="%r12"; |
89 |
$bp="%r12"; |
95 |
$STRIDE=2**5*8; # 5 is "window size" |
90 |
$STRIDE=2**5*8; # 5 is "window size" |
96 |
$N=$STRIDE/4; # should match cache line size |
91 |
$N=$STRIDE/4; # should match cache line size |
97 |
$code.=<<___; |
92 |
$code.=<<___; |
98 |
mov %r10,%r11 |
93 |
movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
99 |
shr \$`log($N/8)/log(2)`,%r10 |
94 |
movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
100 |
and \$`$N/8-1`,%r11 |
95 |
lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) |
101 |
not %r10 |
96 |
and \$-16,%r10 |
102 |
lea .Lmagic_masks(%rip),%rax |
97 |
|
103 |
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" |
98 |
pshufd \$0,%xmm5,%xmm5 # broadcast index |
104 |
lea 96($bp,%r11,8),$bp # pointer within 1st cache line |
99 |
movdqa %xmm1,%xmm4 |
105 |
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which |
100 |
movdqa %xmm1,%xmm2 |
106 |
movq 8(%rax,%r10,8),%xmm5 # cache line contains element |
101 |
___ |
107 |
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument |
102 |
######################################################################## |
108 |
movq 24(%rax,%r10,8),%xmm7 |
103 |
# calculate mask by comparing 0..31 to index and save result to stack |
109 |
|
104 |
# |
110 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
105 |
$code.=<<___; |
111 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
106 |
paddd %xmm0,%xmm1 |
112 |
pand %xmm4,%xmm0 |
107 |
pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
113 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
108 |
.byte 0x67 |
114 |
pand %xmm5,%xmm1 |
109 |
movdqa %xmm4,%xmm3 |
115 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
110 |
___ |
116 |
pand %xmm6,%xmm2 |
111 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
117 |
por %xmm1,%xmm0 |
112 |
$code.=<<___; |
118 |
pand %xmm7,%xmm3 |
113 |
paddd %xmm1,%xmm2 |
|
|
114 |
pcmpeqd %xmm5,%xmm1 # compare to 3,2 |
115 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
116 |
movdqa %xmm4,%xmm0 |
117 |
|
118 |
paddd %xmm2,%xmm3 |
119 |
pcmpeqd %xmm5,%xmm2 # compare to 5,4 |
120 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
121 |
movdqa %xmm4,%xmm1 |
122 |
|
123 |
paddd %xmm3,%xmm0 |
124 |
pcmpeqd %xmm5,%xmm3 # compare to 7,6 |
125 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
126 |
movdqa %xmm4,%xmm2 |
127 |
|
128 |
paddd %xmm0,%xmm1 |
129 |
pcmpeqd %xmm5,%xmm0 |
130 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
131 |
movdqa %xmm4,%xmm3 |
132 |
___ |
133 |
} |
134 |
$code.=<<___; # last iteration can be optimized |
135 |
paddd %xmm1,%xmm2 |
136 |
pcmpeqd %xmm5,%xmm1 |
137 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
138 |
|
139 |
paddd %xmm2,%xmm3 |
140 |
.byte 0x67 |
141 |
pcmpeqd %xmm5,%xmm2 |
142 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
143 |
|
144 |
pcmpeqd %xmm5,%xmm3 |
145 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
146 |
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register |
147 |
|
148 |
pand `16*($k+1)-128`($bp),%xmm1 |
149 |
pand `16*($k+2)-128`($bp),%xmm2 |
150 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
151 |
pand `16*($k+3)-128`($bp),%xmm3 |
152 |
por %xmm2,%xmm0 |
153 |
por %xmm3,%xmm1 |
154 |
___ |
155 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
156 |
$code.=<<___; |
157 |
movdqa `16*($k+0)-128`($bp),%xmm4 |
158 |
movdqa `16*($k+1)-128`($bp),%xmm5 |
159 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
160 |
pand `16*($k+0)+112`(%r10),%xmm4 |
161 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
162 |
pand `16*($k+1)+112`(%r10),%xmm5 |
163 |
por %xmm4,%xmm0 |
164 |
pand `16*($k+2)+112`(%r10),%xmm2 |
165 |
por %xmm5,%xmm1 |
166 |
pand `16*($k+3)+112`(%r10),%xmm3 |
119 |
por %xmm2,%xmm0 |
167 |
por %xmm2,%xmm0 |
|
|
168 |
por %xmm3,%xmm1 |
169 |
___ |
170 |
} |
171 |
$code.=<<___; |
172 |
por %xmm1,%xmm0 |
173 |
pshufd \$0x4e,%xmm0,%xmm1 |
174 |
por %xmm1,%xmm0 |
120 |
lea $STRIDE($bp),$bp |
175 |
lea $STRIDE($bp),$bp |
121 |
por %xmm3,%xmm0 |
|
|
122 |
|
123 |
movq %xmm0,$m0 # m0=bp[0] |
176 |
movq %xmm0,$m0 # m0=bp[0] |
124 |
|
177 |
|
125 |
mov ($n0),$n0 # pull n0[0] value |
178 |
mov ($n0),$n0 # pull n0[0] value |
|
128 |
xor $i,$i # i=0 |
181 |
xor $i,$i # i=0 |
129 |
xor $j,$j # j=0 |
182 |
xor $j,$j # j=0 |
130 |
|
183 |
|
131 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
132 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
133 |
pand %xmm4,%xmm0 |
134 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
135 |
pand %xmm5,%xmm1 |
136 |
|
137 |
mov $n0,$m1 |
184 |
mov $n0,$m1 |
138 |
mulq $m0 # ap[0]*bp[0] |
185 |
mulq $m0 # ap[0]*bp[0] |
139 |
mov %rax,$lo0 |
186 |
mov %rax,$lo0 |
140 |
mov ($np),%rax |
187 |
mov ($np),%rax |
141 |
|
188 |
|
142 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
143 |
pand %xmm6,%xmm2 |
144 |
por %xmm1,%xmm0 |
145 |
pand %xmm7,%xmm3 |
146 |
|
147 |
imulq $lo0,$m1 # "tp[0]"*n0 |
189 |
imulq $lo0,$m1 # "tp[0]"*n0 |
148 |
mov %rdx,$hi0 |
190 |
mov %rdx,$hi0 |
149 |
|
191 |
|
150 |
por %xmm2,%xmm0 |
|
|
151 |
lea $STRIDE($bp),$bp |
152 |
por %xmm3,%xmm0 |
153 |
|
154 |
mulq $m1 # np[0]*m1 |
192 |
mulq $m1 # np[0]*m1 |
155 |
add %rax,$lo0 # discarded |
193 |
add %rax,$lo0 # discarded |
156 |
mov 8($ap),%rax |
194 |
mov 8($ap),%rax |
|
183 |
cmp $num,$j |
221 |
cmp $num,$j |
184 |
jne .L1st |
222 |
jne .L1st |
185 |
|
223 |
|
186 |
movq %xmm0,$m0 # bp[1] |
|
|
187 |
|
188 |
add %rax,$hi1 |
224 |
add %rax,$hi1 |
189 |
mov ($ap),%rax # ap[0] |
225 |
mov ($ap),%rax # ap[0] |
190 |
adc \$0,%rdx |
226 |
adc \$0,%rdx |
|
204 |
jmp .Louter |
240 |
jmp .Louter |
205 |
.align 16 |
241 |
.align 16 |
206 |
.Louter: |
242 |
.Louter: |
|
|
243 |
lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) |
244 |
and \$-16,%rdx |
245 |
pxor %xmm4,%xmm4 |
246 |
pxor %xmm5,%xmm5 |
247 |
___ |
248 |
for($k=0;$k<$STRIDE/16;$k+=4) { |
249 |
$code.=<<___; |
250 |
movdqa `16*($k+0)-128`($bp),%xmm0 |
251 |
movdqa `16*($k+1)-128`($bp),%xmm1 |
252 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
253 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
254 |
pand `16*($k+0)-128`(%rdx),%xmm0 |
255 |
pand `16*($k+1)-128`(%rdx),%xmm1 |
256 |
por %xmm0,%xmm4 |
257 |
pand `16*($k+2)-128`(%rdx),%xmm2 |
258 |
por %xmm1,%xmm5 |
259 |
pand `16*($k+3)-128`(%rdx),%xmm3 |
260 |
por %xmm2,%xmm4 |
261 |
por %xmm3,%xmm5 |
262 |
___ |
263 |
} |
264 |
$code.=<<___; |
265 |
por %xmm5,%xmm4 |
266 |
pshufd \$0x4e,%xmm4,%xmm0 |
267 |
por %xmm4,%xmm0 |
268 |
lea $STRIDE($bp),$bp |
269 |
movq %xmm0,$m0 # m0=bp[i] |
270 |
|
207 |
xor $j,$j # j=0 |
271 |
xor $j,$j # j=0 |
208 |
mov $n0,$m1 |
272 |
mov $n0,$m1 |
209 |
mov (%rsp),$lo0 |
273 |
mov (%rsp),$lo0 |
210 |
|
274 |
|
211 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
212 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
213 |
pand %xmm4,%xmm0 |
214 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
215 |
pand %xmm5,%xmm1 |
216 |
|
217 |
mulq $m0 # ap[0]*bp[i] |
275 |
mulq $m0 # ap[0]*bp[i] |
218 |
add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
276 |
add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
219 |
mov ($np),%rax |
277 |
mov ($np),%rax |
220 |
adc \$0,%rdx |
278 |
adc \$0,%rdx |
221 |
|
279 |
|
222 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
223 |
pand %xmm6,%xmm2 |
224 |
por %xmm1,%xmm0 |
225 |
pand %xmm7,%xmm3 |
226 |
|
227 |
imulq $lo0,$m1 # tp[0]*n0 |
280 |
imulq $lo0,$m1 # tp[0]*n0 |
228 |
mov %rdx,$hi0 |
281 |
mov %rdx,$hi0 |
229 |
|
282 |
|
230 |
por %xmm2,%xmm0 |
|
|
231 |
lea $STRIDE($bp),$bp |
232 |
por %xmm3,%xmm0 |
233 |
|
234 |
mulq $m1 # np[0]*m1 |
283 |
mulq $m1 # np[0]*m1 |
235 |
add %rax,$lo0 # discarded |
284 |
add %rax,$lo0 # discarded |
236 |
mov 8($ap),%rax |
285 |
mov 8($ap),%rax |
|
266 |
cmp $num,$j |
315 |
cmp $num,$j |
267 |
jne .Linner |
316 |
jne .Linner |
268 |
|
317 |
|
269 |
movq %xmm0,$m0 # bp[i+1] |
|
|
270 |
|
271 |
add %rax,$hi1 |
318 |
add %rax,$hi1 |
272 |
mov ($ap),%rax # ap[0] |
319 |
mov ($ap),%rax # ap[0] |
273 |
adc \$0,%rdx |
320 |
adc \$0,%rdx |
|
321 |
|
368 |
|
322 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
369 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
323 |
mov \$1,%rax |
370 |
mov \$1,%rax |
324 |
___ |
371 |
|
325 |
$code.=<<___ if ($win64); |
|
|
326 |
movaps (%rsi),%xmm6 |
327 |
movaps 0x10(%rsi),%xmm7 |
328 |
lea 0x28(%rsi),%rsi |
329 |
___ |
330 |
$code.=<<___; |
331 |
mov (%rsi),%r15 |
372 |
mov (%rsi),%r15 |
332 |
mov 8(%rsi),%r14 |
373 |
mov 8(%rsi),%r14 |
333 |
mov 16(%rsi),%r13 |
374 |
mov 16(%rsi),%r13 |
|
348 |
bn_mul4x_mont_gather5: |
389 |
bn_mul4x_mont_gather5: |
349 |
.Lmul4x_enter: |
390 |
.Lmul4x_enter: |
350 |
mov ${num}d,${num}d |
391 |
mov ${num}d,${num}d |
351 |
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument |
392 |
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
|
|
393 |
lea .Linc(%rip),%r10 |
352 |
push %rbx |
394 |
push %rbx |
353 |
push %rbp |
395 |
push %rbp |
354 |
push %r12 |
396 |
push %r12 |
355 |
push %r13 |
397 |
push %r13 |
356 |
push %r14 |
398 |
push %r14 |
357 |
push %r15 |
399 |
push %r15 |
358 |
___ |
400 |
|
359 |
$code.=<<___ if ($win64); |
|
|
360 |
lea -0x28(%rsp),%rsp |
361 |
movaps %xmm6,(%rsp) |
362 |
movaps %xmm7,0x10(%rsp) |
363 |
.Lmul4x_alloca: |
401 |
.Lmul4x_alloca: |
364 |
___ |
|
|
365 |
$code.=<<___; |
366 |
mov %rsp,%rax |
402 |
mov %rsp,%rax |
367 |
lea 4($num),%r11 |
403 |
lea 4($num),%r11 |
368 |
neg %r11 |
404 |
neg %r11 |
369 |
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) |
405 |
lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) |
370 |
and \$-1024,%rsp # minimize TLB usage |
406 |
and \$-1024,%rsp # minimize TLB usage |
371 |
|
407 |
|
372 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
408 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
373 |
.Lmul4x_body: |
409 |
.Lmul4x_body: |
374 |
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp |
410 |
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp |
375 |
mov %rdx,%r12 # reassign $bp |
411 |
lea 128(%rdx),%r12 # reassign $bp (+size optimization) |
376 |
___ |
412 |
___ |
377 |
$bp="%r12"; |
413 |
$bp="%r12"; |
378 |
$STRIDE=2**5*8; # 5 is "window size" |
414 |
$STRIDE=2**5*8; # 5 is "window size" |
379 |
$N=$STRIDE/4; # should match cache line size |
415 |
$N=$STRIDE/4; # should match cache line size |
380 |
$code.=<<___; |
416 |
$code.=<<___; |
381 |
mov %r10,%r11 |
417 |
movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
382 |
shr \$`log($N/8)/log(2)`,%r10 |
418 |
movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
383 |
and \$`$N/8-1`,%r11 |
419 |
lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) |
384 |
not %r10 |
420 |
|
385 |
lea .Lmagic_masks(%rip),%rax |
421 |
pshufd \$0,%xmm5,%xmm5 # broadcast index |
386 |
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" |
422 |
movdqa %xmm1,%xmm4 |
387 |
lea 96($bp,%r11,8),$bp # pointer within 1st cache line |
423 |
.byte 0x67,0x67 |
388 |
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which |
424 |
movdqa %xmm1,%xmm2 |
389 |
movq 8(%rax,%r10,8),%xmm5 # cache line contains element |
425 |
___ |
390 |
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument |
426 |
######################################################################## |
391 |
movq 24(%rax,%r10,8),%xmm7 |
427 |
# calculate mask by comparing 0..31 to index and save result to stack |
392 |
|
428 |
# |
393 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
429 |
$code.=<<___; |
394 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
430 |
paddd %xmm0,%xmm1 |
395 |
pand %xmm4,%xmm0 |
431 |
pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
396 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
432 |
.byte 0x67 |
397 |
pand %xmm5,%xmm1 |
433 |
movdqa %xmm4,%xmm3 |
398 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
434 |
___ |
399 |
pand %xmm6,%xmm2 |
435 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
400 |
por %xmm1,%xmm0 |
436 |
$code.=<<___; |
401 |
pand %xmm7,%xmm3 |
437 |
paddd %xmm1,%xmm2 |
|
|
438 |
pcmpeqd %xmm5,%xmm1 # compare to 3,2 |
439 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
440 |
movdqa %xmm4,%xmm0 |
441 |
|
442 |
paddd %xmm2,%xmm3 |
443 |
pcmpeqd %xmm5,%xmm2 # compare to 5,4 |
444 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
445 |
movdqa %xmm4,%xmm1 |
446 |
|
447 |
paddd %xmm3,%xmm0 |
448 |
pcmpeqd %xmm5,%xmm3 # compare to 7,6 |
449 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
450 |
movdqa %xmm4,%xmm2 |
451 |
|
452 |
paddd %xmm0,%xmm1 |
453 |
pcmpeqd %xmm5,%xmm0 |
454 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
455 |
movdqa %xmm4,%xmm3 |
456 |
___ |
457 |
} |
458 |
$code.=<<___; # last iteration can be optimized |
459 |
paddd %xmm1,%xmm2 |
460 |
pcmpeqd %xmm5,%xmm1 |
461 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
462 |
|
463 |
paddd %xmm2,%xmm3 |
464 |
.byte 0x67 |
465 |
pcmpeqd %xmm5,%xmm2 |
466 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
467 |
|
468 |
pcmpeqd %xmm5,%xmm3 |
469 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
470 |
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register |
471 |
|
472 |
pand `16*($k+1)-128`($bp),%xmm1 |
473 |
pand `16*($k+2)-128`($bp),%xmm2 |
474 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
475 |
pand `16*($k+3)-128`($bp),%xmm3 |
476 |
por %xmm2,%xmm0 |
477 |
por %xmm3,%xmm1 |
478 |
___ |
479 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
480 |
$code.=<<___; |
481 |
movdqa `16*($k+0)-128`($bp),%xmm4 |
482 |
movdqa `16*($k+1)-128`($bp),%xmm5 |
483 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
484 |
pand `16*($k+0)+112`(%r10),%xmm4 |
485 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
486 |
pand `16*($k+1)+112`(%r10),%xmm5 |
487 |
por %xmm4,%xmm0 |
488 |
pand `16*($k+2)+112`(%r10),%xmm2 |
489 |
por %xmm5,%xmm1 |
490 |
pand `16*($k+3)+112`(%r10),%xmm3 |
402 |
por %xmm2,%xmm0 |
491 |
por %xmm2,%xmm0 |
|
|
492 |
por %xmm3,%xmm1 |
493 |
___ |
494 |
} |
495 |
$code.=<<___; |
496 |
por %xmm1,%xmm0 |
497 |
pshufd \$0x4e,%xmm0,%xmm1 |
498 |
por %xmm1,%xmm0 |
403 |
lea $STRIDE($bp),$bp |
499 |
lea $STRIDE($bp),$bp |
404 |
por %xmm3,%xmm0 |
|
|
405 |
|
406 |
movq %xmm0,$m0 # m0=bp[0] |
500 |
movq %xmm0,$m0 # m0=bp[0] |
|
|
501 |
|
407 |
mov ($n0),$n0 # pull n0[0] value |
502 |
mov ($n0),$n0 # pull n0[0] value |
408 |
mov ($ap),%rax |
503 |
mov ($ap),%rax |
409 |
|
504 |
|
410 |
xor $i,$i # i=0 |
505 |
xor $i,$i # i=0 |
411 |
xor $j,$j # j=0 |
506 |
xor $j,$j # j=0 |
412 |
|
507 |
|
413 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
414 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
415 |
pand %xmm4,%xmm0 |
416 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
417 |
pand %xmm5,%xmm1 |
418 |
|
419 |
mov $n0,$m1 |
508 |
mov $n0,$m1 |
420 |
mulq $m0 # ap[0]*bp[0] |
509 |
mulq $m0 # ap[0]*bp[0] |
421 |
mov %rax,$A[0] |
510 |
mov %rax,$A[0] |
422 |
mov ($np),%rax |
511 |
mov ($np),%rax |
423 |
|
512 |
|
424 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
425 |
pand %xmm6,%xmm2 |
426 |
por %xmm1,%xmm0 |
427 |
pand %xmm7,%xmm3 |
428 |
|
429 |
imulq $A[0],$m1 # "tp[0]"*n0 |
513 |
imulq $A[0],$m1 # "tp[0]"*n0 |
430 |
mov %rdx,$A[1] |
514 |
mov %rdx,$A[1] |
431 |
|
515 |
|
432 |
por %xmm2,%xmm0 |
|
|
433 |
lea $STRIDE($bp),$bp |
434 |
por %xmm3,%xmm0 |
435 |
|
436 |
mulq $m1 # np[0]*m1 |
516 |
mulq $m1 # np[0]*m1 |
437 |
add %rax,$A[0] # discarded |
517 |
add %rax,$A[0] # discarded |
438 |
mov 8($ap),%rax |
518 |
mov 8($ap),%rax |
|
550 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
630 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
551 |
mov %rdx,$N[0] |
631 |
mov %rdx,$N[0] |
552 |
|
632 |
|
553 |
movq %xmm0,$m0 # bp[1] |
|
|
554 |
|
555 |
xor $N[1],$N[1] |
633 |
xor $N[1],$N[1] |
556 |
add $A[0],$N[0] |
634 |
add $A[0],$N[0] |
557 |
adc \$0,$N[1] |
635 |
adc \$0,$N[1] |
|
561 |
lea 1($i),$i # i++ |
639 |
lea 1($i),$i # i++ |
562 |
.align 4 |
640 |
.align 4 |
563 |
.Louter4x: |
641 |
.Louter4x: |
|
|
642 |
lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) |
643 |
pxor %xmm4,%xmm4 |
644 |
pxor %xmm5,%xmm5 |
645 |
___ |
646 |
for($k=0;$k<$STRIDE/16;$k+=4) { |
647 |
$code.=<<___; |
648 |
movdqa `16*($k+0)-128`($bp),%xmm0 |
649 |
movdqa `16*($k+1)-128`($bp),%xmm1 |
650 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
651 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
652 |
pand `16*($k+0)-128`(%rdx),%xmm0 |
653 |
pand `16*($k+1)-128`(%rdx),%xmm1 |
654 |
por %xmm0,%xmm4 |
655 |
pand `16*($k+2)-128`(%rdx),%xmm2 |
656 |
por %xmm1,%xmm5 |
657 |
pand `16*($k+3)-128`(%rdx),%xmm3 |
658 |
por %xmm2,%xmm4 |
659 |
por %xmm3,%xmm5 |
660 |
___ |
661 |
} |
662 |
$code.=<<___; |
663 |
por %xmm5,%xmm4 |
664 |
pshufd \$0x4e,%xmm4,%xmm0 |
665 |
por %xmm4,%xmm0 |
666 |
lea $STRIDE($bp),$bp |
667 |
movq %xmm0,$m0 # m0=bp[i] |
668 |
|
564 |
xor $j,$j # j=0 |
669 |
xor $j,$j # j=0 |
565 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
566 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
567 |
pand %xmm4,%xmm0 |
568 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
569 |
pand %xmm5,%xmm1 |
570 |
|
670 |
|
571 |
mov (%rsp),$A[0] |
671 |
mov (%rsp),$A[0] |
572 |
mov $n0,$m1 |
672 |
mov $n0,$m1 |
|
575 |
mov ($np),%rax |
675 |
mov ($np),%rax |
576 |
adc \$0,%rdx |
676 |
adc \$0,%rdx |
577 |
|
677 |
|
578 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
579 |
pand %xmm6,%xmm2 |
580 |
por %xmm1,%xmm0 |
581 |
pand %xmm7,%xmm3 |
582 |
|
583 |
imulq $A[0],$m1 # tp[0]*n0 |
678 |
imulq $A[0],$m1 # tp[0]*n0 |
584 |
mov %rdx,$A[1] |
679 |
mov %rdx,$A[1] |
585 |
|
680 |
|
586 |
por %xmm2,%xmm0 |
|
|
587 |
lea $STRIDE($bp),$bp |
588 |
por %xmm3,%xmm0 |
589 |
|
590 |
mulq $m1 # np[0]*m1 |
681 |
mulq $m1 # np[0]*m1 |
591 |
add %rax,$A[0] # "$N[0]", discarded |
682 |
add %rax,$A[0] # "$N[0]", discarded |
592 |
mov 8($ap),%rax |
683 |
mov 8($ap),%rax |
|
718 |
mov $N[0],-24(%rsp,$j,8) # tp[j-1] |
809 |
mov $N[0],-24(%rsp,$j,8) # tp[j-1] |
719 |
mov %rdx,$N[0] |
810 |
mov %rdx,$N[0] |
720 |
|
811 |
|
721 |
movq %xmm0,$m0 # bp[i+1] |
|
|
722 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
812 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
723 |
|
813 |
|
724 |
xor $N[1],$N[1] |
814 |
xor $N[1],$N[1] |
|
809 |
$code.=<<___; |
899 |
$code.=<<___; |
810 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
900 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
811 |
mov \$1,%rax |
901 |
mov \$1,%rax |
812 |
___ |
902 |
|
813 |
$code.=<<___ if ($win64); |
|
|
814 |
movaps (%rsi),%xmm6 |
815 |
movaps 0x10(%rsi),%xmm7 |
816 |
lea 0x28(%rsi),%rsi |
817 |
___ |
818 |
$code.=<<___; |
819 |
mov (%rsi),%r15 |
903 |
mov (%rsi),%r15 |
820 |
mov 8(%rsi),%r14 |
904 |
mov 8(%rsi),%r14 |
821 |
mov 16(%rsi),%r13 |
905 |
mov 16(%rsi),%r13 |
|
830 |
}}} |
914 |
}}} |
831 |
|
915 |
|
832 |
{ |
916 |
{ |
833 |
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order |
917 |
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order |
834 |
("%rdi","%rsi","%rdx","%rcx"); # Unix order |
918 |
("%rdi","%rsi","%rdx","%ecx"); # Unix order |
835 |
my $out=$inp; |
919 |
my $out=$inp; |
836 |
my $STRIDE=2**5*8; |
920 |
my $STRIDE=2**5*8; |
837 |
my $N=$STRIDE/4; |
921 |
my $N=$STRIDE/4; |
|
859 |
.type bn_gather5,\@abi-omnipotent |
943 |
.type bn_gather5,\@abi-omnipotent |
860 |
.align 16 |
944 |
.align 16 |
861 |
bn_gather5: |
945 |
bn_gather5: |
862 |
___ |
946 |
.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases |
863 |
$code.=<<___ if ($win64); |
|
|
864 |
.LSEH_begin_bn_gather5: |
865 |
# I can't trust assembler to use specific encoding:-( |
947 |
# I can't trust assembler to use specific encoding:-( |
866 |
.byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp |
948 |
.byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 |
867 |
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) |
949 |
.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp |
868 |
.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) |
950 |
lea .Linc(%rip),%rax |
|
|
951 |
and \$-16,%rsp # shouldn't be formally required |
952 |
|
953 |
movd $idx,%xmm5 |
954 |
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 |
955 |
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 |
956 |
lea 128($tbl),%r11 # size optimization |
957 |
lea 128(%rsp),%rax # size optimization |
958 |
|
959 |
pshufd \$0,%xmm5,%xmm5 # broadcast $idx |
960 |
movdqa %xmm1,%xmm4 |
961 |
movdqa %xmm1,%xmm2 |
869 |
___ |
962 |
___ |
|
|
963 |
######################################################################## |
964 |
# calculate mask by comparing 0..31 to $idx and save result to stack |
965 |
# |
966 |
for($i=0;$i<$STRIDE/16;$i+=4) { |
967 |
$code.=<<___; |
968 |
paddd %xmm0,%xmm1 |
969 |
pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
970 |
___ |
971 |
$code.=<<___ if ($i); |
972 |
movdqa %xmm3,`16*($i-1)-128`(%rax) |
973 |
___ |
974 |
$code.=<<___; |
975 |
movdqa %xmm4,%xmm3 |
976 |
|
977 |
paddd %xmm1,%xmm2 |
978 |
pcmpeqd %xmm5,%xmm1 # compare to 3,2 |
979 |
movdqa %xmm0,`16*($i+0)-128`(%rax) |
980 |
movdqa %xmm4,%xmm0 |
981 |
|
982 |
paddd %xmm2,%xmm3 |
983 |
pcmpeqd %xmm5,%xmm2 # compare to 5,4 |
984 |
movdqa %xmm1,`16*($i+1)-128`(%rax) |
985 |
movdqa %xmm4,%xmm1 |
986 |
|
987 |
paddd %xmm3,%xmm0 |
988 |
pcmpeqd %xmm5,%xmm3 # compare to 7,6 |
989 |
movdqa %xmm2,`16*($i+2)-128`(%rax) |
990 |
movdqa %xmm4,%xmm2 |
991 |
___ |
992 |
} |
870 |
$code.=<<___; |
993 |
$code.=<<___; |
871 |
mov $idx,%r11 |
994 |
movdqa %xmm3,`16*($i-1)-128`(%rax) |
872 |
shr \$`log($N/8)/log(2)`,$idx |
|
|
873 |
and \$`$N/8-1`,%r11 |
874 |
not $idx |
875 |
lea .Lmagic_masks(%rip),%rax |
876 |
and \$`2**5/($N/8)-1`,$idx # 5 is "window size" |
877 |
lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line |
878 |
movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which |
879 |
movq 8(%rax,$idx,8),%xmm5 # cache line contains element |
880 |
movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument |
881 |
movq 24(%rax,$idx,8),%xmm7 |
882 |
jmp .Lgather |
995 |
jmp .Lgather |
883 |
.align 16 |
|
|
884 |
.Lgather: |
885 |
movq `0*$STRIDE/4-96`($tbl),%xmm0 |
886 |
movq `1*$STRIDE/4-96`($tbl),%xmm1 |
887 |
pand %xmm4,%xmm0 |
888 |
movq `2*$STRIDE/4-96`($tbl),%xmm2 |
889 |
pand %xmm5,%xmm1 |
890 |
movq `3*$STRIDE/4-96`($tbl),%xmm3 |
891 |
pand %xmm6,%xmm2 |
892 |
por %xmm1,%xmm0 |
893 |
pand %xmm7,%xmm3 |
894 |
por %xmm2,%xmm0 |
895 |
lea $STRIDE($tbl),$tbl |
896 |
por %xmm3,%xmm0 |
897 |
|
996 |
|
|
|
997 |
.align 32 |
998 |
.Lgather: |
999 |
pxor %xmm4,%xmm4 |
1000 |
pxor %xmm5,%xmm5 |
1001 |
___ |
1002 |
for($i=0;$i<$STRIDE/16;$i+=4) { |
1003 |
$code.=<<___; |
1004 |
movdqa `16*($i+0)-128`(%r11),%xmm0 |
1005 |
movdqa `16*($i+1)-128`(%r11),%xmm1 |
1006 |
movdqa `16*($i+2)-128`(%r11),%xmm2 |
1007 |
pand `16*($i+0)-128`(%rax),%xmm0 |
1008 |
movdqa `16*($i+3)-128`(%r11),%xmm3 |
1009 |
pand `16*($i+1)-128`(%rax),%xmm1 |
1010 |
por %xmm0,%xmm4 |
1011 |
pand `16*($i+2)-128`(%rax),%xmm2 |
1012 |
por %xmm1,%xmm5 |
1013 |
pand `16*($i+3)-128`(%rax),%xmm3 |
1014 |
por %xmm2,%xmm4 |
1015 |
por %xmm3,%xmm5 |
1016 |
___ |
1017 |
} |
1018 |
$code.=<<___; |
1019 |
por %xmm5,%xmm4 |
1020 |
lea $STRIDE(%r11),%r11 |
1021 |
pshufd \$0x4e,%xmm4,%xmm0 |
1022 |
por %xmm4,%xmm0 |
898 |
movq %xmm0,($out) # m0=bp[0] |
1023 |
movq %xmm0,($out) # m0=bp[0] |
899 |
lea 8($out),$out |
1024 |
lea 8($out),$out |
900 |
sub \$1,$num |
1025 |
sub \$1,$num |
901 |
jnz .Lgather |
1026 |
jnz .Lgather |
902 |
___ |
1027 |
|
903 |
$code.=<<___ if ($win64); |
1028 |
lea (%r10),%rsp |
904 |
movaps (%rsp),%xmm6 |
|
|
905 |
movaps 0x10(%rsp),%xmm7 |
906 |
lea 0x28(%rsp),%rsp |
907 |
___ |
908 |
$code.=<<___; |
909 |
ret |
1029 |
ret |
910 |
.LSEH_end_bn_gather5: |
1030 |
.LSEH_end_bn_gather5: |
911 |
.size bn_gather5,.-bn_gather5 |
1031 |
.size bn_gather5,.-bn_gather5 |
|
913 |
} |
1033 |
} |
914 |
$code.=<<___; |
1034 |
$code.=<<___; |
915 |
.align 64 |
1035 |
.align 64 |
916 |
.Lmagic_masks: |
1036 |
.Linc: |
917 |
.long 0,0, 0,0, 0,0, -1,-1 |
1037 |
.long 0,0, 1,1 |
918 |
.long 0,0, 0,0, 0,0, 0,0 |
1038 |
.long 2,2, 2,2 |
919 |
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
1039 |
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
920 |
___ |
1040 |
___ |
921 |
|
1041 |
|
|
954 |
cmp %r10,%rbx # context->Rip<end of prologue label |
1074 |
cmp %r10,%rbx # context->Rip<end of prologue label |
955 |
jb .Lcommon_seh_tail |
1075 |
jb .Lcommon_seh_tail |
956 |
|
1076 |
|
957 |
lea `40+48`(%rax),%rax |
1077 |
lea 48(%rax),%rax |
958 |
|
1078 |
|
959 |
mov 4(%r11),%r10d # HandlerData[1] |
1079 |
mov 4(%r11),%r10d # HandlerData[1] |
960 |
lea (%rsi,%r10),%r10 # end of alloca label |
1080 |
lea (%rsi,%r10),%r10 # end of alloca label |
|
971 |
mov 192($context),%r10 # pull $num |
1091 |
mov 192($context),%r10 # pull $num |
972 |
mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
1092 |
mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
973 |
|
1093 |
|
974 |
movaps (%rax),%xmm0 |
1094 |
lea 48(%rax),%rax |
975 |
movaps 16(%rax),%xmm1 |
|
|
976 |
lea `40+48`(%rax),%rax |
977 |
|
1095 |
|
978 |
mov -8(%rax),%rbx |
1096 |
mov -8(%rax),%rbx |
979 |
mov -16(%rax),%rbp |
1097 |
mov -16(%rax),%rbp |
|
987 |
mov %r13,224($context) # restore context->R13 |
1105 |
mov %r13,224($context) # restore context->R13 |
988 |
mov %r14,232($context) # restore context->R14 |
1106 |
mov %r14,232($context) # restore context->R14 |
989 |
mov %r15,240($context) # restore context->R15 |
1107 |
mov %r15,240($context) # restore context->R15 |
990 |
movups %xmm0,512($context) # restore context->Xmm6 |
|
|
991 |
movups %xmm1,528($context) # restore context->Xmm7 |
992 |
|
1108 |
|
993 |
.Lcommon_seh_tail: |
1109 |
.Lcommon_seh_tail: |
994 |
mov 8(%rax),%rdi |
1110 |
mov 8(%rax),%rdi |
|
1057 |
.rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
1173 |
.rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
1058 |
.align 8 |
1174 |
.align 8 |
1059 |
.LSEH_info_bn_gather5: |
1175 |
.LSEH_info_bn_gather5: |
1060 |
.byte 0x01,0x0d,0x05,0x00 |
1176 |
.byte 0x01,0x0b,0x03,0x0a |
1061 |
.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 |
1177 |
.byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 |
1062 |
.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 |
1178 |
.byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 |
1063 |
.byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 |
|
|
1064 |
.align 8 |
1179 |
.align 8 |
1065 |
___ |
1180 |
___ |
1066 |
} |
1181 |
} |