|
Lines 66-125
bn_mul_mont_gather5:
Link Here
|
| 66 |
.align 16 |
66 |
.align 16 |
| 67 |
.Lmul_enter: |
67 |
.Lmul_enter: |
| 68 |
mov ${num}d,${num}d |
68 |
mov ${num}d,${num}d |
| 69 |
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument |
69 |
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
|
|
70 |
lea .Linc(%rip),%r10 |
| 70 |
push %rbx |
71 |
push %rbx |
| 71 |
push %rbp |
72 |
push %rbp |
| 72 |
push %r12 |
73 |
push %r12 |
| 73 |
push %r13 |
74 |
push %r13 |
| 74 |
push %r14 |
75 |
push %r14 |
| 75 |
push %r15 |
76 |
push %r15 |
| 76 |
___ |
77 |
|
| 77 |
$code.=<<___ if ($win64); |
|
|
| 78 |
lea -0x28(%rsp),%rsp |
| 79 |
movaps %xmm6,(%rsp) |
| 80 |
movaps %xmm7,0x10(%rsp) |
| 81 |
.Lmul_alloca: |
78 |
.Lmul_alloca: |
| 82 |
___ |
|
|
| 83 |
$code.=<<___; |
| 84 |
mov %rsp,%rax |
79 |
mov %rsp,%rax |
| 85 |
lea 2($num),%r11 |
80 |
lea 2($num),%r11 |
| 86 |
neg %r11 |
81 |
neg %r11 |
| 87 |
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) |
82 |
lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) |
| 88 |
and \$-1024,%rsp # minimize TLB usage |
83 |
and \$-1024,%rsp # minimize TLB usage |
| 89 |
|
84 |
|
| 90 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
85 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
| 91 |
.Lmul_body: |
86 |
.Lmul_body: |
| 92 |
mov $bp,%r12 # reassign $bp |
87 |
lea 128($bp),%r12 # reassign $bp (+size optimization) |
| 93 |
___ |
88 |
___ |
| 94 |
$bp="%r12"; |
89 |
$bp="%r12"; |
| 95 |
$STRIDE=2**5*8; # 5 is "window size" |
90 |
$STRIDE=2**5*8; # 5 is "window size" |
| 96 |
$N=$STRIDE/4; # should match cache line size |
91 |
$N=$STRIDE/4; # should match cache line size |
| 97 |
$code.=<<___; |
92 |
$code.=<<___; |
| 98 |
mov %r10,%r11 |
93 |
movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
| 99 |
shr \$`log($N/8)/log(2)`,%r10 |
94 |
movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
| 100 |
and \$`$N/8-1`,%r11 |
95 |
lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) |
| 101 |
not %r10 |
96 |
and \$-16,%r10 |
| 102 |
lea .Lmagic_masks(%rip),%rax |
97 |
|
| 103 |
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" |
98 |
pshufd \$0,%xmm5,%xmm5 # broadcast index |
| 104 |
lea 96($bp,%r11,8),$bp # pointer within 1st cache line |
99 |
movdqa %xmm1,%xmm4 |
| 105 |
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which |
100 |
movdqa %xmm1,%xmm2 |
| 106 |
movq 8(%rax,%r10,8),%xmm5 # cache line contains element |
101 |
___ |
| 107 |
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument |
102 |
######################################################################## |
| 108 |
movq 24(%rax,%r10,8),%xmm7 |
103 |
# calculate mask by comparing 0..31 to index and save result to stack |
| 109 |
|
104 |
# |
| 110 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
105 |
$code.=<<___; |
| 111 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
106 |
paddd %xmm0,%xmm1 |
| 112 |
pand %xmm4,%xmm0 |
107 |
pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
| 113 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
108 |
.byte 0x67 |
| 114 |
pand %xmm5,%xmm1 |
109 |
movdqa %xmm4,%xmm3 |
| 115 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
110 |
___ |
| 116 |
pand %xmm6,%xmm2 |
111 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
| 117 |
por %xmm1,%xmm0 |
112 |
$code.=<<___; |
| 118 |
pand %xmm7,%xmm3 |
113 |
paddd %xmm1,%xmm2 |
|
|
114 |
pcmpeqd %xmm5,%xmm1 # compare to 3,2 |
| 115 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
| 116 |
movdqa %xmm4,%xmm0 |
| 117 |
|
| 118 |
paddd %xmm2,%xmm3 |
| 119 |
pcmpeqd %xmm5,%xmm2 # compare to 5,4 |
| 120 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
| 121 |
movdqa %xmm4,%xmm1 |
| 122 |
|
| 123 |
paddd %xmm3,%xmm0 |
| 124 |
pcmpeqd %xmm5,%xmm3 # compare to 7,6 |
| 125 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
| 126 |
movdqa %xmm4,%xmm2 |
| 127 |
|
| 128 |
paddd %xmm0,%xmm1 |
| 129 |
pcmpeqd %xmm5,%xmm0 |
| 130 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
| 131 |
movdqa %xmm4,%xmm3 |
| 132 |
___ |
| 133 |
} |
| 134 |
$code.=<<___; # last iteration can be optimized |
| 135 |
paddd %xmm1,%xmm2 |
| 136 |
pcmpeqd %xmm5,%xmm1 |
| 137 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
| 138 |
|
| 139 |
paddd %xmm2,%xmm3 |
| 140 |
.byte 0x67 |
| 141 |
pcmpeqd %xmm5,%xmm2 |
| 142 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
| 143 |
|
| 144 |
pcmpeqd %xmm5,%xmm3 |
| 145 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
| 146 |
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register |
| 147 |
|
| 148 |
pand `16*($k+1)-128`($bp),%xmm1 |
| 149 |
pand `16*($k+2)-128`($bp),%xmm2 |
| 150 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
| 151 |
pand `16*($k+3)-128`($bp),%xmm3 |
| 152 |
por %xmm2,%xmm0 |
| 153 |
por %xmm3,%xmm1 |
| 154 |
___ |
| 155 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
| 156 |
$code.=<<___; |
| 157 |
movdqa `16*($k+0)-128`($bp),%xmm4 |
| 158 |
movdqa `16*($k+1)-128`($bp),%xmm5 |
| 159 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
| 160 |
pand `16*($k+0)+112`(%r10),%xmm4 |
| 161 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
| 162 |
pand `16*($k+1)+112`(%r10),%xmm5 |
| 163 |
por %xmm4,%xmm0 |
| 164 |
pand `16*($k+2)+112`(%r10),%xmm2 |
| 165 |
por %xmm5,%xmm1 |
| 166 |
pand `16*($k+3)+112`(%r10),%xmm3 |
| 119 |
por %xmm2,%xmm0 |
167 |
por %xmm2,%xmm0 |
|
|
168 |
por %xmm3,%xmm1 |
| 169 |
___ |
| 170 |
} |
| 171 |
$code.=<<___; |
| 172 |
por %xmm1,%xmm0 |
| 173 |
pshufd \$0x4e,%xmm0,%xmm1 |
| 174 |
por %xmm1,%xmm0 |
| 120 |
lea $STRIDE($bp),$bp |
175 |
lea $STRIDE($bp),$bp |
| 121 |
por %xmm3,%xmm0 |
|
|
| 122 |
|
| 123 |
movq %xmm0,$m0 # m0=bp[0] |
176 |
movq %xmm0,$m0 # m0=bp[0] |
| 124 |
|
177 |
|
| 125 |
mov ($n0),$n0 # pull n0[0] value |
178 |
mov ($n0),$n0 # pull n0[0] value |
|
Lines 128-156
$code.=<<___;
Link Here
|
| 128 |
xor $i,$i # i=0 |
181 |
xor $i,$i # i=0 |
| 129 |
xor $j,$j # j=0 |
182 |
xor $j,$j # j=0 |
| 130 |
|
183 |
|
| 131 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
| 132 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
| 133 |
pand %xmm4,%xmm0 |
| 134 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
| 135 |
pand %xmm5,%xmm1 |
| 136 |
|
| 137 |
mov $n0,$m1 |
184 |
mov $n0,$m1 |
| 138 |
mulq $m0 # ap[0]*bp[0] |
185 |
mulq $m0 # ap[0]*bp[0] |
| 139 |
mov %rax,$lo0 |
186 |
mov %rax,$lo0 |
| 140 |
mov ($np),%rax |
187 |
mov ($np),%rax |
| 141 |
|
188 |
|
| 142 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
| 143 |
pand %xmm6,%xmm2 |
| 144 |
por %xmm1,%xmm0 |
| 145 |
pand %xmm7,%xmm3 |
| 146 |
|
| 147 |
imulq $lo0,$m1 # "tp[0]"*n0 |
189 |
imulq $lo0,$m1 # "tp[0]"*n0 |
| 148 |
mov %rdx,$hi0 |
190 |
mov %rdx,$hi0 |
| 149 |
|
191 |
|
| 150 |
por %xmm2,%xmm0 |
|
|
| 151 |
lea $STRIDE($bp),$bp |
| 152 |
por %xmm3,%xmm0 |
| 153 |
|
| 154 |
mulq $m1 # np[0]*m1 |
192 |
mulq $m1 # np[0]*m1 |
| 155 |
add %rax,$lo0 # discarded |
193 |
add %rax,$lo0 # discarded |
| 156 |
mov 8($ap),%rax |
194 |
mov 8($ap),%rax |
|
Lines 183-190
$code.=<<___;
Link Here
|
| 183 |
cmp $num,$j |
221 |
cmp $num,$j |
| 184 |
jne .L1st |
222 |
jne .L1st |
| 185 |
|
223 |
|
| 186 |
movq %xmm0,$m0 # bp[1] |
|
|
| 187 |
|
| 188 |
add %rax,$hi1 |
224 |
add %rax,$hi1 |
| 189 |
mov ($ap),%rax # ap[0] |
225 |
mov ($ap),%rax # ap[0] |
| 190 |
adc \$0,%rdx |
226 |
adc \$0,%rdx |
|
Lines 204-236
$code.=<<___;
Link Here
|
| 204 |
jmp .Louter |
240 |
jmp .Louter |
| 205 |
.align 16 |
241 |
.align 16 |
| 206 |
.Louter: |
242 |
.Louter: |
|
|
243 |
lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) |
| 244 |
and \$-16,%rdx |
| 245 |
pxor %xmm4,%xmm4 |
| 246 |
pxor %xmm5,%xmm5 |
| 247 |
___ |
| 248 |
for($k=0;$k<$STRIDE/16;$k+=4) { |
| 249 |
$code.=<<___; |
| 250 |
movdqa `16*($k+0)-128`($bp),%xmm0 |
| 251 |
movdqa `16*($k+1)-128`($bp),%xmm1 |
| 252 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
| 253 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
| 254 |
pand `16*($k+0)-128`(%rdx),%xmm0 |
| 255 |
pand `16*($k+1)-128`(%rdx),%xmm1 |
| 256 |
por %xmm0,%xmm4 |
| 257 |
pand `16*($k+2)-128`(%rdx),%xmm2 |
| 258 |
por %xmm1,%xmm5 |
| 259 |
pand `16*($k+3)-128`(%rdx),%xmm3 |
| 260 |
por %xmm2,%xmm4 |
| 261 |
por %xmm3,%xmm5 |
| 262 |
___ |
| 263 |
} |
| 264 |
$code.=<<___; |
| 265 |
por %xmm5,%xmm4 |
| 266 |
pshufd \$0x4e,%xmm4,%xmm0 |
| 267 |
por %xmm4,%xmm0 |
| 268 |
lea $STRIDE($bp),$bp |
| 269 |
movq %xmm0,$m0 # m0=bp[i] |
| 270 |
|
| 207 |
xor $j,$j # j=0 |
271 |
xor $j,$j # j=0 |
| 208 |
mov $n0,$m1 |
272 |
mov $n0,$m1 |
| 209 |
mov (%rsp),$lo0 |
273 |
mov (%rsp),$lo0 |
| 210 |
|
274 |
|
| 211 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
| 212 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
| 213 |
pand %xmm4,%xmm0 |
| 214 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
| 215 |
pand %xmm5,%xmm1 |
| 216 |
|
| 217 |
mulq $m0 # ap[0]*bp[i] |
275 |
mulq $m0 # ap[0]*bp[i] |
| 218 |
add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
276 |
add %rax,$lo0 # ap[0]*bp[i]+tp[0] |
| 219 |
mov ($np),%rax |
277 |
mov ($np),%rax |
| 220 |
adc \$0,%rdx |
278 |
adc \$0,%rdx |
| 221 |
|
279 |
|
| 222 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
| 223 |
pand %xmm6,%xmm2 |
| 224 |
por %xmm1,%xmm0 |
| 225 |
pand %xmm7,%xmm3 |
| 226 |
|
| 227 |
imulq $lo0,$m1 # tp[0]*n0 |
280 |
imulq $lo0,$m1 # tp[0]*n0 |
| 228 |
mov %rdx,$hi0 |
281 |
mov %rdx,$hi0 |
| 229 |
|
282 |
|
| 230 |
por %xmm2,%xmm0 |
|
|
| 231 |
lea $STRIDE($bp),$bp |
| 232 |
por %xmm3,%xmm0 |
| 233 |
|
| 234 |
mulq $m1 # np[0]*m1 |
283 |
mulq $m1 # np[0]*m1 |
| 235 |
add %rax,$lo0 # discarded |
284 |
add %rax,$lo0 # discarded |
| 236 |
mov 8($ap),%rax |
285 |
mov 8($ap),%rax |
|
Lines 266-273
$code.=<<___;
Link Here
|
| 266 |
cmp $num,$j |
315 |
cmp $num,$j |
| 267 |
jne .Linner |
316 |
jne .Linner |
| 268 |
|
317 |
|
| 269 |
movq %xmm0,$m0 # bp[i+1] |
|
|
| 270 |
|
| 271 |
add %rax,$hi1 |
318 |
add %rax,$hi1 |
| 272 |
mov ($ap),%rax # ap[0] |
319 |
mov ($ap),%rax # ap[0] |
| 273 |
adc \$0,%rdx |
320 |
adc \$0,%rdx |
|
Lines 321-333
$code.=<<___;
Link Here
|
| 321 |
|
368 |
|
| 322 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
369 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
| 323 |
mov \$1,%rax |
370 |
mov \$1,%rax |
| 324 |
___ |
371 |
|
| 325 |
$code.=<<___ if ($win64); |
|
|
| 326 |
movaps (%rsi),%xmm6 |
| 327 |
movaps 0x10(%rsi),%xmm7 |
| 328 |
lea 0x28(%rsi),%rsi |
| 329 |
___ |
| 330 |
$code.=<<___; |
| 331 |
mov (%rsi),%r15 |
372 |
mov (%rsi),%r15 |
| 332 |
mov 8(%rsi),%r14 |
373 |
mov 8(%rsi),%r14 |
| 333 |
mov 16(%rsi),%r13 |
374 |
mov 16(%rsi),%r13 |
|
Lines 348-438
$code.=<<___;
Link Here
|
| 348 |
bn_mul4x_mont_gather5: |
389 |
bn_mul4x_mont_gather5: |
| 349 |
.Lmul4x_enter: |
390 |
.Lmul4x_enter: |
| 350 |
mov ${num}d,${num}d |
391 |
mov ${num}d,${num}d |
| 351 |
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument |
392 |
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument |
|
|
393 |
lea .Linc(%rip),%r10 |
| 352 |
push %rbx |
394 |
push %rbx |
| 353 |
push %rbp |
395 |
push %rbp |
| 354 |
push %r12 |
396 |
push %r12 |
| 355 |
push %r13 |
397 |
push %r13 |
| 356 |
push %r14 |
398 |
push %r14 |
| 357 |
push %r15 |
399 |
push %r15 |
| 358 |
___ |
400 |
|
| 359 |
$code.=<<___ if ($win64); |
|
|
| 360 |
lea -0x28(%rsp),%rsp |
| 361 |
movaps %xmm6,(%rsp) |
| 362 |
movaps %xmm7,0x10(%rsp) |
| 363 |
.Lmul4x_alloca: |
401 |
.Lmul4x_alloca: |
| 364 |
___ |
|
|
| 365 |
$code.=<<___; |
| 366 |
mov %rsp,%rax |
402 |
mov %rsp,%rax |
| 367 |
lea 4($num),%r11 |
403 |
lea 4($num),%r11 |
| 368 |
neg %r11 |
404 |
neg %r11 |
| 369 |
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) |
405 |
lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) |
| 370 |
and \$-1024,%rsp # minimize TLB usage |
406 |
and \$-1024,%rsp # minimize TLB usage |
| 371 |
|
407 |
|
| 372 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
408 |
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp |
| 373 |
.Lmul4x_body: |
409 |
.Lmul4x_body: |
| 374 |
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp |
410 |
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp |
| 375 |
mov %rdx,%r12 # reassign $bp |
411 |
lea 128(%rdx),%r12 # reassign $bp (+size optimization) |
| 376 |
___ |
412 |
___ |
| 377 |
$bp="%r12"; |
413 |
$bp="%r12"; |
| 378 |
$STRIDE=2**5*8; # 5 is "window size" |
414 |
$STRIDE=2**5*8; # 5 is "window size" |
| 379 |
$N=$STRIDE/4; # should match cache line size |
415 |
$N=$STRIDE/4; # should match cache line size |
| 380 |
$code.=<<___; |
416 |
$code.=<<___; |
| 381 |
mov %r10,%r11 |
417 |
movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 |
| 382 |
shr \$`log($N/8)/log(2)`,%r10 |
418 |
movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 |
| 383 |
and \$`$N/8-1`,%r11 |
419 |
lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) |
| 384 |
not %r10 |
420 |
|
| 385 |
lea .Lmagic_masks(%rip),%rax |
421 |
pshufd \$0,%xmm5,%xmm5 # broadcast index |
| 386 |
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" |
422 |
movdqa %xmm1,%xmm4 |
| 387 |
lea 96($bp,%r11,8),$bp # pointer within 1st cache line |
423 |
.byte 0x67,0x67 |
| 388 |
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which |
424 |
movdqa %xmm1,%xmm2 |
| 389 |
movq 8(%rax,%r10,8),%xmm5 # cache line contains element |
425 |
___ |
| 390 |
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument |
426 |
######################################################################## |
| 391 |
movq 24(%rax,%r10,8),%xmm7 |
427 |
# calculate mask by comparing 0..31 to index and save result to stack |
| 392 |
|
428 |
# |
| 393 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
429 |
$code.=<<___; |
| 394 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
430 |
paddd %xmm0,%xmm1 |
| 395 |
pand %xmm4,%xmm0 |
431 |
pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
| 396 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
432 |
.byte 0x67 |
| 397 |
pand %xmm5,%xmm1 |
433 |
movdqa %xmm4,%xmm3 |
| 398 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
434 |
___ |
| 399 |
pand %xmm6,%xmm2 |
435 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
| 400 |
por %xmm1,%xmm0 |
436 |
$code.=<<___; |
| 401 |
pand %xmm7,%xmm3 |
437 |
paddd %xmm1,%xmm2 |
|
|
438 |
pcmpeqd %xmm5,%xmm1 # compare to 3,2 |
| 439 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
| 440 |
movdqa %xmm4,%xmm0 |
| 441 |
|
| 442 |
paddd %xmm2,%xmm3 |
| 443 |
pcmpeqd %xmm5,%xmm2 # compare to 5,4 |
| 444 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
| 445 |
movdqa %xmm4,%xmm1 |
| 446 |
|
| 447 |
paddd %xmm3,%xmm0 |
| 448 |
pcmpeqd %xmm5,%xmm3 # compare to 7,6 |
| 449 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
| 450 |
movdqa %xmm4,%xmm2 |
| 451 |
|
| 452 |
paddd %xmm0,%xmm1 |
| 453 |
pcmpeqd %xmm5,%xmm0 |
| 454 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
| 455 |
movdqa %xmm4,%xmm3 |
| 456 |
___ |
| 457 |
} |
| 458 |
$code.=<<___; # last iteration can be optimized |
| 459 |
paddd %xmm1,%xmm2 |
| 460 |
pcmpeqd %xmm5,%xmm1 |
| 461 |
movdqa %xmm0,`16*($k+0)+112`(%r10) |
| 462 |
|
| 463 |
paddd %xmm2,%xmm3 |
| 464 |
.byte 0x67 |
| 465 |
pcmpeqd %xmm5,%xmm2 |
| 466 |
movdqa %xmm1,`16*($k+1)+112`(%r10) |
| 467 |
|
| 468 |
pcmpeqd %xmm5,%xmm3 |
| 469 |
movdqa %xmm2,`16*($k+2)+112`(%r10) |
| 470 |
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register |
| 471 |
|
| 472 |
pand `16*($k+1)-128`($bp),%xmm1 |
| 473 |
pand `16*($k+2)-128`($bp),%xmm2 |
| 474 |
movdqa %xmm3,`16*($k+3)+112`(%r10) |
| 475 |
pand `16*($k+3)-128`($bp),%xmm3 |
| 476 |
por %xmm2,%xmm0 |
| 477 |
por %xmm3,%xmm1 |
| 478 |
___ |
| 479 |
for($k=0;$k<$STRIDE/16-4;$k+=4) { |
| 480 |
$code.=<<___; |
| 481 |
movdqa `16*($k+0)-128`($bp),%xmm4 |
| 482 |
movdqa `16*($k+1)-128`($bp),%xmm5 |
| 483 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
| 484 |
pand `16*($k+0)+112`(%r10),%xmm4 |
| 485 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
| 486 |
pand `16*($k+1)+112`(%r10),%xmm5 |
| 487 |
por %xmm4,%xmm0 |
| 488 |
pand `16*($k+2)+112`(%r10),%xmm2 |
| 489 |
por %xmm5,%xmm1 |
| 490 |
pand `16*($k+3)+112`(%r10),%xmm3 |
| 402 |
por %xmm2,%xmm0 |
491 |
por %xmm2,%xmm0 |
|
|
492 |
por %xmm3,%xmm1 |
| 493 |
___ |
| 494 |
} |
| 495 |
$code.=<<___; |
| 496 |
por %xmm1,%xmm0 |
| 497 |
pshufd \$0x4e,%xmm0,%xmm1 |
| 498 |
por %xmm1,%xmm0 |
| 403 |
lea $STRIDE($bp),$bp |
499 |
lea $STRIDE($bp),$bp |
| 404 |
por %xmm3,%xmm0 |
|
|
| 405 |
|
| 406 |
movq %xmm0,$m0 # m0=bp[0] |
500 |
movq %xmm0,$m0 # m0=bp[0] |
|
|
501 |
|
| 407 |
mov ($n0),$n0 # pull n0[0] value |
502 |
mov ($n0),$n0 # pull n0[0] value |
| 408 |
mov ($ap),%rax |
503 |
mov ($ap),%rax |
| 409 |
|
504 |
|
| 410 |
xor $i,$i # i=0 |
505 |
xor $i,$i # i=0 |
| 411 |
xor $j,$j # j=0 |
506 |
xor $j,$j # j=0 |
| 412 |
|
507 |
|
| 413 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
| 414 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
| 415 |
pand %xmm4,%xmm0 |
| 416 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
| 417 |
pand %xmm5,%xmm1 |
| 418 |
|
| 419 |
mov $n0,$m1 |
508 |
mov $n0,$m1 |
| 420 |
mulq $m0 # ap[0]*bp[0] |
509 |
mulq $m0 # ap[0]*bp[0] |
| 421 |
mov %rax,$A[0] |
510 |
mov %rax,$A[0] |
| 422 |
mov ($np),%rax |
511 |
mov ($np),%rax |
| 423 |
|
512 |
|
| 424 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
| 425 |
pand %xmm6,%xmm2 |
| 426 |
por %xmm1,%xmm0 |
| 427 |
pand %xmm7,%xmm3 |
| 428 |
|
| 429 |
imulq $A[0],$m1 # "tp[0]"*n0 |
513 |
imulq $A[0],$m1 # "tp[0]"*n0 |
| 430 |
mov %rdx,$A[1] |
514 |
mov %rdx,$A[1] |
| 431 |
|
515 |
|
| 432 |
por %xmm2,%xmm0 |
|
|
| 433 |
lea $STRIDE($bp),$bp |
| 434 |
por %xmm3,%xmm0 |
| 435 |
|
| 436 |
mulq $m1 # np[0]*m1 |
516 |
mulq $m1 # np[0]*m1 |
| 437 |
add %rax,$A[0] # discarded |
517 |
add %rax,$A[0] # discarded |
| 438 |
mov 8($ap),%rax |
518 |
mov 8($ap),%rax |
|
Lines 550-557
$code.=<<___;
Link Here
|
| 550 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
630 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
| 551 |
mov %rdx,$N[0] |
631 |
mov %rdx,$N[0] |
| 552 |
|
632 |
|
| 553 |
movq %xmm0,$m0 # bp[1] |
|
|
| 554 |
|
| 555 |
xor $N[1],$N[1] |
633 |
xor $N[1],$N[1] |
| 556 |
add $A[0],$N[0] |
634 |
add $A[0],$N[0] |
| 557 |
adc \$0,$N[1] |
635 |
adc \$0,$N[1] |
|
Lines 561-572
$code.=<<___;
Link Here
|
| 561 |
lea 1($i),$i # i++ |
639 |
lea 1($i),$i # i++ |
| 562 |
.align 4 |
640 |
.align 4 |
| 563 |
.Louter4x: |
641 |
.Louter4x: |
|
|
642 |
lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) |
| 643 |
pxor %xmm4,%xmm4 |
| 644 |
pxor %xmm5,%xmm5 |
| 645 |
___ |
| 646 |
for($k=0;$k<$STRIDE/16;$k+=4) { |
| 647 |
$code.=<<___; |
| 648 |
movdqa `16*($k+0)-128`($bp),%xmm0 |
| 649 |
movdqa `16*($k+1)-128`($bp),%xmm1 |
| 650 |
movdqa `16*($k+2)-128`($bp),%xmm2 |
| 651 |
movdqa `16*($k+3)-128`($bp),%xmm3 |
| 652 |
pand `16*($k+0)-128`(%rdx),%xmm0 |
| 653 |
pand `16*($k+1)-128`(%rdx),%xmm1 |
| 654 |
por %xmm0,%xmm4 |
| 655 |
pand `16*($k+2)-128`(%rdx),%xmm2 |
| 656 |
por %xmm1,%xmm5 |
| 657 |
pand `16*($k+3)-128`(%rdx),%xmm3 |
| 658 |
por %xmm2,%xmm4 |
| 659 |
por %xmm3,%xmm5 |
| 660 |
___ |
| 661 |
} |
| 662 |
$code.=<<___; |
| 663 |
por %xmm5,%xmm4 |
| 664 |
pshufd \$0x4e,%xmm4,%xmm0 |
| 665 |
por %xmm4,%xmm0 |
| 666 |
lea $STRIDE($bp),$bp |
| 667 |
movq %xmm0,$m0 # m0=bp[i] |
| 668 |
|
| 564 |
xor $j,$j # j=0 |
669 |
xor $j,$j # j=0 |
| 565 |
movq `0*$STRIDE/4-96`($bp),%xmm0 |
|
|
| 566 |
movq `1*$STRIDE/4-96`($bp),%xmm1 |
| 567 |
pand %xmm4,%xmm0 |
| 568 |
movq `2*$STRIDE/4-96`($bp),%xmm2 |
| 569 |
pand %xmm5,%xmm1 |
| 570 |
|
670 |
|
| 571 |
mov (%rsp),$A[0] |
671 |
mov (%rsp),$A[0] |
| 572 |
mov $n0,$m1 |
672 |
mov $n0,$m1 |
|
Lines 575-592
$code.=<<___;
Link Here
|
| 575 |
mov ($np),%rax |
675 |
mov ($np),%rax |
| 576 |
adc \$0,%rdx |
676 |
adc \$0,%rdx |
| 577 |
|
677 |
|
| 578 |
movq `3*$STRIDE/4-96`($bp),%xmm3 |
|
|
| 579 |
pand %xmm6,%xmm2 |
| 580 |
por %xmm1,%xmm0 |
| 581 |
pand %xmm7,%xmm3 |
| 582 |
|
| 583 |
imulq $A[0],$m1 # tp[0]*n0 |
678 |
imulq $A[0],$m1 # tp[0]*n0 |
| 584 |
mov %rdx,$A[1] |
679 |
mov %rdx,$A[1] |
| 585 |
|
680 |
|
| 586 |
por %xmm2,%xmm0 |
|
|
| 587 |
lea $STRIDE($bp),$bp |
| 588 |
por %xmm3,%xmm0 |
| 589 |
|
| 590 |
mulq $m1 # np[0]*m1 |
681 |
mulq $m1 # np[0]*m1 |
| 591 |
add %rax,$A[0] # "$N[0]", discarded |
682 |
add %rax,$A[0] # "$N[0]", discarded |
| 592 |
mov 8($ap),%rax |
683 |
mov 8($ap),%rax |
|
Lines 718-724
$code.=<<___;
Link Here
|
| 718 |
mov $N[0],-24(%rsp,$j,8) # tp[j-1] |
809 |
mov $N[0],-24(%rsp,$j,8) # tp[j-1] |
| 719 |
mov %rdx,$N[0] |
810 |
mov %rdx,$N[0] |
| 720 |
|
811 |
|
| 721 |
movq %xmm0,$m0 # bp[i+1] |
|
|
| 722 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
812 |
mov $N[1],-16(%rsp,$j,8) # tp[j-1] |
| 723 |
|
813 |
|
| 724 |
xor $N[1],$N[1] |
814 |
xor $N[1],$N[1] |
|
Lines 809-821
___
Link Here
|
| 809 |
$code.=<<___; |
899 |
$code.=<<___; |
| 810 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
900 |
mov 8(%rsp,$num,8),%rsi # restore %rsp |
| 811 |
mov \$1,%rax |
901 |
mov \$1,%rax |
| 812 |
___ |
902 |
|
| 813 |
$code.=<<___ if ($win64); |
|
|
| 814 |
movaps (%rsi),%xmm6 |
| 815 |
movaps 0x10(%rsi),%xmm7 |
| 816 |
lea 0x28(%rsi),%rsi |
| 817 |
___ |
| 818 |
$code.=<<___; |
| 819 |
mov (%rsi),%r15 |
903 |
mov (%rsi),%r15 |
| 820 |
mov 8(%rsi),%r14 |
904 |
mov 8(%rsi),%r14 |
| 821 |
mov 16(%rsi),%r13 |
905 |
mov 16(%rsi),%r13 |
|
Lines 830-837
___
Link Here
|
| 830 |
}}} |
914 |
}}} |
| 831 |
|
915 |
|
| 832 |
{ |
916 |
{ |
| 833 |
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order |
917 |
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order |
| 834 |
("%rdi","%rsi","%rdx","%rcx"); # Unix order |
918 |
("%rdi","%rsi","%rdx","%ecx"); # Unix order |
| 835 |
my $out=$inp; |
919 |
my $out=$inp; |
| 836 |
my $STRIDE=2**5*8; |
920 |
my $STRIDE=2**5*8; |
| 837 |
my $N=$STRIDE/4; |
921 |
my $N=$STRIDE/4; |
|
Lines 859-911
bn_scatter5:
Link Here
|
| 859 |
.type bn_gather5,\@abi-omnipotent |
943 |
.type bn_gather5,\@abi-omnipotent |
| 860 |
.align 16 |
944 |
.align 16 |
| 861 |
bn_gather5: |
945 |
bn_gather5: |
| 862 |
___ |
946 |
.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases |
| 863 |
$code.=<<___ if ($win64); |
|
|
| 864 |
.LSEH_begin_bn_gather5: |
| 865 |
# I can't trust assembler to use specific encoding:-( |
947 |
# I can't trust assembler to use specific encoding:-( |
| 866 |
.byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp |
948 |
.byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 |
| 867 |
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) |
949 |
.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp |
| 868 |
.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) |
950 |
lea .Linc(%rip),%rax |
|
|
951 |
and \$-16,%rsp # shouldn't be formally required |
| 952 |
|
| 953 |
movd $idx,%xmm5 |
| 954 |
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 |
| 955 |
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 |
| 956 |
lea 128($tbl),%r11 # size optimization |
| 957 |
lea 128(%rsp),%rax # size optimization |
| 958 |
|
| 959 |
pshufd \$0,%xmm5,%xmm5 # broadcast $idx |
| 960 |
movdqa %xmm1,%xmm4 |
| 961 |
movdqa %xmm1,%xmm2 |
| 869 |
___ |
962 |
___ |
|
|
963 |
######################################################################## |
| 964 |
# calculate mask by comparing 0..31 to $idx and save result to stack |
| 965 |
# |
| 966 |
for($i=0;$i<$STRIDE/16;$i+=4) { |
| 967 |
$code.=<<___; |
| 968 |
paddd %xmm0,%xmm1 |
| 969 |
pcmpeqd %xmm5,%xmm0 # compare to 1,0 |
| 970 |
___ |
| 971 |
$code.=<<___ if ($i); |
| 972 |
movdqa %xmm3,`16*($i-1)-128`(%rax) |
| 973 |
___ |
| 974 |
$code.=<<___; |
| 975 |
movdqa %xmm4,%xmm3 |
| 976 |
|
| 977 |
paddd %xmm1,%xmm2 |
| 978 |
pcmpeqd %xmm5,%xmm1 # compare to 3,2 |
| 979 |
movdqa %xmm0,`16*($i+0)-128`(%rax) |
| 980 |
movdqa %xmm4,%xmm0 |
| 981 |
|
| 982 |
paddd %xmm2,%xmm3 |
| 983 |
pcmpeqd %xmm5,%xmm2 # compare to 5,4 |
| 984 |
movdqa %xmm1,`16*($i+1)-128`(%rax) |
| 985 |
movdqa %xmm4,%xmm1 |
| 986 |
|
| 987 |
paddd %xmm3,%xmm0 |
| 988 |
pcmpeqd %xmm5,%xmm3 # compare to 7,6 |
| 989 |
movdqa %xmm2,`16*($i+2)-128`(%rax) |
| 990 |
movdqa %xmm4,%xmm2 |
| 991 |
___ |
| 992 |
} |
| 870 |
$code.=<<___; |
993 |
$code.=<<___; |
| 871 |
mov $idx,%r11 |
994 |
movdqa %xmm3,`16*($i-1)-128`(%rax) |
| 872 |
shr \$`log($N/8)/log(2)`,$idx |
|
|
| 873 |
and \$`$N/8-1`,%r11 |
| 874 |
not $idx |
| 875 |
lea .Lmagic_masks(%rip),%rax |
| 876 |
and \$`2**5/($N/8)-1`,$idx # 5 is "window size" |
| 877 |
lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line |
| 878 |
movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which |
| 879 |
movq 8(%rax,$idx,8),%xmm5 # cache line contains element |
| 880 |
movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument |
| 881 |
movq 24(%rax,$idx,8),%xmm7 |
| 882 |
jmp .Lgather |
995 |
jmp .Lgather |
| 883 |
.align 16 |
|
|
| 884 |
.Lgather: |
| 885 |
movq `0*$STRIDE/4-96`($tbl),%xmm0 |
| 886 |
movq `1*$STRIDE/4-96`($tbl),%xmm1 |
| 887 |
pand %xmm4,%xmm0 |
| 888 |
movq `2*$STRIDE/4-96`($tbl),%xmm2 |
| 889 |
pand %xmm5,%xmm1 |
| 890 |
movq `3*$STRIDE/4-96`($tbl),%xmm3 |
| 891 |
pand %xmm6,%xmm2 |
| 892 |
por %xmm1,%xmm0 |
| 893 |
pand %xmm7,%xmm3 |
| 894 |
por %xmm2,%xmm0 |
| 895 |
lea $STRIDE($tbl),$tbl |
| 896 |
por %xmm3,%xmm0 |
| 897 |
|
996 |
|
|
|
997 |
.align 32 |
| 998 |
.Lgather: |
| 999 |
pxor %xmm4,%xmm4 |
| 1000 |
pxor %xmm5,%xmm5 |
| 1001 |
___ |
| 1002 |
for($i=0;$i<$STRIDE/16;$i+=4) { |
| 1003 |
$code.=<<___; |
| 1004 |
movdqa `16*($i+0)-128`(%r11),%xmm0 |
| 1005 |
movdqa `16*($i+1)-128`(%r11),%xmm1 |
| 1006 |
movdqa `16*($i+2)-128`(%r11),%xmm2 |
| 1007 |
pand `16*($i+0)-128`(%rax),%xmm0 |
| 1008 |
movdqa `16*($i+3)-128`(%r11),%xmm3 |
| 1009 |
pand `16*($i+1)-128`(%rax),%xmm1 |
| 1010 |
por %xmm0,%xmm4 |
| 1011 |
pand `16*($i+2)-128`(%rax),%xmm2 |
| 1012 |
por %xmm1,%xmm5 |
| 1013 |
pand `16*($i+3)-128`(%rax),%xmm3 |
| 1014 |
por %xmm2,%xmm4 |
| 1015 |
por %xmm3,%xmm5 |
| 1016 |
___ |
| 1017 |
} |
| 1018 |
$code.=<<___; |
| 1019 |
por %xmm5,%xmm4 |
| 1020 |
lea $STRIDE(%r11),%r11 |
| 1021 |
pshufd \$0x4e,%xmm4,%xmm0 |
| 1022 |
por %xmm4,%xmm0 |
| 898 |
movq %xmm0,($out) # m0=bp[0] |
1023 |
movq %xmm0,($out) # m0=bp[0] |
| 899 |
lea 8($out),$out |
1024 |
lea 8($out),$out |
| 900 |
sub \$1,$num |
1025 |
sub \$1,$num |
| 901 |
jnz .Lgather |
1026 |
jnz .Lgather |
| 902 |
___ |
1027 |
|
| 903 |
$code.=<<___ if ($win64); |
1028 |
lea (%r10),%rsp |
| 904 |
movaps (%rsp),%xmm6 |
|
|
| 905 |
movaps 0x10(%rsp),%xmm7 |
| 906 |
lea 0x28(%rsp),%rsp |
| 907 |
___ |
| 908 |
$code.=<<___; |
| 909 |
ret |
1029 |
ret |
| 910 |
.LSEH_end_bn_gather5: |
1030 |
.LSEH_end_bn_gather5: |
| 911 |
.size bn_gather5,.-bn_gather5 |
1031 |
.size bn_gather5,.-bn_gather5 |
|
Lines 913-921
___
Link Here
|
| 913 |
} |
1033 |
} |
| 914 |
$code.=<<___; |
1034 |
$code.=<<___; |
| 915 |
.align 64 |
1035 |
.align 64 |
| 916 |
.Lmagic_masks: |
1036 |
.Linc: |
| 917 |
.long 0,0, 0,0, 0,0, -1,-1 |
1037 |
.long 0,0, 1,1 |
| 918 |
.long 0,0, 0,0, 0,0, 0,0 |
1038 |
.long 2,2, 2,2 |
| 919 |
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
1039 |
.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
| 920 |
___ |
1040 |
___ |
| 921 |
|
1041 |
|
|
Lines 954-960
mul_handler:
Link Here
|
| 954 |
cmp %r10,%rbx # context->Rip<end of prologue label |
1074 |
cmp %r10,%rbx # context->Rip<end of prologue label |
| 955 |
jb .Lcommon_seh_tail |
1075 |
jb .Lcommon_seh_tail |
| 956 |
|
1076 |
|
| 957 |
lea `40+48`(%rax),%rax |
1077 |
lea 48(%rax),%rax |
| 958 |
|
1078 |
|
| 959 |
mov 4(%r11),%r10d # HandlerData[1] |
1079 |
mov 4(%r11),%r10d # HandlerData[1] |
| 960 |
lea (%rsi,%r10),%r10 # end of alloca label |
1080 |
lea (%rsi,%r10),%r10 # end of alloca label |
|
Lines 971-979
mul_handler:
Link Here
|
| 971 |
mov 192($context),%r10 # pull $num |
1091 |
mov 192($context),%r10 # pull $num |
| 972 |
mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
1092 |
mov 8(%rax,%r10,8),%rax # pull saved stack pointer |
| 973 |
|
1093 |
|
| 974 |
movaps (%rax),%xmm0 |
1094 |
lea 48(%rax),%rax |
| 975 |
movaps 16(%rax),%xmm1 |
|
|
| 976 |
lea `40+48`(%rax),%rax |
| 977 |
|
1095 |
|
| 978 |
mov -8(%rax),%rbx |
1096 |
mov -8(%rax),%rbx |
| 979 |
mov -16(%rax),%rbp |
1097 |
mov -16(%rax),%rbp |
|
Lines 987-994
mul_handler:
Link Here
|
| 987 |
mov %r13,224($context) # restore context->R13 |
1105 |
mov %r13,224($context) # restore context->R13 |
| 988 |
mov %r14,232($context) # restore context->R14 |
1106 |
mov %r14,232($context) # restore context->R14 |
| 989 |
mov %r15,240($context) # restore context->R15 |
1107 |
mov %r15,240($context) # restore context->R15 |
| 990 |
movups %xmm0,512($context) # restore context->Xmm6 |
|
|
| 991 |
movups %xmm1,528($context) # restore context->Xmm7 |
| 992 |
|
1108 |
|
| 993 |
.Lcommon_seh_tail: |
1109 |
.Lcommon_seh_tail: |
| 994 |
mov 8(%rax),%rdi |
1110 |
mov 8(%rax),%rdi |
|
Lines 1057-1066
mul_handler:
Link Here
|
| 1057 |
.rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
1173 |
.rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] |
| 1058 |
.align 8 |
1174 |
.align 8 |
| 1059 |
.LSEH_info_bn_gather5: |
1175 |
.LSEH_info_bn_gather5: |
| 1060 |
.byte 0x01,0x0d,0x05,0x00 |
1176 |
.byte 0x01,0x0b,0x03,0x0a |
| 1061 |
.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 |
1177 |
.byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 |
| 1062 |
.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 |
1178 |
.byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 |
| 1063 |
.byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 |
|
|
| 1064 |
.align 8 |
1179 |
.align 8 |
| 1065 |
___ |
1180 |
___ |
| 1066 |
} |
1181 |
} |