Univention Bugzilla – Attachment 7513 Details for
Bug 40189
openssl: Denial of service (3.2)
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
CVE-2016-0702.patch
CVE-2016-0702.patch (text/plain), 23.01 KB, created by
Arvid Requate
on 2016-03-01 16:54 CET
(
hide
)
Description:
CVE-2016-0702.patch
Filename:
MIME Type:
Creator:
Arvid Requate
Created:
2016-03-01 16:54 CET
Size:
23.01 KB
patch
obsolete
>diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl >index dae0fe2..2359791 100755 >--- a/crypto/bn/asm/x86_64-mont5.pl >+++ b/crypto/bn/asm/x86_64-mont5.pl >@@ -66,60 +66,113 @@ bn_mul_mont_gather5: > .align 16 > .Lmul_enter: > mov ${num}d,${num}d >- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument >+ movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument >+ lea .Linc(%rip),%r10 > push %rbx > push %rbp > push %r12 > push %r13 > push %r14 > push %r15 >-___ >-$code.=<<___ if ($win64); >- lea -0x28(%rsp),%rsp >- movaps %xmm6,(%rsp) >- movaps %xmm7,0x10(%rsp) >+ > .Lmul_alloca: >-___ >-$code.=<<___; > mov %rsp,%rax > lea 2($num),%r11 > neg %r11 >- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) >+ lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) > and \$-1024,%rsp # minimize TLB usage > > mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp > .Lmul_body: >- mov $bp,%r12 # reassign $bp >+ lea 128($bp),%r12 # reassign $bp (+size optimization) > ___ > $bp="%r12"; > $STRIDE=2**5*8; # 5 is "window size" > $N=$STRIDE/4; # should match cache line size > $code.=<<___; >- mov %r10,%r11 >- shr \$`log($N/8)/log(2)`,%r10 >- and \$`$N/8-1`,%r11 >- not %r10 >- lea .Lmagic_masks(%rip),%rax >- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" >- lea 96($bp,%r11,8),$bp # pointer within 1st cache line >- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which >- movq 8(%rax,%r10,8),%xmm5 # cache line contains element >- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument >- movq 24(%rax,%r10,8),%xmm7 >- >- movq `0*$STRIDE/4-96`($bp),%xmm0 >- movq `1*$STRIDE/4-96`($bp),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($bp),%xmm2 >- pand %xmm5,%xmm1 >- movq `3*$STRIDE/4-96`($bp),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >+ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 >+ movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 >+ lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) >+ and \$-16,%r10 >+ >+ pshufd \$0,%xmm5,%xmm5 # broadcast index >+ movdqa %xmm1,%xmm4 >+ movdqa %xmm1,%xmm2 >+___ >+######################################################################## >+# calculate mask by comparing 0..31 to index and save result to stack >+# >+$code.=<<___; >+ paddd %xmm0,%xmm1 >+ pcmpeqd %xmm5,%xmm0 # compare to 1,0 >+ .byte 0x67 >+ movdqa %xmm4,%xmm3 >+___ >+for($k=0;$k<$STRIDE/16-4;$k+=4) { >+$code.=<<___; >+ paddd %xmm1,%xmm2 >+ pcmpeqd %xmm5,%xmm1 # compare to 3,2 >+ movdqa %xmm0,`16*($k+0)+112`(%r10) >+ movdqa %xmm4,%xmm0 >+ >+ paddd %xmm2,%xmm3 >+ pcmpeqd %xmm5,%xmm2 # compare to 5,4 >+ movdqa %xmm1,`16*($k+1)+112`(%r10) >+ movdqa %xmm4,%xmm1 >+ >+ paddd %xmm3,%xmm0 >+ pcmpeqd %xmm5,%xmm3 # compare to 7,6 >+ movdqa %xmm2,`16*($k+2)+112`(%r10) >+ movdqa %xmm4,%xmm2 >+ >+ paddd %xmm0,%xmm1 >+ pcmpeqd %xmm5,%xmm0 >+ movdqa %xmm3,`16*($k+3)+112`(%r10) >+ movdqa %xmm4,%xmm3 >+___ >+} >+$code.=<<___; # last iteration can be optimized >+ paddd %xmm1,%xmm2 >+ pcmpeqd %xmm5,%xmm1 >+ movdqa %xmm0,`16*($k+0)+112`(%r10) >+ >+ paddd %xmm2,%xmm3 >+ .byte 0x67 >+ pcmpeqd %xmm5,%xmm2 >+ movdqa %xmm1,`16*($k+1)+112`(%r10) >+ >+ pcmpeqd %xmm5,%xmm3 >+ movdqa %xmm2,`16*($k+2)+112`(%r10) >+ pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register >+ >+ pand `16*($k+1)-128`($bp),%xmm1 >+ pand `16*($k+2)-128`($bp),%xmm2 >+ movdqa %xmm3,`16*($k+3)+112`(%r10) >+ pand `16*($k+3)-128`($bp),%xmm3 >+ por %xmm2,%xmm0 >+ por %xmm3,%xmm1 >+___ >+for($k=0;$k<$STRIDE/16-4;$k+=4) { >+$code.=<<___; >+ movdqa `16*($k+0)-128`($bp),%xmm4 >+ movdqa `16*($k+1)-128`($bp),%xmm5 >+ movdqa `16*($k+2)-128`($bp),%xmm2 >+ pand `16*($k+0)+112`(%r10),%xmm4 >+ movdqa `16*($k+3)-128`($bp),%xmm3 >+ pand `16*($k+1)+112`(%r10),%xmm5 >+ por %xmm4,%xmm0 >+ pand `16*($k+2)+112`(%r10),%xmm2 >+ por %xmm5,%xmm1 >+ pand `16*($k+3)+112`(%r10),%xmm3 > por %xmm2,%xmm0 >+ por %xmm3,%xmm1 >+___ >+} >+$code.=<<___; >+ por %xmm1,%xmm0 >+ pshufd \$0x4e,%xmm0,%xmm1 >+ por %xmm1,%xmm0 > lea $STRIDE($bp),$bp >- por %xmm3,%xmm0 >- > movq %xmm0,$m0 # m0=bp[0] > > mov ($n0),$n0 # pull n0[0] value >@@ -128,29 +181,14 @@ $code.=<<___; > xor $i,$i # i=0 > xor $j,$j # j=0 > >- movq `0*$STRIDE/4-96`($bp),%xmm0 >- movq `1*$STRIDE/4-96`($bp),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($bp),%xmm2 >- pand %xmm5,%xmm1 >- > mov $n0,$m1 > mulq $m0 # ap[0]*bp[0] > mov %rax,$lo0 > mov ($np),%rax > >- movq `3*$STRIDE/4-96`($bp),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >- > imulq $lo0,$m1 # "tp[0]"*n0 > mov %rdx,$hi0 > >- por %xmm2,%xmm0 >- lea $STRIDE($bp),$bp >- por %xmm3,%xmm0 >- > mulq $m1 # np[0]*m1 > add %rax,$lo0 # discarded > mov 8($ap),%rax >@@ -183,8 +221,6 @@ $code.=<<___; > cmp $num,$j > jne .L1st > >- movq %xmm0,$m0 # bp[1] >- > add %rax,$hi1 > mov ($ap),%rax # ap[0] > adc \$0,%rdx >@@ -204,33 +240,46 @@ $code.=<<___; > jmp .Louter > .align 16 > .Louter: >+ lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) >+ and \$-16,%rdx >+ pxor %xmm4,%xmm4 >+ pxor %xmm5,%xmm5 >+___ >+for($k=0;$k<$STRIDE/16;$k+=4) { >+$code.=<<___; >+ movdqa `16*($k+0)-128`($bp),%xmm0 >+ movdqa `16*($k+1)-128`($bp),%xmm1 >+ movdqa `16*($k+2)-128`($bp),%xmm2 >+ movdqa `16*($k+3)-128`($bp),%xmm3 >+ pand `16*($k+0)-128`(%rdx),%xmm0 >+ pand `16*($k+1)-128`(%rdx),%xmm1 >+ por %xmm0,%xmm4 >+ pand `16*($k+2)-128`(%rdx),%xmm2 >+ por %xmm1,%xmm5 >+ pand `16*($k+3)-128`(%rdx),%xmm3 >+ por %xmm2,%xmm4 >+ por %xmm3,%xmm5 >+___ >+} >+$code.=<<___; >+ por %xmm5,%xmm4 >+ pshufd \$0x4e,%xmm4,%xmm0 >+ por %xmm4,%xmm0 >+ lea $STRIDE($bp),$bp >+ movq %xmm0,$m0 # m0=bp[i] >+ > xor $j,$j # j=0 > mov $n0,$m1 > mov (%rsp),$lo0 > >- movq `0*$STRIDE/4-96`($bp),%xmm0 >- movq `1*$STRIDE/4-96`($bp),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($bp),%xmm2 >- pand %xmm5,%xmm1 >- > mulq $m0 # ap[0]*bp[i] > add %rax,$lo0 # ap[0]*bp[i]+tp[0] > mov ($np),%rax > adc \$0,%rdx > >- movq `3*$STRIDE/4-96`($bp),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >- > imulq $lo0,$m1 # tp[0]*n0 > mov %rdx,$hi0 > >- por %xmm2,%xmm0 >- lea $STRIDE($bp),$bp >- por %xmm3,%xmm0 >- > mulq $m1 # np[0]*m1 > add %rax,$lo0 # discarded > mov 8($ap),%rax >@@ -266,8 +315,6 @@ $code.=<<___; > cmp $num,$j > jne .Linner > >- movq %xmm0,$m0 # bp[i+1] >- > add %rax,$hi1 > mov ($ap),%rax # ap[0] > adc \$0,%rdx >@@ -321,13 +368,7 @@ $code.=<<___; > > mov 8(%rsp,$num,8),%rsi # restore %rsp > mov \$1,%rax >-___ >-$code.=<<___ if ($win64); >- movaps (%rsi),%xmm6 >- movaps 0x10(%rsi),%xmm7 >- lea 0x28(%rsi),%rsi >-___ >-$code.=<<___; >+ > mov (%rsi),%r15 > mov 8(%rsi),%r14 > mov 16(%rsi),%r13 >@@ -348,91 +389,130 @@ $code.=<<___; > bn_mul4x_mont_gather5: > .Lmul4x_enter: > mov ${num}d,${num}d >- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument >+ movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument >+ lea .Linc(%rip),%r10 > push %rbx > push %rbp > push %r12 > push %r13 > push %r14 > push %r15 >-___ >-$code.=<<___ if ($win64); >- lea -0x28(%rsp),%rsp >- movaps %xmm6,(%rsp) >- movaps %xmm7,0x10(%rsp) >+ > .Lmul4x_alloca: >-___ >-$code.=<<___; > mov %rsp,%rax > lea 4($num),%r11 > neg %r11 >- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) >+ lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) > and \$-1024,%rsp # minimize TLB usage > > mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp > .Lmul4x_body: > mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp >- mov %rdx,%r12 # reassign $bp >+ lea 128(%rdx),%r12 # reassign $bp (+size optimization) > ___ > $bp="%r12"; > $STRIDE=2**5*8; # 5 is "window size" > $N=$STRIDE/4; # should match cache line size > $code.=<<___; >- mov %r10,%r11 >- shr \$`log($N/8)/log(2)`,%r10 >- and \$`$N/8-1`,%r11 >- not %r10 >- lea .Lmagic_masks(%rip),%rax >- and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" >- lea 96($bp,%r11,8),$bp # pointer within 1st cache line >- movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which >- movq 8(%rax,%r10,8),%xmm5 # cache line contains element >- movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument >- movq 24(%rax,%r10,8),%xmm7 >- >- movq `0*$STRIDE/4-96`($bp),%xmm0 >- movq `1*$STRIDE/4-96`($bp),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($bp),%xmm2 >- pand %xmm5,%xmm1 >- movq `3*$STRIDE/4-96`($bp),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >+ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 >+ movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 >+ lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) >+ >+ pshufd \$0,%xmm5,%xmm5 # broadcast index >+ movdqa %xmm1,%xmm4 >+ .byte 0x67,0x67 >+ movdqa %xmm1,%xmm2 >+___ >+######################################################################## >+# calculate mask by comparing 0..31 to index and save result to stack >+# >+$code.=<<___; >+ paddd %xmm0,%xmm1 >+ pcmpeqd %xmm5,%xmm0 # compare to 1,0 >+ .byte 0x67 >+ movdqa %xmm4,%xmm3 >+___ >+for($k=0;$k<$STRIDE/16-4;$k+=4) { >+$code.=<<___; >+ paddd %xmm1,%xmm2 >+ pcmpeqd %xmm5,%xmm1 # compare to 3,2 >+ movdqa %xmm0,`16*($k+0)+112`(%r10) >+ movdqa %xmm4,%xmm0 >+ >+ paddd %xmm2,%xmm3 >+ pcmpeqd %xmm5,%xmm2 # compare to 5,4 >+ movdqa %xmm1,`16*($k+1)+112`(%r10) >+ movdqa %xmm4,%xmm1 >+ >+ paddd %xmm3,%xmm0 >+ pcmpeqd %xmm5,%xmm3 # compare to 7,6 >+ movdqa %xmm2,`16*($k+2)+112`(%r10) >+ movdqa %xmm4,%xmm2 >+ >+ paddd %xmm0,%xmm1 >+ pcmpeqd %xmm5,%xmm0 >+ movdqa %xmm3,`16*($k+3)+112`(%r10) >+ movdqa %xmm4,%xmm3 >+___ >+} >+$code.=<<___; # last iteration can be optimized >+ paddd %xmm1,%xmm2 >+ pcmpeqd %xmm5,%xmm1 >+ movdqa %xmm0,`16*($k+0)+112`(%r10) >+ >+ paddd %xmm2,%xmm3 >+ .byte 0x67 >+ pcmpeqd %xmm5,%xmm2 >+ movdqa %xmm1,`16*($k+1)+112`(%r10) >+ >+ pcmpeqd %xmm5,%xmm3 >+ movdqa %xmm2,`16*($k+2)+112`(%r10) >+ pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register >+ >+ pand `16*($k+1)-128`($bp),%xmm1 >+ pand `16*($k+2)-128`($bp),%xmm2 >+ movdqa %xmm3,`16*($k+3)+112`(%r10) >+ pand `16*($k+3)-128`($bp),%xmm3 >+ por %xmm2,%xmm0 >+ por %xmm3,%xmm1 >+___ >+for($k=0;$k<$STRIDE/16-4;$k+=4) { >+$code.=<<___; >+ movdqa `16*($k+0)-128`($bp),%xmm4 >+ movdqa `16*($k+1)-128`($bp),%xmm5 >+ movdqa `16*($k+2)-128`($bp),%xmm2 >+ pand `16*($k+0)+112`(%r10),%xmm4 >+ movdqa `16*($k+3)-128`($bp),%xmm3 >+ pand `16*($k+1)+112`(%r10),%xmm5 >+ por %xmm4,%xmm0 >+ pand `16*($k+2)+112`(%r10),%xmm2 >+ por %xmm5,%xmm1 >+ pand `16*($k+3)+112`(%r10),%xmm3 > por %xmm2,%xmm0 >+ por %xmm3,%xmm1 >+___ >+} >+$code.=<<___; >+ por %xmm1,%xmm0 >+ pshufd \$0x4e,%xmm0,%xmm1 >+ por %xmm1,%xmm0 > lea $STRIDE($bp),$bp >- por %xmm3,%xmm0 >- > movq %xmm0,$m0 # m0=bp[0] >+ > mov ($n0),$n0 # pull n0[0] value > mov ($ap),%rax > > xor $i,$i # i=0 > xor $j,$j # j=0 > >- movq `0*$STRIDE/4-96`($bp),%xmm0 >- movq `1*$STRIDE/4-96`($bp),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($bp),%xmm2 >- pand %xmm5,%xmm1 >- > mov $n0,$m1 > mulq $m0 # ap[0]*bp[0] > mov %rax,$A[0] > mov ($np),%rax > >- movq `3*$STRIDE/4-96`($bp),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >- > imulq $A[0],$m1 # "tp[0]"*n0 > mov %rdx,$A[1] > >- por %xmm2,%xmm0 >- lea $STRIDE($bp),$bp >- por %xmm3,%xmm0 >- > mulq $m1 # np[0]*m1 > add %rax,$A[0] # discarded > mov 8($ap),%rax >@@ -550,8 +630,6 @@ $code.=<<___; > mov $N[1],-16(%rsp,$j,8) # tp[j-1] > mov %rdx,$N[0] > >- movq %xmm0,$m0 # bp[1] >- > xor $N[1],$N[1] > add $A[0],$N[0] > adc \$0,$N[1] >@@ -561,12 +639,34 @@ $code.=<<___; > lea 1($i),$i # i++ > .align 4 > .Louter4x: >+ lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) >+ pxor %xmm4,%xmm4 >+ pxor %xmm5,%xmm5 >+___ >+for($k=0;$k<$STRIDE/16;$k+=4) { >+$code.=<<___; >+ movdqa `16*($k+0)-128`($bp),%xmm0 >+ movdqa `16*($k+1)-128`($bp),%xmm1 >+ movdqa `16*($k+2)-128`($bp),%xmm2 >+ movdqa `16*($k+3)-128`($bp),%xmm3 >+ pand `16*($k+0)-128`(%rdx),%xmm0 >+ pand `16*($k+1)-128`(%rdx),%xmm1 >+ por %xmm0,%xmm4 >+ pand `16*($k+2)-128`(%rdx),%xmm2 >+ por %xmm1,%xmm5 >+ pand `16*($k+3)-128`(%rdx),%xmm3 >+ por %xmm2,%xmm4 >+ por %xmm3,%xmm5 >+___ >+} >+$code.=<<___; >+ por %xmm5,%xmm4 >+ pshufd \$0x4e,%xmm4,%xmm0 >+ por %xmm4,%xmm0 >+ lea $STRIDE($bp),$bp >+ movq %xmm0,$m0 # m0=bp[i] >+ > xor $j,$j # j=0 >- movq `0*$STRIDE/4-96`($bp),%xmm0 >- movq `1*$STRIDE/4-96`($bp),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($bp),%xmm2 >- pand %xmm5,%xmm1 > > mov (%rsp),$A[0] > mov $n0,$m1 >@@ -575,18 +675,9 @@ $code.=<<___; > mov ($np),%rax > adc \$0,%rdx > >- movq `3*$STRIDE/4-96`($bp),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >- > imulq $A[0],$m1 # tp[0]*n0 > mov %rdx,$A[1] > >- por %xmm2,%xmm0 >- lea $STRIDE($bp),$bp >- por %xmm3,%xmm0 >- > mulq $m1 # np[0]*m1 > add %rax,$A[0] # "$N[0]", discarded > mov 8($ap),%rax >@@ -718,7 +809,6 @@ $code.=<<___; > mov $N[0],-24(%rsp,$j,8) # tp[j-1] > mov %rdx,$N[0] > >- movq %xmm0,$m0 # bp[i+1] > mov $N[1],-16(%rsp,$j,8) # tp[j-1] > > xor $N[1],$N[1] >@@ -809,13 +899,7 @@ ___ > $code.=<<___; > mov 8(%rsp,$num,8),%rsi # restore %rsp > mov \$1,%rax >-___ >-$code.=<<___ if ($win64); >- movaps (%rsi),%xmm6 >- movaps 0x10(%rsi),%xmm7 >- lea 0x28(%rsi),%rsi >-___ >-$code.=<<___; >+ > mov (%rsi),%r15 > mov 8(%rsi),%r14 > mov 16(%rsi),%r13 >@@ -830,8 +914,8 @@ ___ > }}} > > { >-my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order >- ("%rdi","%rsi","%rdx","%rcx"); # Unix order >+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order >+ ("%rdi","%rsi","%rdx","%ecx"); # Unix order > my $out=$inp; > my $STRIDE=2**5*8; > my $N=$STRIDE/4; >@@ -859,53 +943,89 @@ bn_scatter5: > .type bn_gather5,\@abi-omnipotent > .align 16 > bn_gather5: >-___ >-$code.=<<___ if ($win64); >-.LSEH_begin_bn_gather5: >+.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases > # I can't trust assembler to use specific encoding:-( >- .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp >- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) >- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) >+ .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 >+ .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp >+ lea .Linc(%rip),%rax >+ and \$-16,%rsp # shouldn't be formally required >+ >+ movd $idx,%xmm5 >+ movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 >+ movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 >+ lea 128($tbl),%r11 # size optimization >+ lea 128(%rsp),%rax # size optimization >+ >+ pshufd \$0,%xmm5,%xmm5 # broadcast $idx >+ movdqa %xmm1,%xmm4 >+ movdqa %xmm1,%xmm2 > ___ >+######################################################################## >+# calculate mask by comparing 0..31 to $idx and save result to stack >+# >+for($i=0;$i<$STRIDE/16;$i+=4) { >+$code.=<<___; >+ paddd %xmm0,%xmm1 >+ pcmpeqd %xmm5,%xmm0 # compare to 1,0 >+___ >+$code.=<<___ if ($i); >+ movdqa %xmm3,`16*($i-1)-128`(%rax) >+___ >+$code.=<<___; >+ movdqa %xmm4,%xmm3 >+ >+ paddd %xmm1,%xmm2 >+ pcmpeqd %xmm5,%xmm1 # compare to 3,2 >+ movdqa %xmm0,`16*($i+0)-128`(%rax) >+ movdqa %xmm4,%xmm0 >+ >+ paddd %xmm2,%xmm3 >+ pcmpeqd %xmm5,%xmm2 # compare to 5,4 >+ movdqa %xmm1,`16*($i+1)-128`(%rax) >+ movdqa %xmm4,%xmm1 >+ >+ paddd %xmm3,%xmm0 >+ pcmpeqd %xmm5,%xmm3 # compare to 7,6 >+ movdqa %xmm2,`16*($i+2)-128`(%rax) >+ movdqa %xmm4,%xmm2 >+___ >+} > $code.=<<___; >- mov $idx,%r11 >- shr \$`log($N/8)/log(2)`,$idx >- and \$`$N/8-1`,%r11 >- not $idx >- lea .Lmagic_masks(%rip),%rax >- and \$`2**5/($N/8)-1`,$idx # 5 is "window size" >- lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line >- movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which >- movq 8(%rax,$idx,8),%xmm5 # cache line contains element >- movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument >- movq 24(%rax,$idx,8),%xmm7 >+ movdqa %xmm3,`16*($i-1)-128`(%rax) > jmp .Lgather >-.align 16 >-.Lgather: >- movq `0*$STRIDE/4-96`($tbl),%xmm0 >- movq `1*$STRIDE/4-96`($tbl),%xmm1 >- pand %xmm4,%xmm0 >- movq `2*$STRIDE/4-96`($tbl),%xmm2 >- pand %xmm5,%xmm1 >- movq `3*$STRIDE/4-96`($tbl),%xmm3 >- pand %xmm6,%xmm2 >- por %xmm1,%xmm0 >- pand %xmm7,%xmm3 >- por %xmm2,%xmm0 >- lea $STRIDE($tbl),$tbl >- por %xmm3,%xmm0 > >+.align 32 >+.Lgather: >+ pxor %xmm4,%xmm4 >+ pxor %xmm5,%xmm5 >+___ >+for($i=0;$i<$STRIDE/16;$i+=4) { >+$code.=<<___; >+ movdqa `16*($i+0)-128`(%r11),%xmm0 >+ movdqa `16*($i+1)-128`(%r11),%xmm1 >+ movdqa `16*($i+2)-128`(%r11),%xmm2 >+ pand `16*($i+0)-128`(%rax),%xmm0 >+ movdqa `16*($i+3)-128`(%r11),%xmm3 >+ pand `16*($i+1)-128`(%rax),%xmm1 >+ por %xmm0,%xmm4 >+ pand `16*($i+2)-128`(%rax),%xmm2 >+ por %xmm1,%xmm5 >+ pand `16*($i+3)-128`(%rax),%xmm3 >+ por %xmm2,%xmm4 >+ por %xmm3,%xmm5 >+___ >+} >+$code.=<<___; >+ por %xmm5,%xmm4 >+ lea $STRIDE(%r11),%r11 >+ pshufd \$0x4e,%xmm4,%xmm0 >+ por %xmm4,%xmm0 > movq %xmm0,($out) # m0=bp[0] > lea 8($out),$out > sub \$1,$num > jnz .Lgather >-___ >-$code.=<<___ if ($win64); >- movaps (%rsp),%xmm6 >- movaps 0x10(%rsp),%xmm7 >- lea 0x28(%rsp),%rsp >-___ >-$code.=<<___; >+ >+ lea (%r10),%rsp > ret > .LSEH_end_bn_gather5: > .size bn_gather5,.-bn_gather5 >@@ -913,9 +1033,9 @@ ___ > } > $code.=<<___; > .align 64 >-.Lmagic_masks: >- .long 0,0, 0,0, 0,0, -1,-1 >- .long 0,0, 0,0, 0,0, 0,0 >+.Linc: >+ .long 0,0, 1,1 >+ .long 2,2, 2,2 > .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" > ___ > >@@ -954,7 +1074,7 @@ mul_handler: > cmp %r10,%rbx # context->Rip<end of prologue label > jb .Lcommon_seh_tail > >- lea `40+48`(%rax),%rax >+ lea 48(%rax),%rax > > mov 4(%r11),%r10d # HandlerData[1] > lea (%rsi,%r10),%r10 # end of alloca label >@@ -971,9 +1091,7 @@ mul_handler: > mov 192($context),%r10 # pull $num > mov 8(%rax,%r10,8),%rax # pull saved stack pointer > >- movaps (%rax),%xmm0 >- movaps 16(%rax),%xmm1 >- lea `40+48`(%rax),%rax >+ lea 48(%rax),%rax > > mov -8(%rax),%rbx > mov -16(%rax),%rbp >@@ -987,8 +1105,6 @@ mul_handler: > mov %r13,224($context) # restore context->R13 > mov %r14,232($context) # restore context->R14 > mov %r15,240($context) # restore context->R15 >- movups %xmm0,512($context) # restore context->Xmm6 >- movups %xmm1,528($context) # restore context->Xmm7 > > .Lcommon_seh_tail: > mov 8(%rax),%rdi >@@ -1057,10 +1173,9 @@ mul_handler: > .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] > .align 8 > .LSEH_info_bn_gather5: >- .byte 0x01,0x0d,0x05,0x00 >- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 >- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 >- .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 >+ .byte 0x01,0x0b,0x03,0x0a >+ .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 >+ .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 > .align 8 > ___ > } >diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c >index 4839625..c095ac4 100644 >--- a/crypto/bn/bn_exp.c >+++ b/crypto/bn/bn_exp.c >@@ -110,6 +110,7 @@ > */ > > #include "cryptlib.h" >+#include "constant_time_locl.h" > #include "bn_lcl.h" > > #include <stdlib.h> >@@ -546,15 +547,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, > > static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, > unsigned char *buf, int idx, >- int width) >+ int window) > { >- size_t i, j; >+ int i, j; >+ int width = 1 << window; >+ BN_ULONG *table = (BN_ULONG *)buf; > > if (top > b->top) > top = b->top; /* this works because 'buf' is explicitly > * zeroed */ >- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) { >- buf[j] = ((unsigned char *)b->d)[i]; >+ for (i = 0, j = idx; i < top; i++, j += width) { >+ table[j] = b->d[i]; > } > > return 1; >@@ -562,15 +565,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, > > static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, > unsigned char *buf, int idx, >- int width) >+ int window) > { >- size_t i, j; >+ int i, j; >+ int width = 1 << window; >+ volatile BN_ULONG *table = (volatile BN_ULONG *)buf; > > if (bn_wexpand(b, top) == NULL) > return 0; > >- for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) { >- ((unsigned char *)b->d)[i] = buf[j]; >+ if (window <= 3) { >+ for (i = 0; i < top; i++, table += width) { >+ BN_ULONG acc = 0; >+ >+ for (j = 0; j < width; j++) { >+ acc |= table[j] & >+ ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); >+ } >+ >+ b->d[i] = acc; >+ } >+ } else { >+ int xstride = 1 << (window - 2); >+ BN_ULONG y0, y1, y2, y3; >+ >+ i = idx >> (window - 2); /* equivalent of idx / xstride */ >+ idx &= xstride - 1; /* equivalent of idx % xstride */ >+ >+ y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1); >+ y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1); >+ y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1); >+ y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1); >+ >+ for (i = 0; i < top; i++, table += width) { >+ BN_ULONG acc = 0; >+ >+ for (j = 0; j < xstride; j++) { >+ acc |= ( (table[j + 0 * xstride] & y0) | >+ (table[j + 1 * xstride] & y1) | >+ (table[j + 2 * xstride] & y2) | >+ (table[j + 3 * xstride] & y3) ) >+ & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); >+ } >+ >+ b->d[i] = acc; >+ } > } > > b->top = top; >@@ -800,9 +839,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, > } else > #endif > { >- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) >+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window)) > goto err; >- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) >+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window)) > goto err; > > /* >@@ -814,15 +853,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, > if (window > 1) { > if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) > goto err; >- if (!MOD_EXP_CTIME_COPY_TO_PREBUF >- (&tmp, top, powerbuf, 2, numPowers)) >+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, >+ window)) > goto err; > for (i = 3; i < numPowers; i++) { > /* Calculate a^i = a^(i-1) * a */ > if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) > goto err; >- if (!MOD_EXP_CTIME_COPY_TO_PREBUF >- (&tmp, top, powerbuf, i, numPowers)) >+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, >+ window)) > goto err; > } > } >@@ -830,8 +869,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, > bits--; > for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) > wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); >- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF >- (&tmp, top, powerbuf, wvalue, numPowers)) >+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue, >+ window)) > goto err; > > /* >@@ -851,8 +890,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, > /* > * Fetch the appropriate pre-computed value from the pre-buf > */ >- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF >- (&am, top, powerbuf, wvalue, numPowers)) >+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, >+ window)) > goto err; > > /* Multiply the result into the intermediate result */
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
Actions:
View
|
Diff
Attachments on
bug 40189
:
7509
|
7510
|
7511
|
7512
| 7513