View | Details | Raw Unified | Return to bug 40189 | Differences between
and this patch

Collapse All | Expand All

(-)a/crypto/bn/asm/x86_64-mont5.pl (-199 / +314 lines)
 Lines 66-125   bn_mul_mont_gather5: Link Here 
66
.align	16
66
.align	16
67
.Lmul_enter:
67
.Lmul_enter:
68
	mov	${num}d,${num}d
68
	mov	${num}d,${num}d
69
	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
69
	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
70
	lea	.Linc(%rip),%r10
70
	push	%rbx
71
	push	%rbx
71
	push	%rbp
72
	push	%rbp
72
	push	%r12
73
	push	%r12
73
	push	%r13
74
	push	%r13
74
	push	%r14
75
	push	%r14
75
	push	%r15
76
	push	%r15
76
___
77
77
$code.=<<___ if ($win64);
78
	lea	-0x28(%rsp),%rsp
79
	movaps	%xmm6,(%rsp)
80
	movaps	%xmm7,0x10(%rsp)
81
.Lmul_alloca:
78
.Lmul_alloca:
82
___
83
$code.=<<___;
84
	mov	%rsp,%rax
79
	mov	%rsp,%rax
85
	lea	2($num),%r11
80
	lea	2($num),%r11
86
	neg	%r11
81
	neg	%r11
87
	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
82
	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
88
	and	\$-1024,%rsp		# minimize TLB usage
83
	and	\$-1024,%rsp		# minimize TLB usage
89
84
90
	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
85
	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
91
.Lmul_body:
86
.Lmul_body:
92
	mov	$bp,%r12		# reassign $bp
87
	lea	128($bp),%r12		# reassign $bp (+size optimization)
93
___
88
___
94
		$bp="%r12";
89
		$bp="%r12";
95
		$STRIDE=2**5*8;		# 5 is "window size"
90
		$STRIDE=2**5*8;		# 5 is "window size"
96
		$N=$STRIDE/4;		# should match cache line size
91
		$N=$STRIDE/4;		# should match cache line size
97
$code.=<<___;
92
$code.=<<___;
98
	mov	%r10,%r11
93
	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
99
	shr	\$`log($N/8)/log(2)`,%r10
94
	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
100
	and	\$`$N/8-1`,%r11
95
	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
101
	not	%r10
96
	and	\$-16,%r10
102
	lea	.Lmagic_masks(%rip),%rax
97
103
	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
98
	pshufd	\$0,%xmm5,%xmm5		# broadcast index
104
	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
99
	movdqa	%xmm1,%xmm4
105
	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
100
	movdqa	%xmm1,%xmm2
106
	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
101
___
107
	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
102
########################################################################
108
	movq	24(%rax,%r10,8),%xmm7
103
# calculate mask by comparing 0..31 to index and save result to stack
109
104
#
110
	movq	`0*$STRIDE/4-96`($bp),%xmm0
105
$code.=<<___;
111
	movq	`1*$STRIDE/4-96`($bp),%xmm1
106
	paddd	%xmm0,%xmm1
112
	pand	%xmm4,%xmm0
107
	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
113
	movq	`2*$STRIDE/4-96`($bp),%xmm2
108
	.byte	0x67
114
	pand	%xmm5,%xmm1
109
	movdqa	%xmm4,%xmm3
115
	movq	`3*$STRIDE/4-96`($bp),%xmm3
110
___
116
	pand	%xmm6,%xmm2
111
for($k=0;$k<$STRIDE/16-4;$k+=4) {
117
	por	%xmm1,%xmm0
112
$code.=<<___;
118
	pand	%xmm7,%xmm3
113
	paddd	%xmm1,%xmm2
114
	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
115
	movdqa	%xmm0,`16*($k+0)+112`(%r10)
116
	movdqa	%xmm4,%xmm0
117
118
	paddd	%xmm2,%xmm3
119
	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
120
	movdqa	%xmm1,`16*($k+1)+112`(%r10)
121
	movdqa	%xmm4,%xmm1
122
123
	paddd	%xmm3,%xmm0
124
	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
125
	movdqa	%xmm2,`16*($k+2)+112`(%r10)
126
	movdqa	%xmm4,%xmm2
127
128
	paddd	%xmm0,%xmm1
129
	pcmpeqd	%xmm5,%xmm0
130
	movdqa	%xmm3,`16*($k+3)+112`(%r10)
131
	movdqa	%xmm4,%xmm3
132
___
133
}
134
$code.=<<___;				# last iteration can be optimized
135
	paddd	%xmm1,%xmm2
136
	pcmpeqd	%xmm5,%xmm1
137
	movdqa	%xmm0,`16*($k+0)+112`(%r10)
138
139
	paddd	%xmm2,%xmm3
140
	.byte	0x67
141
	pcmpeqd	%xmm5,%xmm2
142
	movdqa	%xmm1,`16*($k+1)+112`(%r10)
143
144
	pcmpeqd	%xmm5,%xmm3
145
	movdqa	%xmm2,`16*($k+2)+112`(%r10)
146
	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
147
148
	pand	`16*($k+1)-128`($bp),%xmm1
149
	pand	`16*($k+2)-128`($bp),%xmm2
150
	movdqa	%xmm3,`16*($k+3)+112`(%r10)
151
	pand	`16*($k+3)-128`($bp),%xmm3
152
	por	%xmm2,%xmm0
153
	por	%xmm3,%xmm1
154
___
155
for($k=0;$k<$STRIDE/16-4;$k+=4) {
156
$code.=<<___;
157
	movdqa	`16*($k+0)-128`($bp),%xmm4
158
	movdqa	`16*($k+1)-128`($bp),%xmm5
159
	movdqa	`16*($k+2)-128`($bp),%xmm2
160
	pand	`16*($k+0)+112`(%r10),%xmm4
161
	movdqa	`16*($k+3)-128`($bp),%xmm3
162
	pand	`16*($k+1)+112`(%r10),%xmm5
163
	por	%xmm4,%xmm0
164
	pand	`16*($k+2)+112`(%r10),%xmm2
165
	por	%xmm5,%xmm1
166
	pand	`16*($k+3)+112`(%r10),%xmm3
119
	por	%xmm2,%xmm0
167
	por	%xmm2,%xmm0
168
	por	%xmm3,%xmm1
169
___
170
}
171
$code.=<<___;
172
	por	%xmm1,%xmm0
173
	pshufd	\$0x4e,%xmm0,%xmm1
174
	por	%xmm1,%xmm0
120
	lea	$STRIDE($bp),$bp
175
	lea	$STRIDE($bp),$bp
121
	por	%xmm3,%xmm0
122
123
	movq	%xmm0,$m0		# m0=bp[0]
176
	movq	%xmm0,$m0		# m0=bp[0]
124
177
125
	mov	($n0),$n0		# pull n0[0] value
178
	mov	($n0),$n0		# pull n0[0] value
 Lines 128-156   $code.=<<___; Link Here 
128
	xor	$i,$i			# i=0
181
	xor	$i,$i			# i=0
129
	xor	$j,$j			# j=0
182
	xor	$j,$j			# j=0
130
183
131
	movq	`0*$STRIDE/4-96`($bp),%xmm0
132
	movq	`1*$STRIDE/4-96`($bp),%xmm1
133
	pand	%xmm4,%xmm0
134
	movq	`2*$STRIDE/4-96`($bp),%xmm2
135
	pand	%xmm5,%xmm1
136
137
	mov	$n0,$m1
184
	mov	$n0,$m1
138
	mulq	$m0			# ap[0]*bp[0]
185
	mulq	$m0			# ap[0]*bp[0]
139
	mov	%rax,$lo0
186
	mov	%rax,$lo0
140
	mov	($np),%rax
187
	mov	($np),%rax
141
188
142
	movq	`3*$STRIDE/4-96`($bp),%xmm3
143
	pand	%xmm6,%xmm2
144
	por	%xmm1,%xmm0
145
	pand	%xmm7,%xmm3
146
147
	imulq	$lo0,$m1		# "tp[0]"*n0
189
	imulq	$lo0,$m1		# "tp[0]"*n0
148
	mov	%rdx,$hi0
190
	mov	%rdx,$hi0
149
191
150
	por	%xmm2,%xmm0
151
	lea	$STRIDE($bp),$bp
152
	por	%xmm3,%xmm0
153
154
	mulq	$m1			# np[0]*m1
192
	mulq	$m1			# np[0]*m1
155
	add	%rax,$lo0		# discarded
193
	add	%rax,$lo0		# discarded
156
	mov	8($ap),%rax
194
	mov	8($ap),%rax
 Lines 183-190   $code.=<<___; Link Here 
183
	cmp	$num,$j
221
	cmp	$num,$j
184
	jne	.L1st
222
	jne	.L1st
185
223
186
	movq	%xmm0,$m0		# bp[1]
187
188
	add	%rax,$hi1
224
	add	%rax,$hi1
189
	mov	($ap),%rax		# ap[0]
225
	mov	($ap),%rax		# ap[0]
190
	adc	\$0,%rdx
226
	adc	\$0,%rdx
 Lines 204-236   $code.=<<___; Link Here 
204
	jmp	.Louter
240
	jmp	.Louter
205
.align	16
241
.align	16
206
.Louter:
242
.Louter:
243
	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
244
	and	\$-16,%rdx
245
	pxor	%xmm4,%xmm4
246
	pxor	%xmm5,%xmm5
247
___
248
for($k=0;$k<$STRIDE/16;$k+=4) {
249
$code.=<<___;
250
	movdqa	`16*($k+0)-128`($bp),%xmm0
251
	movdqa	`16*($k+1)-128`($bp),%xmm1
252
	movdqa	`16*($k+2)-128`($bp),%xmm2
253
	movdqa	`16*($k+3)-128`($bp),%xmm3
254
	pand	`16*($k+0)-128`(%rdx),%xmm0
255
	pand	`16*($k+1)-128`(%rdx),%xmm1
256
	por	%xmm0,%xmm4
257
	pand	`16*($k+2)-128`(%rdx),%xmm2
258
	por	%xmm1,%xmm5
259
	pand	`16*($k+3)-128`(%rdx),%xmm3
260
	por	%xmm2,%xmm4
261
	por	%xmm3,%xmm5
262
___
263
}
264
$code.=<<___;
265
	por	%xmm5,%xmm4
266
	pshufd	\$0x4e,%xmm4,%xmm0
267
	por	%xmm4,%xmm0
268
	lea	$STRIDE($bp),$bp
269
	movq	%xmm0,$m0		# m0=bp[i]
270
207
	xor	$j,$j			# j=0
271
	xor	$j,$j			# j=0
208
	mov	$n0,$m1
272
	mov	$n0,$m1
209
	mov	(%rsp),$lo0
273
	mov	(%rsp),$lo0
210
274
211
	movq	`0*$STRIDE/4-96`($bp),%xmm0
212
	movq	`1*$STRIDE/4-96`($bp),%xmm1
213
	pand	%xmm4,%xmm0
214
	movq	`2*$STRIDE/4-96`($bp),%xmm2
215
	pand	%xmm5,%xmm1
216
217
	mulq	$m0			# ap[0]*bp[i]
275
	mulq	$m0			# ap[0]*bp[i]
218
	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
276
	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
219
	mov	($np),%rax
277
	mov	($np),%rax
220
	adc	\$0,%rdx
278
	adc	\$0,%rdx
221
279
222
	movq	`3*$STRIDE/4-96`($bp),%xmm3
223
	pand	%xmm6,%xmm2
224
	por	%xmm1,%xmm0
225
	pand	%xmm7,%xmm3
226
227
	imulq	$lo0,$m1		# tp[0]*n0
280
	imulq	$lo0,$m1		# tp[0]*n0
228
	mov	%rdx,$hi0
281
	mov	%rdx,$hi0
229
282
230
	por	%xmm2,%xmm0
231
	lea	$STRIDE($bp),$bp
232
	por	%xmm3,%xmm0
233
234
	mulq	$m1			# np[0]*m1
283
	mulq	$m1			# np[0]*m1
235
	add	%rax,$lo0		# discarded
284
	add	%rax,$lo0		# discarded
236
	mov	8($ap),%rax
285
	mov	8($ap),%rax
 Lines 266-273   $code.=<<___; Link Here 
266
	cmp	$num,$j
315
	cmp	$num,$j
267
	jne	.Linner
316
	jne	.Linner
268
317
269
	movq	%xmm0,$m0		# bp[i+1]
270
271
	add	%rax,$hi1
318
	add	%rax,$hi1
272
	mov	($ap),%rax		# ap[0]
319
	mov	($ap),%rax		# ap[0]
273
	adc	\$0,%rdx
320
	adc	\$0,%rdx
 Lines 321-333   $code.=<<___; Link Here 
321
368
322
	mov	8(%rsp,$num,8),%rsi	# restore %rsp
369
	mov	8(%rsp,$num,8),%rsi	# restore %rsp
323
	mov	\$1,%rax
370
	mov	\$1,%rax
324
___
371
325
$code.=<<___ if ($win64);
326
	movaps	(%rsi),%xmm6
327
	movaps	0x10(%rsi),%xmm7
328
	lea	0x28(%rsi),%rsi
329
___
330
$code.=<<___;
331
	mov	(%rsi),%r15
372
	mov	(%rsi),%r15
332
	mov	8(%rsi),%r14
373
	mov	8(%rsi),%r14
333
	mov	16(%rsi),%r13
374
	mov	16(%rsi),%r13
 Lines 348-438   $code.=<<___; Link Here 
348
bn_mul4x_mont_gather5:
389
bn_mul4x_mont_gather5:
349
.Lmul4x_enter:
390
.Lmul4x_enter:
350
	mov	${num}d,${num}d
391
	mov	${num}d,${num}d
351
	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
392
	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
393
	lea	.Linc(%rip),%r10
352
	push	%rbx
394
	push	%rbx
353
	push	%rbp
395
	push	%rbp
354
	push	%r12
396
	push	%r12
355
	push	%r13
397
	push	%r13
356
	push	%r14
398
	push	%r14
357
	push	%r15
399
	push	%r15
358
___
400
359
$code.=<<___ if ($win64);
360
	lea	-0x28(%rsp),%rsp
361
	movaps	%xmm6,(%rsp)
362
	movaps	%xmm7,0x10(%rsp)
363
.Lmul4x_alloca:
401
.Lmul4x_alloca:
364
___
365
$code.=<<___;
366
	mov	%rsp,%rax
402
	mov	%rsp,%rax
367
	lea	4($num),%r11
403
	lea	4($num),%r11
368
	neg	%r11
404
	neg	%r11
369
	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
405
	lea	-256(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4)+256)
370
	and	\$-1024,%rsp		# minimize TLB usage
406
	and	\$-1024,%rsp		# minimize TLB usage
371
407
372
	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
408
	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
373
.Lmul4x_body:
409
.Lmul4x_body:
374
	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
410
	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
375
	mov	%rdx,%r12		# reassign $bp
411
	lea	128(%rdx),%r12		# reassign $bp (+size optimization)
376
___
412
___
377
		$bp="%r12";
413
		$bp="%r12";
378
		$STRIDE=2**5*8;		# 5 is "window size"
414
		$STRIDE=2**5*8;		# 5 is "window size"
379
		$N=$STRIDE/4;		# should match cache line size
415
		$N=$STRIDE/4;		# should match cache line size
380
$code.=<<___;
416
$code.=<<___;
381
	mov	%r10,%r11
417
	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
382
	shr	\$`log($N/8)/log(2)`,%r10
418
	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
383
	and	\$`$N/8-1`,%r11
419
	lea	32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
384
	not	%r10
420
385
	lea	.Lmagic_masks(%rip),%rax
421
	pshufd	\$0,%xmm5,%xmm5		# broadcast index
386
	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
422
	movdqa	%xmm1,%xmm4
387
	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
423
	.byte	0x67,0x67
388
	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
424
	movdqa	%xmm1,%xmm2
389
	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
425
___
390
	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
426
########################################################################
391
	movq	24(%rax,%r10,8),%xmm7
427
# calculate mask by comparing 0..31 to index and save result to stack
392
428
#
393
	movq	`0*$STRIDE/4-96`($bp),%xmm0
429
$code.=<<___;
394
	movq	`1*$STRIDE/4-96`($bp),%xmm1
430
	paddd	%xmm0,%xmm1
395
	pand	%xmm4,%xmm0
431
	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
396
	movq	`2*$STRIDE/4-96`($bp),%xmm2
432
	.byte	0x67
397
	pand	%xmm5,%xmm1
433
	movdqa	%xmm4,%xmm3
398
	movq	`3*$STRIDE/4-96`($bp),%xmm3
434
___
399
	pand	%xmm6,%xmm2
435
for($k=0;$k<$STRIDE/16-4;$k+=4) {
400
	por	%xmm1,%xmm0
436
$code.=<<___;
401
	pand	%xmm7,%xmm3
437
	paddd	%xmm1,%xmm2
438
	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
439
	movdqa	%xmm0,`16*($k+0)+112`(%r10)
440
	movdqa	%xmm4,%xmm0
441
442
	paddd	%xmm2,%xmm3
443
	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
444
	movdqa	%xmm1,`16*($k+1)+112`(%r10)
445
	movdqa	%xmm4,%xmm1
446
447
	paddd	%xmm3,%xmm0
448
	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
449
	movdqa	%xmm2,`16*($k+2)+112`(%r10)
450
	movdqa	%xmm4,%xmm2
451
452
	paddd	%xmm0,%xmm1
453
	pcmpeqd	%xmm5,%xmm0
454
	movdqa	%xmm3,`16*($k+3)+112`(%r10)
455
	movdqa	%xmm4,%xmm3
456
___
457
}
458
$code.=<<___;				# last iteration can be optimized
459
	paddd	%xmm1,%xmm2
460
	pcmpeqd	%xmm5,%xmm1
461
	movdqa	%xmm0,`16*($k+0)+112`(%r10)
462
463
	paddd	%xmm2,%xmm3
464
	.byte	0x67
465
	pcmpeqd	%xmm5,%xmm2
466
	movdqa	%xmm1,`16*($k+1)+112`(%r10)
467
468
	pcmpeqd	%xmm5,%xmm3
469
	movdqa	%xmm2,`16*($k+2)+112`(%r10)
470
	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
471
472
	pand	`16*($k+1)-128`($bp),%xmm1
473
	pand	`16*($k+2)-128`($bp),%xmm2
474
	movdqa	%xmm3,`16*($k+3)+112`(%r10)
475
	pand	`16*($k+3)-128`($bp),%xmm3
476
	por	%xmm2,%xmm0
477
	por	%xmm3,%xmm1
478
___
479
for($k=0;$k<$STRIDE/16-4;$k+=4) {
480
$code.=<<___;
481
	movdqa	`16*($k+0)-128`($bp),%xmm4
482
	movdqa	`16*($k+1)-128`($bp),%xmm5
483
	movdqa	`16*($k+2)-128`($bp),%xmm2
484
	pand	`16*($k+0)+112`(%r10),%xmm4
485
	movdqa	`16*($k+3)-128`($bp),%xmm3
486
	pand	`16*($k+1)+112`(%r10),%xmm5
487
	por	%xmm4,%xmm0
488
	pand	`16*($k+2)+112`(%r10),%xmm2
489
	por	%xmm5,%xmm1
490
	pand	`16*($k+3)+112`(%r10),%xmm3
402
	por	%xmm2,%xmm0
491
	por	%xmm2,%xmm0
492
	por	%xmm3,%xmm1
493
___
494
}
495
$code.=<<___;
496
	por	%xmm1,%xmm0
497
	pshufd	\$0x4e,%xmm0,%xmm1
498
	por	%xmm1,%xmm0
403
	lea	$STRIDE($bp),$bp
499
	lea	$STRIDE($bp),$bp
404
	por	%xmm3,%xmm0
405
406
	movq	%xmm0,$m0		# m0=bp[0]
500
	movq	%xmm0,$m0		# m0=bp[0]
501
407
	mov	($n0),$n0		# pull n0[0] value
502
	mov	($n0),$n0		# pull n0[0] value
408
	mov	($ap),%rax
503
	mov	($ap),%rax
409
504
410
	xor	$i,$i			# i=0
505
	xor	$i,$i			# i=0
411
	xor	$j,$j			# j=0
506
	xor	$j,$j			# j=0
412
507
413
	movq	`0*$STRIDE/4-96`($bp),%xmm0
414
	movq	`1*$STRIDE/4-96`($bp),%xmm1
415
	pand	%xmm4,%xmm0
416
	movq	`2*$STRIDE/4-96`($bp),%xmm2
417
	pand	%xmm5,%xmm1
418
419
	mov	$n0,$m1
508
	mov	$n0,$m1
420
	mulq	$m0			# ap[0]*bp[0]
509
	mulq	$m0			# ap[0]*bp[0]
421
	mov	%rax,$A[0]
510
	mov	%rax,$A[0]
422
	mov	($np),%rax
511
	mov	($np),%rax
423
512
424
	movq	`3*$STRIDE/4-96`($bp),%xmm3
425
	pand	%xmm6,%xmm2
426
	por	%xmm1,%xmm0
427
	pand	%xmm7,%xmm3
428
429
	imulq	$A[0],$m1		# "tp[0]"*n0
513
	imulq	$A[0],$m1		# "tp[0]"*n0
430
	mov	%rdx,$A[1]
514
	mov	%rdx,$A[1]
431
515
432
	por	%xmm2,%xmm0
433
	lea	$STRIDE($bp),$bp
434
	por	%xmm3,%xmm0
435
436
	mulq	$m1			# np[0]*m1
516
	mulq	$m1			# np[0]*m1
437
	add	%rax,$A[0]		# discarded
517
	add	%rax,$A[0]		# discarded
438
	mov	8($ap),%rax
518
	mov	8($ap),%rax
 Lines 550-557   $code.=<<___; Link Here 
550
	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
630
	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
551
	mov	%rdx,$N[0]
631
	mov	%rdx,$N[0]
552
632
553
	movq	%xmm0,$m0		# bp[1]
554
555
	xor	$N[1],$N[1]
633
	xor	$N[1],$N[1]
556
	add	$A[0],$N[0]
634
	add	$A[0],$N[0]
557
	adc	\$0,$N[1]
635
	adc	\$0,$N[1]
 Lines 561-572   $code.=<<___; Link Here 
561
	lea	1($i),$i		# i++
639
	lea	1($i),$i		# i++
562
.align	4
640
.align	4
563
.Louter4x:
641
.Louter4x:
642
	lea	32+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
643
	pxor	%xmm4,%xmm4
644
	pxor	%xmm5,%xmm5
645
___
646
for($k=0;$k<$STRIDE/16;$k+=4) {
647
$code.=<<___;
648
	movdqa	`16*($k+0)-128`($bp),%xmm0
649
	movdqa	`16*($k+1)-128`($bp),%xmm1
650
	movdqa	`16*($k+2)-128`($bp),%xmm2
651
	movdqa	`16*($k+3)-128`($bp),%xmm3
652
	pand	`16*($k+0)-128`(%rdx),%xmm0
653
	pand	`16*($k+1)-128`(%rdx),%xmm1
654
	por	%xmm0,%xmm4
655
	pand	`16*($k+2)-128`(%rdx),%xmm2
656
	por	%xmm1,%xmm5
657
	pand	`16*($k+3)-128`(%rdx),%xmm3
658
	por	%xmm2,%xmm4
659
	por	%xmm3,%xmm5
660
___
661
}
662
$code.=<<___;
663
	por	%xmm5,%xmm4
664
	pshufd	\$0x4e,%xmm4,%xmm0
665
	por	%xmm4,%xmm0
666
	lea	$STRIDE($bp),$bp
667
	movq	%xmm0,$m0		# m0=bp[i]
668
564
	xor	$j,$j			# j=0
669
	xor	$j,$j			# j=0
565
	movq	`0*$STRIDE/4-96`($bp),%xmm0
566
	movq	`1*$STRIDE/4-96`($bp),%xmm1
567
	pand	%xmm4,%xmm0
568
	movq	`2*$STRIDE/4-96`($bp),%xmm2
569
	pand	%xmm5,%xmm1
570
670
571
	mov	(%rsp),$A[0]
671
	mov	(%rsp),$A[0]
572
	mov	$n0,$m1
672
	mov	$n0,$m1
 Lines 575-592   $code.=<<___; Link Here 
575
	mov	($np),%rax
675
	mov	($np),%rax
576
	adc	\$0,%rdx
676
	adc	\$0,%rdx
577
677
578
	movq	`3*$STRIDE/4-96`($bp),%xmm3
579
	pand	%xmm6,%xmm2
580
	por	%xmm1,%xmm0
581
	pand	%xmm7,%xmm3
582
583
	imulq	$A[0],$m1		# tp[0]*n0
678
	imulq	$A[0],$m1		# tp[0]*n0
584
	mov	%rdx,$A[1]
679
	mov	%rdx,$A[1]
585
680
586
	por	%xmm2,%xmm0
587
	lea	$STRIDE($bp),$bp
588
	por	%xmm3,%xmm0
589
590
	mulq	$m1			# np[0]*m1
681
	mulq	$m1			# np[0]*m1
591
	add	%rax,$A[0]		# "$N[0]", discarded
682
	add	%rax,$A[0]		# "$N[0]", discarded
592
	mov	8($ap),%rax
683
	mov	8($ap),%rax
 Lines 718-724   $code.=<<___; Link Here 
718
	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
809
	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
719
	mov	%rdx,$N[0]
810
	mov	%rdx,$N[0]
720
811
721
	movq	%xmm0,$m0		# bp[i+1]
722
	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
812
	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
723
813
724
	xor	$N[1],$N[1]
814
	xor	$N[1],$N[1]
 Lines 809-821   ___ Link Here 
809
$code.=<<___;
899
$code.=<<___;
810
	mov	8(%rsp,$num,8),%rsi	# restore %rsp
900
	mov	8(%rsp,$num,8),%rsi	# restore %rsp
811
	mov	\$1,%rax
901
	mov	\$1,%rax
812
___
902
813
$code.=<<___ if ($win64);
814
	movaps	(%rsi),%xmm6
815
	movaps	0x10(%rsi),%xmm7
816
	lea	0x28(%rsi),%rsi
817
___
818
$code.=<<___;
819
	mov	(%rsi),%r15
903
	mov	(%rsi),%r15
820
	mov	8(%rsi),%r14
904
	mov	8(%rsi),%r14
821
	mov	16(%rsi),%r13
905
	mov	16(%rsi),%r13
 Lines 830-837   ___ Link Here 
830
}}}
914
}}}
831
915
832
{
916
{
833
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
917
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
834
				("%rdi","%rsi","%rdx","%rcx"); # Unix order
918
				("%rdi","%rsi","%rdx","%ecx"); # Unix order
835
my $out=$inp;
919
my $out=$inp;
836
my $STRIDE=2**5*8;
920
my $STRIDE=2**5*8;
837
my $N=$STRIDE/4;
921
my $N=$STRIDE/4;
 Lines 859-911   bn_scatter5: Link Here 
859
.type	bn_gather5,\@abi-omnipotent
943
.type	bn_gather5,\@abi-omnipotent
860
.align	16
944
.align	16
861
bn_gather5:
945
bn_gather5:
862
___
946
.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
863
$code.=<<___ if ($win64);
864
.LSEH_begin_bn_gather5:
865
	# I can't trust assembler to use specific encoding:-(
947
	# I can't trust assembler to use specific encoding:-(
866
	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
948
	.byte	0x4c,0x8d,0x14,0x24			# lea    (%rsp),%r10
867
	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
949
	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	# sub	$0x108,%rsp
868
	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
950
	lea	.Linc(%rip),%rax
951
	and	\$-16,%rsp		# shouldn't be formally required
952
953
	movd	$idx,%xmm5
954
	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
955
	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
956
	lea	128($tbl),%r11		# size optimization
957
	lea	128(%rsp),%rax		# size optimization
958
959
	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
960
	movdqa	%xmm1,%xmm4
961
	movdqa	%xmm1,%xmm2
869
___
962
___
963
########################################################################
964
# calculate mask by comparing 0..31 to $idx and save result to stack
965
#
966
for($i=0;$i<$STRIDE/16;$i+=4) {
967
$code.=<<___;
968
	paddd	%xmm0,%xmm1
969
	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
970
___
971
$code.=<<___	if ($i);
972
	movdqa	%xmm3,`16*($i-1)-128`(%rax)
973
___
974
$code.=<<___;
975
	movdqa	%xmm4,%xmm3
976
977
	paddd	%xmm1,%xmm2
978
	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
979
	movdqa	%xmm0,`16*($i+0)-128`(%rax)
980
	movdqa	%xmm4,%xmm0
981
982
	paddd	%xmm2,%xmm3
983
	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
984
	movdqa	%xmm1,`16*($i+1)-128`(%rax)
985
	movdqa	%xmm4,%xmm1
986
987
	paddd	%xmm3,%xmm0
988
	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
989
	movdqa	%xmm2,`16*($i+2)-128`(%rax)
990
	movdqa	%xmm4,%xmm2
991
___
992
}
870
$code.=<<___;
993
$code.=<<___;
871
	mov	$idx,%r11
994
	movdqa	%xmm3,`16*($i-1)-128`(%rax)
872
	shr	\$`log($N/8)/log(2)`,$idx
873
	and	\$`$N/8-1`,%r11
874
	not	$idx
875
	lea	.Lmagic_masks(%rip),%rax
876
	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
877
	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
878
	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
879
	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
880
	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
881
	movq	24(%rax,$idx,8),%xmm7
882
	jmp	.Lgather
995
	jmp	.Lgather
883
.align	16
884
.Lgather:
885
	movq	`0*$STRIDE/4-96`($tbl),%xmm0
886
	movq	`1*$STRIDE/4-96`($tbl),%xmm1
887
	pand	%xmm4,%xmm0
888
	movq	`2*$STRIDE/4-96`($tbl),%xmm2
889
	pand	%xmm5,%xmm1
890
	movq	`3*$STRIDE/4-96`($tbl),%xmm3
891
	pand	%xmm6,%xmm2
892
	por	%xmm1,%xmm0
893
	pand	%xmm7,%xmm3
894
	por	%xmm2,%xmm0
895
	lea	$STRIDE($tbl),$tbl
896
	por	%xmm3,%xmm0
897
996
997
.align	32
998
.Lgather:
999
	pxor	%xmm4,%xmm4
1000
	pxor	%xmm5,%xmm5
1001
___
1002
for($i=0;$i<$STRIDE/16;$i+=4) {
1003
$code.=<<___;
1004
	movdqa	`16*($i+0)-128`(%r11),%xmm0
1005
	movdqa	`16*($i+1)-128`(%r11),%xmm1
1006
	movdqa	`16*($i+2)-128`(%r11),%xmm2
1007
	pand	`16*($i+0)-128`(%rax),%xmm0
1008
	movdqa	`16*($i+3)-128`(%r11),%xmm3
1009
	pand	`16*($i+1)-128`(%rax),%xmm1
1010
	por	%xmm0,%xmm4
1011
	pand	`16*($i+2)-128`(%rax),%xmm2
1012
	por	%xmm1,%xmm5
1013
	pand	`16*($i+3)-128`(%rax),%xmm3
1014
	por	%xmm2,%xmm4
1015
	por	%xmm3,%xmm5
1016
___
1017
}
1018
$code.=<<___;
1019
	por	%xmm5,%xmm4
1020
	lea	$STRIDE(%r11),%r11
1021
	pshufd	\$0x4e,%xmm4,%xmm0
1022
	por	%xmm4,%xmm0
898
	movq	%xmm0,($out)		# m0=bp[0]
1023
	movq	%xmm0,($out)		# m0=bp[0]
899
	lea	8($out),$out
1024
	lea	8($out),$out
900
	sub	\$1,$num
1025
	sub	\$1,$num
901
	jnz	.Lgather
1026
	jnz	.Lgather
902
___
1027
903
$code.=<<___ if ($win64);
1028
	lea	(%r10),%rsp
904
	movaps	(%rsp),%xmm6
905
	movaps	0x10(%rsp),%xmm7
906
	lea	0x28(%rsp),%rsp
907
___
908
$code.=<<___;
909
	ret
1029
	ret
910
.LSEH_end_bn_gather5:
1030
.LSEH_end_bn_gather5:
911
.size	bn_gather5,.-bn_gather5
1031
.size	bn_gather5,.-bn_gather5
 Lines 913-921   ___ Link Here 
913
}
1033
}
914
$code.=<<___;
1034
$code.=<<___;
915
.align	64
1035
.align	64
916
.Lmagic_masks:
1036
.Linc:
917
	.long	0,0, 0,0, 0,0, -1,-1
1037
	.long	0,0, 1,1
918
	.long	0,0, 0,0, 0,0,  0,0
1038
	.long	2,2, 2,2
919
.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1039
.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
920
___
1040
___
921
1041
 Lines 954-960   mul_handler: Link Here 
954
	cmp	%r10,%rbx		# context->Rip<end of prologue label
1074
	cmp	%r10,%rbx		# context->Rip<end of prologue label
955
	jb	.Lcommon_seh_tail
1075
	jb	.Lcommon_seh_tail
956
1076
957
	lea	`40+48`(%rax),%rax
1077
	lea	48(%rax),%rax
958
1078
959
	mov	4(%r11),%r10d		# HandlerData[1]
1079
	mov	4(%r11),%r10d		# HandlerData[1]
960
	lea	(%rsi,%r10),%r10	# end of alloca label
1080
	lea	(%rsi,%r10),%r10	# end of alloca label
 Lines 971-979   mul_handler: Link Here 
971
	mov	192($context),%r10	# pull $num
1091
	mov	192($context),%r10	# pull $num
972
	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1092
	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
973
1093
974
	movaps	(%rax),%xmm0
1094
	lea	48(%rax),%rax
975
	movaps	16(%rax),%xmm1
976
	lea	`40+48`(%rax),%rax
977
1095
978
	mov	-8(%rax),%rbx
1096
	mov	-8(%rax),%rbx
979
	mov	-16(%rax),%rbp
1097
	mov	-16(%rax),%rbp
 Lines 987-994   mul_handler: Link Here 
987
	mov	%r13,224($context)	# restore context->R13
1105
	mov	%r13,224($context)	# restore context->R13
988
	mov	%r14,232($context)	# restore context->R14
1106
	mov	%r14,232($context)	# restore context->R14
989
	mov	%r15,240($context)	# restore context->R15
1107
	mov	%r15,240($context)	# restore context->R15
990
	movups	%xmm0,512($context)	# restore context->Xmm6
991
	movups	%xmm1,528($context)	# restore context->Xmm7
992
1108
993
.Lcommon_seh_tail:
1109
.Lcommon_seh_tail:
994
	mov	8(%rax),%rdi
1110
	mov	8(%rax),%rdi
 Lines 1057-1066   mul_handler: Link Here 
1057
	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1173
	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1058
.align	8
1174
.align	8
1059
.LSEH_info_bn_gather5:
1175
.LSEH_info_bn_gather5:
1060
        .byte   0x01,0x0d,0x05,0x00
1176
	.byte	0x01,0x0b,0x03,0x0a
1061
        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
1177
	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
1062
        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
1178
	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp), set_frame r10
1063
        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
1064
.align	8
1179
.align	8
1065
___
1180
___
1066
}
1181
}
(-)a/crypto/bn/bn_exp.c (-18 / +57 lines)
 Lines 110-115    Link Here 
110
 */
110
 */
111
111
112
#include "cryptlib.h"
112
#include "cryptlib.h"
113
#include "constant_time_locl.h"
113
#include "bn_lcl.h"
114
#include "bn_lcl.h"
114
115
115
#include <stdlib.h>
116
#include <stdlib.h>
 Lines 546-560   int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, Link Here 
546
547
547
static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
548
static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
548
                                        unsigned char *buf, int idx,
549
                                        unsigned char *buf, int idx,
549
                                        int width)
550
                                        int window)
550
{
551
{
551
    size_t i, j;
552
    int i, j;
553
    int width = 1 << window;
554
    BN_ULONG *table = (BN_ULONG *)buf;
552
555
553
    if (top > b->top)
556
    if (top > b->top)
554
        top = b->top;           /* this works because 'buf' is explicitly
557
        top = b->top;           /* this works because 'buf' is explicitly
555
                                 * zeroed */
558
                                 * zeroed */
556
    for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
559
    for (i = 0, j = idx; i < top; i++, j += width) {
557
        buf[j] = ((unsigned char *)b->d)[i];
560
        table[j] = b->d[i];
558
    }
561
    }
559
562
560
    return 1;
563
    return 1;
 Lines 562-576   static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, Link Here 
562
565
563
static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
566
static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
564
                                          unsigned char *buf, int idx,
567
                                          unsigned char *buf, int idx,
565
                                          int width)
568
                                          int window)
566
{
569
{
567
    size_t i, j;
570
    int i, j;
571
    int width = 1 << window;
572
    volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
568
573
569
    if (bn_wexpand(b, top) == NULL)
574
    if (bn_wexpand(b, top) == NULL)
570
        return 0;
575
        return 0;
571
576
572
    for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
577
    if (window <= 3) {
573
        ((unsigned char *)b->d)[i] = buf[j];
578
        for (i = 0; i < top; i++, table += width) {
579
            BN_ULONG acc = 0;
580
581
            for (j = 0; j < width; j++) {
582
                acc |= table[j] &
583
                       ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
584
            }
585
586
            b->d[i] = acc;
587
        }
588
    } else {
589
        int xstride = 1 << (window - 2);
590
        BN_ULONG y0, y1, y2, y3;
591
592
        i = idx >> (window - 2);        /* equivalent of idx / xstride */
593
        idx &= xstride - 1;             /* equivalent of idx % xstride */
594
595
        y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
596
        y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
597
        y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
598
        y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
599
600
        for (i = 0; i < top; i++, table += width) {
601
            BN_ULONG acc = 0;
602
603
            for (j = 0; j < xstride; j++) {
604
                acc |= ( (table[j + 0 * xstride] & y0) |
605
                         (table[j + 1 * xstride] & y1) |
606
                         (table[j + 2 * xstride] & y2) |
607
                         (table[j + 3 * xstride] & y3) )
608
                       & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
609
            }
610
611
            b->d[i] = acc;
612
        }
574
    }
613
    }
575
614
576
    b->top = top;
615
    b->top = top;
 Lines 800-808   int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, Link Here 
800
    } else
839
    } else
801
#endif
840
#endif
802
    {
841
    {
803
        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers))
842
        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))
804
            goto err;
843
            goto err;
805
        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers))
844
        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))
806
            goto err;
845
            goto err;
807
846
808
        /*
847
        /*
 Lines 814-828   int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, Link Here 
814
        if (window > 1) {
853
        if (window > 1) {
815
            if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
854
            if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
816
                goto err;
855
                goto err;
817
            if (!MOD_EXP_CTIME_COPY_TO_PREBUF
856
            if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,
818
                (&tmp, top, powerbuf, 2, numPowers))
857
                                              window))
819
                goto err;
858
                goto err;
820
            for (i = 3; i < numPowers; i++) {
859
            for (i = 3; i < numPowers; i++) {
821
                /* Calculate a^i = a^(i-1) * a */
860
                /* Calculate a^i = a^(i-1) * a */
822
                if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
861
                if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
823
                    goto err;
862
                    goto err;
824
                if (!MOD_EXP_CTIME_COPY_TO_PREBUF
863
                if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,
825
                    (&tmp, top, powerbuf, i, numPowers))
864
                                                  window))
826
                    goto err;
865
                    goto err;
827
            }
866
            }
828
        }
867
        }
 Lines 830-837   int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, Link Here 
830
        bits--;
869
        bits--;
831
        for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
870
        for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
832
            wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
871
            wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
833
        if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
872
        if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,
834
            (&tmp, top, powerbuf, wvalue, numPowers))
873
                                            window))
835
            goto err;
874
            goto err;
836
875
837
        /*
876
        /*
 Lines 851-858   int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, Link Here 
851
            /*
890
            /*
852
             * Fetch the appropriate pre-computed value from the pre-buf
891
             * Fetch the appropriate pre-computed value from the pre-buf
853
             */
892
             */
854
            if (!MOD_EXP_CTIME_COPY_FROM_PREBUF
893
            if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,
855
                (&am, top, powerbuf, wvalue, numPowers))
894
                                                window))
856
                goto err;
895
                goto err;
857
896
858
            /* Multiply the result into the intermediate result */
897
            /* Multiply the result into the intermediate result */

Return to bug 40189