1 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 # the result to a second limb vector.
3 #
4 # Copyright (C) 2000-2022 Free Software Foundation, Inc.
5 #
6 # This file is part of the GNU MP Library.
7 #
8 # The GNU MP Library is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU Lesser General Public License as published
10 # by the Free Software Foundation; either version 2.1 of the License, or (at
11 # your option) any later version.
12 #
13 # The GNU MP Library is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 # License for more details.
17 #
18 # You should have received a copy of the GNU Lesser General Public License
19 # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>.
20
21 # INPUT PARAMETERS
22 # res_ptr $16
23 # s1_ptr $17
24 # size $18
25 # s2_limb $19
26 #
27 # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
28 # exactly 3.625 cycles/limb on EV6...
29 #
30 # This code was written in close cooperation with ev6 pipeline expert
31 # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
32 #
33 # Register usages for unrolled loop:
34 # 0-3 mul's
35 # 4-7 acc's
36 # 8-15 mul results
37 # 20,21 carry's
38 # 22,23 save for stores
39 #
40 # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
41 #
42 # The stores can issue a cycle late so we have paired no-op's to 'catch'
43 # them, so that further disturbance to the schedule is damped.
44 #
45 # We couldn't pair the loads, because the entangled schedule of the
46 # carry's has to happen on one side {0} of the machine. Note, the total
47 # use of U0, and the total use of L0 (after attending to the stores).
48 # which is part of the reason why....
49 #
50 # This is a great schedule for the d_cache, a poor schedule for the
51 # b_cache. The lockup on U0 means that any stall can't be recovered
52 # from. Consider a ldq in L1. say that load gets stalled because it
53 # collides with a fill from the b_Cache. On the next cycle, this load
54 # gets priority. If first looks at L0, and goes there. The instruction
55 # we intended for L0 gets to look at L1, which is NOT where we want
56 # it. It either stalls 1, because it can't go in L0, or goes there, and
57 # causes a further instruction to stall.
58 #
59 # So for b_cache, we're likely going to want to put one or more cycles
60 # back into the code! And, of course, put in prefetches. For the
61 # accumulator, lds, intent to modify. For the multiplier, you might
62 # want ldq, evict next, if you're not wanting to use it again soon. Use
63 # 256 ahead of present pointer value. At a place where we have an mt
64 # followed by a bookkeeping, put the bookkeeping in upper, and the
65 # prefetch into lower.
66 #
67 # Note, the usage of physical registers per cycle is smoothed off, as
68 # much as possible.
69 #
70 # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
71 # like not to have a ldq or stq to preceded a conditional branch in a
72 # quadpack. The conditional branch moves the retire pointer one cycle
73 # later.
74 #
75 # Optimization notes:
76 # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
77 # Reserved regs: $29 $30 $31
78 # Free caller-saves regs in unrolled code: $24 $25 $28
79 # We should swap some of the callee-saves regs for some of the free
80 # caller-saves regs, saving some overhead cycles.
81 # Most importantly, we should write fast code for the 0-7 case.
82 # The code we use there are for the 21164, and runs at 7 cycles/limb
83 # on the 21264. Should not be hard, if we write specialized code for
84 # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
85 # need a jump table indexed by the low 3 bits of the count argument.
86
87 .set noreorder
88 .set noat
89 .text
90
91 .globl __mpn_addmul_1
92 .ent __mpn_addmul_1
93__mpn_addmul_1:
94 .frame $30,0,$26,0
95 .prologue 0
96
97 cmpult $18, 8, $1
98 beq $1, $Large
99
100 ldq $2, 0($17) # $2 = s1_limb
101 addq $17, 8, $17 # s1_ptr++
102 subq $18, 1, $18 # size--
103 mulq $2, $19, $3 # $3 = prod_low
104 ldq $5, 0($16) # $5 = *res_ptr
105 umulh $2, $19, $0 # $0 = prod_high
106 beq $18, $Lend0b # jump if size was == 1
107 ldq $2, 0($17) # $2 = s1_limb
108 addq $17, 8, $17 # s1_ptr++
109 subq $18, 1, $18 # size--
110 addq $5, $3, $3
111 cmpult $3, $5, $4
112 stq $3, 0($16)
113 addq $16, 8, $16 # res_ptr++
114 beq $18, $Lend0a # jump if size was == 2
115
116 .align 3
117$Loop0: mulq $2, $19, $3 # $3 = prod_low
118 ldq $5, 0($16) # $5 = *res_ptr
119 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
120 subq $18, 1, $18 # size--
121 umulh $2, $19, $4 # $4 = cy_limb
122 ldq $2, 0($17) # $2 = s1_limb
123 addq $17, 8, $17 # s1_ptr++
124 addq $3, $0, $3 # $3 = cy_limb + prod_low
125 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
126 addq $5, $3, $3
127 cmpult $3, $5, $5
128 stq $3, 0($16)
129 addq $16, 8, $16 # res_ptr++
130 addq $5, $0, $0 # combine carries
131 bne $18, $Loop0
132$Lend0a:
133 mulq $2, $19, $3 # $3 = prod_low
134 ldq $5, 0($16) # $5 = *res_ptr
135 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
136 umulh $2, $19, $4 # $4 = cy_limb
137 addq $3, $0, $3 # $3 = cy_limb + prod_low
138 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
139 addq $5, $3, $3
140 cmpult $3, $5, $5
141 stq $3, 0($16)
142 addq $5, $0, $0 # combine carries
143 addq $4, $0, $0 # cy_limb = prod_high + cy
144 ret $31, ($26), 1
145$Lend0b:
146 addq $5, $3, $3
147 cmpult $3, $5, $5
148 stq $3, 0($16)
149 addq $0, $5, $0
150 ret $31, ($26), 1
151
152$Large:
153 lda $30, -240($30)
154 stq $9, 8($30)
155 stq $10, 16($30)
156 stq $11, 24($30)
157 stq $12, 32($30)
158 stq $13, 40($30)
159 stq $14, 48($30)
160 stq $15, 56($30)
161
162 and $18, 7, $20 # count for the first loop, 0-7
163 srl $18, 3, $18 # count for unrolled loop
164 bis $31, $31, $0
165 beq $20, $Lunroll
166 ldq $2, 0($17) # $2 = s1_limb
167 addq $17, 8, $17 # s1_ptr++
168 subq $20, 1, $20 # size--
169 mulq $2, $19, $3 # $3 = prod_low
170 ldq $5, 0($16) # $5 = *res_ptr
171 umulh $2, $19, $0 # $0 = prod_high
172 beq $20, $Lend1b # jump if size was == 1
173 ldq $2, 0($17) # $2 = s1_limb
174 addq $17, 8, $17 # s1_ptr++
175 subq $20, 1, $20 # size--
176 addq $5, $3, $3
177 cmpult $3, $5, $4
178 stq $3, 0($16)
179 addq $16, 8, $16 # res_ptr++
180 beq $20, $Lend1a # jump if size was == 2
181
182 .align 3
183$Loop1: mulq $2, $19, $3 # $3 = prod_low
184 ldq $5, 0($16) # $5 = *res_ptr
185 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
186 subq $20, 1, $20 # size--
187 umulh $2, $19, $4 # $4 = cy_limb
188 ldq $2, 0($17) # $2 = s1_limb
189 addq $17, 8, $17 # s1_ptr++
190 addq $3, $0, $3 # $3 = cy_limb + prod_low
191 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
192 addq $5, $3, $3
193 cmpult $3, $5, $5
194 stq $3, 0($16)
195 addq $16, 8, $16 # res_ptr++
196 addq $5, $0, $0 # combine carries
197 bne $20, $Loop1
198
199$Lend1a:
200 mulq $2, $19, $3 # $3 = prod_low
201 ldq $5, 0($16) # $5 = *res_ptr
202 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
203 umulh $2, $19, $4 # $4 = cy_limb
204 addq $3, $0, $3 # $3 = cy_limb + prod_low
205 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
206 addq $5, $3, $3
207 cmpult $3, $5, $5
208 stq $3, 0($16)
209 addq $16, 8, $16 # res_ptr++
210 addq $5, $0, $0 # combine carries
211 addq $4, $0, $0 # cy_limb = prod_high + cy
212 br $31, $Lunroll
213$Lend1b:
214 addq $5, $3, $3
215 cmpult $3, $5, $5
216 stq $3, 0($16)
217 addq $16, 8, $16 # res_ptr++
218 addq $0, $5, $0
219
220$Lunroll:
221 lda $17, -16($17) # L1 bookkeeping
222 lda $16, -16($16) # L1 bookkeeping
223 bis $0, $31, $12
224
225 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
226
227 ldq $2, 16($17) # L1
228 ldq $3, 24($17) # L1
229 lda $18, -1($18) # L1 bookkeeping
230 ldq $6, 16($16) # L1
231 ldq $7, 24($16) # L1
232 ldq $0, 32($17) # L1
233 mulq $19, $2, $13 # U1
234 ldq $1, 40($17) # L1
235 umulh $19, $2, $14 # U1
236 mulq $19, $3, $15 # U1
237 lda $17, 64($17) # L1 bookkeeping
238 ldq $4, 32($16) # L1
239 ldq $5, 40($16) # L1
240 umulh $19, $3, $8 # U1
241 ldq $2, -16($17) # L1
242 mulq $19, $0, $9 # U1
243 ldq $3, -8($17) # L1
244 umulh $19, $0, $10 # U1
245 addq $6, $13, $6 # L0 lo + acc
246 mulq $19, $1, $11 # U1
247 cmpult $6, $13, $20 # L0 lo add => carry
248 lda $16, 64($16) # L1 bookkeeping
249 addq $6, $12, $22 # U0 hi add => answer
250 cmpult $22, $12, $21 # L0 hi add => carry
251 addq $14, $20, $14 # U0 hi mul + carry
252 ldq $6, -16($16) # L1
253 addq $7, $15, $23 # L0 lo + acc
254 addq $14, $21, $14 # U0 hi mul + carry
255 ldq $7, -8($16) # L1
256 umulh $19, $1, $12 # U1
257 cmpult $23, $15, $20 # L0 lo add => carry
258 addq $23, $14, $23 # U0 hi add => answer
259 ldq $0, 0($17) # L1
260 mulq $19, $2, $13 # U1
261 cmpult $23, $14, $21 # L0 hi add => carry
262 addq $8, $20, $8 # U0 hi mul + carry
263 ldq $1, 8($17) # L1
264 umulh $19, $2, $14 # U1
265 addq $4, $9, $4 # L0 lo + acc
266 stq $22, -48($16) # L0
267 stq $23, -40($16) # L1
268 mulq $19, $3, $15 # U1
269 addq $8, $21, $8 # U0 hi mul + carry
270 cmpult $4, $9, $20 # L0 lo add => carry
271 addq $4, $8, $22 # U0 hi add => answer
272 ble $18, $Lend # U1 bookkeeping
273
274 # ____ MAIN UNROLLED LOOP ____
275 .align 4
276$Loop:
277 bis $31, $31, $31 # U1 mt
278 cmpult $22, $8, $21 # L0 hi add => carry
279 addq $10, $20, $10 # U0 hi mul + carry
280 ldq $4, 0($16) # L1
281
282 bis $31, $31, $31 # U1 mt
283 addq $5, $11, $23 # L0 lo + acc
284 addq $10, $21, $10 # L0 hi mul + carry
285 ldq $5, 8($16) # L1
286
287 umulh $19, $3, $8 # U1
288 cmpult $23, $11, $20 # L0 lo add => carry
289 addq $23, $10, $23 # U0 hi add => answer
290 ldq $2, 16($17) # L1
291
292 mulq $19, $0, $9 # U1
293 cmpult $23, $10, $21 # L0 hi add => carry
294 addq $12, $20, $12 # U0 hi mul + carry
295 ldq $3, 24($17) # L1
296
297 umulh $19, $0, $10 # U1
298 addq $6, $13, $6 # L0 lo + acc
299 stq $22, -32($16) # L0
300 stq $23, -24($16) # L1
301
302 bis $31, $31, $31 # L0 st slosh
303 mulq $19, $1, $11 # U1
304 bis $31, $31, $31 # L1 st slosh
305 addq $12, $21, $12 # U0 hi mul + carry
306
307 cmpult $6, $13, $20 # L0 lo add => carry
308 bis $31, $31, $31 # U1 mt
309 lda $18, -1($18) # L1 bookkeeping
310 addq $6, $12, $22 # U0 hi add => answer
311
312 bis $31, $31, $31 # U1 mt
313 cmpult $22, $12, $21 # L0 hi add => carry
314 addq $14, $20, $14 # U0 hi mul + carry
315 ldq $6, 16($16) # L1
316
317 bis $31, $31, $31 # U1 mt
318 addq $7, $15, $23 # L0 lo + acc
319 addq $14, $21, $14 # U0 hi mul + carry
320 ldq $7, 24($16) # L1
321
322 umulh $19, $1, $12 # U1
323 cmpult $23, $15, $20 # L0 lo add => carry
324 addq $23, $14, $23 # U0 hi add => answer
325 ldq $0, 32($17) # L1
326
327 mulq $19, $2, $13 # U1
328 cmpult $23, $14, $21 # L0 hi add => carry
329 addq $8, $20, $8 # U0 hi mul + carry
330 ldq $1, 40($17) # L1
331
332 umulh $19, $2, $14 # U1
333 addq $4, $9, $4 # U0 lo + acc
334 stq $22, -16($16) # L0
335 stq $23, -8($16) # L1
336
337 bis $31, $31, $31 # L0 st slosh
338 mulq $19, $3, $15 # U1
339 bis $31, $31, $31 # L1 st slosh
340 addq $8, $21, $8 # L0 hi mul + carry
341
342 cmpult $4, $9, $20 # L0 lo add => carry
343 bis $31, $31, $31 # U1 mt
344 lda $17, 64($17) # L1 bookkeeping
345 addq $4, $8, $22 # U0 hi add => answer
346
347 bis $31, $31, $31 # U1 mt
348 cmpult $22, $8, $21 # L0 hi add => carry
349 addq $10, $20, $10 # U0 hi mul + carry
350 ldq $4, 32($16) # L1
351
352 bis $31, $31, $31 # U1 mt
353 addq $5, $11, $23 # L0 lo + acc
354 addq $10, $21, $10 # L0 hi mul + carry
355 ldq $5, 40($16) # L1
356
357 umulh $19, $3, $8 # U1
358 cmpult $23, $11, $20 # L0 lo add => carry
359 addq $23, $10, $23 # U0 hi add => answer
360 ldq $2, -16($17) # L1
361
362 mulq $19, $0, $9 # U1
363 cmpult $23, $10, $21 # L0 hi add => carry
364 addq $12, $20, $12 # U0 hi mul + carry
365 ldq $3, -8($17) # L1
366
367 umulh $19, $0, $10 # U1
368 addq $6, $13, $6 # L0 lo + acc
369 stq $22, 0($16) # L0
370 stq $23, 8($16) # L1
371
372 bis $31, $31, $31 # L0 st slosh
373 mulq $19, $1, $11 # U1
374 bis $31, $31, $31 # L1 st slosh
375 addq $12, $21, $12 # U0 hi mul + carry
376
377 cmpult $6, $13, $20 # L0 lo add => carry
378 bis $31, $31, $31 # U1 mt
379 lda $16, 64($16) # L1 bookkeeping
380 addq $6, $12, $22 # U0 hi add => answer
381
382 bis $31, $31, $31 # U1 mt
383 cmpult $22, $12, $21 # L0 hi add => carry
384 addq $14, $20, $14 # U0 hi mul + carry
385 ldq $6, -16($16) # L1
386
387 bis $31, $31, $31 # U1 mt
388 addq $7, $15, $23 # L0 lo + acc
389 addq $14, $21, $14 # U0 hi mul + carry
390 ldq $7, -8($16) # L1
391
392 umulh $19, $1, $12 # U1
393 cmpult $23, $15, $20 # L0 lo add => carry
394 addq $23, $14, $23 # U0 hi add => answer
395 ldq $0, 0($17) # L1
396
397 mulq $19, $2, $13 # U1
398 cmpult $23, $14, $21 # L0 hi add => carry
399 addq $8, $20, $8 # U0 hi mul + carry
400 ldq $1, 8($17) # L1
401
402 umulh $19, $2, $14 # U1
403 addq $4, $9, $4 # L0 lo + acc
404 stq $22, -48($16) # L0
405 stq $23, -40($16) # L1
406
407 bis $31, $31, $31 # L0 st slosh
408 mulq $19, $3, $15 # U1
409 bis $31, $31, $31 # L1 st slosh
410 addq $8, $21, $8 # U0 hi mul + carry
411
412 cmpult $4, $9, $20 # L0 lo add => carry
413 addq $4, $8, $22 # U0 hi add => answer
414 bis $31, $31, $31 # L1 mt
415 bgt $18, $Loop # U1 bookkeeping
416
417# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
418$Lend:
419 cmpult $22, $8, $21 # L0 hi add => carry
420 addq $10, $20, $10 # U0 hi mul + carry
421 ldq $4, 0($16) # L1
422 addq $5, $11, $23 # L0 lo + acc
423 addq $10, $21, $10 # L0 hi mul + carry
424 ldq $5, 8($16) # L1
425 umulh $19, $3, $8 # U1
426 cmpult $23, $11, $20 # L0 lo add => carry
427 addq $23, $10, $23 # U0 hi add => answer
428 mulq $19, $0, $9 # U1
429 cmpult $23, $10, $21 # L0 hi add => carry
430 addq $12, $20, $12 # U0 hi mul + carry
431 umulh $19, $0, $10 # U1
432 addq $6, $13, $6 # L0 lo + acc
433 stq $22, -32($16) # L0
434 stq $23, -24($16) # L1
435 mulq $19, $1, $11 # U1
436 addq $12, $21, $12 # U0 hi mul + carry
437 cmpult $6, $13, $20 # L0 lo add => carry
438 addq $6, $12, $22 # U0 hi add => answer
439 cmpult $22, $12, $21 # L0 hi add => carry
440 addq $14, $20, $14 # U0 hi mul + carry
441 addq $7, $15, $23 # L0 lo + acc
442 addq $14, $21, $14 # U0 hi mul + carry
443 umulh $19, $1, $12 # U1
444 cmpult $23, $15, $20 # L0 lo add => carry
445 addq $23, $14, $23 # U0 hi add => answer
446 cmpult $23, $14, $21 # L0 hi add => carry
447 addq $8, $20, $8 # U0 hi mul + carry
448 addq $4, $9, $4 # U0 lo + acc
449 stq $22, -16($16) # L0
450 stq $23, -8($16) # L1
451 bis $31, $31, $31 # L0 st slosh
452 addq $8, $21, $8 # L0 hi mul + carry
453 cmpult $4, $9, $20 # L0 lo add => carry
454 addq $4, $8, $22 # U0 hi add => answer
455 cmpult $22, $8, $21 # L0 hi add => carry
456 addq $10, $20, $10 # U0 hi mul + carry
457 addq $5, $11, $23 # L0 lo + acc
458 addq $10, $21, $10 # L0 hi mul + carry
459 cmpult $23, $11, $20 # L0 lo add => carry
460 addq $23, $10, $23 # U0 hi add => answer
461 cmpult $23, $10, $21 # L0 hi add => carry
462 addq $12, $20, $12 # U0 hi mul + carry
463 stq $22, 0($16) # L0
464 stq $23, 8($16) # L1
465 addq $12, $21, $0 # U0 hi mul + carry
466
467 ldq $9, 8($30)
468 ldq $10, 16($30)
469 ldq $11, 24($30)
470 ldq $12, 32($30)
471 ldq $13, 40($30)
472 ldq $14, 48($30)
473 ldq $15, 56($30)
474 lda $30, 240($30)
475 ret $31, ($26), 1
476
477 .end __mpn_addmul_1
478

source code of glibc/sysdeps/alpha/alphaev6/addmul_1.S