1 | # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add |
2 | # the result to a second limb vector. |
3 | # |
4 | # Copyright (C) 2000-2022 Free Software Foundation, Inc. |
5 | # |
6 | # This file is part of the GNU MP Library. |
7 | # |
8 | # The GNU MP Library is free software; you can redistribute it and/or modify |
9 | # it under the terms of the GNU Lesser General Public License as published |
10 | # by the Free Software Foundation; either version 2.1 of the License, or (at |
11 | # your option) any later version. |
12 | # |
13 | # The GNU MP Library is distributed in the hope that it will be useful, but |
14 | # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
15 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
16 | # License for more details. |
17 | # |
18 | # You should have received a copy of the GNU Lesser General Public License |
19 | # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>. |
20 | |
21 | # INPUT PARAMETERS |
22 | # res_ptr $16 |
23 | # s1_ptr $17 |
24 | # size $18 |
25 | # s2_limb $19 |
26 | # |
27 | # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and |
28 | # exactly 3.625 cycles/limb on EV6... |
29 | # |
30 | # This code was written in close cooperation with ev6 pipeline expert |
31 | # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. |
32 | # |
33 | # Register usages for unrolled loop: |
34 | # 0-3 mul's |
35 | # 4-7 acc's |
36 | # 8-15 mul results |
37 | # 20,21 carry's |
38 | # 22,23 save for stores |
39 | # |
40 | # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. |
41 | # |
42 | # The stores can issue a cycle late so we have paired no-op's to 'catch' |
43 | # them, so that further disturbance to the schedule is damped. |
44 | # |
45 | # We couldn't pair the loads, because the entangled schedule of the |
46 | # carry's has to happen on one side {0} of the machine. Note, the total |
47 | # use of U0, and the total use of L0 (after attending to the stores). |
48 | # which is part of the reason why.... |
49 | # |
50 | # This is a great schedule for the d_cache, a poor schedule for the |
51 | # b_cache. The lockup on U0 means that any stall can't be recovered |
52 | # from. Consider a ldq in L1. say that load gets stalled because it |
53 | # collides with a fill from the b_Cache. On the next cycle, this load |
54 | # gets priority. If first looks at L0, and goes there. The instruction |
55 | # we intended for L0 gets to look at L1, which is NOT where we want |
56 | # it. It either stalls 1, because it can't go in L0, or goes there, and |
57 | # causes a further instruction to stall. |
58 | # |
59 | # So for b_cache, we're likely going to want to put one or more cycles |
60 | # back into the code! And, of course, put in prefetches. For the |
61 | # accumulator, lds, intent to modify. For the multiplier, you might |
62 | # want ldq, evict next, if you're not wanting to use it again soon. Use |
63 | # 256 ahead of present pointer value. At a place where we have an mt |
64 | # followed by a bookkeeping, put the bookkeeping in upper, and the |
65 | # prefetch into lower. |
66 | # |
67 | # Note, the usage of physical registers per cycle is smoothed off, as |
68 | # much as possible. |
69 | # |
70 | # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd |
71 | # like not to have a ldq or stq to preceded a conditional branch in a |
72 | # quadpack. The conditional branch moves the retire pointer one cycle |
73 | # later. |
74 | # |
75 | # Optimization notes: |
76 | # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? |
77 | # Reserved regs: $29 $30 $31 |
78 | # Free caller-saves regs in unrolled code: $24 $25 $28 |
79 | # We should swap some of the callee-saves regs for some of the free |
80 | # caller-saves regs, saving some overhead cycles. |
81 | # Most importantly, we should write fast code for the 0-7 case. |
82 | # The code we use there are for the 21164, and runs at 7 cycles/limb |
83 | # on the 21264. Should not be hard, if we write specialized code for |
84 | # 1-7 limbs (the one for 0 limbs should be straightforward). We then just |
85 | # need a jump table indexed by the low 3 bits of the count argument. |
86 | |
87 | .set noreorder |
88 | .set noat |
89 | .text |
90 | |
91 | .globl __mpn_addmul_1 |
92 | .ent __mpn_addmul_1 |
93 | __mpn_addmul_1: |
94 | .frame $30,0,$26,0 |
95 | .prologue 0 |
96 | |
97 | cmpult $18, 8, $1 |
98 | beq $1, $Large |
99 | |
100 | ldq $2, 0($17) # $2 = s1_limb |
101 | addq $17, 8, $17 # s1_ptr++ |
102 | subq $18, 1, $18 # size-- |
103 | mulq $2, $19, $3 # $3 = prod_low |
104 | ldq $5, 0($16) # $5 = *res_ptr |
105 | umulh $2, $19, $0 # $0 = prod_high |
106 | beq $18, $Lend0b # jump if size was == 1 |
107 | ldq $2, 0($17) # $2 = s1_limb |
108 | addq $17, 8, $17 # s1_ptr++ |
109 | subq $18, 1, $18 # size-- |
110 | addq $5, $3, $3 |
111 | cmpult $3, $5, $4 |
112 | stq $3, 0($16) |
113 | addq $16, 8, $16 # res_ptr++ |
114 | beq $18, $Lend0a # jump if size was == 2 |
115 | |
116 | .align 3 |
117 | $Loop0: mulq $2, $19, $3 # $3 = prod_low |
118 | ldq $5, 0($16) # $5 = *res_ptr |
119 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' |
120 | subq $18, 1, $18 # size-- |
121 | umulh $2, $19, $4 # $4 = cy_limb |
122 | ldq $2, 0($17) # $2 = s1_limb |
123 | addq $17, 8, $17 # s1_ptr++ |
124 | addq $3, $0, $3 # $3 = cy_limb + prod_low |
125 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) |
126 | addq $5, $3, $3 |
127 | cmpult $3, $5, $5 |
128 | stq $3, 0($16) |
129 | addq $16, 8, $16 # res_ptr++ |
130 | addq $5, $0, $0 # combine carries |
131 | bne $18, $Loop0 |
132 | $Lend0a: |
133 | mulq $2, $19, $3 # $3 = prod_low |
134 | ldq $5, 0($16) # $5 = *res_ptr |
135 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' |
136 | umulh $2, $19, $4 # $4 = cy_limb |
137 | addq $3, $0, $3 # $3 = cy_limb + prod_low |
138 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) |
139 | addq $5, $3, $3 |
140 | cmpult $3, $5, $5 |
141 | stq $3, 0($16) |
142 | addq $5, $0, $0 # combine carries |
143 | addq $4, $0, $0 # cy_limb = prod_high + cy |
144 | ret $31, ($26), 1 |
145 | $Lend0b: |
146 | addq $5, $3, $3 |
147 | cmpult $3, $5, $5 |
148 | stq $3, 0($16) |
149 | addq $0, $5, $0 |
150 | ret $31, ($26), 1 |
151 | |
152 | $Large: |
153 | lda $30, -240($30) |
154 | stq $9, 8($30) |
155 | stq $10, 16($30) |
156 | stq $11, 24($30) |
157 | stq $12, 32($30) |
158 | stq $13, 40($30) |
159 | stq $14, 48($30) |
160 | stq $15, 56($30) |
161 | |
162 | and $18, 7, $20 # count for the first loop, 0-7 |
163 | srl $18, 3, $18 # count for unrolled loop |
164 | bis $31, $31, $0 |
165 | beq $20, $Lunroll |
166 | ldq $2, 0($17) # $2 = s1_limb |
167 | addq $17, 8, $17 # s1_ptr++ |
168 | subq $20, 1, $20 # size-- |
169 | mulq $2, $19, $3 # $3 = prod_low |
170 | ldq $5, 0($16) # $5 = *res_ptr |
171 | umulh $2, $19, $0 # $0 = prod_high |
172 | beq $20, $Lend1b # jump if size was == 1 |
173 | ldq $2, 0($17) # $2 = s1_limb |
174 | addq $17, 8, $17 # s1_ptr++ |
175 | subq $20, 1, $20 # size-- |
176 | addq $5, $3, $3 |
177 | cmpult $3, $5, $4 |
178 | stq $3, 0($16) |
179 | addq $16, 8, $16 # res_ptr++ |
180 | beq $20, $Lend1a # jump if size was == 2 |
181 | |
182 | .align 3 |
183 | $Loop1: mulq $2, $19, $3 # $3 = prod_low |
184 | ldq $5, 0($16) # $5 = *res_ptr |
185 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' |
186 | subq $20, 1, $20 # size-- |
187 | umulh $2, $19, $4 # $4 = cy_limb |
188 | ldq $2, 0($17) # $2 = s1_limb |
189 | addq $17, 8, $17 # s1_ptr++ |
190 | addq $3, $0, $3 # $3 = cy_limb + prod_low |
191 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) |
192 | addq $5, $3, $3 |
193 | cmpult $3, $5, $5 |
194 | stq $3, 0($16) |
195 | addq $16, 8, $16 # res_ptr++ |
196 | addq $5, $0, $0 # combine carries |
197 | bne $20, $Loop1 |
198 | |
199 | $Lend1a: |
200 | mulq $2, $19, $3 # $3 = prod_low |
201 | ldq $5, 0($16) # $5 = *res_ptr |
202 | addq $4, $0, $0 # cy_limb = cy_limb + 'cy' |
203 | umulh $2, $19, $4 # $4 = cy_limb |
204 | addq $3, $0, $3 # $3 = cy_limb + prod_low |
205 | cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) |
206 | addq $5, $3, $3 |
207 | cmpult $3, $5, $5 |
208 | stq $3, 0($16) |
209 | addq $16, 8, $16 # res_ptr++ |
210 | addq $5, $0, $0 # combine carries |
211 | addq $4, $0, $0 # cy_limb = prod_high + cy |
212 | br $31, $Lunroll |
213 | $Lend1b: |
214 | addq $5, $3, $3 |
215 | cmpult $3, $5, $5 |
216 | stq $3, 0($16) |
217 | addq $16, 8, $16 # res_ptr++ |
218 | addq $0, $5, $0 |
219 | |
220 | $Lunroll: |
221 | lda $17, -16($17) # L1 bookkeeping |
222 | lda $16, -16($16) # L1 bookkeeping |
223 | bis $0, $31, $12 |
224 | |
225 | # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ |
226 | |
227 | ldq $2, 16($17) # L1 |
228 | ldq $3, 24($17) # L1 |
229 | lda $18, -1($18) # L1 bookkeeping |
230 | ldq $6, 16($16) # L1 |
231 | ldq $7, 24($16) # L1 |
232 | ldq $0, 32($17) # L1 |
233 | mulq $19, $2, $13 # U1 |
234 | ldq $1, 40($17) # L1 |
235 | umulh $19, $2, $14 # U1 |
236 | mulq $19, $3, $15 # U1 |
237 | lda $17, 64($17) # L1 bookkeeping |
238 | ldq $4, 32($16) # L1 |
239 | ldq $5, 40($16) # L1 |
240 | umulh $19, $3, $8 # U1 |
241 | ldq $2, -16($17) # L1 |
242 | mulq $19, $0, $9 # U1 |
243 | ldq $3, -8($17) # L1 |
244 | umulh $19, $0, $10 # U1 |
245 | addq $6, $13, $6 # L0 lo + acc |
246 | mulq $19, $1, $11 # U1 |
247 | cmpult $6, $13, $20 # L0 lo add => carry |
248 | lda $16, 64($16) # L1 bookkeeping |
249 | addq $6, $12, $22 # U0 hi add => answer |
250 | cmpult $22, $12, $21 # L0 hi add => carry |
251 | addq $14, $20, $14 # U0 hi mul + carry |
252 | ldq $6, -16($16) # L1 |
253 | addq $7, $15, $23 # L0 lo + acc |
254 | addq $14, $21, $14 # U0 hi mul + carry |
255 | ldq $7, -8($16) # L1 |
256 | umulh $19, $1, $12 # U1 |
257 | cmpult $23, $15, $20 # L0 lo add => carry |
258 | addq $23, $14, $23 # U0 hi add => answer |
259 | ldq $0, 0($17) # L1 |
260 | mulq $19, $2, $13 # U1 |
261 | cmpult $23, $14, $21 # L0 hi add => carry |
262 | addq $8, $20, $8 # U0 hi mul + carry |
263 | ldq $1, 8($17) # L1 |
264 | umulh $19, $2, $14 # U1 |
265 | addq $4, $9, $4 # L0 lo + acc |
266 | stq $22, -48($16) # L0 |
267 | stq $23, -40($16) # L1 |
268 | mulq $19, $3, $15 # U1 |
269 | addq $8, $21, $8 # U0 hi mul + carry |
270 | cmpult $4, $9, $20 # L0 lo add => carry |
271 | addq $4, $8, $22 # U0 hi add => answer |
272 | ble $18, $Lend # U1 bookkeeping |
273 | |
274 | # ____ MAIN UNROLLED LOOP ____ |
275 | .align 4 |
276 | $Loop: |
277 | bis $31, $31, $31 # U1 mt |
278 | cmpult $22, $8, $21 # L0 hi add => carry |
279 | addq $10, $20, $10 # U0 hi mul + carry |
280 | ldq $4, 0($16) # L1 |
281 | |
282 | bis $31, $31, $31 # U1 mt |
283 | addq $5, $11, $23 # L0 lo + acc |
284 | addq $10, $21, $10 # L0 hi mul + carry |
285 | ldq $5, 8($16) # L1 |
286 | |
287 | umulh $19, $3, $8 # U1 |
288 | cmpult $23, $11, $20 # L0 lo add => carry |
289 | addq $23, $10, $23 # U0 hi add => answer |
290 | ldq $2, 16($17) # L1 |
291 | |
292 | mulq $19, $0, $9 # U1 |
293 | cmpult $23, $10, $21 # L0 hi add => carry |
294 | addq $12, $20, $12 # U0 hi mul + carry |
295 | ldq $3, 24($17) # L1 |
296 | |
297 | umulh $19, $0, $10 # U1 |
298 | addq $6, $13, $6 # L0 lo + acc |
299 | stq $22, -32($16) # L0 |
300 | stq $23, -24($16) # L1 |
301 | |
302 | bis $31, $31, $31 # L0 st slosh |
303 | mulq $19, $1, $11 # U1 |
304 | bis $31, $31, $31 # L1 st slosh |
305 | addq $12, $21, $12 # U0 hi mul + carry |
306 | |
307 | cmpult $6, $13, $20 # L0 lo add => carry |
308 | bis $31, $31, $31 # U1 mt |
309 | lda $18, -1($18) # L1 bookkeeping |
310 | addq $6, $12, $22 # U0 hi add => answer |
311 | |
312 | bis $31, $31, $31 # U1 mt |
313 | cmpult $22, $12, $21 # L0 hi add => carry |
314 | addq $14, $20, $14 # U0 hi mul + carry |
315 | ldq $6, 16($16) # L1 |
316 | |
317 | bis $31, $31, $31 # U1 mt |
318 | addq $7, $15, $23 # L0 lo + acc |
319 | addq $14, $21, $14 # U0 hi mul + carry |
320 | ldq $7, 24($16) # L1 |
321 | |
322 | umulh $19, $1, $12 # U1 |
323 | cmpult $23, $15, $20 # L0 lo add => carry |
324 | addq $23, $14, $23 # U0 hi add => answer |
325 | ldq $0, 32($17) # L1 |
326 | |
327 | mulq $19, $2, $13 # U1 |
328 | cmpult $23, $14, $21 # L0 hi add => carry |
329 | addq $8, $20, $8 # U0 hi mul + carry |
330 | ldq $1, 40($17) # L1 |
331 | |
332 | umulh $19, $2, $14 # U1 |
333 | addq $4, $9, $4 # U0 lo + acc |
334 | stq $22, -16($16) # L0 |
335 | stq $23, -8($16) # L1 |
336 | |
337 | bis $31, $31, $31 # L0 st slosh |
338 | mulq $19, $3, $15 # U1 |
339 | bis $31, $31, $31 # L1 st slosh |
340 | addq $8, $21, $8 # L0 hi mul + carry |
341 | |
342 | cmpult $4, $9, $20 # L0 lo add => carry |
343 | bis $31, $31, $31 # U1 mt |
344 | lda $17, 64($17) # L1 bookkeeping |
345 | addq $4, $8, $22 # U0 hi add => answer |
346 | |
347 | bis $31, $31, $31 # U1 mt |
348 | cmpult $22, $8, $21 # L0 hi add => carry |
349 | addq $10, $20, $10 # U0 hi mul + carry |
350 | ldq $4, 32($16) # L1 |
351 | |
352 | bis $31, $31, $31 # U1 mt |
353 | addq $5, $11, $23 # L0 lo + acc |
354 | addq $10, $21, $10 # L0 hi mul + carry |
355 | ldq $5, 40($16) # L1 |
356 | |
357 | umulh $19, $3, $8 # U1 |
358 | cmpult $23, $11, $20 # L0 lo add => carry |
359 | addq $23, $10, $23 # U0 hi add => answer |
360 | ldq $2, -16($17) # L1 |
361 | |
362 | mulq $19, $0, $9 # U1 |
363 | cmpult $23, $10, $21 # L0 hi add => carry |
364 | addq $12, $20, $12 # U0 hi mul + carry |
365 | ldq $3, -8($17) # L1 |
366 | |
367 | umulh $19, $0, $10 # U1 |
368 | addq $6, $13, $6 # L0 lo + acc |
369 | stq $22, 0($16) # L0 |
370 | stq $23, 8($16) # L1 |
371 | |
372 | bis $31, $31, $31 # L0 st slosh |
373 | mulq $19, $1, $11 # U1 |
374 | bis $31, $31, $31 # L1 st slosh |
375 | addq $12, $21, $12 # U0 hi mul + carry |
376 | |
377 | cmpult $6, $13, $20 # L0 lo add => carry |
378 | bis $31, $31, $31 # U1 mt |
379 | lda $16, 64($16) # L1 bookkeeping |
380 | addq $6, $12, $22 # U0 hi add => answer |
381 | |
382 | bis $31, $31, $31 # U1 mt |
383 | cmpult $22, $12, $21 # L0 hi add => carry |
384 | addq $14, $20, $14 # U0 hi mul + carry |
385 | ldq $6, -16($16) # L1 |
386 | |
387 | bis $31, $31, $31 # U1 mt |
388 | addq $7, $15, $23 # L0 lo + acc |
389 | addq $14, $21, $14 # U0 hi mul + carry |
390 | ldq $7, -8($16) # L1 |
391 | |
392 | umulh $19, $1, $12 # U1 |
393 | cmpult $23, $15, $20 # L0 lo add => carry |
394 | addq $23, $14, $23 # U0 hi add => answer |
395 | ldq $0, 0($17) # L1 |
396 | |
397 | mulq $19, $2, $13 # U1 |
398 | cmpult $23, $14, $21 # L0 hi add => carry |
399 | addq $8, $20, $8 # U0 hi mul + carry |
400 | ldq $1, 8($17) # L1 |
401 | |
402 | umulh $19, $2, $14 # U1 |
403 | addq $4, $9, $4 # L0 lo + acc |
404 | stq $22, -48($16) # L0 |
405 | stq $23, -40($16) # L1 |
406 | |
407 | bis $31, $31, $31 # L0 st slosh |
408 | mulq $19, $3, $15 # U1 |
409 | bis $31, $31, $31 # L1 st slosh |
410 | addq $8, $21, $8 # U0 hi mul + carry |
411 | |
412 | cmpult $4, $9, $20 # L0 lo add => carry |
413 | addq $4, $8, $22 # U0 hi add => answer |
414 | bis $31, $31, $31 # L1 mt |
415 | bgt $18, $Loop # U1 bookkeeping |
416 | |
417 | # ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ |
418 | $Lend: |
419 | cmpult $22, $8, $21 # L0 hi add => carry |
420 | addq $10, $20, $10 # U0 hi mul + carry |
421 | ldq $4, 0($16) # L1 |
422 | addq $5, $11, $23 # L0 lo + acc |
423 | addq $10, $21, $10 # L0 hi mul + carry |
424 | ldq $5, 8($16) # L1 |
425 | umulh $19, $3, $8 # U1 |
426 | cmpult $23, $11, $20 # L0 lo add => carry |
427 | addq $23, $10, $23 # U0 hi add => answer |
428 | mulq $19, $0, $9 # U1 |
429 | cmpult $23, $10, $21 # L0 hi add => carry |
430 | addq $12, $20, $12 # U0 hi mul + carry |
431 | umulh $19, $0, $10 # U1 |
432 | addq $6, $13, $6 # L0 lo + acc |
433 | stq $22, -32($16) # L0 |
434 | stq $23, -24($16) # L1 |
435 | mulq $19, $1, $11 # U1 |
436 | addq $12, $21, $12 # U0 hi mul + carry |
437 | cmpult $6, $13, $20 # L0 lo add => carry |
438 | addq $6, $12, $22 # U0 hi add => answer |
439 | cmpult $22, $12, $21 # L0 hi add => carry |
440 | addq $14, $20, $14 # U0 hi mul + carry |
441 | addq $7, $15, $23 # L0 lo + acc |
442 | addq $14, $21, $14 # U0 hi mul + carry |
443 | umulh $19, $1, $12 # U1 |
444 | cmpult $23, $15, $20 # L0 lo add => carry |
445 | addq $23, $14, $23 # U0 hi add => answer |
446 | cmpult $23, $14, $21 # L0 hi add => carry |
447 | addq $8, $20, $8 # U0 hi mul + carry |
448 | addq $4, $9, $4 # U0 lo + acc |
449 | stq $22, -16($16) # L0 |
450 | stq $23, -8($16) # L1 |
451 | bis $31, $31, $31 # L0 st slosh |
452 | addq $8, $21, $8 # L0 hi mul + carry |
453 | cmpult $4, $9, $20 # L0 lo add => carry |
454 | addq $4, $8, $22 # U0 hi add => answer |
455 | cmpult $22, $8, $21 # L0 hi add => carry |
456 | addq $10, $20, $10 # U0 hi mul + carry |
457 | addq $5, $11, $23 # L0 lo + acc |
458 | addq $10, $21, $10 # L0 hi mul + carry |
459 | cmpult $23, $11, $20 # L0 lo add => carry |
460 | addq $23, $10, $23 # U0 hi add => answer |
461 | cmpult $23, $10, $21 # L0 hi add => carry |
462 | addq $12, $20, $12 # U0 hi mul + carry |
463 | stq $22, 0($16) # L0 |
464 | stq $23, 8($16) # L1 |
465 | addq $12, $21, $0 # U0 hi mul + carry |
466 | |
467 | ldq $9, 8($30) |
468 | ldq $10, 16($30) |
469 | ldq $11, 24($30) |
470 | ldq $12, 32($30) |
471 | ldq $13, 40($30) |
472 | ldq $14, 48($30) |
473 | ldq $15, 56($30) |
474 | lda $30, 240($30) |
475 | ret $31, ($26), 1 |
476 | |
477 | .end __mpn_addmul_1 |
478 | |