1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * Copyright (C) 2003-2013 Altera Corporation |
4 | * All rights reserved. |
5 | */ |
6 | |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <asm/entry.h> |
10 | |
11 | .set noat |
12 | .set nobreak |
13 | |
14 | /* |
15 | * Explicitly allow the use of r1 (the assembler temporary register) |
16 | * within this code. This register is normally reserved for the use of |
17 | * the compiler. |
18 | */ |
19 | |
20 | ENTRY(instruction_trap) |
21 | ldw r1, PT_R1(sp) // Restore registers |
22 | ldw r2, PT_R2(sp) |
23 | ldw r3, PT_R3(sp) |
24 | ldw r4, PT_R4(sp) |
25 | ldw r5, PT_R5(sp) |
26 | ldw r6, PT_R6(sp) |
27 | ldw r7, PT_R7(sp) |
28 | ldw r8, PT_R8(sp) |
29 | ldw r9, PT_R9(sp) |
30 | ldw r10, PT_R10(sp) |
31 | ldw r11, PT_R11(sp) |
32 | ldw r12, PT_R12(sp) |
33 | ldw r13, PT_R13(sp) |
34 | ldw r14, PT_R14(sp) |
35 | ldw r15, PT_R15(sp) |
36 | ldw ra, PT_RA(sp) |
37 | ldw fp, PT_FP(sp) |
38 | ldw gp, PT_GP(sp) |
39 | ldw et, PT_ESTATUS(sp) |
40 | wrctl estatus, et |
41 | ldw ea, PT_EA(sp) |
42 | ldw et, PT_SP(sp) /* backup sp in et */ |
43 | |
44 | addi sp, sp, PT_REGS_SIZE |
45 | |
46 | /* INSTRUCTION EMULATION |
47 | * --------------------- |
48 | * |
49 | * Nios II processors generate exceptions for unimplemented instructions. |
50 | * The routines below emulate these instructions. Depending on the |
51 | * processor core, the only instructions that might need to be emulated |
52 | * are div, divu, mul, muli, mulxss, mulxsu, and mulxuu. |
53 | * |
54 | * The emulations match the instructions, except for the following |
55 | * limitations: |
56 | * |
57 | * 1) The emulation routines do not emulate the use of the exception |
58 | * temporary register (et) as a source operand because the exception |
59 | * handler already has modified it. |
60 | * |
61 | * 2) The routines do not emulate the use of the stack pointer (sp) or |
62 | * the exception return address register (ea) as a destination because |
63 | * modifying these registers crashes the exception handler or the |
64 | * interrupted routine. |
65 | * |
66 | * Detailed Design |
67 | * --------------- |
68 | * |
69 | * The emulation routines expect the contents of integer registers r0-r31 |
70 | * to be on the stack at addresses sp, 4(sp), 8(sp), ... 124(sp). The |
71 | * routines retrieve source operands from the stack and modify the |
72 | * destination register's value on the stack prior to the end of the |
73 | * exception handler. Then all registers except the destination register |
74 | * are restored to their previous values. |
75 | * |
76 | * The instruction that causes the exception is found at address -4(ea). |
77 | * The instruction's OP and OPX fields identify the operation to be |
78 | * performed. |
79 | * |
80 | * One instruction, muli, is an I-type instruction that is identified by |
81 | * an OP field of 0x24. |
82 | * |
83 | * muli AAAAA,BBBBB,IIIIIIIIIIIIIIII,-0x24- |
84 | * 27 22 6 0 <-- LSB of field |
85 | * |
86 | * The remaining emulated instructions are R-type and have an OP field |
87 | * of 0x3a. Their OPX fields identify them. |
88 | * |
89 | * R-type AAAAA,BBBBB,CCCCC,XXXXXX,NNNNN,-0x3a- |
90 | * 27 22 17 11 6 0 <-- LSB of field |
91 | * |
92 | * |
93 | * Opcode Encoding. muli is identified by its OP value. Then OPX & 0x02 |
94 | * is used to differentiate between the division opcodes and the |
95 | * remaining multiplication opcodes. |
96 | * |
97 | * Instruction OP OPX OPX & 0x02 |
98 | * ----------- ---- ---- ---------- |
99 | * muli 0x24 |
100 | * divu 0x3a 0x24 0 |
101 | * div 0x3a 0x25 0 |
102 | * mul 0x3a 0x27 != 0 |
103 | * mulxuu 0x3a 0x07 != 0 |
104 | * mulxsu 0x3a 0x17 != 0 |
105 | * mulxss 0x3a 0x1f != 0 |
106 | */ |
107 | |
108 | |
109 | /* |
110 | * Save everything on the stack to make it easy for the emulation |
111 | * routines to retrieve the source register operands. |
112 | */ |
113 | |
114 | addi sp, sp, -128 |
115 | stw zero, 0(sp) /* Save zero on stack to avoid special case for r0. */ |
116 | stw r1, 4(sp) |
117 | stw r2, 8(sp) |
118 | stw r3, 12(sp) |
119 | stw r4, 16(sp) |
120 | stw r5, 20(sp) |
121 | stw r6, 24(sp) |
122 | stw r7, 28(sp) |
123 | stw r8, 32(sp) |
124 | stw r9, 36(sp) |
125 | stw r10, 40(sp) |
126 | stw r11, 44(sp) |
127 | stw r12, 48(sp) |
128 | stw r13, 52(sp) |
129 | stw r14, 56(sp) |
130 | stw r15, 60(sp) |
131 | stw r16, 64(sp) |
132 | stw r17, 68(sp) |
133 | stw r18, 72(sp) |
134 | stw r19, 76(sp) |
135 | stw r20, 80(sp) |
136 | stw r21, 84(sp) |
137 | stw r22, 88(sp) |
138 | stw r23, 92(sp) |
139 | /* Don't bother to save et. It's already been changed. */ |
140 | rdctl r5, estatus |
141 | stw r5, 100(sp) |
142 | |
143 | stw gp, 104(sp) |
144 | stw et, 108(sp) /* et contains previous sp value. */ |
145 | stw fp, 112(sp) |
146 | stw ea, 116(sp) |
147 | stw ra, 120(sp) |
148 | |
149 | |
150 | /* |
151 | * Split the instruction into its fields. We need 4*A, 4*B, and 4*C as |
152 | * offsets to the stack pointer for access to the stored register values. |
153 | */ |
154 | ldw r2,-4(ea) /* r2 = AAAAA,BBBBB,IIIIIIIIIIIIIIII,PPPPPP */ |
155 | roli r3, r2, 7 /* r3 = BBB,IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BB */ |
156 | roli r4, r3, 3 /* r4 = IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB */ |
157 | roli r5, r4, 2 /* r5 = IIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB,II */ |
158 | srai r4, r4, 16 /* r4 = (sign-extended) IMM16 */ |
159 | roli r6, r5, 5 /* r6 = XXXX,NNNNN,PPPPPP,AAAAA,BBBBB,CCCCC,XX */ |
160 | andi r2, r2, 0x3f /* r2 = 00000000000000000000000000,PPPPPP */ |
161 | andi r3, r3, 0x7c /* r3 = 0000000000000000000000000,AAAAA,00 */ |
162 | andi r5, r5, 0x7c /* r5 = 0000000000000000000000000,BBBBB,00 */ |
163 | andi r6, r6, 0x7c /* r6 = 0000000000000000000000000,CCCCC,00 */ |
164 | |
165 | /* Now |
166 | * r2 = OP |
167 | * r3 = 4*A |
168 | * r4 = IMM16 (sign extended) |
169 | * r5 = 4*B |
170 | * r6 = 4*C |
171 | */ |
172 | |
173 | /* |
174 | * Get the operands. |
175 | * |
176 | * It is necessary to check for muli because it uses an I-type |
177 | * instruction format, while the other instructions are have an R-type |
178 | * format. |
179 | * |
180 | * Prepare for either multiplication or division loop. |
181 | * They both loop 32 times. |
182 | */ |
183 | movi r14, 32 |
184 | |
185 | add r3, r3, sp /* r3 = address of A-operand. */ |
186 | ldw r3, 0(r3) /* r3 = A-operand. */ |
187 | movi r7, 0x24 /* muli opcode (I-type instruction format) */ |
188 | beq r2, r7, mul_immed /* muli doesn't use the B register as a source */ |
189 | |
190 | add r5, r5, sp /* r5 = address of B-operand. */ |
191 | ldw r5, 0(r5) /* r5 = B-operand. */ |
192 | /* r4 = SSSSSSSSSSSSSSSS,-----IMM16------ */ |
193 | /* IMM16 not needed, align OPX portion */ |
194 | /* r4 = SSSSSSSSSSSSSSSS,CCCCC,-OPX--,00000 */ |
195 | srli r4, r4, 5 /* r4 = 00000,SSSSSSSSSSSSSSSS,CCCCC,-OPX-- */ |
196 | andi r4, r4, 0x3f /* r4 = 00000000000000000000000000,-OPX-- */ |
197 | |
198 | /* Now |
199 | * r2 = OP |
200 | * r3 = src1 |
201 | * r5 = src2 |
202 | * r4 = OPX (no longer can be muli) |
203 | * r6 = 4*C |
204 | */ |
205 | |
206 | |
207 | /* |
208 | * Multiply or Divide? |
209 | */ |
210 | andi r7, r4, 0x02 /* For R-type multiply instructions, |
211 | OPX & 0x02 != 0 */ |
212 | bne r7, zero, multiply |
213 | |
214 | |
215 | /* DIVISION |
216 | * |
217 | * Divide an unsigned dividend by an unsigned divisor using |
218 | * a shift-and-subtract algorithm. The example below shows |
219 | * 43 div 7 = 6 for 8-bit integers. This classic algorithm uses a |
220 | * single register to store both the dividend and the quotient, |
221 | * allowing both values to be shifted with a single instruction. |
222 | * |
223 | * remainder dividend:quotient |
224 | * --------- ----------------- |
225 | * initialize 00000000 00101011: |
226 | * shift 00000000 0101011:_ |
227 | * remainder >= divisor? no 00000000 0101011:0 |
228 | * shift 00000000 101011:0_ |
229 | * remainder >= divisor? no 00000000 101011:00 |
230 | * shift 00000001 01011:00_ |
231 | * remainder >= divisor? no 00000001 01011:000 |
232 | * shift 00000010 1011:000_ |
233 | * remainder >= divisor? no 00000010 1011:0000 |
234 | * shift 00000101 011:0000_ |
235 | * remainder >= divisor? no 00000101 011:00000 |
236 | * shift 00001010 11:00000_ |
237 | * remainder >= divisor? yes 00001010 11:000001 |
238 | * remainder -= divisor - 00000111 |
239 | * ---------- |
240 | * 00000011 11:000001 |
241 | * shift 00000111 1:000001_ |
242 | * remainder >= divisor? yes 00000111 1:0000011 |
243 | * remainder -= divisor - 00000111 |
244 | * ---------- |
245 | * 00000000 1:0000011 |
246 | * shift 00000001 :0000011_ |
247 | * remainder >= divisor? no 00000001 :00000110 |
248 | * |
249 | * The quotient is 00000110. |
250 | */ |
251 | |
252 | divide: |
253 | /* |
254 | * Prepare for division by assuming the result |
255 | * is unsigned, and storing its "sign" as 0. |
256 | */ |
257 | movi r17, 0 |
258 | |
259 | |
260 | /* Which division opcode? */ |
261 | xori r7, r4, 0x25 /* OPX of div */ |
262 | bne r7, zero, unsigned_division |
263 | |
264 | |
265 | /* |
266 | * OPX is div. Determine and store the sign of the quotient. |
267 | * Then take the absolute value of both operands. |
268 | */ |
269 | xor r17, r3, r5 /* MSB contains sign of quotient */ |
270 | bge r3,zero,dividend_is_nonnegative |
271 | sub r3, zero, r3 /* -r3 */ |
272 | dividend_is_nonnegative: |
273 | bge r5, zero, divisor_is_nonnegative |
274 | sub r5, zero, r5 /* -r5 */ |
275 | divisor_is_nonnegative: |
276 | |
277 | |
278 | unsigned_division: |
279 | /* Initialize the unsigned-division loop. */ |
280 | movi r13, 0 /* remainder = 0 */ |
281 | |
282 | /* Now |
283 | * r3 = dividend : quotient |
284 | * r4 = 0x25 for div, 0x24 for divu |
285 | * r5 = divisor |
286 | * r13 = remainder |
287 | * r14 = loop counter (already initialized to 32) |
288 | * r17 = MSB contains sign of quotient |
289 | */ |
290 | |
291 | |
292 | /* |
293 | * for (count = 32; count > 0; --count) |
294 | * { |
295 | */ |
296 | divide_loop: |
297 | |
298 | /* |
299 | * Division: |
300 | * |
301 | * (remainder:dividend:quotient) <<= 1; |
302 | */ |
303 | slli r13, r13, 1 |
304 | cmplt r7, r3, zero /* r7 = MSB of r3 */ |
305 | or r13, r13, r7 |
306 | slli r3, r3, 1 |
307 | |
308 | |
309 | /* |
310 | * if (remainder >= divisor) |
311 | * { |
312 | * set LSB of quotient |
313 | * remainder -= divisor; |
314 | * } |
315 | */ |
316 | bltu r13, r5, div_skip |
317 | ori r3, r3, 1 |
318 | sub r13, r13, r5 |
319 | div_skip: |
320 | |
321 | /* |
322 | * } |
323 | */ |
324 | subi r14, r14, 1 |
325 | bne r14, zero, divide_loop |
326 | |
327 | |
328 | /* Now |
329 | * r3 = quotient |
330 | * r4 = 0x25 for div, 0x24 for divu |
331 | * r6 = 4*C |
332 | * r17 = MSB contains sign of quotient |
333 | */ |
334 | |
335 | |
336 | /* |
337 | * Conditionally negate signed quotient. If quotient is unsigned, |
338 | * the sign already is initialized to 0. |
339 | */ |
340 | bge r17, zero, quotient_is_nonnegative |
341 | sub r3, zero, r3 /* -r3 */ |
342 | quotient_is_nonnegative: |
343 | |
344 | |
345 | /* |
346 | * Final quotient is in r3. |
347 | */ |
348 | add r6, r6, sp |
349 | stw r3, 0(r6) /* write quotient to stack */ |
350 | br restore_registers |
351 | |
352 | |
353 | |
354 | |
355 | /* MULTIPLICATION |
356 | * |
357 | * A "product" is the number that one gets by summing a "multiplicand" |
358 | * several times. The "multiplier" specifies the number of copies of the |
359 | * multiplicand that are summed. |
360 | * |
361 | * Actual multiplication algorithms don't use repeated addition, however. |
362 | * Shift-and-add algorithms get the same answer as repeated addition, and |
363 | * they are faster. To compute the lower half of a product (pppp below) |
364 | * one shifts the product left before adding in each of the partial |
365 | * products (a * mmmm) through (d * mmmm). |
366 | * |
367 | * To compute the upper half of a product (PPPP below), one adds in the |
368 | * partial products (d * mmmm) through (a * mmmm), each time following |
369 | * the add by a right shift of the product. |
370 | * |
371 | * mmmm |
372 | * * abcd |
373 | * ------ |
374 | * #### = d * mmmm |
375 | * #### = c * mmmm |
376 | * #### = b * mmmm |
377 | * #### = a * mmmm |
378 | * -------- |
379 | * PPPPpppp |
380 | * |
381 | * The example above shows 4 partial products. Computing actual Nios II |
382 | * products requires 32 partials. |
383 | * |
384 | * It is possible to compute the result of mulxsu from the result of |
385 | * mulxuu because the only difference between the results of these two |
386 | * opcodes is the value of the partial product associated with the sign |
387 | * bit of rA. |
388 | * |
389 | * mulxsu = mulxuu - (rA < 0) ? rB : 0; |
390 | * |
391 | * It is possible to compute the result of mulxss from the result of |
392 | * mulxsu because the only difference between the results of these two |
393 | * opcodes is the value of the partial product associated with the sign |
394 | * bit of rB. |
395 | * |
396 | * mulxss = mulxsu - (rB < 0) ? rA : 0; |
397 | * |
398 | */ |
399 | |
400 | mul_immed: |
401 | /* Opcode is muli. Change it into mul for remainder of algorithm. */ |
402 | mov r6, r5 /* Field B is dest register, not field C. */ |
403 | mov r5, r4 /* Field IMM16 is src2, not field B. */ |
404 | movi r4, 0x27 /* OPX of mul is 0x27 */ |
405 | |
406 | multiply: |
407 | /* Initialize the multiplication loop. */ |
408 | movi r9, 0 /* mul_product = 0 */ |
409 | movi r10, 0 /* mulxuu_product = 0 */ |
410 | mov r11, r5 /* save original multiplier for mulxsu and mulxss */ |
411 | mov r12, r5 /* mulxuu_multiplier (will be shifted) */ |
412 | movi r16, 1 /* used to create "rori B,A,1" from "ror B,A,r16" */ |
413 | |
414 | /* Now |
415 | * r3 = multiplicand |
416 | * r5 = mul_multiplier |
417 | * r6 = 4 * dest_register (used later as offset to sp) |
418 | * r7 = temp |
419 | * r9 = mul_product |
420 | * r10 = mulxuu_product |
421 | * r11 = original multiplier |
422 | * r12 = mulxuu_multiplier |
423 | * r14 = loop counter (already initialized) |
424 | * r16 = 1 |
425 | */ |
426 | |
427 | |
428 | /* |
429 | * for (count = 32; count > 0; --count) |
430 | * { |
431 | */ |
432 | multiply_loop: |
433 | |
434 | /* |
435 | * mul_product <<= 1; |
436 | * lsb = multiplier & 1; |
437 | */ |
438 | slli r9, r9, 1 |
439 | andi r7, r12, 1 |
440 | |
441 | /* |
442 | * if (lsb == 1) |
443 | * { |
444 | * mulxuu_product += multiplicand; |
445 | * } |
446 | */ |
447 | beq r7, zero, mulx_skip |
448 | add r10, r10, r3 |
449 | cmpltu r7, r10, r3 /* Save the carry from the MSB of mulxuu_product. */ |
450 | ror r7, r7, r16 /* r7 = 0x80000000 on carry, or else 0x00000000 */ |
451 | mulx_skip: |
452 | |
453 | /* |
454 | * if (MSB of mul_multiplier == 1) |
455 | * { |
456 | * mul_product += multiplicand; |
457 | * } |
458 | */ |
459 | bge r5, zero, mul_skip |
460 | add r9, r9, r3 |
461 | mul_skip: |
462 | |
463 | /* |
464 | * mulxuu_product >>= 1; logical shift |
465 | * mul_multiplier <<= 1; done with MSB |
466 | * mulx_multiplier >>= 1; done with LSB |
467 | */ |
468 | srli r10, r10, 1 |
469 | or r10, r10, r7 /* OR in the saved carry bit. */ |
470 | slli r5, r5, 1 |
471 | srli r12, r12, 1 |
472 | |
473 | |
474 | /* |
475 | * } |
476 | */ |
477 | subi r14, r14, 1 |
478 | bne r14, zero, multiply_loop |
479 | |
480 | |
481 | /* |
482 | * Multiply emulation loop done. |
483 | */ |
484 | |
485 | /* Now |
486 | * r3 = multiplicand |
487 | * r4 = OPX |
488 | * r6 = 4 * dest_register (used later as offset to sp) |
489 | * r7 = temp |
490 | * r9 = mul_product |
491 | * r10 = mulxuu_product |
492 | * r11 = original multiplier |
493 | */ |
494 | |
495 | |
496 | /* Calculate address for result from 4 * dest_register */ |
497 | add r6, r6, sp |
498 | |
499 | |
500 | /* |
501 | * Select/compute the result based on OPX. |
502 | */ |
503 | |
504 | |
505 | /* OPX == mul? Then store. */ |
506 | xori r7, r4, 0x27 |
507 | beq r7, zero, store_product |
508 | |
509 | /* It's one of the mulx.. opcodes. Move over the result. */ |
510 | mov r9, r10 |
511 | |
512 | /* OPX == mulxuu? Then store. */ |
513 | xori r7, r4, 0x07 |
514 | beq r7, zero, store_product |
515 | |
516 | /* Compute mulxsu |
517 | * |
518 | * mulxsu = mulxuu - (rA < 0) ? rB : 0; |
519 | */ |
520 | bge r3, zero, mulxsu_skip |
521 | sub r9, r9, r11 |
522 | mulxsu_skip: |
523 | |
524 | /* OPX == mulxsu? Then store. */ |
525 | xori r7, r4, 0x17 |
526 | beq r7, zero, store_product |
527 | |
528 | /* Compute mulxss |
529 | * |
530 | * mulxss = mulxsu - (rB < 0) ? rA : 0; |
531 | */ |
532 | bge r11,zero,mulxss_skip |
533 | sub r9, r9, r3 |
534 | mulxss_skip: |
535 | /* At this point, assume that OPX is mulxss, so store*/ |
536 | |
537 | |
538 | store_product: |
539 | stw r9, 0(r6) |
540 | |
541 | |
542 | restore_registers: |
543 | /* No need to restore r0. */ |
544 | ldw r5, 100(sp) |
545 | wrctl estatus, r5 |
546 | |
547 | ldw r1, 4(sp) |
548 | ldw r2, 8(sp) |
549 | ldw r3, 12(sp) |
550 | ldw r4, 16(sp) |
551 | ldw r5, 20(sp) |
552 | ldw r6, 24(sp) |
553 | ldw r7, 28(sp) |
554 | ldw r8, 32(sp) |
555 | ldw r9, 36(sp) |
556 | ldw r10, 40(sp) |
557 | ldw r11, 44(sp) |
558 | ldw r12, 48(sp) |
559 | ldw r13, 52(sp) |
560 | ldw r14, 56(sp) |
561 | ldw r15, 60(sp) |
562 | ldw r16, 64(sp) |
563 | ldw r17, 68(sp) |
564 | ldw r18, 72(sp) |
565 | ldw r19, 76(sp) |
566 | ldw r20, 80(sp) |
567 | ldw r21, 84(sp) |
568 | ldw r22, 88(sp) |
569 | ldw r23, 92(sp) |
570 | /* Does not need to restore et */ |
571 | ldw gp, 104(sp) |
572 | |
573 | ldw fp, 112(sp) |
574 | ldw ea, 116(sp) |
575 | ldw ra, 120(sp) |
576 | ldw sp, 108(sp) /* last restore sp */ |
577 | eret |
578 | |
579 | .set at |
580 | .set break |
581 | |