insnemu.S source code [linux/arch/nios2/kernel/insnemu.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* Copyright (C) 2003-2013 Altera Corporation
4	* All rights reserved.
5	*/
6
7
8	#include <linux/linkage.h>
9	#include <asm/entry.h>
10
11	.set noat
12	.set nobreak
13
14	/*
15	* Explicitly allow the use of r1 (the assembler temporary register)
16	* within this code. This register is normally reserved for the use of
17	* the compiler.
18	*/
19
20	ENTRY(instruction_trap)
21	ldw r1, PT_R1(sp) // Restore registers
22	ldw r2, PT_R2(sp)
23	ldw r3, PT_R3(sp)
24	ldw r4, PT_R4(sp)
25	ldw r5, PT_R5(sp)
26	ldw r6, PT_R6(sp)
27	ldw r7, PT_R7(sp)
28	ldw r8, PT_R8(sp)
29	ldw r9, PT_R9(sp)
30	ldw r10, PT_R10(sp)
31	ldw r11, PT_R11(sp)
32	ldw r12, PT_R12(sp)
33	ldw r13, PT_R13(sp)
34	ldw r14, PT_R14(sp)
35	ldw r15, PT_R15(sp)
36	ldw ra, PT_RA(sp)
37	ldw fp, PT_FP(sp)
38	ldw gp, PT_GP(sp)
39	ldw et, PT_ESTATUS(sp)
40	wrctl estatus, et
41	ldw ea, PT_EA(sp)
42	ldw et, PT_SP(sp) / backup sp in et /
43
44	addi sp, sp, PT_REGS_SIZE
45
46	/ INSTRUCTION EMULATION*
47	* ---------------------
48	*
49	* Nios II processors generate exceptions for unimplemented instructions.
50	* The routines below emulate these instructions. Depending on the
51	* processor core, the only instructions that might need to be emulated
52	* are div, divu, mul, muli, mulxss, mulxsu, and mulxuu.
53	*
54	* The emulations match the instructions, except for the following
55	* limitations:
56	*
57	* 1) The emulation routines do not emulate the use of the exception
58	* temporary register (et) as a source operand because the exception
59	* handler already has modified it.
60	*
61	* 2) The routines do not emulate the use of the stack pointer (sp) or
62	* the exception return address register (ea) as a destination because
63	* modifying these registers crashes the exception handler or the
64	* interrupted routine.
65	*
66	* Detailed Design
67	* ---------------
68	*
69	* The emulation routines expect the contents of integer registers r0-r31
70	* to be on the stack at addresses sp, 4(sp), 8(sp), ... 124(sp). The
71	* routines retrieve source operands from the stack and modify the
72	* destination register's value on the stack prior to the end of the
73	* exception handler. Then all registers except the destination register
74	* are restored to their previous values.
75	*
76	* The instruction that causes the exception is found at address -4(ea).
77	* The instruction's OP and OPX fields identify the operation to be
78	* performed.
79	*
80	* One instruction, muli, is an I-type instruction that is identified by
81	* an OP field of 0x24.
82	*
83	* muli AAAAA,BBBBB,IIIIIIIIIIIIIIII,-0x24-
84	* 27 22 6 0 <-- LSB of field
85	*
86	* The remaining emulated instructions are R-type and have an OP field
87	* of 0x3a. Their OPX fields identify them.
88	*
89	* R-type AAAAA,BBBBB,CCCCC,XXXXXX,NNNNN,-0x3a-
90	* 27 22 17 11 6 0 <-- LSB of field
91	*
92	*
93	* Opcode Encoding. muli is identified by its OP value. Then OPX & 0x02
94	* is used to differentiate between the division opcodes and the
95	* remaining multiplication opcodes.
96	*
97	* Instruction OP OPX OPX & 0x02
98	* ----------- ---- ---- ----------
99	* muli 0x24
100	* divu 0x3a 0x24 0
101	* div 0x3a 0x25 0
102	* mul 0x3a 0x27 != 0
103	* mulxuu 0x3a 0x07 != 0
104	* mulxsu 0x3a 0x17 != 0
105	* mulxss 0x3a 0x1f != 0
106	*/
107
108
109	/*
110	* Save everything on the stack to make it easy for the emulation
111	* routines to retrieve the source register operands.
112	*/
113
114	addi sp, sp, -`128`
115	stw zero, `0`(sp) / Save zero on stack to avoid special case for r0. /
116	stw r1, `4`(sp)
117	stw r2, `8`(sp)
118	stw r3, `12`(sp)
119	stw r4, `16`(sp)
120	stw r5, `20`(sp)
121	stw r6, `24`(sp)
122	stw r7, `28`(sp)
123	stw r8, `32`(sp)
124	stw r9, `36`(sp)
125	stw r10, `40`(sp)
126	stw r11, `44`(sp)
127	stw r12, `48`(sp)
128	stw r13, `52`(sp)
129	stw r14, `56`(sp)
130	stw r15, `60`(sp)
131	stw r16, `64`(sp)
132	stw r17, `68`(sp)
133	stw r18, `72`(sp)
134	stw r19, `76`(sp)
135	stw r20, `80`(sp)
136	stw r21, `84`(sp)
137	stw r22, `88`(sp)
138	stw r23, `92`(sp)
139	/ Don't bother to save et. It's already been changed. /
140	rdctl r5, estatus
141	stw r5, `100`(sp)
142
143	stw gp, `104`(sp)
144	stw et, `108`(sp) / et contains previous sp value. /
145	stw fp, `112`(sp)
146	stw ea, `116`(sp)
147	stw ra, `120`(sp)
148
149
150	/*
151	* Split the instruction into its fields. We need 4A, 4B, and 4*C as
152	* offsets to the stack pointer for access to the stored register values.
153	*/
154	ldw r2,-`4`(ea) / r2 = AAAAA,BBBBB,IIIIIIIIIIIIIIII,PPPPPP /
155	roli r3, r2, `7` / r3 = BBB,IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BB /
156	roli r4, r3, `3` / r4 = IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB /
157	roli r5, r4, `2` / r5 = IIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB,II /
158	srai r4, r4, `16` / r4 = (sign-extended) IMM16 /
159	roli r6, r5, `5` / r6 = XXXX,NNNNN,PPPPPP,AAAAA,BBBBB,CCCCC,XX /
160	andi r2, r2, `0x3f` / r2 = 00000000000000000000000000,PPPPPP /
161	andi r3, r3, `0x7c` / r3 = 0000000000000000000000000,AAAAA,00 /
162	andi r5, r5, `0x7c` / r5 = 0000000000000000000000000,BBBBB,00 /
163	andi r6, r6, `0x7c` / r6 = 0000000000000000000000000,CCCCC,00 /
164
165	/ Now*
166	* r2 = OP
167	* r3 = 4*A
168	* r4 = IMM16 (sign extended)
169	* r5 = 4*B
170	* r6 = 4*C
171	*/
172
173	/*
174	* Get the operands.
175	*
176	* It is necessary to check for muli because it uses an I-type
177	* instruction format, while the other instructions are have an R-type
178	* format.
179	*
180	* Prepare for either multiplication or division loop.
181	* They both loop 32 times.
182	*/
183	movi r14, `32`
184
185	add r3, r3, sp / r3 = address of A-operand. /
186	ldw r3, `0`(r3) / r3 = A-operand. /
187	movi r7, `0x24` / muli opcode (I-type instruction format) /
188	beq r2, r7, mul_immed / muli doesn't use the B register as a source /
189
190	add r5, r5, sp / r5 = address of B-operand. /
191	ldw r5, `0`(r5) / r5 = B-operand. /
192	/ r4 = SSSSSSSSSSSSSSSS,-----IMM16------ /
193	/ IMM16 not needed, align OPX portion /
194	/ r4 = SSSSSSSSSSSSSSSS,CCCCC,-OPX--,00000 /
195	srli r4, r4, `5` / r4 = 00000,SSSSSSSSSSSSSSSS,CCCCC,-OPX-- /
196	andi r4, r4, `0x3f` / r4 = 00000000000000000000000000,-OPX-- /
197
198	/ Now*
199	* r2 = OP
200	* r3 = src1
201	* r5 = src2
202	* r4 = OPX (no longer can be muli)
203	* r6 = 4*C
204	*/
205
206
207	/*
208	* Multiply or Divide?
209	*/
210	andi r7, r4, `0x02` / For R-type multiply instructions,*
211	OPX & 0x02 != 0 /*
212	bne r7, zero, multiply
213
214
215	/ DIVISION*
216	*
217	* Divide an unsigned dividend by an unsigned divisor using
218	* a shift-and-subtract algorithm. The example below shows
219	* 43 div 7 = 6 for 8-bit integers. This classic algorithm uses a
220	* single register to store both the dividend and the quotient,
221	* allowing both values to be shifted with a single instruction.
222	*
223	* remainder dividend:quotient
224	* --------- -----------------
225	* initialize 00000000 00101011:
226	* shift 00000000 0101011:_
227	* remainder >= divisor? no 00000000 0101011:0
228	* shift 00000000 101011:0_
229	* remainder >= divisor? no 00000000 101011:00
230	* shift 00000001 01011:00_
231	* remainder >= divisor? no 00000001 01011:000
232	* shift 00000010 1011:000_
233	* remainder >= divisor? no 00000010 1011:0000
234	* shift 00000101 011:0000_
235	* remainder >= divisor? no 00000101 011:00000
236	* shift 00001010 11:00000_
237	* remainder >= divisor? yes 00001010 11:000001
238	* remainder -= divisor - 00000111
239	* ----------
240	* 00000011 11:000001
241	* shift 00000111 1:000001_
242	* remainder >= divisor? yes 00000111 1:0000011
243	* remainder -= divisor - 00000111
244	* ----------
245	* 00000000 1:0000011
246	* shift 00000001 :0000011_
247	* remainder >= divisor? no 00000001 :00000110
248	*
249	* The quotient is 00000110.
250	*/
251
252	divide:
253	/*
254	* Prepare for division by assuming the result
255	* is unsigned, and storing its "sign" as 0.
256	*/
257	movi r17, `0`
258
259
260	/ Which division opcode? /
261	xori r7, r4, `0x25` / OPX of div /
262	bne r7, zero, unsigned_division
263
264
265	/*
266	* OPX is div. Determine and store the sign of the quotient.
267	* Then take the absolute value of both operands.
268	*/
269	xor r17, r3, r5 / MSB contains sign of quotient /
270	bge r3,zero,dividend_is_nonnegative
271	sub r3, zero, r3 / -r3 /
272	dividend_is_nonnegative:
273	bge r5, zero, divisor_is_nonnegative
274	sub r5, zero, r5 / -r5 /
275	divisor_is_nonnegative:
276
277
278	unsigned_division:
279	/ Initialize the unsigned-division loop. /
280	movi r13, `0` / remainder = 0 /
281
282	/ Now*
283	* r3 = dividend : quotient
284	* r4 = 0x25 for div, 0x24 for divu
285	* r5 = divisor
286	* r13 = remainder
287	* r14 = loop counter (already initialized to 32)
288	* r17 = MSB contains sign of quotient
289	*/
290
291
292	/*
293	* for (count = 32; count > 0; --count)
294	* {
295	*/
296	divide_loop:
297
298	/*
299	* Division:
300	*
301	* (remainder:dividend:quotient) <<= 1;
302	*/
303	slli r13, r13, `1`
304	cmplt r7, r3, zero / r7 = MSB of r3 /
305	or r13, r13, r7
306	slli r3, r3, `1`
307
308
309	/*
310	* if (remainder >= divisor)
311	* {
312	* set LSB of quotient
313	* remainder -= divisor;
314	* }
315	*/
316	bltu r13, r5, div_skip
317	ori r3, r3, `1`
318	sub r13, r13, r5
319	div_skip:
320
321	/*
322	* }
323	*/
324	subi r14, r14, `1`
325	bne r14, zero, divide_loop
326
327
328	/ Now*
329	* r3 = quotient
330	* r4 = 0x25 for div, 0x24 for divu
331	* r6 = 4*C
332	* r17 = MSB contains sign of quotient
333	*/
334
335
336	/*
337	* Conditionally negate signed quotient. If quotient is unsigned,
338	* the sign already is initialized to 0.
339	*/
340	bge r17, zero, quotient_is_nonnegative
341	sub r3, zero, r3 / -r3 /
342	quotient_is_nonnegative:
343
344
345	/*
346	* Final quotient is in r3.
347	*/
348	add r6, r6, sp
349	stw r3, `0`(r6) / write quotient to stack /
350	br restore_registers
351
352
353
354
355	/ MULTIPLICATION*
356	*
357	* A "product" is the number that one gets by summing a "multiplicand"
358	* several times. The "multiplier" specifies the number of copies of the
359	* multiplicand that are summed.
360	*
361	* Actual multiplication algorithms don't use repeated addition, however.
362	* Shift-and-add algorithms get the same answer as repeated addition, and
363	* they are faster. To compute the lower half of a product (pppp below)
364	* one shifts the product left before adding in each of the partial
365	* products (a * mmmm) through (d * mmmm).
366	*
367	* To compute the upper half of a product (PPPP below), one adds in the
368	* partial products (d * mmmm) through (a * mmmm), each time following
369	* the add by a right shift of the product.
370	*
371	* mmmm
372	* * abcd
373	* ------
374	* #### = d * mmmm
375	* #### = c * mmmm
376	* #### = b * mmmm
377	* #### = a * mmmm
378	* --------
379	* PPPPpppp
380	*
381	* The example above shows 4 partial products. Computing actual Nios II
382	* products requires 32 partials.
383	*
384	* It is possible to compute the result of mulxsu from the result of
385	* mulxuu because the only difference between the results of these two
386	* opcodes is the value of the partial product associated with the sign
387	* bit of rA.
388	*
389	* mulxsu = mulxuu - (rA < 0) ? rB : 0;
390	*
391	* It is possible to compute the result of mulxss from the result of
392	* mulxsu because the only difference between the results of these two
393	* opcodes is the value of the partial product associated with the sign
394	* bit of rB.
395	*
396	* mulxss = mulxsu - (rB < 0) ? rA : 0;
397	*
398	*/
399
400	mul_immed:
401	/ Opcode is muli. Change it into mul for remainder of algorithm. /
402	mov r6, r5 / Field B is dest register, not field C. /
403	mov r5, r4 / Field IMM16 is src2, not field B. /
404	movi r4, `0x27` / OPX of mul is 0x27 /
405
406	multiply:
407	/ Initialize the multiplication loop. /
408	movi r9, `0` / mul_product = 0 /
409	movi r10, `0` / mulxuu_product = 0 /
410	mov r11, r5 / save original multiplier for mulxsu and mulxss /
411	mov r12, r5 / mulxuu_multiplier (will be shifted) /
412	movi r16, `1` / used to create "rori B,A,1" from "ror B,A,r16" /
413
414	/ Now*
415	* r3 = multiplicand
416	* r5 = mul_multiplier
417	* r6 = 4 * dest_register (used later as offset to sp)
418	* r7 = temp
419	* r9 = mul_product
420	* r10 = mulxuu_product
421	* r11 = original multiplier
422	* r12 = mulxuu_multiplier
423	* r14 = loop counter (already initialized)
424	* r16 = 1
425	*/
426
427
428	/*
429	* for (count = 32; count > 0; --count)
430	* {
431	*/
432	multiply_loop:
433
434	/*
435	* mul_product <<= 1;
436	* lsb = multiplier & 1;
437	*/
438	slli r9, r9, `1`
439	andi r7, r12, `1`
440
441	/*
442	* if (lsb == 1)
443	* {
444	* mulxuu_product += multiplicand;
445	* }
446	*/
447	beq r7, zero, mulx_skip
448	add r10, r10, r3
449	cmpltu r7, r10, r3 / Save the carry from the MSB of mulxuu_product. /
450	ror r7, r7, r16 / r7 = 0x80000000 on carry, or else 0x00000000 /
451	mulx_skip:
452
453	/*
454	* if (MSB of mul_multiplier == 1)
455	* {
456	* mul_product += multiplicand;
457	* }
458	*/
459	bge r5, zero, mul_skip
460	add r9, r9, r3
461	mul_skip:
462
463	/*
464	* mulxuu_product >>= 1; logical shift
465	* mul_multiplier <<= 1; done with MSB
466	* mulx_multiplier >>= 1; done with LSB
467	*/
468	srli r10, r10, `1`
469	or r10, r10, r7 / OR in the saved carry bit. /
470	slli r5, r5, `1`
471	srli r12, r12, `1`
472
473
474	/*
475	* }
476	*/
477	subi r14, r14, `1`
478	bne r14, zero, multiply_loop
479
480
481	/*
482	* Multiply emulation loop done.
483	*/
484
485	/ Now*
486	* r3 = multiplicand
487	* r4 = OPX
488	* r6 = 4 * dest_register (used later as offset to sp)
489	* r7 = temp
490	* r9 = mul_product
491	* r10 = mulxuu_product
492	* r11 = original multiplier
493	*/
494
495
496	/ Calculate address for result from 4 * dest_register /
497	add r6, r6, sp
498
499
500	/*
501	* Select/compute the result based on OPX.
502	*/
503
504
505	/ OPX == mul? Then store. /
506	xori r7, r4, `0x27`
507	beq r7, zero, store_product
508
509	/ It's one of the mulx.. opcodes. Move over the result. /
510	mov r9, r10
511
512	/ OPX == mulxuu? Then store. /
513	xori r7, r4, `0x07`
514	beq r7, zero, store_product
515
516	/ Compute mulxsu*
517	*
518	* mulxsu = mulxuu - (rA < 0) ? rB : 0;
519	*/
520	bge r3, zero, mulxsu_skip
521	sub r9, r9, r11
522	mulxsu_skip:
523
524	/ OPX == mulxsu? Then store. /
525	xori r7, r4, `0x17`
526	beq r7, zero, store_product
527
528	/ Compute mulxss*
529	*
530	* mulxss = mulxsu - (rB < 0) ? rA : 0;
531	*/
532	bge r11,zero,mulxss_skip
533	sub r9, r9, r3
534	mulxss_skip:
535	/ At this point, assume that OPX is mulxss, so store/
536
537
538	store_product:
539	stw r9, `0`(r6)
540
541
542	restore_registers:
543	/ No need to restore r0. /
544	ldw r5, `100`(sp)
545	wrctl estatus, r5
546
547	ldw r1, `4`(sp)
548	ldw r2, `8`(sp)
549	ldw r3, `12`(sp)
550	ldw r4, `16`(sp)
551	ldw r5, `20`(sp)
552	ldw r6, `24`(sp)
553	ldw r7, `28`(sp)
554	ldw r8, `32`(sp)
555	ldw r9, `36`(sp)
556	ldw r10, `40`(sp)
557	ldw r11, `44`(sp)
558	ldw r12, `48`(sp)
559	ldw r13, `52`(sp)
560	ldw r14, `56`(sp)
561	ldw r15, `60`(sp)
562	ldw r16, `64`(sp)
563	ldw r17, `68`(sp)
564	ldw r18, `72`(sp)
565	ldw r19, `76`(sp)
566	ldw r20, `80`(sp)
567	ldw r21, `84`(sp)
568	ldw r22, `88`(sp)
569	ldw r23, `92`(sp)
570	/ Does not need to restore et /
571	ldw gp, `104`(sp)
572
573	ldw fp, `112`(sp)
574	ldw ea, `116`(sp)
575	ldw ra, `120`(sp)
576	ldw sp, `108`(sp) / last restore sp /
577	eret
578
579	.set at
580	.set break
581

source code of linux/arch/nios2/kernel/insnemu.S