remq.S source code [glibc/sysdeps/alpha/remq.S]

1	/ Copyright (C) 2004-2022 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	The GNU C Library is free software; you can redistribute it and/or
5	modify it under the terms of the GNU Lesser General Public
6	License as published by the Free Software Foundation; either
7	version 2.1 of the License, or (at your option) any later version.
8
9	The GNU C Library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13
14	You should have received a copy of the GNU Lesser General Public
15	License along with the GNU C Library. If not, see
16	<https://www.gnu.org/licenses/>. /*
17
18	#include "div_libc.h"
19
20
21	/ 64-bit signed long remainder. These are not normal C functions. Argument*
22	registers are t10 and t11, the result goes in t12. Only t12 and AT may
23	be clobbered.
24
25	Theory of operation here is that we can use the FPU divider for virtually
26	all operands that we see: all dividend values between -253 and 253-1
27	can be computed directly. Note that divisor values need not be checked
28	against that range because the rounded fp value will be close enough such
29	that the quotient is < 1, which will properly be truncated to zero when we
30	convert back to integer.
31
32	When the dividend is outside the range for which we can compute exact
33	results, we use the fp quotent as an estimate from which we begin refining
34	an exact integral value. This reduces the number of iterations in the
35	shift-and-subtract loop significantly.
36
37	The FPCR save/restore is due to the fact that the EV6 _will_ set FPCR_INE
38	for cvttq/c even without /sui being set. It will not, however, properly
39	raise the exception, so we don't have to worry about FPCR_INED being clear
40	and so dying by SIGFPE. /*
41
42	.text
43	.align `4`
44	.globl __remq
45	.type __remq, @funcnoplt
46	.usepv __remq, no
47
48	cfi_startproc
49	cfi_return_column (RA)
50	__remq:
51	lda sp, -FRAME(sp)
52	cfi_def_cfa_offset (FRAME)
53	CALL_MCOUNT
54
55	/ Get the fp divide insn issued as quickly as possible. After*
56	that's done, we have at least 22 cycles until its results are
57	ready -- all the time in the world to figure out how we're
58	going to use the results. /*
59	stt $f0, `0`(sp)
60	excb
61	beq Y, DIVBYZERO
62
63	stt $f1, `8`(sp)
64	stt $f3, `48`(sp)
65	cfi_rel_offset ($f0, `0`)
66	cfi_rel_offset ($f1, `8`)
67	cfi_rel_offset ($f3, `48`)
68	mf_fpcr $f3
69
70	_ITOFT2 X, $f0, `16`, Y, $f1, `24`
71	cvtqt $f0, $f0
72	cvtqt $f1, $f1
73	divt/c $f0, $f1, $f0
74
75	/ Check to see if X fit in the double as an exact value. /
76	sll X, (`64`-`53`), AT
77	ldt $f1, `8`(sp)
78	sra AT, (`64`-`53`), AT
79	cmpeq X, AT, AT
80	beq AT, $x_big
81
82	/ If we get here, we're expecting exact results from the division.*
83	Do nothing else besides convert, compute remainder, clean up. /*
84	cvttq/c $f0, $f0
85	excb
86	mt_fpcr $f3
87	_FTOIT $f0, AT, `16`
88	mulq AT, Y, AT
89	ldt $f0, `0`(sp)
90	ldt $f3, `48`(sp)
91	cfi_restore ($f1)
92	cfi_remember_state
93	cfi_restore ($f0)
94	cfi_restore ($f3)
95	cfi_def_cfa_offset (`0`)
96	lda sp, FRAME(sp)
97	subq X, AT, RV
98	ret $`31`, (RA), `1`
99
100	.align `4`
101	cfi_restore_state
102	$x_big:
103	/ If we get here, X is large enough that we don't expect exact*
104	results, and neither X nor Y got mis-translated for the fp
105	division. Our task is to take the fp result, figure out how
106	far it's off from the correct result and compute a fixup. /*
107	stq t0, `16`(sp)
108	stq t1, `24`(sp)
109	stq t2, `32`(sp)
110	stq t5, `40`(sp)
111	cfi_rel_offset (t0, `16`)
112	cfi_rel_offset (t1, `24`)
113	cfi_rel_offset (t2, `32`)
114	cfi_rel_offset (t5, `40`)
115
116	#define Q t0 /* quotient */
117	#define R RV /* remainder */
118	#define SY t1 /* scaled Y */
119	#define S t2 /* scalar */
120	#define QY t3 /* QY /
121
122	/ The fixup code below can only handle unsigned values. /
123	or X, Y, AT
124	mov $`31`, t5
125	blt AT, $fix_sign_in
126	$fix_sign_in_ret1:
127	cvttq/c $f0, $f0
128
129	_FTOIT $f0, Q, `8`
130	.align `3`
131	$fix_sign_in_ret2:
132	ldt $f0, `0`(sp)
133	stq t3, `0`(sp)
134	cfi_restore ($f0)
135	cfi_rel_offset (t3, `0`)
136
137	mulq Q, Y, QY
138	excb
139	stq t4, `8`(sp)
140	mt_fpcr $f3
141	cfi_rel_offset (t4, `8`)
142
143	subq QY, X, R
144	mov Y, SY
145	mov `1`, S
146	bgt R, $q_high
147
148	$q_high_ret:
149	subq X, QY, R
150	mov Y, SY
151	mov `1`, S
152	bgt R, $q_low
153
154	$q_low_ret:
155	ldq t0, `16`(sp)
156	ldq t1, `24`(sp)
157	ldq t2, `32`(sp)
158	bne t5, $fix_sign_out
159
160	$fix_sign_out_ret:
161	ldq t3, `0`(sp)
162	ldq t4, `8`(sp)
163	ldq t5, `40`(sp)
164	ldt $f3, `48`(sp)
165	lda sp, FRAME(sp)
166	cfi_remember_state
167	cfi_restore (t0)
168	cfi_restore (t1)
169	cfi_restore (t2)
170	cfi_restore (t3)
171	cfi_restore (t4)
172	cfi_restore (t5)
173	cfi_restore ($f3)
174	cfi_def_cfa_offset (`0`)
175	ret $`31`, (RA), `1`
176
177	.align `4`
178	cfi_restore_state
179	/ The quotient that we computed was too large. We need to reduce*
180	it by S such that YS >= R. Obviously the closer we get to the*
181	correct value the better, but overshooting high is ok, as we'll
182	fix that up later. /*
183	`0`:
184	addq SY, SY, SY
185	addq S, S, S
186	$q_high:
187	cmpult SY, R, AT
188	bne AT, `0b`
189
190	subq Q, S, Q
191	unop
192	subq QY, SY, QY
193	br $q_high_ret
194
195	.align `4`
196	/ The quotient that we computed was too small. Divide Y by the*
197	current remainder (R) and add that to the existing quotient (Q).
198	The expectation, of course, is that R is much smaller than X. /*
199	/ Begin with a shift-up loop. Compute S such that YS >= R. We
200	already have a copy of Y in SY and the value 1 in S. /*
201	`0`:
202	addq SY, SY, SY
203	addq S, S, S
204	$q_low:
205	cmpult SY, R, AT
206	bne AT, `0b`
207
208	/ Shift-down and subtract loop. Each iteration compares our scaled*
209	Y (SY) with the remainder (R); if SY <= R then X is divisible by
210	Y's scalar (S) so add it to the quotient (Q). /*
211	`2`: addq Q, S, t3
212	srl S, `1`, S
213	cmpule SY, R, AT
214	subq R, SY, t4
215
216	cmovne AT, t3, Q
217	cmovne AT, t4, R
218	srl SY, `1`, SY
219	bne S, `2b`
220
221	br $q_low_ret
222
223	.align `4`
224	$fix_sign_in:
225	/ If we got here, then X\|Y is negative. Need to adjust everything*
226	such that we're doing unsigned division in the fixup loop. /*
227	/ T5 records the changes we had to make:*
228	bit 0: set if X was negated. Note that the sign of the
229	remainder follows the sign of the divisor.
230	bit 2: set if Y was negated.
231	*/
232	xor X, Y, t1
233	cmplt X, `0`, t5
234	negq X, t0
235	cmovne t5, t0, X
236
237	cmplt Y, `0`, AT
238	negq Y, t0
239	s4addq AT, t5, t5
240	cmovne AT, t0, Y
241
242	bge t1, $fix_sign_in_ret1
243	cvttq/c $f0, $f0
244	_FTOIT $f0, Q, `8`
245	.align `3`
246	negq Q, Q
247	br $fix_sign_in_ret2
248
249	.align `4`
250	$fix_sign_out:
251	/ Now we get to undo what we did above. /
252	/ ??? Is this really faster than just increasing the size of*
253	the stack frame and storing X and Y in memory? /*
254	and t5, `4`, AT
255	negq Y, t4
256	cmovne AT, t4, Y
257
258	negq X, t4
259	cmovlbs t5, t4, X
260	negq RV, t4
261	cmovlbs t5, t4, RV
262
263	br $fix_sign_out_ret
264
265	cfi_endproc
266	.size __remq, .-__remq
267
268	DO_DIVBYZERO
269

source code of glibc/sysdeps/alpha/remq.S