rshift.S source code [glibc/sysdeps/i386/i586/rshift.S]

1	/ Pentium optimized __mpn_rshift --*
2	Copyright (C) 1992-2024 Free Software Foundation, Inc.
3	This file is part of the GNU MP Library.
4
5	The GNU MP Library is free software; you can redistribute it and/or modify
6	it under the terms of the GNU Lesser General Public License as published by
7	the Free Software Foundation; either version 2.1 of the License, or (at your
8	option) any later version.
9
10	The GNU MP Library is distributed in the hope that it will be useful, but
11	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13	License for more details.
14
15	You should have received a copy of the GNU Lesser General Public License
16	along with the GNU MP Library; see the file COPYING.LIB. If not,
17	see <https://www.gnu.org/licenses/>. /*
18
19	#include "sysdep.h"
20	#include "asm-syntax.h"
21
22	#define PARMS 4+16 /* space for 4 saved regs */
23	#define RES PARMS
24	#define S RES+4
25	#define SIZE S+4
26	#define CNT SIZE+4
27
28	.text
29	ENTRY (__mpn_rshift)
30
31	pushl %edi
32	cfi_adjust_cfa_offset (`4`)
33	pushl %esi
34	cfi_adjust_cfa_offset (`4`)
35	pushl %ebp
36	cfi_adjust_cfa_offset (`4`)
37	cfi_rel_offset (ebp, `0`)
38	pushl %ebx
39	cfi_adjust_cfa_offset (`4`)
40
41	movl RES(%esp),%edi
42	cfi_rel_offset (edi, `12`)
43	movl S(%esp),%esi
44	cfi_rel_offset (esi, `8`)
45	movl SIZE(%esp),%ebx
46	cfi_rel_offset (ebx, `0`)
47	movl CNT(%esp),%ecx
48
49	/ We can use faster code for shift-by-1 under certain conditions. /
50	cmp $`1`,%ecx
51	jne L(normal)
52	leal `4`(%edi),%eax
53	cmpl %esi,%eax
54	jnc L(special) / jump if res_ptr + 1 >= s_ptr /
55	leal (%edi,%ebx,`4`),%eax
56	cmpl %eax,%esi
57	jnc L(special) / jump if s_ptr >= res_ptr + size /
58
59	L(normal):
60	movl (%esi),%edx
61	addl $`4`,%esi
62	xorl %eax,%eax
63	shrdl %cl,%edx,%eax / compute carry limb /
64	pushl %eax / push carry limb onto stack /
65	cfi_adjust_cfa_offset (`4`)
66
67	decl %ebx
68	pushl %ebx
69	cfi_adjust_cfa_offset (`4`)
70	shrl $`3`,%ebx
71	jz L(end)
72
73	movl (%edi),%eax / fetch destination cache line /
74
75	ALIGN (`2`)
76	L(oop): movl `28`(%edi),%eax / fetch destination cache line /
77	movl %edx,%ebp
78
79	movl (%esi),%eax
80	movl `4`(%esi),%edx
81	shrdl %cl,%eax,%ebp
82	shrdl %cl,%edx,%eax
83	movl %ebp,(%edi)
84	movl %eax,`4`(%edi)
85
86	movl `8`(%esi),%ebp
87	movl `12`(%esi),%eax
88	shrdl %cl,%ebp,%edx
89	shrdl %cl,%eax,%ebp
90	movl %edx,`8`(%edi)
91	movl %ebp,`12`(%edi)
92
93	movl `16`(%esi),%edx
94	movl `20`(%esi),%ebp
95	shrdl %cl,%edx,%eax
96	shrdl %cl,%ebp,%edx
97	movl %eax,`16`(%edi)
98	movl %edx,`20`(%edi)
99
100	movl `24`(%esi),%eax
101	movl `28`(%esi),%edx
102	shrdl %cl,%eax,%ebp
103	shrdl %cl,%edx,%eax
104	movl %ebp,`24`(%edi)
105	movl %eax,`28`(%edi)
106
107	addl $`32`,%esi
108	addl $`32`,%edi
109	decl %ebx
110	jnz L(oop)
111
112	L(end): popl %ebx
113	cfi_adjust_cfa_offset (-`4`)
114	andl $`7`,%ebx
115	jz L(end2)
116	L(oop2):
117	movl (%esi),%eax
118	shrdl %cl,%eax,%edx / compute result limb /
119	movl %edx,(%edi)
120	movl %eax,%edx
121	addl $`4`,%esi
122	addl $`4`,%edi
123	decl %ebx
124	jnz L(oop2)
125
126	L(end2):
127	shrl %cl,%edx / compute most significant limb /
128	movl %edx,(%edi) / store it /
129
130	popl %eax / pop carry limb /
131	cfi_adjust_cfa_offset (-`4`)
132
133	popl %ebx
134	cfi_adjust_cfa_offset (-`4`)
135	cfi_restore (ebx)
136	popl %ebp
137	cfi_adjust_cfa_offset (-`4`)
138	cfi_restore (ebp)
139	popl %esi
140	cfi_adjust_cfa_offset (-`4`)
141	cfi_restore (esi)
142	popl %edi
143	cfi_adjust_cfa_offset (-`4`)
144	cfi_restore (edi)
145
146	ret
147
148	/ We loop from least significant end of the arrays, which is only*
149	permissible if the source and destination don't overlap, since the
150	function is documented to work for overlapping source and destination.
151	*/
152
153	cfi_adjust_cfa_offset (`16`)
154	cfi_rel_offset (edi, `12`)
155	cfi_rel_offset (esi, `8`)
156	cfi_rel_offset (ebp, `4`)
157	cfi_rel_offset (ebx, `0`)
158	L(special):
159	leal -`4`(%edi,%ebx,`4`),%edi
160	leal -`4`(%esi,%ebx,`4`),%esi
161
162	movl (%esi),%edx
163	subl $`4`,%esi
164
165	decl %ebx
166	pushl %ebx
167	cfi_adjust_cfa_offset (`4`)
168	shrl $`3`,%ebx
169
170	shrl $`1`,%edx
171	incl %ebx
172	decl %ebx
173	jz L(Lend)
174
175	movl (%edi),%eax / fetch destination cache line /
176
177	ALIGN (`2`)
178	L(Loop):
179	movl -`28`(%edi),%eax / fetch destination cache line /
180	movl %edx,%ebp
181
182	movl (%esi),%eax
183	movl -`4`(%esi),%edx
184	rcrl $`1`,%eax
185	movl %ebp,(%edi)
186	rcrl $`1`,%edx
187	movl %eax,-`4`(%edi)
188
189	movl -`8`(%esi),%ebp
190	movl -`12`(%esi),%eax
191	rcrl $`1`,%ebp
192	movl %edx,-`8`(%edi)
193	rcrl $`1`,%eax
194	movl %ebp,-`12`(%edi)
195
196	movl -`16`(%esi),%edx
197	movl -`20`(%esi),%ebp
198	rcrl $`1`,%edx
199	movl %eax,-`16`(%edi)
200	rcrl $`1`,%ebp
201	movl %edx,-`20`(%edi)
202
203	movl -`24`(%esi),%eax
204	movl -`28`(%esi),%edx
205	rcrl $`1`,%eax
206	movl %ebp,-`24`(%edi)
207	rcrl $`1`,%edx
208	movl %eax,-`28`(%edi)
209
210	leal -`32`(%esi),%esi / use leal not to clobber carry /
211	leal -`32`(%edi),%edi
212	decl %ebx
213	jnz L(Loop)
214
215	L(Lend):
216	popl %ebx
217	cfi_adjust_cfa_offset (-`4`)
218	sbbl %eax,%eax / save carry in %eax /
219	andl $`7`,%ebx
220	jz L(Lend2)
221	addl %eax,%eax / restore carry from eax /
222	L(Loop2):
223	movl %edx,%ebp
224	movl (%esi),%edx
225	rcrl $`1`,%edx
226	movl %ebp,(%edi)
227
228	leal -`4`(%esi),%esi / use leal not to clobber carry /
229	leal -`4`(%edi),%edi
230	decl %ebx
231	jnz L(Loop2)
232
233	jmp L(L1)
234	L(Lend2):
235	addl %eax,%eax / restore carry from eax /
236	L(L1): movl %edx,(%edi) / store last limb /
237
238	movl $`0`,%eax
239	rcrl $`1`,%eax
240
241	popl %ebx
242	cfi_adjust_cfa_offset (-`4`)
243	cfi_restore (ebx)
244	popl %ebp
245	cfi_adjust_cfa_offset (-`4`)
246	cfi_restore (ebp)
247	popl %esi
248	cfi_adjust_cfa_offset (-`4`)
249	cfi_restore (esi)
250	popl %edi
251	cfi_adjust_cfa_offset (-`4`)
252	cfi_restore (edi)
253
254	ret
255	END (__mpn_rshift)
256

source code of glibc/sysdeps/i386/i586/rshift.S