1/* Pentium optimized __mpn_rshift --
2 Copyright (C) 1992-2024 Free Software Foundation, Inc.
3 This file is part of the GNU MP Library.
4
5 The GNU MP Library is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2.1 of the License, or (at your
8 option) any later version.
9
10 The GNU MP Library is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with the GNU MP Library; see the file COPYING.LIB. If not,
17 see <https://www.gnu.org/licenses/>. */
18
19#include "sysdep.h"
20#include "asm-syntax.h"
21
22#define PARMS 4+16 /* space for 4 saved regs */
23#define RES PARMS
24#define S RES+4
25#define SIZE S+4
26#define CNT SIZE+4
27
28 .text
29ENTRY (__mpn_rshift)
30
31 pushl %edi
32 cfi_adjust_cfa_offset (4)
33 pushl %esi
34 cfi_adjust_cfa_offset (4)
35 pushl %ebp
36 cfi_adjust_cfa_offset (4)
37 cfi_rel_offset (ebp, 0)
38 pushl %ebx
39 cfi_adjust_cfa_offset (4)
40
41 movl RES(%esp),%edi
42 cfi_rel_offset (edi, 12)
43 movl S(%esp),%esi
44 cfi_rel_offset (esi, 8)
45 movl SIZE(%esp),%ebx
46 cfi_rel_offset (ebx, 0)
47 movl CNT(%esp),%ecx
48
49/* We can use faster code for shift-by-1 under certain conditions. */
50 cmp $1,%ecx
51 jne L(normal)
52 leal 4(%edi),%eax
53 cmpl %esi,%eax
54 jnc L(special) /* jump if res_ptr + 1 >= s_ptr */
55 leal (%edi,%ebx,4),%eax
56 cmpl %eax,%esi
57 jnc L(special) /* jump if s_ptr >= res_ptr + size */
58
59L(normal):
60 movl (%esi),%edx
61 addl $4,%esi
62 xorl %eax,%eax
63 shrdl %cl,%edx,%eax /* compute carry limb */
64 pushl %eax /* push carry limb onto stack */
65 cfi_adjust_cfa_offset (4)
66
67 decl %ebx
68 pushl %ebx
69 cfi_adjust_cfa_offset (4)
70 shrl $3,%ebx
71 jz L(end)
72
73 movl (%edi),%eax /* fetch destination cache line */
74
75 ALIGN (2)
76L(oop): movl 28(%edi),%eax /* fetch destination cache line */
77 movl %edx,%ebp
78
79 movl (%esi),%eax
80 movl 4(%esi),%edx
81 shrdl %cl,%eax,%ebp
82 shrdl %cl,%edx,%eax
83 movl %ebp,(%edi)
84 movl %eax,4(%edi)
85
86 movl 8(%esi),%ebp
87 movl 12(%esi),%eax
88 shrdl %cl,%ebp,%edx
89 shrdl %cl,%eax,%ebp
90 movl %edx,8(%edi)
91 movl %ebp,12(%edi)
92
93 movl 16(%esi),%edx
94 movl 20(%esi),%ebp
95 shrdl %cl,%edx,%eax
96 shrdl %cl,%ebp,%edx
97 movl %eax,16(%edi)
98 movl %edx,20(%edi)
99
100 movl 24(%esi),%eax
101 movl 28(%esi),%edx
102 shrdl %cl,%eax,%ebp
103 shrdl %cl,%edx,%eax
104 movl %ebp,24(%edi)
105 movl %eax,28(%edi)
106
107 addl $32,%esi
108 addl $32,%edi
109 decl %ebx
110 jnz L(oop)
111
112L(end): popl %ebx
113 cfi_adjust_cfa_offset (-4)
114 andl $7,%ebx
115 jz L(end2)
116L(oop2):
117 movl (%esi),%eax
118 shrdl %cl,%eax,%edx /* compute result limb */
119 movl %edx,(%edi)
120 movl %eax,%edx
121 addl $4,%esi
122 addl $4,%edi
123 decl %ebx
124 jnz L(oop2)
125
126L(end2):
127 shrl %cl,%edx /* compute most significant limb */
128 movl %edx,(%edi) /* store it */
129
130 popl %eax /* pop carry limb */
131 cfi_adjust_cfa_offset (-4)
132
133 popl %ebx
134 cfi_adjust_cfa_offset (-4)
135 cfi_restore (ebx)
136 popl %ebp
137 cfi_adjust_cfa_offset (-4)
138 cfi_restore (ebp)
139 popl %esi
140 cfi_adjust_cfa_offset (-4)
141 cfi_restore (esi)
142 popl %edi
143 cfi_adjust_cfa_offset (-4)
144 cfi_restore (edi)
145
146 ret
147
148/* We loop from least significant end of the arrays, which is only
149 permissible if the source and destination don't overlap, since the
150 function is documented to work for overlapping source and destination.
151*/
152
153 cfi_adjust_cfa_offset (16)
154 cfi_rel_offset (edi, 12)
155 cfi_rel_offset (esi, 8)
156 cfi_rel_offset (ebp, 4)
157 cfi_rel_offset (ebx, 0)
158L(special):
159 leal -4(%edi,%ebx,4),%edi
160 leal -4(%esi,%ebx,4),%esi
161
162 movl (%esi),%edx
163 subl $4,%esi
164
165 decl %ebx
166 pushl %ebx
167 cfi_adjust_cfa_offset (4)
168 shrl $3,%ebx
169
170 shrl $1,%edx
171 incl %ebx
172 decl %ebx
173 jz L(Lend)
174
175 movl (%edi),%eax /* fetch destination cache line */
176
177 ALIGN (2)
178L(Loop):
179 movl -28(%edi),%eax /* fetch destination cache line */
180 movl %edx,%ebp
181
182 movl (%esi),%eax
183 movl -4(%esi),%edx
184 rcrl $1,%eax
185 movl %ebp,(%edi)
186 rcrl $1,%edx
187 movl %eax,-4(%edi)
188
189 movl -8(%esi),%ebp
190 movl -12(%esi),%eax
191 rcrl $1,%ebp
192 movl %edx,-8(%edi)
193 rcrl $1,%eax
194 movl %ebp,-12(%edi)
195
196 movl -16(%esi),%edx
197 movl -20(%esi),%ebp
198 rcrl $1,%edx
199 movl %eax,-16(%edi)
200 rcrl $1,%ebp
201 movl %edx,-20(%edi)
202
203 movl -24(%esi),%eax
204 movl -28(%esi),%edx
205 rcrl $1,%eax
206 movl %ebp,-24(%edi)
207 rcrl $1,%edx
208 movl %eax,-28(%edi)
209
210 leal -32(%esi),%esi /* use leal not to clobber carry */
211 leal -32(%edi),%edi
212 decl %ebx
213 jnz L(Loop)
214
215L(Lend):
216 popl %ebx
217 cfi_adjust_cfa_offset (-4)
218 sbbl %eax,%eax /* save carry in %eax */
219 andl $7,%ebx
220 jz L(Lend2)
221 addl %eax,%eax /* restore carry from eax */
222L(Loop2):
223 movl %edx,%ebp
224 movl (%esi),%edx
225 rcrl $1,%edx
226 movl %ebp,(%edi)
227
228 leal -4(%esi),%esi /* use leal not to clobber carry */
229 leal -4(%edi),%edi
230 decl %ebx
231 jnz L(Loop2)
232
233 jmp L(L1)
234L(Lend2):
235 addl %eax,%eax /* restore carry from eax */
236L(L1): movl %edx,(%edi) /* store last limb */
237
238 movl $0,%eax
239 rcrl $1,%eax
240
241 popl %ebx
242 cfi_adjust_cfa_offset (-4)
243 cfi_restore (ebx)
244 popl %ebp
245 cfi_adjust_cfa_offset (-4)
246 cfi_restore (ebp)
247 popl %esi
248 cfi_adjust_cfa_offset (-4)
249 cfi_restore (esi)
250 popl %edi
251 cfi_adjust_cfa_offset (-4)
252 cfi_restore (edi)
253
254 ret
255END (__mpn_rshift)
256

source code of glibc/sysdeps/i386/i586/rshift.S