1/* Pentium optimized __mpn_lshift --
2 Copyright (C) 1992-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include "sysdep.h"
20#include "asm-syntax.h"
21
22#define PARMS 4+16 /* space for 4 saved regs */
23#define RES PARMS
24#define S RES+4
25#define SIZE S+4
26#define CNT SIZE+4
27
28 .text
29ENTRY (__mpn_lshift)
30
31 pushl %edi
32 cfi_adjust_cfa_offset (4)
33 pushl %esi
34 cfi_adjust_cfa_offset (4)
35 pushl %ebp
36 cfi_adjust_cfa_offset (4)
37 cfi_rel_offset (ebp, 0)
38 pushl %ebx
39 cfi_adjust_cfa_offset (4)
40
41 movl RES(%esp),%edi
42 cfi_rel_offset (edi, 12)
43 movl S(%esp),%esi
44 cfi_rel_offset (esi, 8)
45 movl SIZE(%esp),%ebx
46 cfi_rel_offset (ebx, 0)
47 movl CNT(%esp),%ecx
48
49/* We can use faster code for shift-by-1 under certain conditions. */
50 cmp $1,%ecx
51 jne L(normal)
52 leal 4(%esi),%eax
53 cmpl %edi,%eax
54 jnc L(special) /* jump if s_ptr + 1 >= res_ptr */
55 leal (%esi,%ebx,4),%eax
56 cmpl %eax,%edi
57 jnc L(special) /* jump if res_ptr >= s_ptr + size */
58
59L(normal):
60 leal -4(%edi,%ebx,4),%edi
61 leal -4(%esi,%ebx,4),%esi
62
63 movl (%esi),%edx
64 subl $4,%esi
65 xorl %eax,%eax
66 shldl %cl,%edx,%eax /* compute carry limb */
67 pushl %eax /* push carry limb onto stack */
68 cfi_adjust_cfa_offset (4)
69
70 decl %ebx
71 pushl %ebx
72 cfi_adjust_cfa_offset (4)
73 shrl $3,%ebx
74 jz L(end)
75
76 movl (%edi),%eax /* fetch destination cache line */
77
78 ALIGN (2)
79L(oop): movl -28(%edi),%eax /* fetch destination cache line */
80 movl %edx,%ebp
81
82 movl (%esi),%eax
83 movl -4(%esi),%edx
84 shldl %cl,%eax,%ebp
85 shldl %cl,%edx,%eax
86 movl %ebp,(%edi)
87 movl %eax,-4(%edi)
88
89 movl -8(%esi),%ebp
90 movl -12(%esi),%eax
91 shldl %cl,%ebp,%edx
92 shldl %cl,%eax,%ebp
93 movl %edx,-8(%edi)
94 movl %ebp,-12(%edi)
95
96 movl -16(%esi),%edx
97 movl -20(%esi),%ebp
98 shldl %cl,%edx,%eax
99 shldl %cl,%ebp,%edx
100 movl %eax,-16(%edi)
101 movl %edx,-20(%edi)
102
103 movl -24(%esi),%eax
104 movl -28(%esi),%edx
105 shldl %cl,%eax,%ebp
106 shldl %cl,%edx,%eax
107 movl %ebp,-24(%edi)
108 movl %eax,-28(%edi)
109
110 subl $32,%esi
111 subl $32,%edi
112 decl %ebx
113 jnz L(oop)
114
115L(end): popl %ebx
116 cfi_adjust_cfa_offset (-4)
117 andl $7,%ebx
118 jz L(end2)
119L(oop2):
120 movl (%esi),%eax
121 shldl %cl,%eax,%edx
122 movl %edx,(%edi)
123 movl %eax,%edx
124 subl $4,%esi
125 subl $4,%edi
126 decl %ebx
127 jnz L(oop2)
128
129L(end2):
130 shll %cl,%edx /* compute least significant limb */
131 movl %edx,(%edi) /* store it */
132
133 popl %eax /* pop carry limb */
134 cfi_adjust_cfa_offset (-4)
135
136 popl %ebx
137 cfi_adjust_cfa_offset (-4)
138 cfi_restore (ebx)
139 popl %ebp
140 cfi_adjust_cfa_offset (-4)
141 cfi_restore (ebp)
142 popl %esi
143 cfi_adjust_cfa_offset (-4)
144 cfi_restore (esi)
145 popl %edi
146 cfi_adjust_cfa_offset (-4)
147 cfi_restore (edi)
148
149 ret
150
151/* We loop from least significant end of the arrays, which is only
152 permissible if the source and destination don't overlap, since the
153 function is documented to work for overlapping source and destination.
154*/
155
156 cfi_adjust_cfa_offset (16)
157 cfi_rel_offset (edi, 12)
158 cfi_rel_offset (esi, 8)
159 cfi_rel_offset (ebp, 4)
160 cfi_rel_offset (ebx, 0)
161L(special):
162 movl (%esi),%edx
163 addl $4,%esi
164
165 decl %ebx
166 pushl %ebx
167 cfi_adjust_cfa_offset (4)
168 shrl $3,%ebx
169
170 addl %edx,%edx
171 incl %ebx
172 decl %ebx
173 jz L(Lend)
174
175 movl (%edi),%eax /* fetch destination cache line */
176
177 ALIGN (2)
178L(Loop):
179 movl 28(%edi),%eax /* fetch destination cache line */
180 movl %edx,%ebp
181
182 movl (%esi),%eax
183 movl 4(%esi),%edx
184 adcl %eax,%eax
185 movl %ebp,(%edi)
186 adcl %edx,%edx
187 movl %eax,4(%edi)
188
189 movl 8(%esi),%ebp
190 movl 12(%esi),%eax
191 adcl %ebp,%ebp
192 movl %edx,8(%edi)
193 adcl %eax,%eax
194 movl %ebp,12(%edi)
195
196 movl 16(%esi),%edx
197 movl 20(%esi),%ebp
198 adcl %edx,%edx
199 movl %eax,16(%edi)
200 adcl %ebp,%ebp
201 movl %edx,20(%edi)
202
203 movl 24(%esi),%eax
204 movl 28(%esi),%edx
205 adcl %eax,%eax
206 movl %ebp,24(%edi)
207 adcl %edx,%edx
208 movl %eax,28(%edi)
209
210 leal 32(%esi),%esi /* use leal not to clobber carry */
211 leal 32(%edi),%edi
212 decl %ebx
213 jnz L(Loop)
214
215L(Lend):
216 popl %ebx
217 cfi_adjust_cfa_offset (-4)
218 sbbl %eax,%eax /* save carry in %eax */
219 andl $7,%ebx
220 jz L(Lend2)
221 addl %eax,%eax /* restore carry from eax */
222L(Loop2):
223 movl %edx,%ebp
224 movl (%esi),%edx
225 adcl %edx,%edx
226 movl %ebp,(%edi)
227
228 leal 4(%esi),%esi /* use leal not to clobber carry */
229 leal 4(%edi),%edi
230 decl %ebx
231 jnz L(Loop2)
232
233 jmp L(L1)
234L(Lend2):
235 addl %eax,%eax /* restore carry from eax */
236L(L1): movl %edx,(%edi) /* store last limb */
237
238 sbbl %eax,%eax
239 negl %eax
240
241 popl %ebx
242 cfi_adjust_cfa_offset (-4)
243 cfi_restore (ebx)
244 popl %ebp
245 cfi_adjust_cfa_offset (-4)
246 cfi_restore (ebp)
247 popl %esi
248 cfi_adjust_cfa_offset (-4)
249 cfi_restore (esi)
250 popl %edi
251 cfi_adjust_cfa_offset (-4)
252 cfi_restore (edi)
253
254 ret
255END (__mpn_lshift)
256

source code of glibc/sysdeps/i386/i586/lshift.S