1 | /* Optimized memcmp implementation for POWER7/PowerPC64. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* int [r3] memcmp (const char *s1 [r3], |
22 | const char *s2 [r4], |
23 | size_t size [r5]) */ |
24 | |
25 | #ifndef MEMCMP |
26 | # define MEMCMP memcmp |
27 | #endif |
28 | .machine power8 |
29 | ENTRY_TOCLESS (MEMCMP, 4) |
30 | CALL_MCOUNT 3 |
31 | |
32 | #define rRTN r3 |
33 | #define rSTR1 r3 /* First string arg. */ |
34 | #define rSTR2 r4 /* Second string arg. */ |
35 | #define rN r5 /* Max string length. */ |
36 | #define rWORD1 r6 /* Current word in s1. */ |
37 | #define rWORD2 r7 /* Current word in s2. */ |
38 | #define rWORD3 r8 /* Next word in s1. */ |
39 | #define rWORD4 r9 /* Next word in s2. */ |
40 | #define rWORD5 r10 /* Next word in s1. */ |
41 | #define rWORD6 r11 /* Next word in s2. */ |
42 | |
43 | #define rOFF8 r20 /* 8 bytes offset. */ |
44 | #define rOFF16 r21 /* 16 bytes offset. */ |
45 | #define rOFF24 r22 /* 24 bytes offset. */ |
46 | #define rOFF32 r23 /* 24 bytes offset. */ |
47 | #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ |
48 | #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ |
49 | #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ |
50 | #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ |
51 | #define rSHR r28 /* Unaligned shift right count. */ |
52 | #define rSHL r29 /* Unaligned shift left count. */ |
53 | #define rWORD7 r30 /* Next word in s1. */ |
54 | #define rWORD8 r31 /* Next word in s2. */ |
55 | |
56 | #define rWORD8SAVE (-8) |
57 | #define rWORD7SAVE (-16) |
58 | #define rOFF8SAVE (-24) |
59 | #define rOFF16SAVE (-32) |
60 | #define rOFF24SAVE (-40) |
61 | #define rOFF32SAVE (-48) |
62 | #define rSHRSAVE (-56) |
63 | #define rSHLSAVE (-64) |
64 | #define rWORD8SHIFTSAVE (-72) |
65 | #define rWORD2SHIFTSAVE (-80) |
66 | #define rWORD4SHIFTSAVE (-88) |
67 | #define rWORD6SHIFTSAVE (-96) |
68 | |
69 | #ifdef __LITTLE_ENDIAN__ |
70 | # define LD ldbrx |
71 | #else |
72 | # define LD ldx |
73 | #endif |
74 | |
75 | xor r10, rSTR2, rSTR1 |
76 | cmpldi cr6, rN, 0 |
77 | cmpldi cr1, rN, 8 |
78 | clrldi. r0, r10, 61 |
79 | clrldi r12, rSTR1, 61 |
80 | cmpldi cr5, r12, 0 |
81 | beq- cr6, L(zeroLength) |
82 | dcbt 0, rSTR1 |
83 | dcbt 0, rSTR2 |
84 | /* If less than 8 bytes or not aligned, use the unaligned |
85 | byte loop. */ |
86 | blt cr1, L(bytealigned) |
87 | bne L(unalignedqw) |
88 | /* At this point we know both strings have the same alignment and the |
89 | compare length is at least 8 bytes. r12 contains the low order |
90 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
91 | of r12 to 0. If r12 == 0 then we are already double word |
92 | aligned and can perform the DW aligned loop. */ |
93 | |
94 | .align 4 |
95 | L(samealignment): |
96 | or r11, rSTR2, rSTR1 |
97 | clrldi. r11, r11, 60 |
98 | beq L(qw_align) |
99 | /* Try to align to QW else proceed to DW loop. */ |
100 | clrldi. r10, r10, 60 |
101 | bne L(DW) |
102 | /* For the difference to reach QW alignment, load as DW. */ |
103 | clrrdi rSTR1, rSTR1, 3 |
104 | clrrdi rSTR2, rSTR2, 3 |
105 | subfic r10, r12, 8 |
106 | LD rWORD1, 0, rSTR1 |
107 | LD rWORD2, 0, rSTR2 |
108 | sldi r9, r10, 3 |
109 | subfic r9, r9, 64 |
110 | sld rWORD1, rWORD1, r9 |
111 | sld rWORD2, rWORD2, r9 |
112 | cmpld cr6, rWORD1, rWORD2 |
113 | addi rSTR1, rSTR1, 8 |
114 | addi rSTR2, rSTR2, 8 |
115 | bne cr6, L(ret_diff) |
116 | subf rN, r10, rN |
117 | |
118 | cmpld cr6, r11, r12 |
119 | bgt cr6, L(qw_align) |
120 | LD rWORD1, 0, rSTR1 |
121 | LD rWORD2, 0, rSTR2 |
122 | cmpld cr6, rWORD1, rWORD2 |
123 | addi rSTR1, rSTR1, 8 |
124 | addi rSTR2, rSTR2, 8 |
125 | bne cr6, L(different) |
126 | cmpldi cr6, rN, 8 |
127 | ble cr6, L(zeroLength) |
128 | addi rN, rN, -8 |
129 | /* Now both rSTR1 and rSTR2 are aligned to QW. */ |
130 | .align 4 |
131 | L(qw_align): |
132 | vspltisb v0, 0 |
133 | srdi. r6, rN, 6 |
134 | li r8, 16 |
135 | li r10, 32 |
136 | li r11, 48 |
137 | ble cr0, L(lessthan64) |
138 | mtctr r6 |
139 | vspltisb v8, 0 |
140 | vspltisb v6, 0 |
141 | /* Aligned vector loop. */ |
142 | .align 4 |
143 | L(aligned_loop): |
144 | lvx v4, 0, rSTR1 |
145 | lvx v5, 0, rSTR2 |
146 | vcmpequb. v7, v6, v8 |
147 | bnl cr6, L(different3) |
148 | lvx v6, rSTR1, r8 |
149 | lvx v8, rSTR2, r8 |
150 | vcmpequb. v7, v5, v4 |
151 | bnl cr6, L(different2) |
152 | lvx v4, rSTR1, r10 |
153 | lvx v5, rSTR2, r10 |
154 | vcmpequb. v7, v6, v8 |
155 | bnl cr6, L(different3) |
156 | lvx v6, rSTR1, r11 |
157 | lvx v8, rSTR2, r11 |
158 | vcmpequb. v7, v5, v4 |
159 | bnl cr6, L(different2) |
160 | addi rSTR1, rSTR1, 64 |
161 | addi rSTR2, rSTR2, 64 |
162 | bdnz L(aligned_loop) |
163 | vcmpequb. v7, v6, v8 |
164 | bnl cr6, L(different3) |
165 | clrldi rN, rN, 58 |
166 | /* Handle remainder for aligned loop. */ |
167 | .align 4 |
168 | L(lessthan64): |
169 | mr r9, rSTR1 |
170 | cmpdi cr6, rN, 0 |
171 | li rSTR1, 0 |
172 | blelr cr6 |
173 | lvx v4, 0, r9 |
174 | lvx v5, 0, rSTR2 |
175 | vcmpequb. v7, v5, v4 |
176 | bnl cr6, L(different1) |
177 | addi rN, rN, -16 |
178 | |
179 | cmpdi cr6, rN, 0 |
180 | blelr cr6 |
181 | lvx v4, r9, r8 |
182 | lvx v5, rSTR2, r8 |
183 | vcmpequb. v7, v5, v4 |
184 | bnl cr6, L(different1) |
185 | addi rN, rN, -16 |
186 | |
187 | cmpdi cr6, rN, 0 |
188 | blelr cr6 |
189 | lvx v4, r9, r10 |
190 | lvx v5, rSTR2, r10 |
191 | vcmpequb. v7, v5, v4 |
192 | bnl cr6, L(different1) |
193 | addi rN, rN, -16 |
194 | |
195 | cmpdi cr6, rN, 0 |
196 | blelr cr6 |
197 | lvx v4, r9, r11 |
198 | lvx v5, rSTR2, r11 |
199 | vcmpequb. v7, v5, v4 |
200 | bnl cr6, L(different1) |
201 | blr |
202 | |
203 | /* Calculate and return the difference. */ |
204 | .align 4 |
205 | L(different1): |
206 | cmpdi cr6, rN, 16 |
207 | bge cr6, L(different2) |
208 | /* Discard unwanted bytes. */ |
209 | #ifdef __LITTLE_ENDIAN__ |
210 | lvsr v1, 0, rN |
211 | vperm v4, v4, v0, v1 |
212 | vperm v5, v5, v0, v1 |
213 | #else |
214 | lvsl v1, 0, rN |
215 | vperm v4, v0, v4, v1 |
216 | vperm v5, v0, v5, v1 |
217 | #endif |
218 | vcmpequb. v7, v4, v5 |
219 | li rRTN, 0 |
220 | bltlr cr6 |
221 | .align 4 |
222 | L(different2): |
223 | #ifdef __LITTLE_ENDIAN__ |
224 | /* Reverse bytes for direct comparison. */ |
225 | lvsl v10, r0, r0 |
226 | vspltisb v8, 15 |
227 | vsububm v9, v8, v10 |
228 | vperm v4, v4, v0, v9 |
229 | vperm v5, v5, v0, v9 |
230 | #endif |
231 | mfvrd r7, v4 |
232 | mfvrd r9, v5 |
233 | cmpld cr6, r7, r9 |
234 | bne cr6, L(ret_diff) |
235 | /* Difference in second DW. */ |
236 | vsldoi v4, v4, v4, 8 |
237 | vsldoi v5, v5, v5, 8 |
238 | mfvrd r7, v4 |
239 | mfvrd r9, v5 |
240 | cmpld cr6, r7, r9 |
241 | L(ret_diff): |
242 | li rRTN, 1 |
243 | bgtlr cr6 |
244 | li rRTN, -1 |
245 | blr |
246 | .align 4 |
247 | L(different3): |
248 | #ifdef __LITTLE_ENDIAN__ |
249 | /* Reverse bytes for direct comparison. */ |
250 | vspltisb v9, 15 |
251 | lvsl v10, r0, r0 |
252 | vsububm v9, v9, v10 |
253 | vperm v6, v6, v0, v9 |
254 | vperm v8, v8, v0, v9 |
255 | #endif |
256 | mfvrd r7, v6 |
257 | mfvrd r9, v8 |
258 | cmpld cr6, r7, r9 |
259 | bne cr6, L(ret_diff) |
260 | /* Difference in second DW. */ |
261 | vsldoi v6, v6, v6, 8 |
262 | vsldoi v8, v8, v8, 8 |
263 | mfvrd r7, v6 |
264 | mfvrd r9, v8 |
265 | cmpld cr6, r7, r9 |
266 | li rRTN, 1 |
267 | bgtlr cr6 |
268 | li rRTN, -1 |
269 | blr |
270 | |
271 | .align 4 |
272 | L(different): |
273 | cmpldi cr7, rN, 8 |
274 | bgt cr7, L(end) |
275 | /* Skip unwanted bytes. */ |
276 | sldi r8, rN, 3 |
277 | subfic r8, r8, 64 |
278 | srd rWORD1, rWORD1, r8 |
279 | srd rWORD2, rWORD2, r8 |
280 | cmpld cr6, rWORD1, rWORD2 |
281 | li rRTN, 0 |
282 | beqlr cr6 |
283 | L(end): |
284 | li rRTN, 1 |
285 | bgtlr cr6 |
286 | li rRTN, -1 |
287 | blr |
288 | |
289 | .align 4 |
290 | L(unalignedqw): |
291 | /* Proceed to DW unaligned loop,if there is a chance of pagecross. */ |
292 | rldicl r9, rSTR1, 0, 52 |
293 | add r9, r9, rN |
294 | cmpldi cr0, r9, 4096-16 |
295 | bgt cr0, L(unaligned) |
296 | rldicl r9, rSTR2, 0, 52 |
297 | add r9, r9, rN |
298 | cmpldi cr0, r9, 4096-16 |
299 | bgt cr0, L(unaligned) |
300 | li r0, 0 |
301 | li r8, 16 |
302 | vspltisb v0, 0 |
303 | /* Check if rSTR1 is aligned to QW. */ |
304 | andi. r11, rSTR1, 0xF |
305 | beq L(s1_align) |
306 | |
307 | /* Compare 16B and align S1 to QW. */ |
308 | #ifdef __LITTLE_ENDIAN__ |
309 | lvsr v10, 0, rSTR1 /* Compute mask. */ |
310 | lvsr v6, 0, rSTR2 /* Compute mask. */ |
311 | #else |
312 | lvsl v10, 0, rSTR1 /* Compute mask. */ |
313 | lvsl v6, 0, rSTR2 /* Compute mask. */ |
314 | #endif |
315 | lvx v5, 0, rSTR2 |
316 | lvx v9, rSTR2, r8 |
317 | #ifdef __LITTLE_ENDIAN__ |
318 | vperm v5, v9, v5, v6 |
319 | #else |
320 | vperm v5, v5, v9, v6 |
321 | #endif |
322 | lvx v4, 0, rSTR1 |
323 | lvx v9, rSTR1, r8 |
324 | #ifdef __LITTLE_ENDIAN__ |
325 | vperm v4, v9, v4, v10 |
326 | #else |
327 | vperm v4, v4, v9, v10 |
328 | #endif |
329 | vcmpequb. v7, v5, v4 |
330 | bnl cr6, L(different1) |
331 | cmpldi cr6, rN, 16 |
332 | ble cr6, L(zeroLength) |
333 | subfic r11, r11, 16 |
334 | subf rN, r11, rN |
335 | add rSTR1, rSTR1, r11 |
336 | add rSTR2, rSTR2, r11 |
337 | |
338 | /* As s1 is QW aligned prepare for unaligned loop. */ |
339 | .align 4 |
340 | L(s1_align): |
341 | #ifdef __LITTLE_ENDIAN__ |
342 | lvsr v6, 0, rSTR2 |
343 | #else |
344 | lvsl v6, 0, rSTR2 |
345 | #endif |
346 | lvx v5, 0, rSTR2 |
347 | srdi. r6, rN, 6 |
348 | li r10, 32 |
349 | li r11, 48 |
350 | ble cr0, L(lessthan64_unalign) |
351 | mtctr r6 |
352 | li r9, 64 |
353 | /* Unaligned vector loop. */ |
354 | .align 4 |
355 | L(unalign_qwloop): |
356 | lvx v4, 0, rSTR1 |
357 | lvx v10, rSTR2, r8 |
358 | #ifdef __LITTLE_ENDIAN__ |
359 | vperm v5, v10, v5, v6 |
360 | #else |
361 | vperm v5, v5, v10, v6 |
362 | #endif |
363 | vcmpequb. v7, v5, v4 |
364 | bnl cr6, L(different2) |
365 | vor v5, v10, v10 |
366 | lvx v4, rSTR1, r8 |
367 | lvx v10, rSTR2, r10 |
368 | #ifdef __LITTLE_ENDIAN__ |
369 | vperm v5, v10, v5, v6 |
370 | #else |
371 | vperm v5, v5, v10, v6 |
372 | #endif |
373 | vcmpequb. v7, v5, v4 |
374 | bnl cr6, L(different2) |
375 | vor v5, v10, v10 |
376 | lvx v4, rSTR1, r10 |
377 | lvx v10, rSTR2, r11 |
378 | #ifdef __LITTLE_ENDIAN__ |
379 | vperm v5, v10, v5, v6 |
380 | #else |
381 | vperm v5, v5, v10, v6 |
382 | #endif |
383 | vcmpequb. v7, v5, v4 |
384 | bnl cr6, L(different2) |
385 | vor v5, v10, v10 |
386 | lvx v4, rSTR1, r11 |
387 | lvx v10, rSTR2, r9 |
388 | #ifdef __LITTLE_ENDIAN__ |
389 | vperm v5, v10, v5, v6 |
390 | #else |
391 | vperm v5, v5, v10, v6 |
392 | #endif |
393 | vcmpequb. v7, v5, v4 |
394 | bnl cr6, L(different2) |
395 | vor v5, v10, v10 |
396 | addi rSTR1, rSTR1, 64 |
397 | addi rSTR2, rSTR2, 64 |
398 | bdnz L(unalign_qwloop) |
399 | clrldi rN, rN, 58 |
400 | /* Handle remainder for unaligned loop. */ |
401 | .align 4 |
402 | L(lessthan64_unalign): |
403 | mr r9, rSTR1 |
404 | cmpdi cr6, rN, 0 |
405 | li rSTR1, 0 |
406 | blelr cr6 |
407 | lvx v4, 0, r9 |
408 | lvx v10, rSTR2, r8 |
409 | #ifdef __LITTLE_ENDIAN__ |
410 | vperm v5, v10, v5, v6 |
411 | #else |
412 | vperm v5, v5, v10, v6 |
413 | #endif |
414 | vcmpequb. v7, v5, v4 |
415 | bnl cr6, L(different1) |
416 | vor v5, v10, v10 |
417 | addi rN, rN, -16 |
418 | |
419 | cmpdi cr6, rN, 0 |
420 | blelr cr6 |
421 | lvx v4, r9, r8 |
422 | lvx v10, rSTR2, r10 |
423 | #ifdef __LITTLE_ENDIAN__ |
424 | vperm v5, v10, v5, v6 |
425 | #else |
426 | vperm v5, v5, v10, v6 |
427 | #endif |
428 | vcmpequb. v7, v5, v4 |
429 | bnl cr6, L(different1) |
430 | vor v5, v10, v10 |
431 | addi rN, rN, -16 |
432 | |
433 | cmpdi cr6, rN, 0 |
434 | blelr cr6 |
435 | lvx v4, r9, r10 |
436 | lvx v10, rSTR2, r11 |
437 | #ifdef __LITTLE_ENDIAN__ |
438 | vperm v5, v10, v5, v6 |
439 | #else |
440 | vperm v5, v5, v10, v6 |
441 | #endif |
442 | vcmpequb. v7, v5, v4 |
443 | bnl cr6, L(different1) |
444 | vor v5, v10, v10 |
445 | addi rN, rN, -16 |
446 | |
447 | cmpdi cr6, rN, 0 |
448 | blelr cr6 |
449 | lvx v4, r9, r11 |
450 | addi r11, r11, 16 |
451 | lvx v10, rSTR2, r11 |
452 | #ifdef __LITTLE_ENDIAN__ |
453 | vperm v5, v10, v5, v6 |
454 | #else |
455 | vperm v5, v5, v10, v6 |
456 | #endif |
457 | vcmpequb. v7, v5, v4 |
458 | bnl cr6, L(different1) |
459 | blr |
460 | |
461 | /* Otherwise we know the two strings have the same alignment (but not |
462 | yet DW). So we force the string addresses to the next lower DW |
463 | boundary and special case this first DW using shift left to |
464 | eliminate bits preceding the first byte. Since we want to join the |
465 | normal (DW aligned) compare loop, starting at the second double word, |
466 | we need to adjust the length (rN) and special case the loop |
467 | versioning for the first DW. This ensures that the loop count is |
468 | correct and the first DW (shifted) is in the expected register pair. */ |
469 | .align 4 |
470 | L(DW): |
471 | std rWORD8, rWORD8SAVE(r1) |
472 | std rWORD7, rWORD7SAVE(r1) |
473 | std rOFF8, rOFF8SAVE(r1) |
474 | std rOFF16, rOFF16SAVE(r1) |
475 | std rOFF24, rOFF24SAVE(r1) |
476 | std rOFF32, rOFF32SAVE(r1) |
477 | cfi_offset(rWORD8, rWORD8SAVE) |
478 | cfi_offset(rWORD7, rWORD7SAVE) |
479 | cfi_offset(rOFF8, rOFF8SAVE) |
480 | cfi_offset(rOFF16, rOFF16SAVE) |
481 | cfi_offset(rOFF24, rOFF24SAVE) |
482 | cfi_offset(rOFF32, rOFF32SAVE) |
483 | |
484 | li rOFF8,8 |
485 | li rOFF16,16 |
486 | li rOFF24,24 |
487 | li rOFF32,32 |
488 | clrrdi rSTR1, rSTR1, 3 |
489 | clrrdi rSTR2, rSTR2, 3 |
490 | beq cr5, L(DWaligned) |
491 | add rN, rN, r12 |
492 | sldi rWORD6, r12, 3 |
493 | srdi r0, rN, 5 /* Divide by 32. */ |
494 | andi. r12, rN, 24 /* Get the DW remainder. */ |
495 | LD rWORD1, 0, rSTR1 |
496 | LD rWORD2, 0, rSTR2 |
497 | cmpldi cr1, r12, 16 |
498 | cmpldi cr7, rN, 32 |
499 | clrldi rN, rN, 61 |
500 | beq L(dPs4) |
501 | mtctr r0 |
502 | bgt cr1, L(dPs3) |
503 | beq cr1, L(dPs2) |
504 | |
505 | /* Remainder is 8. */ |
506 | .align 3 |
507 | L(dsP1): |
508 | sld rWORD5, rWORD1, rWORD6 |
509 | sld rWORD6, rWORD2, rWORD6 |
510 | cmpld cr5, rWORD5, rWORD6 |
511 | blt cr7, L(dP1x) |
512 | /* Do something useful in this cycle since we have to branch anyway. */ |
513 | LD rWORD1, rOFF8, rSTR1 |
514 | LD rWORD2, rOFF8, rSTR2 |
515 | cmpld cr7, rWORD1, rWORD2 |
516 | b L(dP1e) |
517 | /* Remainder is 16. */ |
518 | .align 4 |
519 | L(dPs2): |
520 | sld rWORD5, rWORD1, rWORD6 |
521 | sld rWORD6, rWORD2, rWORD6 |
522 | cmpld cr6, rWORD5, rWORD6 |
523 | blt cr7, L(dP2x) |
524 | /* Do something useful in this cycle since we have to branch anyway. */ |
525 | LD rWORD7, rOFF8, rSTR1 |
526 | LD rWORD8, rOFF8, rSTR2 |
527 | cmpld cr5, rWORD7, rWORD8 |
528 | b L(dP2e) |
529 | /* Remainder is 24. */ |
530 | .align 4 |
531 | L(dPs3): |
532 | sld rWORD3, rWORD1, rWORD6 |
533 | sld rWORD4, rWORD2, rWORD6 |
534 | cmpld cr1, rWORD3, rWORD4 |
535 | b L(dP3e) |
536 | /* Count is a multiple of 32, remainder is 0. */ |
537 | .align 4 |
538 | L(dPs4): |
539 | mtctr r0 |
540 | sld rWORD1, rWORD1, rWORD6 |
541 | sld rWORD2, rWORD2, rWORD6 |
542 | cmpld cr7, rWORD1, rWORD2 |
543 | b L(dP4e) |
544 | |
545 | /* At this point we know both strings are double word aligned and the |
546 | compare length is at least 8 bytes. */ |
547 | .align 4 |
548 | L(DWaligned): |
549 | andi. r12, rN, 24 /* Get the DW remainder. */ |
550 | srdi r0, rN, 5 /* Divide by 32. */ |
551 | cmpldi cr1, r12, 16 |
552 | cmpldi cr7, rN, 32 |
553 | clrldi rN, rN, 61 |
554 | beq L(dP4) |
555 | bgt cr1, L(dP3) |
556 | beq cr1, L(dP2) |
557 | |
558 | /* Remainder is 8. */ |
559 | .align 4 |
560 | L(dP1): |
561 | mtctr r0 |
562 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early |
563 | (8-15 byte compare), we want to use only volatile registers. This |
564 | means we can avoid restoring non-volatile registers since we did not |
565 | change any on the early exit path. The key here is the non-early |
566 | exit path only cares about the condition code (cr5), not about which |
567 | register pair was used. */ |
568 | LD rWORD5, 0, rSTR1 |
569 | LD rWORD6, 0, rSTR2 |
570 | cmpld cr5, rWORD5, rWORD6 |
571 | blt cr7, L(dP1x) |
572 | LD rWORD1, rOFF8, rSTR1 |
573 | LD rWORD2, rOFF8, rSTR2 |
574 | cmpld cr7, rWORD1, rWORD2 |
575 | L(dP1e): |
576 | LD rWORD3, rOFF16, rSTR1 |
577 | LD rWORD4, rOFF16, rSTR2 |
578 | cmpld cr1, rWORD3, rWORD4 |
579 | LD rWORD5, rOFF24, rSTR1 |
580 | LD rWORD6, rOFF24, rSTR2 |
581 | cmpld cr6, rWORD5, rWORD6 |
582 | bne cr5, L(dLcr5x) |
583 | bne cr7, L(dLcr7x) |
584 | |
585 | LD rWORD7, rOFF32, rSTR1 |
586 | LD rWORD8, rOFF32, rSTR2 |
587 | addi rSTR1, rSTR1, 32 |
588 | addi rSTR2, rSTR2, 32 |
589 | bne cr1, L(dLcr1) |
590 | cmpld cr5, rWORD7, rWORD8 |
591 | bdnz L(dLoop) |
592 | bne cr6, L(dLcr6) |
593 | ld rWORD8, rWORD8SAVE(r1) |
594 | ld rWORD7, rWORD7SAVE(r1) |
595 | .align 3 |
596 | L(dP1x): |
597 | sldi. r12, rN, 3 |
598 | bne cr5, L(dLcr5x) |
599 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
600 | bne L(d00) |
601 | ld rOFF8, rOFF8SAVE(r1) |
602 | ld rOFF16, rOFF16SAVE(r1) |
603 | ld rOFF24, rOFF24SAVE(r1) |
604 | ld rOFF32, rOFF32SAVE(r1) |
605 | li rRTN, 0 |
606 | blr |
607 | |
608 | /* Remainder is 16. */ |
609 | .align 4 |
610 | L(dP2): |
611 | mtctr r0 |
612 | LD rWORD5, 0, rSTR1 |
613 | LD rWORD6, 0, rSTR2 |
614 | cmpld cr6, rWORD5, rWORD6 |
615 | blt cr7, L(dP2x) |
616 | LD rWORD7, rOFF8, rSTR1 |
617 | LD rWORD8, rOFF8, rSTR2 |
618 | cmpld cr5, rWORD7, rWORD8 |
619 | L(dP2e): |
620 | LD rWORD1, rOFF16, rSTR1 |
621 | LD rWORD2, rOFF16, rSTR2 |
622 | cmpld cr7, rWORD1, rWORD2 |
623 | LD rWORD3, rOFF24, rSTR1 |
624 | LD rWORD4, rOFF24, rSTR2 |
625 | cmpld cr1, rWORD3, rWORD4 |
626 | addi rSTR1, rSTR1, 8 |
627 | addi rSTR2, rSTR2, 8 |
628 | bne cr6, L(dLcr6) |
629 | bne cr5, L(dLcr5) |
630 | b L(dLoop2) |
631 | .align 4 |
632 | L(dP2x): |
633 | LD rWORD3, rOFF8, rSTR1 |
634 | LD rWORD4, rOFF8, rSTR2 |
635 | cmpld cr1, rWORD3, rWORD4 |
636 | sldi. r12, rN, 3 |
637 | bne cr6, L(dLcr6x) |
638 | addi rSTR1, rSTR1, 8 |
639 | addi rSTR2, rSTR2, 8 |
640 | bne cr1, L(dLcr1x) |
641 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
642 | bne L(d00) |
643 | ld rOFF8, rOFF8SAVE(r1) |
644 | ld rOFF16, rOFF16SAVE(r1) |
645 | ld rOFF24, rOFF24SAVE(r1) |
646 | ld rOFF32, rOFF32SAVE(r1) |
647 | li rRTN, 0 |
648 | blr |
649 | |
650 | /* Remainder is 24. */ |
651 | .align 4 |
652 | L(dP3): |
653 | mtctr r0 |
654 | LD rWORD3, 0, rSTR1 |
655 | LD rWORD4, 0, rSTR2 |
656 | cmpld cr1, rWORD3, rWORD4 |
657 | L(dP3e): |
658 | LD rWORD5, rOFF8, rSTR1 |
659 | LD rWORD6, rOFF8, rSTR2 |
660 | cmpld cr6, rWORD5, rWORD6 |
661 | blt cr7, L(dP3x) |
662 | LD rWORD7, rOFF16, rSTR1 |
663 | LD rWORD8, rOFF16, rSTR2 |
664 | cmpld cr5, rWORD7, rWORD8 |
665 | LD rWORD1, rOFF24, rSTR1 |
666 | LD rWORD2, rOFF24, rSTR2 |
667 | cmpld cr7, rWORD1, rWORD2 |
668 | addi rSTR1, rSTR1, 16 |
669 | addi rSTR2, rSTR2, 16 |
670 | bne cr1, L(dLcr1) |
671 | bne cr6, L(dLcr6) |
672 | b L(dLoop1) |
673 | /* Again we are on a early exit path (24-31 byte compare), we want to |
674 | only use volatile registers and avoid restoring non-volatile |
675 | registers. */ |
676 | .align 4 |
677 | L(dP3x): |
678 | LD rWORD1, rOFF16, rSTR1 |
679 | LD rWORD2, rOFF16, rSTR2 |
680 | cmpld cr7, rWORD1, rWORD2 |
681 | sldi. r12, rN, 3 |
682 | bne cr1, L(dLcr1x) |
683 | addi rSTR1, rSTR1, 16 |
684 | addi rSTR2, rSTR2, 16 |
685 | bne cr6, L(dLcr6x) |
686 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
687 | bne cr7, L(dLcr7x) |
688 | bne L(d00) |
689 | ld rOFF8, rOFF8SAVE(r1) |
690 | ld rOFF16, rOFF16SAVE(r1) |
691 | ld rOFF24, rOFF24SAVE(r1) |
692 | ld rOFF32, rOFF32SAVE(r1) |
693 | li rRTN, 0 |
694 | blr |
695 | |
696 | /* Count is a multiple of 32, remainder is 0. */ |
697 | .align 4 |
698 | L(dP4): |
699 | mtctr r0 |
700 | LD rWORD1, 0, rSTR1 |
701 | LD rWORD2, 0, rSTR2 |
702 | cmpld cr7, rWORD1, rWORD2 |
703 | L(dP4e): |
704 | LD rWORD3, rOFF8, rSTR1 |
705 | LD rWORD4, rOFF8, rSTR2 |
706 | cmpld cr1, rWORD3, rWORD4 |
707 | LD rWORD5, rOFF16, rSTR1 |
708 | LD rWORD6, rOFF16, rSTR2 |
709 | cmpld cr6, rWORD5, rWORD6 |
710 | LD rWORD7, rOFF24, rSTR1 |
711 | LD rWORD8, rOFF24, rSTR2 |
712 | addi rSTR1, rSTR1, 24 |
713 | addi rSTR2, rSTR2, 24 |
714 | cmpld cr5, rWORD7, rWORD8 |
715 | bne cr7, L(dLcr7) |
716 | bne cr1, L(dLcr1) |
717 | bdz- L(d24) /* Adjust CTR as we start with +4. */ |
718 | /* This is the primary loop. */ |
719 | .align 4 |
720 | L(dLoop): |
721 | LD rWORD1, rOFF8, rSTR1 |
722 | LD rWORD2, rOFF8, rSTR2 |
723 | cmpld cr1, rWORD3, rWORD4 |
724 | bne cr6, L(dLcr6) |
725 | L(dLoop1): |
726 | LD rWORD3, rOFF16, rSTR1 |
727 | LD rWORD4, rOFF16, rSTR2 |
728 | cmpld cr6, rWORD5, rWORD6 |
729 | bne cr5, L(dLcr5) |
730 | L(dLoop2): |
731 | LD rWORD5, rOFF24, rSTR1 |
732 | LD rWORD6, rOFF24, rSTR2 |
733 | cmpld cr5, rWORD7, rWORD8 |
734 | bne cr7, L(dLcr7) |
735 | L(dLoop3): |
736 | LD rWORD7, rOFF32, rSTR1 |
737 | LD rWORD8, rOFF32, rSTR2 |
738 | addi rSTR1, rSTR1, 32 |
739 | addi rSTR2, rSTR2, 32 |
740 | bne cr1, L(dLcr1) |
741 | cmpld cr7, rWORD1, rWORD2 |
742 | bdnz L(dLoop) |
743 | |
744 | L(dL4): |
745 | cmpld cr1, rWORD3, rWORD4 |
746 | bne cr6, L(dLcr6) |
747 | cmpld cr6, rWORD5, rWORD6 |
748 | bne cr5, L(dLcr5) |
749 | cmpld cr5, rWORD7, rWORD8 |
750 | L(d44): |
751 | bne cr7, L(dLcr7) |
752 | L(d34): |
753 | bne cr1, L(dLcr1) |
754 | L(d24): |
755 | bne cr6, L(dLcr6) |
756 | L(d14): |
757 | sldi. r12, rN, 3 |
758 | bne cr5, L(dLcr5) |
759 | L(d04): |
760 | ld rWORD8, rWORD8SAVE(r1) |
761 | ld rWORD7, rWORD7SAVE(r1) |
762 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
763 | beq L(duzeroLength) |
764 | /* At this point we have a remainder of 1 to 7 bytes to compare. Since |
765 | we are aligned it is safe to load the whole double word, and use |
766 | shift right double to eliminate bits beyond the compare length. */ |
767 | L(d00): |
768 | LD rWORD1, rOFF8, rSTR1 |
769 | LD rWORD2, rOFF8, rSTR2 |
770 | srd rWORD1, rWORD1, rN |
771 | srd rWORD2, rWORD2, rN |
772 | cmpld cr7, rWORD1, rWORD2 |
773 | bne cr7, L(dLcr7x) |
774 | ld rOFF8, rOFF8SAVE(r1) |
775 | ld rOFF16, rOFF16SAVE(r1) |
776 | ld rOFF24, rOFF24SAVE(r1) |
777 | ld rOFF32, rOFF32SAVE(r1) |
778 | li rRTN, 0 |
779 | blr |
780 | |
781 | .align 4 |
782 | L(dLcr7): |
783 | ld rWORD8, rWORD8SAVE(r1) |
784 | ld rWORD7, rWORD7SAVE(r1) |
785 | L(dLcr7x): |
786 | ld rOFF8, rOFF8SAVE(r1) |
787 | ld rOFF16, rOFF16SAVE(r1) |
788 | ld rOFF24, rOFF24SAVE(r1) |
789 | ld rOFF32, rOFF32SAVE(r1) |
790 | li rRTN, 1 |
791 | bgtlr cr7 |
792 | li rRTN, -1 |
793 | blr |
794 | .align 4 |
795 | L(dLcr1): |
796 | ld rWORD8, rWORD8SAVE(r1) |
797 | ld rWORD7, rWORD7SAVE(r1) |
798 | L(dLcr1x): |
799 | ld rOFF8, rOFF8SAVE(r1) |
800 | ld rOFF16, rOFF16SAVE(r1) |
801 | ld rOFF24, rOFF24SAVE(r1) |
802 | ld rOFF32, rOFF32SAVE(r1) |
803 | li rRTN, 1 |
804 | bgtlr cr1 |
805 | li rRTN, -1 |
806 | blr |
807 | .align 4 |
808 | L(dLcr6): |
809 | ld rWORD8, rWORD8SAVE(r1) |
810 | ld rWORD7, rWORD7SAVE(r1) |
811 | L(dLcr6x): |
812 | ld rOFF8, rOFF8SAVE(r1) |
813 | ld rOFF16, rOFF16SAVE(r1) |
814 | ld rOFF24, rOFF24SAVE(r1) |
815 | ld rOFF32, rOFF32SAVE(r1) |
816 | li rRTN, 1 |
817 | bgtlr cr6 |
818 | li rRTN, -1 |
819 | blr |
820 | .align 4 |
821 | L(dLcr5): |
822 | ld rWORD8, rWORD8SAVE(r1) |
823 | ld rWORD7, rWORD7SAVE(r1) |
824 | L(dLcr5x): |
825 | ld rOFF8, rOFF8SAVE(r1) |
826 | ld rOFF16, rOFF16SAVE(r1) |
827 | ld rOFF24, rOFF24SAVE(r1) |
828 | ld rOFF32, rOFF32SAVE(r1) |
829 | li rRTN, 1 |
830 | bgtlr cr5 |
831 | li rRTN, -1 |
832 | blr |
833 | |
834 | .align 4 |
835 | L(bytealigned): |
836 | mtctr rN |
837 | |
838 | /* We need to prime this loop. This loop is swing modulo scheduled |
839 | to avoid pipe delays. The dependent instruction latencies (load to |
840 | compare to conditional branch) is 2 to 3 cycles. In this loop each |
841 | dispatch group ends in a branch and takes 1 cycle. Effectively |
842 | the first iteration of the loop only serves to load operands and |
843 | branches based on compares are delayed until the next loop. |
844 | |
845 | So we must precondition some registers and condition codes so that |
846 | we don't exit the loop early on the first iteration. */ |
847 | |
848 | lbz rWORD1, 0(rSTR1) |
849 | lbz rWORD2, 0(rSTR2) |
850 | bdz L(b11) |
851 | cmpld cr7, rWORD1, rWORD2 |
852 | lbz rWORD3, 1(rSTR1) |
853 | lbz rWORD4, 1(rSTR2) |
854 | bdz L(b12) |
855 | cmpld cr1, rWORD3, rWORD4 |
856 | lbzu rWORD5, 2(rSTR1) |
857 | lbzu rWORD6, 2(rSTR2) |
858 | bdz L(b13) |
859 | .align 4 |
860 | L(bLoop): |
861 | lbzu rWORD1, 1(rSTR1) |
862 | lbzu rWORD2, 1(rSTR2) |
863 | bne cr7, L(bLcr7) |
864 | |
865 | cmpld cr6, rWORD5, rWORD6 |
866 | bdz L(b3i) |
867 | |
868 | lbzu rWORD3, 1(rSTR1) |
869 | lbzu rWORD4, 1(rSTR2) |
870 | bne cr1, L(bLcr1) |
871 | |
872 | cmpld cr7, rWORD1, rWORD2 |
873 | bdz L(b2i) |
874 | |
875 | lbzu rWORD5, 1(rSTR1) |
876 | lbzu rWORD6, 1(rSTR2) |
877 | bne cr6, L(bLcr6) |
878 | |
879 | cmpld cr1, rWORD3, rWORD4 |
880 | bdnz L(bLoop) |
881 | |
882 | /* We speculatively loading bytes before we have tested the previous |
883 | bytes. But we must avoid overrunning the length (in the ctr) to |
884 | prevent these speculative loads from causing a segfault. In this |
885 | case the loop will exit early (before the all pending bytes are |
886 | tested. In this case we must complete the pending operations |
887 | before returning. */ |
888 | L(b1i): |
889 | bne cr7, L(bLcr7) |
890 | bne cr1, L(bLcr1) |
891 | b L(bx56) |
892 | .align 4 |
893 | L(b2i): |
894 | bne cr6, L(bLcr6) |
895 | bne cr7, L(bLcr7) |
896 | b L(bx34) |
897 | .align 4 |
898 | L(b3i): |
899 | bne cr1, L(bLcr1) |
900 | bne cr6, L(bLcr6) |
901 | b L(bx12) |
902 | .align 4 |
903 | L(bLcr7): |
904 | li rRTN, 1 |
905 | bgtlr cr7 |
906 | li rRTN, -1 |
907 | blr |
908 | L(bLcr1): |
909 | li rRTN, 1 |
910 | bgtlr cr1 |
911 | li rRTN, -1 |
912 | blr |
913 | L(bLcr6): |
914 | li rRTN, 1 |
915 | bgtlr cr6 |
916 | li rRTN, -1 |
917 | blr |
918 | |
919 | L(b13): |
920 | bne cr7, L(bx12) |
921 | bne cr1, L(bx34) |
922 | L(bx56): |
923 | sub rRTN, rWORD5, rWORD6 |
924 | blr |
925 | nop |
926 | L(b12): |
927 | bne cr7, L(bx12) |
928 | L(bx34): |
929 | sub rRTN, rWORD3, rWORD4 |
930 | blr |
931 | L(b11): |
932 | L(bx12): |
933 | sub rRTN, rWORD1, rWORD2 |
934 | blr |
935 | |
936 | .align 4 |
937 | L(zeroLength): |
938 | li rRTN, 0 |
939 | blr |
940 | |
941 | .align 4 |
942 | /* At this point we know the strings have different alignment and the |
943 | compare length is at least 8 bytes. r12 contains the low order |
944 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
945 | of r12 to 0. If r12 == 0 then rStr1 is double word |
946 | aligned and can perform the DWunaligned loop. |
947 | |
948 | Otherwise we know that rSTR1 is not already DW aligned yet. |
949 | So we can force the string addresses to the next lower DW |
950 | boundary and special case this first DW using shift left to |
951 | eliminate bits preceding the first byte. Since we want to join the |
952 | normal (DWaligned) compare loop, starting at the second double word, |
953 | we need to adjust the length (rN) and special case the loop |
954 | versioning for the first DW. This ensures that the loop count is |
955 | correct and the first DW (shifted) is in the expected resister pair. */ |
956 | L(unaligned): |
957 | std rWORD8, rWORD8SAVE(r1) |
958 | std rWORD7, rWORD7SAVE(r1) |
959 | std rOFF8, rOFF8SAVE(r1) |
960 | std rOFF16, rOFF16SAVE(r1) |
961 | std rOFF24, rOFF24SAVE(r1) |
962 | std rOFF32, rOFF32SAVE(r1) |
963 | cfi_offset(rWORD8, rWORD8SAVE) |
964 | cfi_offset(rWORD7, rWORD7SAVE) |
965 | cfi_offset(rOFF8, rOFF8SAVE) |
966 | cfi_offset(rOFF16, rOFF16SAVE) |
967 | cfi_offset(rOFF24, rOFF24SAVE) |
968 | cfi_offset(rOFF32, rOFF32SAVE) |
969 | li rOFF8,8 |
970 | li rOFF16,16 |
971 | li rOFF24,24 |
972 | li rOFF32,32 |
973 | std rSHL, rSHLSAVE(r1) |
974 | cfi_offset(rSHL, rSHLSAVE) |
975 | clrldi rSHL, rSTR2, 61 |
976 | beq cr6, L(duzeroLength) |
977 | std rSHR, rSHRSAVE(r1) |
978 | cfi_offset(rSHR, rSHRSAVE) |
979 | beq cr5, L(DWunaligned) |
980 | std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
981 | cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) |
982 | /* Adjust the logical start of rSTR2 to compensate for the extra bits |
983 | in the 1st rSTR1 DW. */ |
984 | sub rWORD8_SHIFT, rSTR2, r12 |
985 | /* But do not attempt to address the DW before that DW that contains |
986 | the actual start of rSTR2. */ |
987 | clrrdi rSTR2, rSTR2, 3 |
988 | std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
989 | /* Compute the left/right shift counts for the unaligned rSTR2, |
990 | compensating for the logical (DW aligned) start of rSTR1. */ |
991 | clrldi rSHL, rWORD8_SHIFT, 61 |
992 | clrrdi rSTR1, rSTR1, 3 |
993 | std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) |
994 | sldi rSHL, rSHL, 3 |
995 | cmpld cr5, rWORD8_SHIFT, rSTR2 |
996 | add rN, rN, r12 |
997 | sldi rWORD6, r12, 3 |
998 | std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
999 | cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) |
1000 | cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) |
1001 | cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) |
1002 | subfic rSHR, rSHL, 64 |
1003 | srdi r0, rN, 5 /* Divide by 32. */ |
1004 | andi. r12, rN, 24 /* Get the DW remainder. */ |
1005 | /* We normally need to load 2 DWs to start the unaligned rSTR2, but in |
1006 | this special case those bits may be discarded anyway. Also we |
1007 | must avoid loading a DW where none of the bits are part of rSTR2 as |
1008 | this may cross a page boundary and cause a page fault. */ |
1009 | li rWORD8, 0 |
1010 | blt cr5, L(dus0) |
1011 | LD rWORD8, 0, rSTR2 |
1012 | addi rSTR2, rSTR2, 8 |
1013 | sld rWORD8, rWORD8, rSHL |
1014 | |
1015 | L(dus0): |
1016 | LD rWORD1, 0, rSTR1 |
1017 | LD rWORD2, 0, rSTR2 |
1018 | cmpldi cr1, r12, 16 |
1019 | cmpldi cr7, rN, 32 |
1020 | srd r12, rWORD2, rSHR |
1021 | clrldi rN, rN, 61 |
1022 | beq L(duPs4) |
1023 | mtctr r0 |
1024 | or rWORD8, r12, rWORD8 |
1025 | bgt cr1, L(duPs3) |
1026 | beq cr1, L(duPs2) |
1027 | |
1028 | /* Remainder is 8. */ |
1029 | .align 4 |
1030 | L(dusP1): |
1031 | sld rWORD8_SHIFT, rWORD2, rSHL |
1032 | sld rWORD7, rWORD1, rWORD6 |
1033 | sld rWORD8, rWORD8, rWORD6 |
1034 | bge cr7, L(duP1e) |
1035 | /* At this point we exit early with the first double word compare |
1036 | complete and remainder of 0 to 7 bytes. See L(du14) for details on |
1037 | how we handle the remaining bytes. */ |
1038 | cmpld cr5, rWORD7, rWORD8 |
1039 | sldi. rN, rN, 3 |
1040 | bne cr5, L(duLcr5) |
1041 | cmpld cr7, rN, rSHR |
1042 | beq L(duZeroReturn) |
1043 | li r0, 0 |
1044 | ble cr7, L(dutrim) |
1045 | LD rWORD2, rOFF8, rSTR2 |
1046 | srd r0, rWORD2, rSHR |
1047 | b L(dutrim) |
1048 | /* Remainder is 16. */ |
1049 | .align 4 |
1050 | L(duPs2): |
1051 | sld rWORD6_SHIFT, rWORD2, rSHL |
1052 | sld rWORD5, rWORD1, rWORD6 |
1053 | sld rWORD6, rWORD8, rWORD6 |
1054 | b L(duP2e) |
1055 | /* Remainder is 24. */ |
1056 | .align 4 |
1057 | L(duPs3): |
1058 | sld rWORD4_SHIFT, rWORD2, rSHL |
1059 | sld rWORD3, rWORD1, rWORD6 |
1060 | sld rWORD4, rWORD8, rWORD6 |
1061 | b L(duP3e) |
1062 | /* Count is a multiple of 32, remainder is 0. */ |
1063 | .align 4 |
1064 | L(duPs4): |
1065 | mtctr r0 |
1066 | or rWORD8, r12, rWORD8 |
1067 | sld rWORD2_SHIFT, rWORD2, rSHL |
1068 | sld rWORD1, rWORD1, rWORD6 |
1069 | sld rWORD2, rWORD8, rWORD6 |
1070 | b L(duP4e) |
1071 | |
1072 | /* At this point we know rSTR1 is double word aligned and the |
1073 | compare length is at least 8 bytes. */ |
1074 | .align 4 |
1075 | L(DWunaligned): |
1076 | std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
1077 | clrrdi rSTR2, rSTR2, 3 |
1078 | std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
1079 | srdi r0, rN, 5 /* Divide by 32. */ |
1080 | std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) |
1081 | andi. r12, rN, 24 /* Get the DW remainder. */ |
1082 | std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
1083 | cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) |
1084 | cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) |
1085 | cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) |
1086 | cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) |
1087 | sldi rSHL, rSHL, 3 |
1088 | LD rWORD6, 0, rSTR2 |
1089 | LD rWORD8, rOFF8, rSTR2 |
1090 | addi rSTR2, rSTR2, 8 |
1091 | cmpldi cr1, r12, 16 |
1092 | cmpldi cr7, rN, 32 |
1093 | clrldi rN, rN, 61 |
1094 | subfic rSHR, rSHL, 64 |
1095 | sld rWORD6_SHIFT, rWORD6, rSHL |
1096 | beq L(duP4) |
1097 | mtctr r0 |
1098 | bgt cr1, L(duP3) |
1099 | beq cr1, L(duP2) |
1100 | |
1101 | /* Remainder is 8. */ |
1102 | .align 4 |
1103 | L(duP1): |
1104 | srd r12, rWORD8, rSHR |
1105 | LD rWORD7, 0, rSTR1 |
1106 | sld rWORD8_SHIFT, rWORD8, rSHL |
1107 | or rWORD8, r12, rWORD6_SHIFT |
1108 | blt cr7, L(duP1x) |
1109 | L(duP1e): |
1110 | LD rWORD1, rOFF8, rSTR1 |
1111 | LD rWORD2, rOFF8, rSTR2 |
1112 | cmpld cr5, rWORD7, rWORD8 |
1113 | srd r0, rWORD2, rSHR |
1114 | sld rWORD2_SHIFT, rWORD2, rSHL |
1115 | or rWORD2, r0, rWORD8_SHIFT |
1116 | LD rWORD3, rOFF16, rSTR1 |
1117 | LD rWORD4, rOFF16, rSTR2 |
1118 | cmpld cr7, rWORD1, rWORD2 |
1119 | srd r12, rWORD4, rSHR |
1120 | sld rWORD4_SHIFT, rWORD4, rSHL |
1121 | bne cr5, L(duLcr5) |
1122 | or rWORD4, r12, rWORD2_SHIFT |
1123 | LD rWORD5, rOFF24, rSTR1 |
1124 | LD rWORD6, rOFF24, rSTR2 |
1125 | cmpld cr1, rWORD3, rWORD4 |
1126 | srd r0, rWORD6, rSHR |
1127 | sld rWORD6_SHIFT, rWORD6, rSHL |
1128 | bne cr7, L(duLcr7) |
1129 | or rWORD6, r0, rWORD4_SHIFT |
1130 | cmpld cr6, rWORD5, rWORD6 |
1131 | b L(duLoop3) |
1132 | .align 4 |
1133 | /* At this point we exit early with the first double word compare |
1134 | complete and remainder of 0 to 7 bytes. See L(du14) for details on |
1135 | how we handle the remaining bytes. */ |
1136 | L(duP1x): |
1137 | cmpld cr5, rWORD7, rWORD8 |
1138 | sldi. rN, rN, 3 |
1139 | bne cr5, L(duLcr5) |
1140 | cmpld cr7, rN, rSHR |
1141 | beq L(duZeroReturn) |
1142 | li r0, 0 |
1143 | ble cr7, L(dutrim) |
1144 | LD rWORD2, rOFF8, rSTR2 |
1145 | srd r0, rWORD2, rSHR |
1146 | b L(dutrim) |
1147 | /* Remainder is 16. */ |
1148 | .align 4 |
1149 | L(duP2): |
1150 | srd r0, rWORD8, rSHR |
1151 | LD rWORD5, 0, rSTR1 |
1152 | or rWORD6, r0, rWORD6_SHIFT |
1153 | sld rWORD6_SHIFT, rWORD8, rSHL |
1154 | L(duP2e): |
1155 | LD rWORD7, rOFF8, rSTR1 |
1156 | LD rWORD8, rOFF8, rSTR2 |
1157 | cmpld cr6, rWORD5, rWORD6 |
1158 | srd r12, rWORD8, rSHR |
1159 | sld rWORD8_SHIFT, rWORD8, rSHL |
1160 | or rWORD8, r12, rWORD6_SHIFT |
1161 | blt cr7, L(duP2x) |
1162 | LD rWORD1, rOFF16, rSTR1 |
1163 | LD rWORD2, rOFF16, rSTR2 |
1164 | cmpld cr5, rWORD7, rWORD8 |
1165 | bne cr6, L(duLcr6) |
1166 | srd r0, rWORD2, rSHR |
1167 | sld rWORD2_SHIFT, rWORD2, rSHL |
1168 | or rWORD2, r0, rWORD8_SHIFT |
1169 | LD rWORD3, rOFF24, rSTR1 |
1170 | LD rWORD4, rOFF24, rSTR2 |
1171 | cmpld cr7, rWORD1, rWORD2 |
1172 | bne cr5, L(duLcr5) |
1173 | srd r12, rWORD4, rSHR |
1174 | sld rWORD4_SHIFT, rWORD4, rSHL |
1175 | or rWORD4, r12, rWORD2_SHIFT |
1176 | addi rSTR1, rSTR1, 8 |
1177 | addi rSTR2, rSTR2, 8 |
1178 | cmpld cr1, rWORD3, rWORD4 |
1179 | b L(duLoop2) |
1180 | .align 4 |
1181 | L(duP2x): |
1182 | cmpld cr5, rWORD7, rWORD8 |
1183 | addi rSTR1, rSTR1, 8 |
1184 | addi rSTR2, rSTR2, 8 |
1185 | bne cr6, L(duLcr6) |
1186 | sldi. rN, rN, 3 |
1187 | bne cr5, L(duLcr5) |
1188 | cmpld cr7, rN, rSHR |
1189 | beq L(duZeroReturn) |
1190 | li r0, 0 |
1191 | ble cr7, L(dutrim) |
1192 | LD rWORD2, rOFF8, rSTR2 |
1193 | srd r0, rWORD2, rSHR |
1194 | b L(dutrim) |
1195 | |
1196 | /* Remainder is 24. */ |
1197 | .align 4 |
1198 | L(duP3): |
1199 | srd r12, rWORD8, rSHR |
1200 | LD rWORD3, 0, rSTR1 |
1201 | sld rWORD4_SHIFT, rWORD8, rSHL |
1202 | or rWORD4, r12, rWORD6_SHIFT |
1203 | L(duP3e): |
1204 | LD rWORD5, rOFF8, rSTR1 |
1205 | LD rWORD6, rOFF8, rSTR2 |
1206 | cmpld cr1, rWORD3, rWORD4 |
1207 | srd r0, rWORD6, rSHR |
1208 | sld rWORD6_SHIFT, rWORD6, rSHL |
1209 | or rWORD6, r0, rWORD4_SHIFT |
1210 | LD rWORD7, rOFF16, rSTR1 |
1211 | LD rWORD8, rOFF16, rSTR2 |
1212 | cmpld cr6, rWORD5, rWORD6 |
1213 | bne cr1, L(duLcr1) |
1214 | srd r12, rWORD8, rSHR |
1215 | sld rWORD8_SHIFT, rWORD8, rSHL |
1216 | or rWORD8, r12, rWORD6_SHIFT |
1217 | blt cr7, L(duP3x) |
1218 | LD rWORD1, rOFF24, rSTR1 |
1219 | LD rWORD2, rOFF24, rSTR2 |
1220 | cmpld cr5, rWORD7, rWORD8 |
1221 | bne cr6, L(duLcr6) |
1222 | srd r0, rWORD2, rSHR |
1223 | sld rWORD2_SHIFT, rWORD2, rSHL |
1224 | or rWORD2, r0, rWORD8_SHIFT |
1225 | addi rSTR1, rSTR1, 16 |
1226 | addi rSTR2, rSTR2, 16 |
1227 | cmpld cr7, rWORD1, rWORD2 |
1228 | b L(duLoop1) |
1229 | .align 4 |
1230 | L(duP3x): |
1231 | addi rSTR1, rSTR1, 16 |
1232 | addi rSTR2, rSTR2, 16 |
1233 | cmpld cr5, rWORD7, rWORD8 |
1234 | bne cr6, L(duLcr6) |
1235 | sldi. rN, rN, 3 |
1236 | bne cr5, L(duLcr5) |
1237 | cmpld cr7, rN, rSHR |
1238 | beq L(duZeroReturn) |
1239 | li r0, 0 |
1240 | ble cr7, L(dutrim) |
1241 | LD rWORD2, rOFF8, rSTR2 |
1242 | srd r0, rWORD2, rSHR |
1243 | b L(dutrim) |
1244 | |
1245 | /* Count is a multiple of 32, remainder is 0. */ |
1246 | .align 4 |
1247 | L(duP4): |
1248 | mtctr r0 |
1249 | srd r0, rWORD8, rSHR |
1250 | LD rWORD1, 0, rSTR1 |
1251 | sld rWORD2_SHIFT, rWORD8, rSHL |
1252 | or rWORD2, r0, rWORD6_SHIFT |
1253 | L(duP4e): |
1254 | LD rWORD3, rOFF8, rSTR1 |
1255 | LD rWORD4, rOFF8, rSTR2 |
1256 | cmpld cr7, rWORD1, rWORD2 |
1257 | srd r12, rWORD4, rSHR |
1258 | sld rWORD4_SHIFT, rWORD4, rSHL |
1259 | or rWORD4, r12, rWORD2_SHIFT |
1260 | LD rWORD5, rOFF16, rSTR1 |
1261 | LD rWORD6, rOFF16, rSTR2 |
1262 | cmpld cr1, rWORD3, rWORD4 |
1263 | bne cr7, L(duLcr7) |
1264 | srd r0, rWORD6, rSHR |
1265 | sld rWORD6_SHIFT, rWORD6, rSHL |
1266 | or rWORD6, r0, rWORD4_SHIFT |
1267 | LD rWORD7, rOFF24, rSTR1 |
1268 | LD rWORD8, rOFF24, rSTR2 |
1269 | addi rSTR1, rSTR1, 24 |
1270 | addi rSTR2, rSTR2, 24 |
1271 | cmpld cr6, rWORD5, rWORD6 |
1272 | bne cr1, L(duLcr1) |
1273 | srd r12, rWORD8, rSHR |
1274 | sld rWORD8_SHIFT, rWORD8, rSHL |
1275 | or rWORD8, r12, rWORD6_SHIFT |
1276 | cmpld cr5, rWORD7, rWORD8 |
1277 | bdz L(du24) /* Adjust CTR as we start with +4. */ |
1278 | /* This is the primary loop. */ |
1279 | .align 4 |
1280 | L(duLoop): |
1281 | LD rWORD1, rOFF8, rSTR1 |
1282 | LD rWORD2, rOFF8, rSTR2 |
1283 | cmpld cr1, rWORD3, rWORD4 |
1284 | bne cr6, L(duLcr6) |
1285 | srd r0, rWORD2, rSHR |
1286 | sld rWORD2_SHIFT, rWORD2, rSHL |
1287 | or rWORD2, r0, rWORD8_SHIFT |
1288 | L(duLoop1): |
1289 | LD rWORD3, rOFF16, rSTR1 |
1290 | LD rWORD4, rOFF16, rSTR2 |
1291 | cmpld cr6, rWORD5, rWORD6 |
1292 | bne cr5, L(duLcr5) |
1293 | srd r12, rWORD4, rSHR |
1294 | sld rWORD4_SHIFT, rWORD4, rSHL |
1295 | or rWORD4, r12, rWORD2_SHIFT |
1296 | L(duLoop2): |
1297 | LD rWORD5, rOFF24, rSTR1 |
1298 | LD rWORD6, rOFF24, rSTR2 |
1299 | cmpld cr5, rWORD7, rWORD8 |
1300 | bne cr7, L(duLcr7) |
1301 | srd r0, rWORD6, rSHR |
1302 | sld rWORD6_SHIFT, rWORD6, rSHL |
1303 | or rWORD6, r0, rWORD4_SHIFT |
1304 | L(duLoop3): |
1305 | LD rWORD7, rOFF32, rSTR1 |
1306 | LD rWORD8, rOFF32, rSTR2 |
1307 | addi rSTR1, rSTR1, 32 |
1308 | addi rSTR2, rSTR2, 32 |
1309 | cmpld cr7, rWORD1, rWORD2 |
1310 | bne cr1, L(duLcr1) |
1311 | srd r12, rWORD8, rSHR |
1312 | sld rWORD8_SHIFT, rWORD8, rSHL |
1313 | or rWORD8, r12, rWORD6_SHIFT |
1314 | bdnz L(duLoop) |
1315 | |
1316 | L(duL4): |
1317 | cmpld cr1, rWORD3, rWORD4 |
1318 | bne cr6, L(duLcr6) |
1319 | cmpld cr6, rWORD5, rWORD6 |
1320 | bne cr5, L(duLcr5) |
1321 | cmpld cr5, rWORD7, rWORD8 |
1322 | L(du44): |
1323 | bne cr7, L(duLcr7) |
1324 | L(du34): |
1325 | bne cr1, L(duLcr1) |
1326 | L(du24): |
1327 | bne cr6, L(duLcr6) |
1328 | L(du14): |
1329 | sldi. rN, rN, 3 |
1330 | bne cr5, L(duLcr5) |
1331 | /* At this point we have a remainder of 1 to 7 bytes to compare. We use |
1332 | shift right double to eliminate bits beyond the compare length. |
1333 | |
1334 | However it may not be safe to load rWORD2 which may be beyond the |
1335 | string length. So we compare the bit length of the remainder to |
1336 | the right shift count (rSHR). If the bit count is less than or equal |
1337 | we do not need to load rWORD2 (all significant bits are already in |
1338 | rWORD8_SHIFT). */ |
1339 | cmpld cr7, rN, rSHR |
1340 | beq L(duZeroReturn) |
1341 | li r0, 0 |
1342 | ble cr7, L(dutrim) |
1343 | LD rWORD2, rOFF8, rSTR2 |
1344 | srd r0, rWORD2, rSHR |
1345 | .align 4 |
1346 | L(dutrim): |
1347 | LD rWORD1, rOFF8, rSTR1 |
1348 | ld rWORD8, -8(r1) |
1349 | subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ |
1350 | or rWORD2, r0, rWORD8_SHIFT |
1351 | ld rWORD7, rWORD7SAVE(r1) |
1352 | ld rSHL, rSHLSAVE(r1) |
1353 | srd rWORD1, rWORD1, rN |
1354 | srd rWORD2, rWORD2, rN |
1355 | ld rSHR, rSHRSAVE(r1) |
1356 | ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
1357 | li rRTN, 0 |
1358 | cmpld cr7, rWORD1, rWORD2 |
1359 | ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
1360 | ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) |
1361 | beq cr7, L(dureturn24) |
1362 | li rRTN, 1 |
1363 | ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
1364 | ld rOFF8, rOFF8SAVE(r1) |
1365 | ld rOFF16, rOFF16SAVE(r1) |
1366 | ld rOFF24, rOFF24SAVE(r1) |
1367 | ld rOFF32, rOFF32SAVE(r1) |
1368 | bgtlr cr7 |
1369 | li rRTN, -1 |
1370 | blr |
1371 | .align 4 |
1372 | L(duLcr7): |
1373 | ld rWORD8, rWORD8SAVE(r1) |
1374 | ld rWORD7, rWORD7SAVE(r1) |
1375 | li rRTN, 1 |
1376 | bgt cr7, L(dureturn29) |
1377 | ld rSHL, rSHLSAVE(r1) |
1378 | ld rSHR, rSHRSAVE(r1) |
1379 | li rRTN, -1 |
1380 | b L(dureturn27) |
1381 | .align 4 |
1382 | L(duLcr1): |
1383 | ld rWORD8, rWORD8SAVE(r1) |
1384 | ld rWORD7, rWORD7SAVE(r1) |
1385 | li rRTN, 1 |
1386 | bgt cr1, L(dureturn29) |
1387 | ld rSHL, rSHLSAVE(r1) |
1388 | ld rSHR, rSHRSAVE(r1) |
1389 | li rRTN, -1 |
1390 | b L(dureturn27) |
1391 | .align 4 |
1392 | L(duLcr6): |
1393 | ld rWORD8, rWORD8SAVE(r1) |
1394 | ld rWORD7, rWORD7SAVE(r1) |
1395 | li rRTN, 1 |
1396 | bgt cr6, L(dureturn29) |
1397 | ld rSHL, rSHLSAVE(r1) |
1398 | ld rSHR, rSHRSAVE(r1) |
1399 | li rRTN, -1 |
1400 | b L(dureturn27) |
1401 | .align 4 |
1402 | L(duLcr5): |
1403 | ld rWORD8, rWORD8SAVE(r1) |
1404 | ld rWORD7, rWORD7SAVE(r1) |
1405 | li rRTN, 1 |
1406 | bgt cr5, L(dureturn29) |
1407 | ld rSHL, rSHLSAVE(r1) |
1408 | ld rSHR, rSHRSAVE(r1) |
1409 | li rRTN, -1 |
1410 | b L(dureturn27) |
1411 | |
1412 | .align 3 |
1413 | L(duZeroReturn): |
1414 | li rRTN, 0 |
1415 | .align 4 |
1416 | L(dureturn): |
1417 | ld rWORD8, rWORD8SAVE(r1) |
1418 | ld rWORD7, rWORD7SAVE(r1) |
1419 | L(dureturn29): |
1420 | ld rSHL, rSHLSAVE(r1) |
1421 | ld rSHR, rSHRSAVE(r1) |
1422 | L(dureturn27): |
1423 | ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
1424 | ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
1425 | ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) |
1426 | L(dureturn24): |
1427 | ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
1428 | ld rOFF8, rOFF8SAVE(r1) |
1429 | ld rOFF16, rOFF16SAVE(r1) |
1430 | ld rOFF24, rOFF24SAVE(r1) |
1431 | ld rOFF32, rOFF32SAVE(r1) |
1432 | blr |
1433 | |
1434 | L(duzeroLength): |
1435 | ld rOFF8, rOFF8SAVE(r1) |
1436 | ld rOFF16, rOFF16SAVE(r1) |
1437 | ld rOFF24, rOFF24SAVE(r1) |
1438 | ld rOFF32, rOFF32SAVE(r1) |
1439 | li rRTN, 0 |
1440 | blr |
1441 | |
1442 | END (MEMCMP) |
1443 | libc_hidden_builtin_def (memcmp) |
1444 | weak_alias (memcmp, bcmp) |
1445 | strong_alias (memcmp, __memcmpeq) |
1446 | libc_hidden_def (__memcmpeq) |
1447 | |