1 | /* Optimized strcmp implementation for Power7 using 'cmpb' instruction |
2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* The optimization is achieved here through cmpb instruction. |
20 | 8byte aligned strings are processed with double word comparison |
21 | and unaligned strings are handled effectively with loop unrolling |
22 | technique */ |
23 | |
24 | #include <sysdep.h> |
25 | |
26 | #ifndef STRCMP |
27 | # define STRCMP strcmp |
28 | #endif |
29 | |
30 | /* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ |
31 | |
32 | .machine power7 |
33 | ENTRY_TOCLESS (STRCMP, 4) |
34 | CALL_MCOUNT 2 |
35 | |
36 | or r9, r3, r4 |
37 | rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ |
38 | bne cr0, L(process_unaligned_bytes) |
39 | li r5, 0 |
40 | |
41 | .align 4 |
42 | /* process input parameters on double word aligned boundary */ |
43 | L(unrollDword): |
44 | ld r8,0(r3) |
45 | ld r10,0(r4) |
46 | cmpb r7,r8,r5 |
47 | cmpdi cr7,r7,0 |
48 | mr r9,r7 |
49 | bne cr7,L(null_found) |
50 | cmpld cr7,r8,r10 |
51 | bne cr7,L(different) |
52 | |
53 | ld r8,8(r3) |
54 | ld r10,8(r4) |
55 | cmpb r7,r8,r5 |
56 | cmpdi cr7,r7,0 |
57 | mr r9,r7 |
58 | bne cr7,L(null_found) |
59 | cmpld cr7,r8,r10 |
60 | bne cr7,L(different) |
61 | |
62 | ld r8,16(r3) |
63 | ld r10,16(r4) |
64 | cmpb r7,r8,r5 |
65 | cmpdi cr7,r7,0 |
66 | mr r9,r7 |
67 | bne cr7,L(null_found) |
68 | cmpld cr7,r8,r10 |
69 | bne cr7,L(different) |
70 | |
71 | ld r8,24(r3) |
72 | ld r10,24(r4) |
73 | cmpb r7,r8,r5 |
74 | cmpdi cr7,r7,0 |
75 | mr r9,r7 |
76 | bne cr7,L(null_found) |
77 | cmpld cr7,r8,r10 |
78 | bne cr7,L(different) |
79 | |
80 | addi r3, r3, 32 |
81 | addi r4, r4, 32 |
82 | beq cr7, L(unrollDword) |
83 | |
84 | .align 4 |
85 | L(null_found): |
86 | #ifdef __LITTLE_ENDIAN__ |
87 | neg r7,r9 |
88 | and r9,r9,r7 |
89 | li r7,-1 |
90 | cntlzd r9,r9 |
91 | subfic r9,r9,71 |
92 | sld r9,r7,r9 |
93 | #else |
94 | cntlzd r9,r9 |
95 | li r7,-1 |
96 | addi r9,r9,8 |
97 | srd r9,r7,r9 |
98 | #endif |
99 | or r8,r8,r9 |
100 | or r10,r10,r9 |
101 | |
102 | L(different): |
103 | cmpb r9,r8,r10 |
104 | #ifdef __LITTLE_ENDIAN__ |
105 | addi r7,r9,1 |
106 | andc r9,r7,r9 |
107 | cntlzd r9,r9 |
108 | subfic r9,r9,63 |
109 | #else |
110 | not r9,r9 |
111 | cntlzd r9,r9 |
112 | subfic r9,r9,56 |
113 | #endif |
114 | srd r3,r8,r9 |
115 | srd r10,r10,r9 |
116 | rldicl r10,r10,0,56 |
117 | rldicl r3,r3,0,56 |
118 | subf r3,r10,r3 |
119 | blr |
120 | |
121 | .align 4 |
122 | L(process_unaligned_bytes): |
123 | lbz r9, 0(r3) /* load byte from s1 */ |
124 | lbz r10, 0(r4) /* load byte from s2 */ |
125 | cmpdi cr7, r9, 0 /* compare *s1 with NULL */ |
126 | beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ |
127 | cmplw cr7, r9, r10 /* compare *s1 and *s2 */ |
128 | bne cr7, L(ComputeDiff) /* branch to compute difference and return */ |
129 | |
130 | lbz r9, 1(r3) /* load next byte from s1 */ |
131 | lbz r10, 1(r4) /* load next byte from s2 */ |
132 | cmpdi cr7, r9, 0 /* compare *s1 with NULL */ |
133 | beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ |
134 | cmplw cr7, r9, r10 /* compare *s1 and *s2 */ |
135 | bne cr7, L(ComputeDiff) /* branch to compute difference and return */ |
136 | |
137 | lbz r9, 2(r3) /* unroll 3rd byte here */ |
138 | lbz r10, 2(r4) |
139 | cmpdi cr7, r9, 0 |
140 | beq cr7, L(diffOfNULL) |
141 | cmplw cr7, r9, r10 |
142 | bne 7, L(ComputeDiff) |
143 | |
144 | lbz r9, 3(r3) /* unroll 4th byte now */ |
145 | lbz r10, 3(r4) |
146 | addi r3, r3, 4 /* increment s1 by unroll factor */ |
147 | cmpdi cr7, r9, 0 |
148 | cmplw cr6, 9, r10 |
149 | beq cr7, L(diffOfNULL) |
150 | addi r4, r4, 4 /* increment s2 by unroll factor */ |
151 | beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ |
152 | |
153 | .align 4 |
154 | L(ComputeDiff): |
155 | extsw r9, r9 |
156 | subf r10, r10, r9 /* compute s1 - s2 */ |
157 | extsw r3, r10 |
158 | blr /* return */ |
159 | |
160 | .align 4 |
161 | L(diffOfNULL): |
162 | li r9, 0 |
163 | subf r10, r10, r9 /* compute s1 - s2 */ |
164 | extsw r3, r10 /* sign extend result */ |
165 | blr /* return */ |
166 | |
167 | END (STRCMP) |
168 | libc_hidden_builtin_def (strcmp) |
169 | |