1/* strcmp with unaligned loads
2 Copyright (C) 2013-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21#include "sysdep.h"
22
23ENTRY ( __strcmp_sse2_unaligned)
24 movl %edi, %eax
25 xorl %edx, %edx
26 pxor %xmm7, %xmm7
27 orl %esi, %eax
28 andl $4095, %eax
29 cmpl $4032, %eax
30 jg L(cross_page)
31 movdqu (%rdi), %xmm1
32 movdqu (%rsi), %xmm0
33 pcmpeqb %xmm1, %xmm0
34 pminub %xmm1, %xmm0
35 pxor %xmm1, %xmm1
36 pcmpeqb %xmm1, %xmm0
37 pmovmskb %xmm0, %eax
38 testq %rax, %rax
39 je L(next_48_bytes)
40L(return):
41 bsfq %rax, %rdx
42 movzbl (%rdi, %rdx), %eax
43 movzbl (%rsi, %rdx), %edx
44 subl %edx, %eax
45 ret
46
47 .p2align 4
48L(next_48_bytes):
49 movdqu 16(%rdi), %xmm6
50 movdqu 16(%rsi), %xmm3
51 movdqu 32(%rdi), %xmm5
52 pcmpeqb %xmm6, %xmm3
53 movdqu 32(%rsi), %xmm2
54 pminub %xmm6, %xmm3
55 pcmpeqb %xmm1, %xmm3
56 movdqu 48(%rdi), %xmm4
57 pcmpeqb %xmm5, %xmm2
58 pmovmskb %xmm3, %edx
59 movdqu 48(%rsi), %xmm0
60 pminub %xmm5, %xmm2
61 pcmpeqb %xmm1, %xmm2
62 pcmpeqb %xmm4, %xmm0
63 pmovmskb %xmm2, %eax
64 salq $16, %rdx
65 pminub %xmm4, %xmm0
66 pcmpeqb %xmm1, %xmm0
67 salq $32, %rax
68 orq %rdx, %rax
69 pmovmskb %xmm0, %ecx
70 movq %rcx, %rdx
71 salq $48, %rdx
72 orq %rdx, %rax
73 jne L(return)
74L(main_loop_header):
75 leaq 64(%rdi), %rdx
76 movl $4096, %ecx
77 pxor %xmm9, %xmm9
78 andq $-64, %rdx
79 subq %rdi, %rdx
80 leaq (%rdi, %rdx), %rax
81 addq %rsi, %rdx
82 movq %rdx, %rsi
83 andl $4095, %esi
84 subq %rsi, %rcx
85 shrq $6, %rcx
86 movq %rcx, %rsi
87 jmp L(loop_start)
88
89 .p2align 4
90L(loop):
91 addq $64, %rax
92 addq $64, %rdx
93L(loop_start):
94 testq %rsi, %rsi
95 leaq -1(%rsi), %rsi
96 je L(loop_cross_page)
97L(back_to_loop):
98 movdqu (%rdx), %xmm0
99 movdqu 16(%rdx), %xmm1
100 movdqa (%rax), %xmm2
101 movdqa 16(%rax), %xmm3
102 pcmpeqb %xmm2, %xmm0
103 movdqu 32(%rdx), %xmm5
104 pcmpeqb %xmm3, %xmm1
105 pminub %xmm2, %xmm0
106 movdqu 48(%rdx), %xmm6
107 pminub %xmm3, %xmm1
108 movdqa 32(%rax), %xmm2
109 pminub %xmm1, %xmm0
110 movdqa 48(%rax), %xmm3
111 pcmpeqb %xmm2, %xmm5
112 pcmpeqb %xmm3, %xmm6
113 pminub %xmm2, %xmm5
114 pminub %xmm3, %xmm6
115 pminub %xmm5, %xmm0
116 pminub %xmm6, %xmm0
117 pcmpeqb %xmm7, %xmm0
118 pmovmskb %xmm0, %ecx
119 testl %ecx, %ecx
120 je L(loop)
121 pcmpeqb %xmm7, %xmm5
122 movdqu (%rdx), %xmm0
123 pcmpeqb %xmm7, %xmm1
124 movdqa (%rax), %xmm2
125 pcmpeqb %xmm2, %xmm0
126 pminub %xmm2, %xmm0
127 pcmpeqb %xmm7, %xmm6
128 pcmpeqb %xmm7, %xmm0
129 pmovmskb %xmm1, %ecx
130 pmovmskb %xmm5, %r8d
131 pmovmskb %xmm0, %edi
132 salq $16, %rcx
133 salq $32, %r8
134 pmovmskb %xmm6, %esi
135 orq %r8, %rcx
136 orq %rdi, %rcx
137 salq $48, %rsi
138 orq %rsi, %rcx
139 bsfq %rcx, %rcx
140 movzbl (%rax, %rcx), %eax
141 movzbl (%rdx, %rcx), %edx
142 subl %edx, %eax
143 ret
144
145 .p2align 4
146L(loop_cross_page):
147 xor %r10, %r10
148 movq %rdx, %r9
149 and $63, %r9
150 subq %r9, %r10
151
152 movdqa (%rdx, %r10), %xmm0
153 movdqa 16(%rdx, %r10), %xmm1
154 movdqu (%rax, %r10), %xmm2
155 movdqu 16(%rax, %r10), %xmm3
156 pcmpeqb %xmm2, %xmm0
157 movdqa 32(%rdx, %r10), %xmm5
158 pcmpeqb %xmm3, %xmm1
159 pminub %xmm2, %xmm0
160 movdqa 48(%rdx, %r10), %xmm6
161 pminub %xmm3, %xmm1
162 movdqu 32(%rax, %r10), %xmm2
163 movdqu 48(%rax, %r10), %xmm3
164 pcmpeqb %xmm2, %xmm5
165 pcmpeqb %xmm3, %xmm6
166 pminub %xmm2, %xmm5
167 pminub %xmm3, %xmm6
168
169 pcmpeqb %xmm7, %xmm0
170 pcmpeqb %xmm7, %xmm1
171 pcmpeqb %xmm7, %xmm5
172 pcmpeqb %xmm7, %xmm6
173
174 pmovmskb %xmm1, %ecx
175 pmovmskb %xmm5, %r8d
176 pmovmskb %xmm0, %edi
177 salq $16, %rcx
178 salq $32, %r8
179 pmovmskb %xmm6, %esi
180 orq %r8, %rdi
181 orq %rcx, %rdi
182 salq $48, %rsi
183 orq %rsi, %rdi
184 movq %r9, %rcx
185 movq $63, %rsi
186 shrq %cl, %rdi
187 test %rdi, %rdi
188 je L(back_to_loop)
189 bsfq %rdi, %rcx
190 movzbl (%rax, %rcx), %eax
191 movzbl (%rdx, %rcx), %edx
192 subl %edx, %eax
193 ret
194
195 .p2align 4
196L(cross_page_loop):
197 cmpb %cl, %al
198 jne L(different)
199 addq $1, %rdx
200 cmpq $64, %rdx
201 je L(main_loop_header)
202L(cross_page):
203 movzbl (%rdi, %rdx), %eax
204 movzbl (%rsi, %rdx), %ecx
205 testb %al, %al
206 jne L(cross_page_loop)
207 xorl %eax, %eax
208L(different):
209 subl %ecx, %eax
210 ret
211END (__strcmp_sse2_unaligned)
212
213#endif
214

source code of glibc/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S