1/* strcspn with SSE4.2 intrinsics
2 Copyright (C) 2009-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <nmmintrin.h>
20#include <string.h>
21#include "varshift.h"
22
23/* We use 0x2:
24 _SIDD_SBYTE_OPS
25 | _SIDD_CMP_EQUAL_ANY
26 | _SIDD_POSITIVE_POLARITY
27 | _SIDD_LEAST_SIGNIFICANT
28 on pcmpistri to compare xmm/mem128
29
30 0 1 2 3 4 5 6 7 8 9 A B C D E F
31 X X X X X X X X X X X X X X X X
32
33 against xmm
34
35 0 1 2 3 4 5 6 7 8 9 A B C D E F
36 A A A A A A A A A A A A A A A A
37
38 to find out if the first 16byte data element has any byte A and
39 the offset of the first byte. There are 3 cases:
40
41 1. The first 16byte data element has the byte A at the offset X.
42 2. The first 16byte data element has EOS and doesn't have the byte A.
43 3. The first 16byte data element is valid and doesn't have the byte A.
44
45 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
46
47 1 X 1 0/1 0
48 2 16 0 1 0
49 3 16 0 0 0
50
51 We exit from the loop for cases 1 and 2 with jbe which branches
52 when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
53 X for case 1. */
54
55#ifndef STRCSPN_SSE2
56# define STRCSPN_SSE2 __strcspn_sse2
57# define STRCSPN_SSE42 __strcspn_sse42
58#endif
59
60#ifdef USE_AS_STRPBRK
61# define RETURN(val1, val2) return val1
62#else
63# define RETURN(val1, val2) return val2
64#endif
65
66extern
67#ifdef USE_AS_STRPBRK
68char *
69#else
70size_t
71#endif
72STRCSPN_SSE2 (const char *, const char *) attribute_hidden;
73
74
75#ifdef USE_AS_STRPBRK
76char *
77#else
78size_t
79#endif
80__attribute__ ((section (".text.sse4.2")))
81STRCSPN_SSE42 (const char *s, const char *a)
82{
83 if (*a == 0)
84 RETURN (NULL, strlen (s));
85
86 const char *aligned;
87 __m128i mask, maskz, zero;
88 unsigned int maskz_bits;
89 unsigned int offset = (unsigned int) ((size_t) a & 15);
90 zero = _mm_set1_epi8 (b: 0);
91 if (offset != 0)
92 {
93 /* Load masks. */
94 aligned = (const char *) ((size_t) a & -16L);
95 __m128i mask0 = _mm_load_si128 (p: (__m128i *) aligned);
96 maskz = _mm_cmpeq_epi8 (a: mask0, b: zero);
97
98 /* Find where the NULL terminator is. */
99 maskz_bits = _mm_movemask_epi8 (a: maskz) >> offset;
100 if (maskz_bits != 0)
101 {
102 mask = __m128i_shift_right (value: mask0, offset);
103 offset = (unsigned int) ((size_t) s & 15);
104 if (offset)
105 goto start_unaligned;
106
107 aligned = s;
108 goto start_loop;
109 }
110 }
111
112 /* A is aligned. */
113 mask = _mm_loadu_si128 (p: (__m128i *) a);
114 /* Find where the NULL terminator is. */
115 maskz = _mm_cmpeq_epi8 (a: mask, b: zero);
116 maskz_bits = _mm_movemask_epi8 (a: maskz);
117 if (maskz_bits == 0)
118 {
119 /* There is no NULL terminator. Don't use SSE4.2 if the length
120 of A > 16. */
121 if (a[16] != 0)
122 return STRCSPN_SSE2 (s, a);
123 }
124
125 aligned = s;
126 offset = (unsigned int) ((size_t) s & 15);
127 if (offset != 0)
128 {
129 start_unaligned:
130 /* Check partial string. */
131 aligned = (const char *) ((size_t) s & -16L);
132 __m128i value = _mm_load_si128 (p: (__m128i *) aligned);
133
134 value = __m128i_shift_right (value, offset);
135
136 unsigned int length = _mm_cmpistri (mask, value, 0x2);
137 /* No need to check ZFlag since ZFlag is always 1. */
138 unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
139 if (cflag)
140 RETURN ((char *) (s + length), length);
141 /* Find where the NULL terminator is. */
142 unsigned int index = _mm_cmpistri (value, value, 0x3a);
143 if (index < 16 - offset)
144 RETURN (NULL, index);
145 aligned += 16;
146 }
147
148start_loop:
149 while (1)
150 {
151 __m128i value = _mm_load_si128 (p: (__m128i *) aligned);
152 unsigned int index = _mm_cmpistri (mask, value, 0x2);
153 unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
154 unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
155 if (cflag)
156 RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
157 if (zflag)
158 RETURN (NULL,
159 /* Find where the NULL terminator is. */
160 (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
161 aligned += 16;
162 }
163}
164

source code of glibc/sysdeps/x86_64/multiarch/strcspn-c.c