1 | /* Copyright (C) 2010-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library. If not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | #include <string.h> |
19 | |
20 | typedef unsigned long word; |
21 | |
22 | static inline word |
23 | ldq_u(const void *s) |
24 | { |
25 | return *(const word *)((word)s & -8); |
26 | } |
27 | |
28 | #define unlikely(X) __builtin_expect ((X), 0) |
29 | #define prefetch(X) __builtin_prefetch ((void *)(X), 0) |
30 | |
31 | #define cmpbeq0(X) __builtin_alpha_cmpbge(0, (X)) |
32 | #define find(X, Y) cmpbeq0 ((X) ^ (Y)) |
33 | |
34 | /* Search no more than N bytes of S for C. */ |
35 | |
36 | void * |
37 | __memchr (const void *s, int xc, size_t n) |
38 | { |
39 | const word *s_align; |
40 | word t, current, found, mask, offset; |
41 | |
42 | if (unlikely (n == 0)) |
43 | return 0; |
44 | |
45 | current = ldq_u (s); |
46 | |
47 | /* Replicate low byte of XC into all bytes of C. */ |
48 | t = xc & 0xff; /* 0000000c */ |
49 | t = (t << 8) | t; /* 000000cc */ |
50 | t = (t << 16) | t; /* 0000cccc */ |
51 | const word c = (t << 32) | t; /* cccccccc */ |
52 | |
53 | /* Align the source, and decrement the count by the number |
54 | of bytes searched in the first word. */ |
55 | s_align = (const word *)((word)s & -8); |
56 | { |
57 | size_t inc = n + ((word)s & 7); |
58 | n = inc | -(inc < n); |
59 | } |
60 | |
61 | /* Deal with misalignment in the first word for the comparison. */ |
62 | mask = (1ul << ((word)s & 7)) - 1; |
63 | |
64 | /* If the entire string fits within one word, we may need masking |
65 | at both the front and the back of the string. */ |
66 | if (unlikely (n <= 8)) |
67 | { |
68 | mask |= -1ul << n; |
69 | goto last_quad; |
70 | } |
71 | |
72 | found = find (current, c) & ~mask; |
73 | if (unlikely (found)) |
74 | goto found_it; |
75 | |
76 | s_align++; |
77 | n -= 8; |
78 | |
79 | /* If the block is sufficiently large, align to cacheline and prefetch. */ |
80 | if (unlikely (n >= 256)) |
81 | { |
82 | /* Prefetch 3 cache lines beyond the one we're working on. */ |
83 | prefetch (s_align + 8); |
84 | prefetch (s_align + 16); |
85 | prefetch (s_align + 24); |
86 | |
87 | while ((word)s_align & 63) |
88 | { |
89 | current = *s_align; |
90 | found = find (current, c); |
91 | if (found) |
92 | goto found_it; |
93 | s_align++; |
94 | n -= 8; |
95 | } |
96 | |
97 | /* Within each cacheline, advance the load for the next word |
98 | before the test for the previous word is complete. This |
99 | allows us to hide the 3 cycle L1 cache load latency. We |
100 | only perform this advance load within a cacheline to prevent |
101 | reading across page boundary. */ |
102 | #define CACHELINE_LOOP \ |
103 | do { \ |
104 | word i, next = s_align[0]; \ |
105 | for (i = 0; i < 7; ++i) \ |
106 | { \ |
107 | current = next; \ |
108 | next = s_align[1]; \ |
109 | found = find (current, c); \ |
110 | if (unlikely (found)) \ |
111 | goto found_it; \ |
112 | s_align++; \ |
113 | } \ |
114 | current = next; \ |
115 | found = find (current, c); \ |
116 | if (unlikely (found)) \ |
117 | goto found_it; \ |
118 | s_align++; \ |
119 | n -= 64; \ |
120 | } while (0) |
121 | |
122 | /* While there's still lots more data to potentially be read, |
123 | continue issuing prefetches for the 4th cacheline out. */ |
124 | while (n >= 256) |
125 | { |
126 | prefetch (s_align + 24); |
127 | CACHELINE_LOOP; |
128 | } |
129 | |
130 | /* Up to 3 cache lines remaining. Continue issuing advanced |
131 | loads, but stop prefetching. */ |
132 | while (n >= 64) |
133 | CACHELINE_LOOP; |
134 | |
135 | /* We may have exhausted the buffer. */ |
136 | if (n == 0) |
137 | return NULL; |
138 | } |
139 | |
140 | /* Quadword aligned loop. */ |
141 | current = *s_align; |
142 | while (n > 8) |
143 | { |
144 | found = find (current, c); |
145 | if (unlikely (found)) |
146 | goto found_it; |
147 | current = *++s_align; |
148 | n -= 8; |
149 | } |
150 | |
151 | /* The last word may need masking at the tail of the compare. */ |
152 | mask = -1ul << n; |
153 | last_quad: |
154 | found = find (current, c) & ~mask; |
155 | if (found == 0) |
156 | return NULL; |
157 | |
158 | found_it: |
159 | #ifdef __alpha_cix__ |
160 | offset = __builtin_alpha_cttz (found); |
161 | #else |
162 | /* Extract LSB. */ |
163 | found &= -found; |
164 | |
165 | /* Binary search for the LSB. */ |
166 | offset = (found & 0x0f ? 0 : 4); |
167 | offset += (found & 0x33 ? 0 : 2); |
168 | offset += (found & 0x55 ? 0 : 1); |
169 | #endif |
170 | |
171 | return (void *)((word)s_align + offset); |
172 | } |
173 | |
174 | #ifdef weak_alias |
175 | weak_alias (__memchr, memchr) |
176 | #endif |
177 | libc_hidden_builtin_def (memchr) |
178 | |