1 | /* Copyright (C) 2011-2019 Free Software Foundation, Inc. |
---|---|

2 | Contributed by Intel Corporation. |

3 | This file is part of the GNU C Library. |

4 | |

5 | The GNU C Library is free software; you can redistribute it and/or |

6 | modify it under the terms of the GNU Lesser General Public |

7 | License as published by the Free Software Foundation; either |

8 | version 2.1 of the License, or (at your option) any later version. |

9 | |

10 | The GNU C Library is distributed in the hope that it will be useful, |

11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | Lesser General Public License for more details. |

14 | |

15 | You should have received a copy of the GNU Lesser General Public |

16 | License along with the GNU C Library; if not, see |

17 | <http://www.gnu.org/licenses/>. */ |

18 | |

19 | #include <sysdep.h> |

20 | |

21 | #ifdef USE_AS_WMEMCHR |

22 | # define MEMCHR wmemchr |

23 | # define PCMPEQ pcmpeqd |

24 | #else |

25 | # define MEMCHR memchr |

26 | # define PCMPEQ pcmpeqb |

27 | #endif |

28 | |

29 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |

30 | |

31 | .text |

32 | ENTRY(MEMCHR) |

33 | movd %esi, %xmm1 |

34 | mov %edi, %ecx |

35 | |

36 | #ifdef USE_AS_WMEMCHR |

37 | test %RDX_LP, %RDX_LP |

38 | jz L(return_null) |

39 | shl $2, %RDX_LP |

40 | #else |

41 | # ifdef __ILP32__ |

42 | /* Clear the upper 32 bits. */ |

43 | movl %edx, %edx |

44 | # endif |

45 | punpcklbw %xmm1, %xmm1 |

46 | test %RDX_LP, %RDX_LP |

47 | jz L(return_null) |

48 | punpcklbw %xmm1, %xmm1 |

49 | #endif |

50 | |

51 | and $63, %ecx |

52 | pshufd $0, %xmm1, %xmm1 |

53 | |

54 | cmp $48, %ecx |

55 | ja L(crosscache) |

56 | |

57 | movdqu (%rdi), %xmm0 |

58 | PCMPEQ %xmm1, %xmm0 |

59 | pmovmskb %xmm0, %eax |

60 | test %eax, %eax |

61 | |

62 | jnz L(matches_1) |

63 | sub $16, %rdx |

64 | jbe L(return_null) |

65 | add $16, %rdi |

66 | and $15, %ecx |

67 | and $-16, %rdi |

68 | add %rcx, %rdx |

69 | sub $64, %rdx |

70 | jbe L(exit_loop) |

71 | jmp L(loop_prolog) |

72 | |

73 | .p2align 4 |

74 | L(crosscache): |

75 | and $15, %ecx |

76 | and $-16, %rdi |

77 | movdqa (%rdi), %xmm0 |

78 | |

79 | PCMPEQ %xmm1, %xmm0 |

80 | /* Check if there is a match. */ |

81 | pmovmskb %xmm0, %eax |

82 | /* Remove the leading bytes. */ |

83 | sar %cl, %eax |

84 | test %eax, %eax |

85 | je L(unaligned_no_match) |

86 | /* Check which byte is a match. */ |

87 | bsf %eax, %eax |

88 | |

89 | sub %rax, %rdx |

90 | jbe L(return_null) |

91 | add %rdi, %rax |

92 | add %rcx, %rax |

93 | ret |

94 | |

95 | .p2align 4 |

96 | L(unaligned_no_match): |

97 | /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using |

98 | "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void |

99 | possible addition overflow. */ |

100 | neg %rcx |

101 | add $16, %rcx |

102 | sub %rcx, %rdx |

103 | jbe L(return_null) |

104 | add $16, %rdi |

105 | sub $64, %rdx |

106 | jbe L(exit_loop) |

107 | |

108 | .p2align 4 |

109 | L(loop_prolog): |

110 | movdqa (%rdi), %xmm0 |

111 | PCMPEQ %xmm1, %xmm0 |

112 | pmovmskb %xmm0, %eax |

113 | test %eax, %eax |

114 | jnz L(matches) |

115 | |

116 | movdqa 16(%rdi), %xmm2 |

117 | PCMPEQ %xmm1, %xmm2 |

118 | pmovmskb %xmm2, %eax |

119 | test %eax, %eax |

120 | jnz L(matches16) |

121 | |

122 | movdqa 32(%rdi), %xmm3 |

123 | PCMPEQ %xmm1, %xmm3 |

124 | pmovmskb %xmm3, %eax |

125 | test %eax, %eax |

126 | jnz L(matches32) |

127 | |

128 | movdqa 48(%rdi), %xmm4 |

129 | PCMPEQ %xmm1, %xmm4 |

130 | add $64, %rdi |

131 | pmovmskb %xmm4, %eax |

132 | test %eax, %eax |

133 | jnz L(matches0) |

134 | |

135 | test $0x3f, %rdi |

136 | jz L(align64_loop) |

137 | |

138 | sub $64, %rdx |

139 | jbe L(exit_loop) |

140 | |

141 | movdqa (%rdi), %xmm0 |

142 | PCMPEQ %xmm1, %xmm0 |

143 | pmovmskb %xmm0, %eax |

144 | test %eax, %eax |

145 | jnz L(matches) |

146 | |

147 | movdqa 16(%rdi), %xmm2 |

148 | PCMPEQ %xmm1, %xmm2 |

149 | pmovmskb %xmm2, %eax |

150 | test %eax, %eax |

151 | jnz L(matches16) |

152 | |

153 | movdqa 32(%rdi), %xmm3 |

154 | PCMPEQ %xmm1, %xmm3 |

155 | pmovmskb %xmm3, %eax |

156 | test %eax, %eax |

157 | jnz L(matches32) |

158 | |

159 | movdqa 48(%rdi), %xmm3 |

160 | PCMPEQ %xmm1, %xmm3 |

161 | pmovmskb %xmm3, %eax |

162 | |

163 | add $64, %rdi |

164 | test %eax, %eax |

165 | jnz L(matches0) |

166 | |

167 | mov %rdi, %rcx |

168 | and $-64, %rdi |

169 | and $63, %ecx |

170 | add %rcx, %rdx |

171 | |

172 | .p2align 4 |

173 | L(align64_loop): |

174 | sub $64, %rdx |

175 | jbe L(exit_loop) |

176 | movdqa (%rdi), %xmm0 |

177 | movdqa 16(%rdi), %xmm2 |

178 | movdqa 32(%rdi), %xmm3 |

179 | movdqa 48(%rdi), %xmm4 |

180 | |

181 | PCMPEQ %xmm1, %xmm0 |

182 | PCMPEQ %xmm1, %xmm2 |

183 | PCMPEQ %xmm1, %xmm3 |

184 | PCMPEQ %xmm1, %xmm4 |

185 | |

186 | pmaxub %xmm0, %xmm3 |

187 | pmaxub %xmm2, %xmm4 |

188 | pmaxub %xmm3, %xmm4 |

189 | pmovmskb %xmm4, %eax |

190 | |

191 | add $64, %rdi |

192 | |

193 | test %eax, %eax |

194 | jz L(align64_loop) |

195 | |

196 | sub $64, %rdi |

197 | |

198 | pmovmskb %xmm0, %eax |

199 | test %eax, %eax |

200 | jnz L(matches) |

201 | |

202 | pmovmskb %xmm2, %eax |

203 | test %eax, %eax |

204 | jnz L(matches16) |

205 | |

206 | movdqa 32(%rdi), %xmm3 |

207 | PCMPEQ %xmm1, %xmm3 |

208 | |

209 | PCMPEQ 48(%rdi), %xmm1 |

210 | pmovmskb %xmm3, %eax |

211 | test %eax, %eax |

212 | jnz L(matches32) |

213 | |

214 | pmovmskb %xmm1, %eax |

215 | bsf %eax, %eax |

216 | lea 48(%rdi, %rax), %rax |

217 | ret |

218 | |

219 | .p2align 4 |

220 | L(exit_loop): |

221 | add $32, %edx |

222 | jle L(exit_loop_32) |

223 | |

224 | movdqa (%rdi), %xmm0 |

225 | PCMPEQ %xmm1, %xmm0 |

226 | pmovmskb %xmm0, %eax |

227 | test %eax, %eax |

228 | jnz L(matches) |

229 | |

230 | movdqa 16(%rdi), %xmm2 |

231 | PCMPEQ %xmm1, %xmm2 |

232 | pmovmskb %xmm2, %eax |

233 | test %eax, %eax |

234 | jnz L(matches16) |

235 | |

236 | movdqa 32(%rdi), %xmm3 |

237 | PCMPEQ %xmm1, %xmm3 |

238 | pmovmskb %xmm3, %eax |

239 | test %eax, %eax |

240 | jnz L(matches32_1) |

241 | sub $16, %edx |

242 | jle L(return_null) |

243 | |

244 | PCMPEQ 48(%rdi), %xmm1 |

245 | pmovmskb %xmm1, %eax |

246 | test %eax, %eax |

247 | jnz L(matches48_1) |

248 | xor %eax, %eax |

249 | ret |

250 | |

251 | .p2align 4 |

252 | L(exit_loop_32): |

253 | add $32, %edx |

254 | movdqa (%rdi), %xmm0 |

255 | PCMPEQ %xmm1, %xmm0 |

256 | pmovmskb %xmm0, %eax |

257 | test %eax, %eax |

258 | jnz L(matches_1) |

259 | sub $16, %edx |

260 | jbe L(return_null) |

261 | |

262 | PCMPEQ 16(%rdi), %xmm1 |

263 | pmovmskb %xmm1, %eax |

264 | test %eax, %eax |

265 | jnz L(matches16_1) |

266 | xor %eax, %eax |

267 | ret |

268 | |

269 | .p2align 4 |

270 | L(matches0): |

271 | bsf %eax, %eax |

272 | lea -16(%rax, %rdi), %rax |

273 | ret |

274 | |

275 | .p2align 4 |

276 | L(matches): |

277 | bsf %eax, %eax |

278 | add %rdi, %rax |

279 | ret |

280 | |

281 | .p2align 4 |

282 | L(matches16): |

283 | bsf %eax, %eax |

284 | lea 16(%rax, %rdi), %rax |

285 | ret |

286 | |

287 | .p2align 4 |

288 | L(matches32): |

289 | bsf %eax, %eax |

290 | lea 32(%rax, %rdi), %rax |

291 | ret |

292 | |

293 | .p2align 4 |

294 | L(matches_1): |

295 | bsf %eax, %eax |

296 | sub %rax, %rdx |

297 | jbe L(return_null) |

298 | add %rdi, %rax |

299 | ret |

300 | |

301 | .p2align 4 |

302 | L(matches16_1): |

303 | bsf %eax, %eax |

304 | sub %rax, %rdx |

305 | jbe L(return_null) |

306 | lea 16(%rdi, %rax), %rax |

307 | ret |

308 | |

309 | .p2align 4 |

310 | L(matches32_1): |

311 | bsf %eax, %eax |

312 | sub %rax, %rdx |

313 | jbe L(return_null) |

314 | lea 32(%rdi, %rax), %rax |

315 | ret |

316 | |

317 | .p2align 4 |

318 | L(matches48_1): |

319 | bsf %eax, %eax |

320 | sub %rax, %rdx |

321 | jbe L(return_null) |

322 | lea 48(%rdi, %rax), %rax |

323 | ret |

324 | |

325 | .p2align 4 |

326 | L(return_null): |

327 | xor %eax, %eax |

328 | ret |

329 | END(MEMCHR) |

330 | |

331 | #ifndef USE_AS_WMEMCHR |

332 | strong_alias (memchr, __memchr) |

333 | libc_hidden_builtin_def(memchr) |

334 | #endif |

335 |