1 | /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using |
---|---|

2 | |

3 | Copyright (C) 2011-2019 Free Software Foundation, Inc. |

4 | Contributed by Intel Corporation. |

5 | This file is part of the GNU C Library. |

6 | |

7 | The GNU C Library is free software; you can redistribute it and/or |

8 | modify it under the terms of the GNU Lesser General Public |

9 | License as published by the Free Software Foundation; either |

10 | version 2.1 of the License, or (at your option) any later version. |

11 | |

12 | The GNU C Library is distributed in the hope that it will be useful, |

13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

15 | Lesser General Public License for more details. |

16 | |

17 | You should have received a copy of the GNU Lesser General Public |

18 | License along with the GNU C Library; if not, see |

19 | <http://www.gnu.org/licenses/>. */ |

20 | |

21 | #include <sysdep.h> |

22 | |

23 | .text |

24 | ENTRY (__memrchr) |

25 | movd %esi, %xmm1 |

26 | |

27 | sub $16, %RDX_LP |

28 | jbe L(length_less16) |

29 | |

30 | punpcklbw %xmm1, %xmm1 |

31 | punpcklbw %xmm1, %xmm1 |

32 | |

33 | add %RDX_LP, %RDI_LP |

34 | pshufd $0, %xmm1, %xmm1 |

35 | |

36 | movdqu (%rdi), %xmm0 |

37 | pcmpeqb %xmm1, %xmm0 |

38 | |

39 | /* Check if there is a match. */ |

40 | pmovmskb %xmm0, %eax |

41 | test %eax, %eax |

42 | jnz L(matches0) |

43 | |

44 | sub $64, %rdi |

45 | mov %edi, %ecx |

46 | and $15, %ecx |

47 | jz L(loop_prolog) |

48 | |

49 | add $16, %rdi |

50 | add $16, %rdx |

51 | and $-16, %rdi |

52 | sub %rcx, %rdx |

53 | |

54 | .p2align 4 |

55 | L(loop_prolog): |

56 | sub $64, %rdx |

57 | jbe L(exit_loop) |

58 | |

59 | movdqa 48(%rdi), %xmm0 |

60 | pcmpeqb %xmm1, %xmm0 |

61 | pmovmskb %xmm0, %eax |

62 | test %eax, %eax |

63 | jnz L(matches48) |

64 | |

65 | movdqa 32(%rdi), %xmm2 |

66 | pcmpeqb %xmm1, %xmm2 |

67 | pmovmskb %xmm2, %eax |

68 | test %eax, %eax |

69 | jnz L(matches32) |

70 | |

71 | movdqa 16(%rdi), %xmm3 |

72 | pcmpeqb %xmm1, %xmm3 |

73 | pmovmskb %xmm3, %eax |

74 | test %eax, %eax |

75 | jnz L(matches16) |

76 | |

77 | movdqa (%rdi), %xmm4 |

78 | pcmpeqb %xmm1, %xmm4 |

79 | pmovmskb %xmm4, %eax |

80 | test %eax, %eax |

81 | jnz L(matches0) |

82 | |

83 | sub $64, %rdi |

84 | sub $64, %rdx |

85 | jbe L(exit_loop) |

86 | |

87 | movdqa 48(%rdi), %xmm0 |

88 | pcmpeqb %xmm1, %xmm0 |

89 | pmovmskb %xmm0, %eax |

90 | test %eax, %eax |

91 | jnz L(matches48) |

92 | |

93 | movdqa 32(%rdi), %xmm2 |

94 | pcmpeqb %xmm1, %xmm2 |

95 | pmovmskb %xmm2, %eax |

96 | test %eax, %eax |

97 | jnz L(matches32) |

98 | |

99 | movdqa 16(%rdi), %xmm3 |

100 | pcmpeqb %xmm1, %xmm3 |

101 | pmovmskb %xmm3, %eax |

102 | test %eax, %eax |

103 | jnz L(matches16) |

104 | |

105 | movdqa (%rdi), %xmm3 |

106 | pcmpeqb %xmm1, %xmm3 |

107 | pmovmskb %xmm3, %eax |

108 | test %eax, %eax |

109 | jnz L(matches0) |

110 | |

111 | mov %edi, %ecx |

112 | and $63, %ecx |

113 | jz L(align64_loop) |

114 | |

115 | add $64, %rdi |

116 | add $64, %rdx |

117 | and $-64, %rdi |

118 | sub %rcx, %rdx |

119 | |

120 | .p2align 4 |

121 | L(align64_loop): |

122 | sub $64, %rdi |

123 | sub $64, %rdx |

124 | jbe L(exit_loop) |

125 | |

126 | movdqa (%rdi), %xmm0 |

127 | movdqa 16(%rdi), %xmm2 |

128 | movdqa 32(%rdi), %xmm3 |

129 | movdqa 48(%rdi), %xmm4 |

130 | |

131 | pcmpeqb %xmm1, %xmm0 |

132 | pcmpeqb %xmm1, %xmm2 |

133 | pcmpeqb %xmm1, %xmm3 |

134 | pcmpeqb %xmm1, %xmm4 |

135 | |

136 | pmaxub %xmm3, %xmm0 |

137 | pmaxub %xmm4, %xmm2 |

138 | pmaxub %xmm0, %xmm2 |

139 | pmovmskb %xmm2, %eax |

140 | |

141 | test %eax, %eax |

142 | jz L(align64_loop) |

143 | |

144 | pmovmskb %xmm4, %eax |

145 | test %eax, %eax |

146 | jnz L(matches48) |

147 | |

148 | pmovmskb %xmm3, %eax |

149 | test %eax, %eax |

150 | jnz L(matches32) |

151 | |

152 | movdqa 16(%rdi), %xmm2 |

153 | |

154 | pcmpeqb %xmm1, %xmm2 |

155 | pcmpeqb (%rdi), %xmm1 |

156 | |

157 | pmovmskb %xmm2, %eax |

158 | test %eax, %eax |

159 | jnz L(matches16) |

160 | |

161 | pmovmskb %xmm1, %eax |

162 | bsr %eax, %eax |

163 | |

164 | add %rdi, %rax |

165 | ret |

166 | |

167 | .p2align 4 |

168 | L(exit_loop): |

169 | add $64, %edx |

170 | cmp $32, %edx |

171 | jbe L(exit_loop_32) |

172 | |

173 | movdqa 48(%rdi), %xmm0 |

174 | pcmpeqb %xmm1, %xmm0 |

175 | pmovmskb %xmm0, %eax |

176 | test %eax, %eax |

177 | jnz L(matches48) |

178 | |

179 | movdqa 32(%rdi), %xmm2 |

180 | pcmpeqb %xmm1, %xmm2 |

181 | pmovmskb %xmm2, %eax |

182 | test %eax, %eax |

183 | jnz L(matches32) |

184 | |

185 | movdqa 16(%rdi), %xmm3 |

186 | pcmpeqb %xmm1, %xmm3 |

187 | pmovmskb %xmm3, %eax |

188 | test %eax, %eax |

189 | jnz L(matches16_1) |

190 | cmp $48, %edx |

191 | jbe L(return_null) |

192 | |

193 | pcmpeqb (%rdi), %xmm1 |

194 | pmovmskb %xmm1, %eax |

195 | test %eax, %eax |

196 | jnz L(matches0_1) |

197 | xor %eax, %eax |

198 | ret |

199 | |

200 | .p2align 4 |

201 | L(exit_loop_32): |

202 | movdqa 48(%rdi), %xmm0 |

203 | pcmpeqb %xmm1, %xmm0 |

204 | pmovmskb %xmm0, %eax |

205 | test %eax, %eax |

206 | jnz L(matches48_1) |

207 | cmp $16, %edx |

208 | jbe L(return_null) |

209 | |

210 | pcmpeqb 32(%rdi), %xmm1 |

211 | pmovmskb %xmm1, %eax |

212 | test %eax, %eax |

213 | jnz L(matches32_1) |

214 | xor %eax, %eax |

215 | ret |

216 | |

217 | .p2align 4 |

218 | L(matches0): |

219 | bsr %eax, %eax |

220 | add %rdi, %rax |

221 | ret |

222 | |

223 | .p2align 4 |

224 | L(matches16): |

225 | bsr %eax, %eax |

226 | lea 16(%rax, %rdi), %rax |

227 | ret |

228 | |

229 | .p2align 4 |

230 | L(matches32): |

231 | bsr %eax, %eax |

232 | lea 32(%rax, %rdi), %rax |

233 | ret |

234 | |

235 | .p2align 4 |

236 | L(matches48): |

237 | bsr %eax, %eax |

238 | lea 48(%rax, %rdi), %rax |

239 | ret |

240 | |

241 | .p2align 4 |

242 | L(matches0_1): |

243 | bsr %eax, %eax |

244 | sub $64, %rdx |

245 | add %rax, %rdx |

246 | jl L(return_null) |

247 | add %rdi, %rax |

248 | ret |

249 | |

250 | .p2align 4 |

251 | L(matches16_1): |

252 | bsr %eax, %eax |

253 | sub $48, %rdx |

254 | add %rax, %rdx |

255 | jl L(return_null) |

256 | lea 16(%rdi, %rax), %rax |

257 | ret |

258 | |

259 | .p2align 4 |

260 | L(matches32_1): |

261 | bsr %eax, %eax |

262 | sub $32, %rdx |

263 | add %rax, %rdx |

264 | jl L(return_null) |

265 | lea 32(%rdi, %rax), %rax |

266 | ret |

267 | |

268 | .p2align 4 |

269 | L(matches48_1): |

270 | bsr %eax, %eax |

271 | sub $16, %rdx |

272 | add %rax, %rdx |

273 | jl L(return_null) |

274 | lea 48(%rdi, %rax), %rax |

275 | ret |

276 | |

277 | .p2align 4 |

278 | L(return_null): |

279 | xor %eax, %eax |

280 | ret |

281 | |

282 | .p2align 4 |

283 | L(length_less16_offset0): |

284 | test %edx, %edx |

285 | jz L(return_null) |

286 | |

287 | mov %dl, %cl |

288 | pcmpeqb (%rdi), %xmm1 |

289 | |

290 | mov $1, %edx |

291 | sal %cl, %edx |

292 | sub $1, %edx |

293 | |

294 | pmovmskb %xmm1, %eax |

295 | |

296 | and %edx, %eax |

297 | test %eax, %eax |

298 | jz L(return_null) |

299 | |

300 | bsr %eax, %eax |

301 | add %rdi, %rax |

302 | ret |

303 | |

304 | .p2align 4 |

305 | L(length_less16): |

306 | punpcklbw %xmm1, %xmm1 |

307 | punpcklbw %xmm1, %xmm1 |

308 | |

309 | add $16, %edx |

310 | |

311 | pshufd $0, %xmm1, %xmm1 |

312 | |

313 | mov %edi, %ecx |

314 | and $15, %ecx |

315 | jz L(length_less16_offset0) |

316 | |

317 | mov %cl, %dh |

318 | mov %ecx, %esi |

319 | add %dl, %dh |

320 | and $-16, %rdi |

321 | |

322 | sub $16, %dh |

323 | ja L(length_less16_part2) |

324 | |

325 | pcmpeqb (%rdi), %xmm1 |

326 | pmovmskb %xmm1, %eax |

327 | |

328 | sar %cl, %eax |

329 | mov %dl, %cl |

330 | |

331 | mov $1, %edx |

332 | sal %cl, %edx |

333 | sub $1, %edx |

334 | |

335 | and %edx, %eax |

336 | test %eax, %eax |

337 | jz L(return_null) |

338 | |

339 | bsr %eax, %eax |

340 | add %rdi, %rax |

341 | add %rsi, %rax |

342 | ret |

343 | |

344 | .p2align 4 |

345 | L(length_less16_part2): |

346 | movdqa 16(%rdi), %xmm2 |

347 | pcmpeqb %xmm1, %xmm2 |

348 | pmovmskb %xmm2, %eax |

349 | |

350 | mov %dh, %cl |

351 | mov $1, %edx |

352 | sal %cl, %edx |

353 | sub $1, %edx |

354 | |

355 | and %edx, %eax |

356 | |

357 | test %eax, %eax |

358 | jnz L(length_less16_part2_return) |

359 | |

360 | pcmpeqb (%rdi), %xmm1 |

361 | pmovmskb %xmm1, %eax |

362 | |

363 | mov %esi, %ecx |

364 | sar %cl, %eax |

365 | test %eax, %eax |

366 | jz L(return_null) |

367 | |

368 | bsr %eax, %eax |

369 | add %rdi, %rax |

370 | add %rsi, %rax |

371 | ret |

372 | |

373 | .p2align 4 |

374 | L(length_less16_part2_return): |

375 | bsr %eax, %eax |

376 | lea 16(%rax, %rdi), %rax |

377 | ret |

378 | |

379 | END (__memrchr) |

380 | weak_alias (__memrchr, memrchr) |

381 |