1 | /* memcmp with SSE2 |
---|---|

2 | Copyright (C) 2009-2019 Free Software Foundation, Inc. |

3 | Contributed by Intel Corporation. |

4 | This file is part of the GNU C Library. |

5 | |

6 | The GNU C Library is free software; you can redistribute it and/or |

7 | modify it under the terms of the GNU Lesser General Public |

8 | License as published by the Free Software Foundation; either |

9 | version 2.1 of the License, or (at your option) any later version. |

10 | |

11 | The GNU C Library is distributed in the hope that it will be useful, |

12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

14 | Lesser General Public License for more details. |

15 | |

16 | You should have received a copy of the GNU Lesser General Public |

17 | License along with the GNU C Library; if not, see |

18 | <http://www.gnu.org/licenses/>. */ |

19 | |

20 | #include <sysdep.h> |

21 | |

22 | .text |

23 | ENTRY (memcmp) |

24 | #ifdef __ILP32__ |

25 | /* Clear the upper 32 bits. */ |

26 | movl %edx, %edx |

27 | #endif |

28 | test %RDX_LP, %RDX_LP |

29 | jz L(finz) |

30 | cmpq $1, %rdx |

31 | jbe L(finr1b) |

32 | subq %rdi, %rsi |

33 | movq %rdx, %r10 |

34 | cmpq $32, %r10 |

35 | jae L(gt32) |

36 | /* Handle small chunks and last block of less than 32 bytes. */ |

37 | L(small): |

38 | testq $1, %r10 |

39 | jz L(s2b) |

40 | movzbl (%rdi), %eax |

41 | movzbl (%rdi, %rsi), %edx |

42 | subq $1, %r10 |

43 | je L(finz1) |

44 | addq $1, %rdi |

45 | subl %edx, %eax |

46 | jnz L(exit) |

47 | L(s2b): |

48 | testq $2, %r10 |

49 | jz L(s4b) |

50 | movzwl (%rdi), %eax |

51 | movzwl (%rdi, %rsi), %edx |

52 | subq $2, %r10 |

53 | je L(fin2_7) |

54 | addq $2, %rdi |

55 | cmpl %edx, %eax |

56 | jnz L(fin2_7) |

57 | L(s4b): |

58 | testq $4, %r10 |

59 | jz L(s8b) |

60 | movl (%rdi), %eax |

61 | movl (%rdi, %rsi), %edx |

62 | subq $4, %r10 |

63 | je L(fin2_7) |

64 | addq $4, %rdi |

65 | cmpl %edx, %eax |

66 | jnz L(fin2_7) |

67 | L(s8b): |

68 | testq $8, %r10 |

69 | jz L(s16b) |

70 | movq (%rdi), %rax |

71 | movq (%rdi, %rsi), %rdx |

72 | subq $8, %r10 |

73 | je L(fin2_7) |

74 | addq $8, %rdi |

75 | cmpq %rdx, %rax |

76 | jnz L(fin2_7) |

77 | L(s16b): |

78 | movdqu (%rdi), %xmm1 |

79 | movdqu (%rdi, %rsi), %xmm0 |

80 | pcmpeqb %xmm0, %xmm1 |

81 | pmovmskb %xmm1, %edx |

82 | xorl %eax, %eax |

83 | subl $0xffff, %edx |

84 | jz L(finz) |

85 | bsfl %edx, %ecx |

86 | leaq (%rdi, %rcx), %rcx |

87 | movzbl (%rcx), %eax |

88 | movzbl (%rsi, %rcx), %edx |

89 | jmp L(finz1) |

90 | |

91 | .p2align 4,, 4 |

92 | L(finr1b): |

93 | movzbl (%rdi), %eax |

94 | movzbl (%rsi), %edx |

95 | L(finz1): |

96 | subl %edx, %eax |

97 | L(exit): |

98 | ret |

99 | |

100 | .p2align 4,, 4 |

101 | L(fin2_7): |

102 | cmpq %rdx, %rax |

103 | jz L(finz) |

104 | movq %rax, %r11 |

105 | subq %rdx, %r11 |

106 | bsfq %r11, %rcx |

107 | sarq $3, %rcx |

108 | salq $3, %rcx |

109 | sarq %cl, %rax |

110 | movzbl %al, %eax |

111 | sarq %cl, %rdx |

112 | movzbl %dl, %edx |

113 | subl %edx, %eax |

114 | ret |

115 | |

116 | .p2align 4,, 4 |

117 | L(finz): |

118 | xorl %eax, %eax |

119 | ret |

120 | |

121 | /* For blocks bigger than 32 bytes |

122 | 1. Advance one of the addr pointer to be 16B aligned. |

123 | 2. Treat the case of both addr pointers aligned to 16B |

124 | separately to avoid movdqu. |

125 | 3. Handle any blocks of greater than 64 consecutive bytes with |

126 | unrolling to reduce branches. |

127 | 4. At least one addr pointer is 16B aligned, use memory version |

128 | of pcmbeqb. |

129 | */ |

130 | .p2align 4,, 4 |

131 | L(gt32): |

132 | movq %rdx, %r11 |

133 | addq %rdi, %r11 |

134 | movq %rdi, %r8 |

135 | |

136 | andq $15, %r8 |

137 | jz L(16am) |

138 | /* Both pointers may be misaligned. */ |

139 | movdqu (%rdi), %xmm1 |

140 | movdqu (%rdi, %rsi), %xmm0 |

141 | pcmpeqb %xmm0, %xmm1 |

142 | pmovmskb %xmm1, %edx |

143 | subl $0xffff, %edx |

144 | jnz L(neq) |

145 | neg %r8 |

146 | leaq 16(%rdi, %r8), %rdi |

147 | L(16am): |

148 | /* Handle two 16B aligned pointers separately. */ |

149 | testq $15, %rsi |

150 | jz L(ATR) |

151 | testq $16, %rdi |

152 | jz L(A32) |

153 | movdqu (%rdi, %rsi), %xmm0 |

154 | pcmpeqb (%rdi), %xmm0 |

155 | pmovmskb %xmm0, %edx |

156 | subl $0xffff, %edx |

157 | jnz L(neq) |

158 | addq $16, %rdi |

159 | L(A32): |

160 | movq %r11, %r10 |

161 | andq $-32, %r10 |

162 | cmpq %r10, %rdi |

163 | jae L(mt16) |

164 | /* Pre-unroll to be ready for unrolled 64B loop. */ |

165 | testq $32, %rdi |

166 | jz L(A64) |

167 | movdqu (%rdi,%rsi), %xmm0 |

168 | pcmpeqb (%rdi), %xmm0 |

169 | pmovmskb %xmm0, %edx |

170 | subl $0xffff, %edx |

171 | jnz L(neq) |

172 | addq $16, %rdi |

173 | |

174 | movdqu (%rdi,%rsi), %xmm0 |

175 | pcmpeqb (%rdi), %xmm0 |

176 | pmovmskb %xmm0, %edx |

177 | subl $0xffff, %edx |

178 | jnz L(neq) |

179 | addq $16, %rdi |

180 | |

181 | L(A64): |

182 | movq %r11, %r10 |

183 | andq $-64, %r10 |

184 | cmpq %r10, %rdi |

185 | jae L(mt32) |

186 | |

187 | L(A64main): |

188 | movdqu (%rdi,%rsi), %xmm0 |

189 | pcmpeqb (%rdi), %xmm0 |

190 | pmovmskb %xmm0, %edx |

191 | subl $0xffff, %edx |

192 | jnz L(neq) |

193 | addq $16, %rdi |

194 | |

195 | movdqu (%rdi,%rsi), %xmm0 |

196 | pcmpeqb (%rdi), %xmm0 |

197 | pmovmskb %xmm0, %edx |

198 | subl $0xffff, %edx |

199 | jnz L(neq) |

200 | addq $16, %rdi |

201 | |

202 | movdqu (%rdi,%rsi), %xmm0 |

203 | pcmpeqb (%rdi), %xmm0 |

204 | pmovmskb %xmm0, %edx |

205 | subl $0xffff, %edx |

206 | jnz L(neq) |

207 | addq $16, %rdi |

208 | |

209 | movdqu (%rdi,%rsi), %xmm0 |

210 | pcmpeqb (%rdi), %xmm0 |

211 | pmovmskb %xmm0, %edx |

212 | subl $0xffff, %edx |

213 | jnz L(neq) |

214 | addq $16, %rdi |

215 | |

216 | cmpq %rdi, %r10 |

217 | jne L(A64main) |

218 | |

219 | L(mt32): |

220 | movq %r11, %r10 |

221 | andq $-32, %r10 |

222 | cmpq %r10, %rdi |

223 | jae L(mt16) |

224 | |

225 | L(A32main): |

226 | movdqu (%rdi,%rsi), %xmm0 |

227 | pcmpeqb (%rdi), %xmm0 |

228 | pmovmskb %xmm0, %edx |

229 | subl $0xffff, %edx |

230 | jnz L(neq) |

231 | addq $16, %rdi |

232 | |

233 | movdqu (%rdi,%rsi), %xmm0 |

234 | pcmpeqb (%rdi), %xmm0 |

235 | pmovmskb %xmm0, %edx |

236 | subl $0xffff, %edx |

237 | jnz L(neq) |

238 | addq $16, %rdi |

239 | |

240 | cmpq %rdi, %r10 |

241 | jne L(A32main) |

242 | L(mt16): |

243 | subq %rdi, %r11 |

244 | je L(finz) |

245 | movq %r11, %r10 |

246 | jmp L(small) |

247 | |

248 | .p2align 4,, 4 |

249 | L(neq): |

250 | bsfl %edx, %ecx |

251 | movzbl (%rdi, %rcx), %eax |

252 | addq %rdi, %rsi |

253 | movzbl (%rsi,%rcx), %edx |

254 | jmp L(finz1) |

255 | |

256 | .p2align 4,, 4 |

257 | L(ATR): |

258 | movq %r11, %r10 |

259 | andq $-32, %r10 |

260 | cmpq %r10, %rdi |

261 | jae L(mt16) |

262 | testq $16, %rdi |

263 | jz L(ATR32) |

264 | |

265 | movdqa (%rdi,%rsi), %xmm0 |

266 | pcmpeqb (%rdi), %xmm0 |

267 | pmovmskb %xmm0, %edx |

268 | subl $0xffff, %edx |

269 | jnz L(neq) |

270 | addq $16, %rdi |

271 | cmpq %rdi, %r10 |

272 | je L(mt16) |

273 | |

274 | L(ATR32): |

275 | movq %r11, %r10 |

276 | andq $-64, %r10 |

277 | testq $32, %rdi |

278 | jz L(ATR64) |

279 | |

280 | movdqa (%rdi,%rsi), %xmm0 |

281 | pcmpeqb (%rdi), %xmm0 |

282 | pmovmskb %xmm0, %edx |

283 | subl $0xffff, %edx |

284 | jnz L(neq) |

285 | addq $16, %rdi |

286 | |

287 | movdqa (%rdi,%rsi), %xmm0 |

288 | pcmpeqb (%rdi), %xmm0 |

289 | pmovmskb %xmm0, %edx |

290 | subl $0xffff, %edx |

291 | jnz L(neq) |

292 | addq $16, %rdi |

293 | |

294 | L(ATR64): |

295 | cmpq %rdi, %r10 |

296 | je L(mt32) |

297 | |

298 | L(ATR64main): |

299 | movdqa (%rdi,%rsi), %xmm0 |

300 | pcmpeqb (%rdi), %xmm0 |

301 | pmovmskb %xmm0, %edx |

302 | subl $0xffff, %edx |

303 | jnz L(neq) |

304 | addq $16, %rdi |

305 | |

306 | movdqa (%rdi,%rsi), %xmm0 |

307 | pcmpeqb (%rdi), %xmm0 |

308 | pmovmskb %xmm0, %edx |

309 | subl $0xffff, %edx |

310 | jnz L(neq) |

311 | addq $16, %rdi |

312 | |

313 | movdqa (%rdi,%rsi), %xmm0 |

314 | pcmpeqb (%rdi), %xmm0 |

315 | pmovmskb %xmm0, %edx |

316 | subl $0xffff, %edx |

317 | jnz L(neq) |

318 | addq $16, %rdi |

319 | |

320 | movdqa (%rdi,%rsi), %xmm0 |

321 | pcmpeqb (%rdi), %xmm0 |

322 | pmovmskb %xmm0, %edx |

323 | subl $0xffff, %edx |

324 | jnz L(neq) |

325 | addq $16, %rdi |

326 | cmpq %rdi, %r10 |

327 | jne L(ATR64main) |

328 | |

329 | movq %r11, %r10 |

330 | andq $-32, %r10 |

331 | cmpq %r10, %rdi |

332 | jae L(mt16) |

333 | |

334 | L(ATR32res): |

335 | movdqa (%rdi,%rsi), %xmm0 |

336 | pcmpeqb (%rdi), %xmm0 |

337 | pmovmskb %xmm0, %edx |

338 | subl $0xffff, %edx |

339 | jnz L(neq) |

340 | addq $16, %rdi |

341 | |

342 | movdqa (%rdi,%rsi), %xmm0 |

343 | pcmpeqb (%rdi), %xmm0 |

344 | pmovmskb %xmm0, %edx |

345 | subl $0xffff, %edx |

346 | jnz L(neq) |

347 | addq $16, %rdi |

348 | |

349 | cmpq %r10, %rdi |

350 | jne L(ATR32res) |

351 | |

352 | subq %rdi, %r11 |

353 | je L(finz) |

354 | movq %r11, %r10 |

355 | jmp L(small) |

356 | /* Align to 16byte to improve instruction fetch. */ |

357 | .p2align 4,, 4 |

358 | END(memcmp) |

359 | |

360 | #undef bcmp |

361 | weak_alias (memcmp, bcmp) |

362 | libc_hidden_builtin_def (memcmp) |

363 |