1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
---|---|

2 | Copyright (C) 2016-2019 Free Software Foundation, Inc. |

3 | This file is part of the GNU C Library. |

4 | |

5 | The GNU C Library is free software; you can redistribute it and/or |

6 | modify it under the terms of the GNU Lesser General Public |

7 | License as published by the Free Software Foundation; either |

8 | version 2.1 of the License, or (at your option) any later version. |

9 | |

10 | The GNU C Library is distributed in the hope that it will be useful, |

11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | Lesser General Public License for more details. |

14 | |

15 | You should have received a copy of the GNU Lesser General Public |

16 | License along with the GNU C Library; if not, see |

17 | <http://www.gnu.org/licenses/>. */ |

18 | |

19 | /* memmove/memcpy/mempcpy is implemented as: |

20 | 1. Use overlapping load and store to avoid branch. |

21 | 2. Load all sources into registers and store them together to avoid |

22 | possible address overlap between source and destination. |

23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |

24 | and store them together. |

25 | 4. If address of destination > address of source, backward copy |

26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |

27 | Load the first 4 * VEC and last VEC before the loop and store |

28 | them after the loop to support overlapping addresses. |

29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |

30 | load and aligned store. Load the last 4 * VEC and first VEC |

31 | before the loop and store them after the loop to support |

32 | overlapping addresses. |

33 | 6. If size >= __x86_shared_non_temporal_threshold and there is no |

34 | overlap between destination and source, use non-temporal store |

35 | instead of aligned store. */ |

36 | |

37 | #include <sysdep.h> |

38 | |

39 | #ifndef MEMCPY_SYMBOL |

40 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

41 | #endif |

42 | |

43 | #ifndef MEMPCPY_SYMBOL |

44 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

45 | #endif |

46 | |

47 | #ifndef MEMMOVE_CHK_SYMBOL |

48 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

49 | #endif |

50 | |

51 | #ifndef VZEROUPPER |

52 | # if VEC_SIZE > 16 |

53 | # define VZEROUPPER vzeroupper |

54 | # else |

55 | # define VZEROUPPER |

56 | # endif |

57 | #endif |

58 | |

59 | /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set |

60 | up REP MOVSB operation, REP MOVSB isn't faster on short data. The |

61 | memcpy micro benchmark in glibc shows that 2KB is the approximate |

62 | value above which REP MOVSB becomes faster than SSE2 optimization |

63 | on processors with Enhanced REP MOVSB. Since larger register size |

64 | can move more data with a single load and store, the threshold is |

65 | higher with larger register size. */ |

66 | #ifndef REP_MOVSB_THRESHOLD |

67 | # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) |

68 | #endif |

69 | |

70 | #ifndef PREFETCH |

71 | # define PREFETCH(addr) prefetcht0 addr |

72 | #endif |

73 | |

74 | /* Assume 64-byte prefetch size. */ |

75 | #ifndef PREFETCH_SIZE |

76 | # define PREFETCH_SIZE 64 |

77 | #endif |

78 | |

79 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |

80 | |

81 | #if PREFETCH_SIZE == 64 |

82 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |

83 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

84 | PREFETCH ((offset)base) |

85 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |

86 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

87 | PREFETCH ((offset)base); \ |

88 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |

89 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |

90 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

91 | PREFETCH ((offset)base); \ |

92 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |

93 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |

94 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |

95 | # else |

96 | # error Unsupported PREFETCHED_LOAD_SIZE! |

97 | # endif |

98 | #else |

99 | # error Unsupported PREFETCH_SIZE! |

100 | #endif |

101 | |

102 | #ifndef SECTION |

103 | # error SECTION is not defined! |

104 | #endif |

105 | |

106 | .section SECTION(.text),"ax",@progbits |

107 | #if defined SHARED && IS_IN (libc) |

108 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |

109 | cmp %RDX_LP, %RCX_LP |

110 | jb HIDDEN_JUMPTARGET (__chk_fail) |

111 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |

112 | #endif |

113 | |

114 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |

115 | mov %RDI_LP, %RAX_LP |

116 | add %RDX_LP, %RAX_LP |

117 | jmp L(start) |

118 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |

119 | |

120 | #if defined SHARED && IS_IN (libc) |

121 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |

122 | cmp %RDX_LP, %RCX_LP |

123 | jb HIDDEN_JUMPTARGET (__chk_fail) |

124 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |

125 | #endif |

126 | |

127 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |

128 | movq %rdi, %rax |

129 | L(start): |

130 | # ifdef __ILP32__ |

131 | /* Clear the upper 32 bits. */ |

132 | movl %edx, %edx |

133 | # endif |

134 | cmp $VEC_SIZE, %RDX_LP |

135 | jb L(less_vec) |

136 | cmp $(VEC_SIZE * 2), %RDX_LP |

137 | ja L(more_2x_vec) |

138 | #if !defined USE_MULTIARCH || !IS_IN (libc) |

139 | L(last_2x_vec): |

140 | #endif |

141 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |

142 | VMOVU (%rsi), %VEC(0) |

143 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |

144 | VMOVU %VEC(0), (%rdi) |

145 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |

146 | VZEROUPPER |

147 | #if !defined USE_MULTIARCH || !IS_IN (libc) |

148 | L(nop): |

149 | #endif |

150 | ret |

151 | #if defined USE_MULTIARCH && IS_IN (libc) |

152 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |

153 | |

154 | # if VEC_SIZE == 16 |

155 | ENTRY (__mempcpy_chk_erms) |

156 | cmp %RDX_LP, %RCX_LP |

157 | jb HIDDEN_JUMPTARGET (__chk_fail) |

158 | END (__mempcpy_chk_erms) |

159 | |

160 | /* Only used to measure performance of REP MOVSB. */ |

161 | ENTRY (__mempcpy_erms) |

162 | mov %RDI_LP, %RAX_LP |

163 | /* Skip zero length. */ |

164 | test %RDX_LP, %RDX_LP |

165 | jz 2f |

166 | add %RDX_LP, %RAX_LP |

167 | jmp L(start_movsb) |

168 | END (__mempcpy_erms) |

169 | |

170 | ENTRY (__memmove_chk_erms) |

171 | cmp %RDX_LP, %RCX_LP |

172 | jb HIDDEN_JUMPTARGET (__chk_fail) |

173 | END (__memmove_chk_erms) |

174 | |

175 | ENTRY (__memmove_erms) |

176 | movq %rdi, %rax |

177 | /* Skip zero length. */ |

178 | test %RDX_LP, %RDX_LP |

179 | jz 2f |

180 | L(start_movsb): |

181 | mov %RDX_LP, %RCX_LP |

182 | cmp %RSI_LP, %RDI_LP |

183 | jb 1f |

184 | /* Source == destination is less common. */ |

185 | je 2f |

186 | lea (%rsi,%rcx), %RDX_LP |

187 | cmp %RDX_LP, %RDI_LP |

188 | jb L(movsb_backward) |

189 | 1: |

190 | rep movsb |

191 | 2: |

192 | ret |

193 | L(movsb_backward): |

194 | leaq -1(%rdi,%rcx), %rdi |

195 | leaq -1(%rsi,%rcx), %rsi |

196 | std |

197 | rep movsb |

198 | cld |

199 | ret |

200 | END (__memmove_erms) |

201 | strong_alias (__memmove_erms, __memcpy_erms) |

202 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |

203 | # endif |

204 | |

205 | # ifdef SHARED |

206 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |

207 | cmp %RDX_LP, %RCX_LP |

208 | jb HIDDEN_JUMPTARGET (__chk_fail) |

209 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |

210 | # endif |

211 | |

212 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |

213 | mov %RDI_LP, %RAX_LP |

214 | add %RDX_LP, %RAX_LP |

215 | jmp L(start_erms) |

216 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |

217 | |

218 | # ifdef SHARED |

219 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |

220 | cmp %RDX_LP, %RCX_LP |

221 | jb HIDDEN_JUMPTARGET (__chk_fail) |

222 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |

223 | # endif |

224 | |

225 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |

226 | movq %rdi, %rax |

227 | L(start_erms): |

228 | # ifdef __ILP32__ |

229 | /* Clear the upper 32 bits. */ |

230 | movl %edx, %edx |

231 | # endif |

232 | cmp $VEC_SIZE, %RDX_LP |

233 | jb L(less_vec) |

234 | cmp $(VEC_SIZE * 2), %RDX_LP |

235 | ja L(movsb_more_2x_vec) |

236 | L(last_2x_vec): |

237 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |

238 | VMOVU (%rsi), %VEC(0) |

239 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |

240 | VMOVU %VEC(0), (%rdi) |

241 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |

242 | L(return): |

243 | VZEROUPPER |

244 | ret |

245 | |

246 | L(movsb): |

247 | cmpq __x86_shared_non_temporal_threshold(%rip), %rdx |

248 | jae L(more_8x_vec) |

249 | cmpq %rsi, %rdi |

250 | jb 1f |

251 | /* Source == destination is less common. */ |

252 | je L(nop) |

253 | leaq (%rsi,%rdx), %r9 |

254 | cmpq %r9, %rdi |

255 | /* Avoid slow backward REP MOVSB. */ |

256 | # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) |

257 | # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! |

258 | # endif |

259 | jb L(more_8x_vec_backward) |

260 | 1: |

261 | mov %RDX_LP, %RCX_LP |

262 | rep movsb |

263 | L(nop): |

264 | ret |

265 | #endif |

266 | |

267 | L(less_vec): |

268 | /* Less than 1 VEC. */ |

269 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |

270 | # error Unsupported VEC_SIZE! |

271 | #endif |

272 | #if VEC_SIZE > 32 |

273 | cmpb $32, %dl |

274 | jae L(between_32_63) |

275 | #endif |

276 | #if VEC_SIZE > 16 |

277 | cmpb $16, %dl |

278 | jae L(between_16_31) |

279 | #endif |

280 | cmpb $8, %dl |

281 | jae L(between_8_15) |

282 | cmpb $4, %dl |

283 | jae L(between_4_7) |

284 | cmpb $1, %dl |

285 | ja L(between_2_3) |

286 | jb 1f |

287 | movzbl (%rsi), %ecx |

288 | movb %cl, (%rdi) |

289 | 1: |

290 | ret |

291 | #if VEC_SIZE > 32 |

292 | L(between_32_63): |

293 | /* From 32 to 63. No branch when size == 32. */ |

294 | vmovdqu (%rsi), %ymm0 |

295 | vmovdqu -32(%rsi,%rdx), %ymm1 |

296 | vmovdqu %ymm0, (%rdi) |

297 | vmovdqu %ymm1, -32(%rdi,%rdx) |

298 | VZEROUPPER |

299 | ret |

300 | #endif |

301 | #if VEC_SIZE > 16 |

302 | /* From 16 to 31. No branch when size == 16. */ |

303 | L(between_16_31): |

304 | vmovdqu (%rsi), %xmm0 |

305 | vmovdqu -16(%rsi,%rdx), %xmm1 |

306 | vmovdqu %xmm0, (%rdi) |

307 | vmovdqu %xmm1, -16(%rdi,%rdx) |

308 | ret |

309 | #endif |

310 | L(between_8_15): |

311 | /* From 8 to 15. No branch when size == 8. */ |

312 | movq -8(%rsi,%rdx), %rcx |

313 | movq (%rsi), %rsi |

314 | movq %rcx, -8(%rdi,%rdx) |

315 | movq %rsi, (%rdi) |

316 | ret |

317 | L(between_4_7): |

318 | /* From 4 to 7. No branch when size == 4. */ |

319 | movl -4(%rsi,%rdx), %ecx |

320 | movl (%rsi), %esi |

321 | movl %ecx, -4(%rdi,%rdx) |

322 | movl %esi, (%rdi) |

323 | ret |

324 | L(between_2_3): |

325 | /* From 2 to 3. No branch when size == 2. */ |

326 | movzwl -2(%rsi,%rdx), %ecx |

327 | movzwl (%rsi), %esi |

328 | movw %cx, -2(%rdi,%rdx) |

329 | movw %si, (%rdi) |

330 | ret |

331 | |

332 | #if defined USE_MULTIARCH && IS_IN (libc) |

333 | L(movsb_more_2x_vec): |

334 | cmpq $REP_MOVSB_THRESHOLD, %rdx |

335 | ja L(movsb) |

336 | #endif |

337 | L(more_2x_vec): |

338 | /* More than 2 * VEC and there may be overlap between destination |

339 | and source. */ |

340 | cmpq $(VEC_SIZE * 8), %rdx |

341 | ja L(more_8x_vec) |

342 | cmpq $(VEC_SIZE * 4), %rdx |

343 | jb L(last_4x_vec) |

344 | /* Copy from 4 * VEC to 8 * VEC, inclusively. */ |

345 | VMOVU (%rsi), %VEC(0) |

346 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

347 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

348 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

349 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) |

350 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) |

351 | VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) |

352 | VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) |

353 | VMOVU %VEC(0), (%rdi) |

354 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

355 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |

356 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |

357 | VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) |

358 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) |

359 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) |

360 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) |

361 | VZEROUPPER |

362 | ret |

363 | L(last_4x_vec): |

364 | /* Copy from 2 * VEC to 4 * VEC. */ |

365 | VMOVU (%rsi), %VEC(0) |

366 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

367 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) |

368 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) |

369 | VMOVU %VEC(0), (%rdi) |

370 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

371 | VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) |

372 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) |

373 | VZEROUPPER |

374 | ret |

375 | |

376 | L(more_8x_vec): |

377 | cmpq %rsi, %rdi |

378 | ja L(more_8x_vec_backward) |

379 | /* Source == destination is less common. */ |

380 | je L(nop) |

381 | /* Load the first VEC and last 4 * VEC to support overlapping |

382 | addresses. */ |

383 | VMOVU (%rsi), %VEC(4) |

384 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |

385 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |

386 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |

387 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |

388 | /* Save start and stop of the destination buffer. */ |

389 | movq %rdi, %r11 |

390 | leaq -VEC_SIZE(%rdi, %rdx), %rcx |

391 | /* Align destination for aligned stores in the loop. Compute |

392 | how much destination is misaligned. */ |

393 | movq %rdi, %r8 |

394 | andq $(VEC_SIZE - 1), %r8 |

395 | /* Get the negative of offset for alignment. */ |

396 | subq $VEC_SIZE, %r8 |

397 | /* Adjust source. */ |

398 | subq %r8, %rsi |

399 | /* Adjust destination which should be aligned now. */ |

400 | subq %r8, %rdi |

401 | /* Adjust length. */ |

402 | addq %r8, %rdx |

403 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

404 | /* Check non-temporal store threshold. */ |

405 | cmpq __x86_shared_non_temporal_threshold(%rip), %rdx |

406 | ja L(large_forward) |

407 | #endif |

408 | L(loop_4x_vec_forward): |

409 | /* Copy 4 * VEC a time forward. */ |

410 | VMOVU (%rsi), %VEC(0) |

411 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

412 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

413 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

414 | addq $(VEC_SIZE * 4), %rsi |

415 | subq $(VEC_SIZE * 4), %rdx |

416 | VMOVA %VEC(0), (%rdi) |

417 | VMOVA %VEC(1), VEC_SIZE(%rdi) |

418 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |

419 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |

420 | addq $(VEC_SIZE * 4), %rdi |

421 | cmpq $(VEC_SIZE * 4), %rdx |

422 | ja L(loop_4x_vec_forward) |

423 | /* Store the last 4 * VEC. */ |

424 | VMOVU %VEC(5), (%rcx) |

425 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |

426 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |

427 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |

428 | /* Store the first VEC. */ |

429 | VMOVU %VEC(4), (%r11) |

430 | VZEROUPPER |

431 | ret |

432 | |

433 | L(more_8x_vec_backward): |

434 | /* Load the first 4 * VEC and last VEC to support overlapping |

435 | addresses. */ |

436 | VMOVU (%rsi), %VEC(4) |

437 | VMOVU VEC_SIZE(%rsi), %VEC(5) |

438 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |

439 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |

440 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) |

441 | /* Save stop of the destination buffer. */ |

442 | leaq -VEC_SIZE(%rdi, %rdx), %r11 |

443 | /* Align destination end for aligned stores in the loop. Compute |

444 | how much destination end is misaligned. */ |

445 | leaq -VEC_SIZE(%rsi, %rdx), %rcx |

446 | movq %r11, %r9 |

447 | movq %r11, %r8 |

448 | andq $(VEC_SIZE - 1), %r8 |

449 | /* Adjust source. */ |

450 | subq %r8, %rcx |

451 | /* Adjust the end of destination which should be aligned now. */ |

452 | subq %r8, %r9 |

453 | /* Adjust length. */ |

454 | subq %r8, %rdx |

455 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

456 | /* Check non-temporal store threshold. */ |

457 | cmpq __x86_shared_non_temporal_threshold(%rip), %rdx |

458 | ja L(large_backward) |

459 | #endif |

460 | L(loop_4x_vec_backward): |

461 | /* Copy 4 * VEC a time backward. */ |

462 | VMOVU (%rcx), %VEC(0) |

463 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |

464 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |

465 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |

466 | subq $(VEC_SIZE * 4), %rcx |

467 | subq $(VEC_SIZE * 4), %rdx |

468 | VMOVA %VEC(0), (%r9) |

469 | VMOVA %VEC(1), -VEC_SIZE(%r9) |

470 | VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) |

471 | VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) |

472 | subq $(VEC_SIZE * 4), %r9 |

473 | cmpq $(VEC_SIZE * 4), %rdx |

474 | ja L(loop_4x_vec_backward) |

475 | /* Store the first 4 * VEC. */ |

476 | VMOVU %VEC(4), (%rdi) |

477 | VMOVU %VEC(5), VEC_SIZE(%rdi) |

478 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |

479 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |

480 | /* Store the last VEC. */ |

481 | VMOVU %VEC(8), (%r11) |

482 | VZEROUPPER |

483 | ret |

484 | |

485 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

486 | L(large_forward): |

487 | /* Don't use non-temporal store if there is overlap between |

488 | destination and source since destination may be in cache |

489 | when source is loaded. */ |

490 | leaq (%rdi, %rdx), %r10 |

491 | cmpq %r10, %rsi |

492 | jb L(loop_4x_vec_forward) |

493 | L(loop_large_forward): |

494 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |

495 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |

496 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) |

497 | VMOVU (%rsi), %VEC(0) |

498 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

499 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

500 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

501 | addq $PREFETCHED_LOAD_SIZE, %rsi |

502 | subq $PREFETCHED_LOAD_SIZE, %rdx |

503 | VMOVNT %VEC(0), (%rdi) |

504 | VMOVNT %VEC(1), VEC_SIZE(%rdi) |

505 | VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) |

506 | VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) |

507 | addq $PREFETCHED_LOAD_SIZE, %rdi |

508 | cmpq $PREFETCHED_LOAD_SIZE, %rdx |

509 | ja L(loop_large_forward) |

510 | sfence |

511 | /* Store the last 4 * VEC. */ |

512 | VMOVU %VEC(5), (%rcx) |

513 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |

514 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |

515 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |

516 | /* Store the first VEC. */ |

517 | VMOVU %VEC(4), (%r11) |

518 | VZEROUPPER |

519 | ret |

520 | |

521 | L(large_backward): |

522 | /* Don't use non-temporal store if there is overlap between |

523 | destination and source since destination may be in cache |

524 | when source is loaded. */ |

525 | leaq (%rcx, %rdx), %r10 |

526 | cmpq %r10, %r9 |

527 | jb L(loop_4x_vec_backward) |

528 | L(loop_large_backward): |

529 | /* Copy 4 * VEC a time backward with non-temporal stores. */ |

530 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) |

531 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) |

532 | VMOVU (%rcx), %VEC(0) |

533 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |

534 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |

535 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |

536 | subq $PREFETCHED_LOAD_SIZE, %rcx |

537 | subq $PREFETCHED_LOAD_SIZE, %rdx |

538 | VMOVNT %VEC(0), (%r9) |

539 | VMOVNT %VEC(1), -VEC_SIZE(%r9) |

540 | VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) |

541 | VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) |

542 | subq $PREFETCHED_LOAD_SIZE, %r9 |

543 | cmpq $PREFETCHED_LOAD_SIZE, %rdx |

544 | ja L(loop_large_backward) |

545 | sfence |

546 | /* Store the first 4 * VEC. */ |

547 | VMOVU %VEC(4), (%rdi) |

548 | VMOVU %VEC(5), VEC_SIZE(%rdi) |

549 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |

550 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |

551 | /* Store the last VEC. */ |

552 | VMOVU %VEC(8), (%r11) |

553 | VZEROUPPER |

554 | ret |

555 | #endif |

556 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |

557 | |

558 | #if IS_IN (libc) |

559 | # ifdef USE_MULTIARCH |

560 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |

561 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |

562 | # ifdef SHARED |

563 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |

564 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |

565 | # endif |

566 | # endif |

567 | # ifdef SHARED |

568 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |

569 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |

570 | # endif |

571 | #endif |

572 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |

573 | MEMCPY_SYMBOL (__memcpy, unaligned)) |

574 |