1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
---|---|

2 | Copyright (C) 2016-2018 Free Software Foundation, Inc. |

3 | This file is part of the GNU C Library. |

4 | |

5 | The GNU C Library is free software; you can redistribute it and/or |

6 | modify it under the terms of the GNU Lesser General Public |

7 | License as published by the Free Software Foundation; either |

8 | version 2.1 of the License, or (at your option) any later version. |

9 | |

10 | The GNU C Library is distributed in the hope that it will be useful, |

11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | Lesser General Public License for more details. |

14 | |

15 | You should have received a copy of the GNU Lesser General Public |

16 | License along with the GNU C Library; if not, see |

17 | <http://www.gnu.org/licenses/>. */ |

18 | |

19 | /* memmove/memcpy/mempcpy is implemented as: |

20 | 1. Use overlapping load and store to avoid branch. |

21 | 2. Load all sources into registers and store them together to avoid |

22 | possible address overlap between source and destination. |

23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |

24 | and store them together. |

25 | 4. If address of destination > address of source, backward copy |

26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |

27 | Load the first 4 * VEC and last VEC before the loop and store |

28 | them after the loop to support overlapping addresses. |

29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |

30 | load and aligned store. Load the last 4 * VEC and first VEC |

31 | before the loop and store them after the loop to support |

32 | overlapping addresses. |

33 | 6. If size >= __x86_shared_non_temporal_threshold and there is no |

34 | overlap between destination and source, use non-temporal store |

35 | instead of aligned store. */ |

36 | |

37 | #include <sysdep.h> |

38 | |

39 | #ifndef MEMCPY_SYMBOL |

40 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

41 | #endif |

42 | |

43 | #ifndef MEMPCPY_SYMBOL |

44 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

45 | #endif |

46 | |

47 | #ifndef MEMMOVE_CHK_SYMBOL |

48 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

49 | #endif |

50 | |

51 | #ifndef VZEROUPPER |

52 | # if VEC_SIZE > 16 |

53 | # define VZEROUPPER vzeroupper |

54 | # else |

55 | # define VZEROUPPER |

56 | # endif |

57 | #endif |

58 | |

59 | /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set |

60 | up REP MOVSB operation, REP MOVSB isn't faster on short data. The |

61 | memcpy micro benchmark in glibc shows that 2KB is the approximate |

62 | value above which REP MOVSB becomes faster than SSE2 optimization |

63 | on processors with Enhanced REP MOVSB. Since larger register size |

64 | can move more data with a single load and store, the threshold is |

65 | higher with larger register size. */ |

66 | #ifndef REP_MOVSB_THRESHOLD |

67 | # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) |

68 | #endif |

69 | |

70 | #ifndef PREFETCH |

71 | # define PREFETCH(addr) prefetcht0 addr |

72 | #endif |

73 | |

74 | /* Assume 64-byte prefetch size. */ |

75 | #ifndef PREFETCH_SIZE |

76 | # define PREFETCH_SIZE 64 |

77 | #endif |

78 | |

79 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |

80 | |

81 | #if PREFETCH_SIZE == 64 |

82 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |

83 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

84 | PREFETCH ((offset)base) |

85 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |

86 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

87 | PREFETCH ((offset)base); \ |

88 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |

89 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |

90 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

91 | PREFETCH ((offset)base); \ |

92 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |

93 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |

94 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |

95 | # else |

96 | # error Unsupported PREFETCHED_LOAD_SIZE! |

97 | # endif |

98 | #else |

99 | # error Unsupported PREFETCH_SIZE! |

100 | #endif |

101 | |

102 | #ifndef SECTION |

103 | # error SECTION is not defined! |

104 | #endif |

105 | |

106 | .section SECTION(.text),"ax",@progbits |

107 | #if defined SHARED && IS_IN (libc) |

108 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |

109 | cmpq %rdx, %rcx |

110 | jb HIDDEN_JUMPTARGET (__chk_fail) |

111 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |

112 | #endif |

113 | |

114 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |

115 | movq %rdi, %rax |

116 | addq %rdx, %rax |

117 | jmp L(start) |

118 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |

119 | |

120 | #if defined SHARED && IS_IN (libc) |

121 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |

122 | cmpq %rdx, %rcx |

123 | jb HIDDEN_JUMPTARGET (__chk_fail) |

124 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |

125 | #endif |

126 | |

127 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |

128 | movq %rdi, %rax |

129 | L(start): |

130 | cmpq $VEC_SIZE, %rdx |

131 | jb L(less_vec) |

132 | cmpq $(VEC_SIZE * 2), %rdx |

133 | ja L(more_2x_vec) |

134 | #if !defined USE_MULTIARCH || !IS_IN (libc) |

135 | L(last_2x_vec): |

136 | #endif |

137 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |

138 | VMOVU (%rsi), %VEC(0) |

139 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |

140 | VMOVU %VEC(0), (%rdi) |

141 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |

142 | VZEROUPPER |

143 | #if !defined USE_MULTIARCH || !IS_IN (libc) |

144 | L(nop): |

145 | #endif |

146 | ret |

147 | #if defined USE_MULTIARCH && IS_IN (libc) |

148 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |

149 | |

150 | # if VEC_SIZE == 16 |

151 | ENTRY (__mempcpy_chk_erms) |

152 | cmpq %rdx, %rcx |

153 | jb HIDDEN_JUMPTARGET (__chk_fail) |

154 | END (__mempcpy_chk_erms) |

155 | |

156 | /* Only used to measure performance of REP MOVSB. */ |

157 | ENTRY (__mempcpy_erms) |

158 | movq %rdi, %rax |

159 | /* Skip zero length. */ |

160 | testq %rdx, %rdx |

161 | jz 2f |

162 | addq %rdx, %rax |

163 | jmp L(start_movsb) |

164 | END (__mempcpy_erms) |

165 | |

166 | ENTRY (__memmove_chk_erms) |

167 | cmpq %rdx, %rcx |

168 | jb HIDDEN_JUMPTARGET (__chk_fail) |

169 | END (__memmove_chk_erms) |

170 | |

171 | ENTRY (__memmove_erms) |

172 | movq %rdi, %rax |

173 | /* Skip zero length. */ |

174 | testq %rdx, %rdx |

175 | jz 2f |

176 | L(start_movsb): |

177 | movq %rdx, %rcx |

178 | cmpq %rsi, %rdi |

179 | jb 1f |

180 | /* Source == destination is less common. */ |

181 | je 2f |

182 | leaq (%rsi,%rcx), %rdx |

183 | cmpq %rdx, %rdi |

184 | jb L(movsb_backward) |

185 | 1: |

186 | rep movsb |

187 | 2: |

188 | ret |

189 | L(movsb_backward): |

190 | leaq -1(%rdi,%rcx), %rdi |

191 | leaq -1(%rsi,%rcx), %rsi |

192 | std |

193 | rep movsb |

194 | cld |

195 | ret |

196 | END (__memmove_erms) |

197 | strong_alias (__memmove_erms, __memcpy_erms) |

198 | strong_alias (__memmove_chk_erms, __memcpy_chk_erms) |

199 | # endif |

200 | |

201 | # ifdef SHARED |

202 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |

203 | cmpq %rdx, %rcx |

204 | jb HIDDEN_JUMPTARGET (__chk_fail) |

205 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |

206 | # endif |

207 | |

208 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |

209 | movq %rdi, %rax |

210 | addq %rdx, %rax |

211 | jmp L(start_erms) |

212 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |

213 | |

214 | # ifdef SHARED |

215 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |

216 | cmpq %rdx, %rcx |

217 | jb HIDDEN_JUMPTARGET (__chk_fail) |

218 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |

219 | # endif |

220 | |

221 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |

222 | movq %rdi, %rax |

223 | L(start_erms): |

224 | cmpq $VEC_SIZE, %rdx |

225 | jb L(less_vec) |

226 | cmpq $(VEC_SIZE * 2), %rdx |

227 | ja L(movsb_more_2x_vec) |

228 | L(last_2x_vec): |

229 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |

230 | VMOVU (%rsi), %VEC(0) |

231 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |

232 | VMOVU %VEC(0), (%rdi) |

233 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |

234 | L(return): |

235 | VZEROUPPER |

236 | ret |

237 | |

238 | L(movsb): |

239 | cmpq __x86_shared_non_temporal_threshold(%rip), %rdx |

240 | jae L(more_8x_vec) |

241 | cmpq %rsi, %rdi |

242 | jb 1f |

243 | /* Source == destination is less common. */ |

244 | je L(nop) |

245 | leaq (%rsi,%rdx), %r9 |

246 | cmpq %r9, %rdi |

247 | /* Avoid slow backward REP MOVSB. */ |

248 | # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) |

249 | # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! |

250 | # endif |

251 | jb L(more_8x_vec_backward) |

252 | 1: |

253 | movq %rdx, %rcx |

254 | rep movsb |

255 | L(nop): |

256 | ret |

257 | #endif |

258 | |

259 | L(less_vec): |

260 | /* Less than 1 VEC. */ |

261 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |

262 | # error Unsupported VEC_SIZE! |

263 | #endif |

264 | #if VEC_SIZE > 32 |

265 | cmpb $32, %dl |

266 | jae L(between_32_63) |

267 | #endif |

268 | #if VEC_SIZE > 16 |

269 | cmpb $16, %dl |

270 | jae L(between_16_31) |

271 | #endif |

272 | cmpb $8, %dl |

273 | jae L(between_8_15) |

274 | cmpb $4, %dl |

275 | jae L(between_4_7) |

276 | cmpb $1, %dl |

277 | ja L(between_2_3) |

278 | jb 1f |

279 | movzbl (%rsi), %ecx |

280 | movb %cl, (%rdi) |

281 | 1: |

282 | ret |

283 | #if VEC_SIZE > 32 |

284 | L(between_32_63): |

285 | /* From 32 to 63. No branch when size == 32. */ |

286 | vmovdqu (%rsi), %ymm0 |

287 | vmovdqu -32(%rsi,%rdx), %ymm1 |

288 | vmovdqu %ymm0, (%rdi) |

289 | vmovdqu %ymm1, -32(%rdi,%rdx) |

290 | VZEROUPPER |

291 | ret |

292 | #endif |

293 | #if VEC_SIZE > 16 |

294 | /* From 16 to 31. No branch when size == 16. */ |

295 | L(between_16_31): |

296 | vmovdqu (%rsi), %xmm0 |

297 | vmovdqu -16(%rsi,%rdx), %xmm1 |

298 | vmovdqu %xmm0, (%rdi) |

299 | vmovdqu %xmm1, -16(%rdi,%rdx) |

300 | ret |

301 | #endif |

302 | L(between_8_15): |

303 | /* From 8 to 15. No branch when size == 8. */ |

304 | movq -8(%rsi,%rdx), %rcx |

305 | movq (%rsi), %rsi |

306 | movq %rcx, -8(%rdi,%rdx) |

307 | movq %rsi, (%rdi) |

308 | ret |

309 | L(between_4_7): |

310 | /* From 4 to 7. No branch when size == 4. */ |

311 | movl -4(%rsi,%rdx), %ecx |

312 | movl (%rsi), %esi |

313 | movl %ecx, -4(%rdi,%rdx) |

314 | movl %esi, (%rdi) |

315 | ret |

316 | L(between_2_3): |

317 | /* From 2 to 3. No branch when size == 2. */ |

318 | movzwl -2(%rsi,%rdx), %ecx |

319 | movzwl (%rsi), %esi |

320 | movw %cx, -2(%rdi,%rdx) |

321 | movw %si, (%rdi) |

322 | ret |

323 | |

324 | #if defined USE_MULTIARCH && IS_IN (libc) |

325 | L(movsb_more_2x_vec): |

326 | cmpq $REP_MOVSB_THRESHOLD, %rdx |

327 | ja L(movsb) |

328 | #endif |

329 | L(more_2x_vec): |

330 | /* More than 2 * VEC and there may be overlap between destination |

331 | and source. */ |

332 | cmpq $(VEC_SIZE * 8), %rdx |

333 | ja L(more_8x_vec) |

334 | cmpq $(VEC_SIZE * 4), %rdx |

335 | jb L(last_4x_vec) |

336 | /* Copy from 4 * VEC to 8 * VEC, inclusively. */ |

337 | VMOVU (%rsi), %VEC(0) |

338 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

339 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

340 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

341 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) |

342 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) |

343 | VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) |

344 | VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) |

345 | VMOVU %VEC(0), (%rdi) |

346 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

347 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |

348 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |

349 | VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) |

350 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) |

351 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) |

352 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) |

353 | VZEROUPPER |

354 | ret |

355 | L(last_4x_vec): |

356 | /* Copy from 2 * VEC to 4 * VEC. */ |

357 | VMOVU (%rsi), %VEC(0) |

358 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

359 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) |

360 | VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) |

361 | VMOVU %VEC(0), (%rdi) |

362 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

363 | VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) |

364 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) |

365 | VZEROUPPER |

366 | ret |

367 | |

368 | L(more_8x_vec): |

369 | cmpq %rsi, %rdi |

370 | ja L(more_8x_vec_backward) |

371 | /* Source == destination is less common. */ |

372 | je L(nop) |

373 | /* Load the first VEC and last 4 * VEC to support overlapping |

374 | addresses. */ |

375 | VMOVU (%rsi), %VEC(4) |

376 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |

377 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |

378 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |

379 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |

380 | /* Save start and stop of the destination buffer. */ |

381 | movq %rdi, %r11 |

382 | leaq -VEC_SIZE(%rdi, %rdx), %rcx |

383 | /* Align destination for aligned stores in the loop. Compute |

384 | how much destination is misaligned. */ |

385 | movq %rdi, %r8 |

386 | andq $(VEC_SIZE - 1), %r8 |

387 | /* Get the negative of offset for alignment. */ |

388 | subq $VEC_SIZE, %r8 |

389 | /* Adjust source. */ |

390 | subq %r8, %rsi |

391 | /* Adjust destination which should be aligned now. */ |

392 | subq %r8, %rdi |

393 | /* Adjust length. */ |

394 | addq %r8, %rdx |

395 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

396 | /* Check non-temporal store threshold. */ |

397 | cmpq __x86_shared_non_temporal_threshold(%rip), %rdx |

398 | ja L(large_forward) |

399 | #endif |

400 | L(loop_4x_vec_forward): |

401 | /* Copy 4 * VEC a time forward. */ |

402 | VMOVU (%rsi), %VEC(0) |

403 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

404 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

405 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

406 | addq $(VEC_SIZE * 4), %rsi |

407 | subq $(VEC_SIZE * 4), %rdx |

408 | VMOVA %VEC(0), (%rdi) |

409 | VMOVA %VEC(1), VEC_SIZE(%rdi) |

410 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |

411 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |

412 | addq $(VEC_SIZE * 4), %rdi |

413 | cmpq $(VEC_SIZE * 4), %rdx |

414 | ja L(loop_4x_vec_forward) |

415 | /* Store the last 4 * VEC. */ |

416 | VMOVU %VEC(5), (%rcx) |

417 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |

418 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |

419 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |

420 | /* Store the first VEC. */ |

421 | VMOVU %VEC(4), (%r11) |

422 | VZEROUPPER |

423 | ret |

424 | |

425 | L(more_8x_vec_backward): |

426 | /* Load the first 4 * VEC and last VEC to support overlapping |

427 | addresses. */ |

428 | VMOVU (%rsi), %VEC(4) |

429 | VMOVU VEC_SIZE(%rsi), %VEC(5) |

430 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |

431 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |

432 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) |

433 | /* Save stop of the destination buffer. */ |

434 | leaq -VEC_SIZE(%rdi, %rdx), %r11 |

435 | /* Align destination end for aligned stores in the loop. Compute |

436 | how much destination end is misaligned. */ |

437 | leaq -VEC_SIZE(%rsi, %rdx), %rcx |

438 | movq %r11, %r9 |

439 | movq %r11, %r8 |

440 | andq $(VEC_SIZE - 1), %r8 |

441 | /* Adjust source. */ |

442 | subq %r8, %rcx |

443 | /* Adjust the end of destination which should be aligned now. */ |

444 | subq %r8, %r9 |

445 | /* Adjust length. */ |

446 | subq %r8, %rdx |

447 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

448 | /* Check non-temporal store threshold. */ |

449 | cmpq __x86_shared_non_temporal_threshold(%rip), %rdx |

450 | ja L(large_backward) |

451 | #endif |

452 | L(loop_4x_vec_backward): |

453 | /* Copy 4 * VEC a time backward. */ |

454 | VMOVU (%rcx), %VEC(0) |

455 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |

456 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |

457 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |

458 | subq $(VEC_SIZE * 4), %rcx |

459 | subq $(VEC_SIZE * 4), %rdx |

460 | VMOVA %VEC(0), (%r9) |

461 | VMOVA %VEC(1), -VEC_SIZE(%r9) |

462 | VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) |

463 | VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) |

464 | subq $(VEC_SIZE * 4), %r9 |

465 | cmpq $(VEC_SIZE * 4), %rdx |

466 | ja L(loop_4x_vec_backward) |

467 | /* Store the first 4 * VEC. */ |

468 | VMOVU %VEC(4), (%rdi) |

469 | VMOVU %VEC(5), VEC_SIZE(%rdi) |

470 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |

471 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |

472 | /* Store the last VEC. */ |

473 | VMOVU %VEC(8), (%r11) |

474 | VZEROUPPER |

475 | ret |

476 | |

477 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

478 | L(large_forward): |

479 | /* Don't use non-temporal store if there is overlap between |

480 | destination and source since destination may be in cache |

481 | when source is loaded. */ |

482 | leaq (%rdi, %rdx), %r10 |

483 | cmpq %r10, %rsi |

484 | jb L(loop_4x_vec_forward) |

485 | L(loop_large_forward): |

486 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |

487 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |

488 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) |

489 | VMOVU (%rsi), %VEC(0) |

490 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

491 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

492 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

493 | addq $PREFETCHED_LOAD_SIZE, %rsi |

494 | subq $PREFETCHED_LOAD_SIZE, %rdx |

495 | VMOVNT %VEC(0), (%rdi) |

496 | VMOVNT %VEC(1), VEC_SIZE(%rdi) |

497 | VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) |

498 | VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) |

499 | addq $PREFETCHED_LOAD_SIZE, %rdi |

500 | cmpq $PREFETCHED_LOAD_SIZE, %rdx |

501 | ja L(loop_large_forward) |

502 | sfence |

503 | /* Store the last 4 * VEC. */ |

504 | VMOVU %VEC(5), (%rcx) |

505 | VMOVU %VEC(6), -VEC_SIZE(%rcx) |

506 | VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |

507 | VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |

508 | /* Store the first VEC. */ |

509 | VMOVU %VEC(4), (%r11) |

510 | VZEROUPPER |

511 | ret |

512 | |

513 | L(large_backward): |

514 | /* Don't use non-temporal store if there is overlap between |

515 | destination and source since destination may be in cache |

516 | when source is loaded. */ |

517 | leaq (%rcx, %rdx), %r10 |

518 | cmpq %r10, %r9 |

519 | jb L(loop_4x_vec_backward) |

520 | L(loop_large_backward): |

521 | /* Copy 4 * VEC a time backward with non-temporal stores. */ |

522 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) |

523 | PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) |

524 | VMOVU (%rcx), %VEC(0) |

525 | VMOVU -VEC_SIZE(%rcx), %VEC(1) |

526 | VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |

527 | VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |

528 | subq $PREFETCHED_LOAD_SIZE, %rcx |

529 | subq $PREFETCHED_LOAD_SIZE, %rdx |

530 | VMOVNT %VEC(0), (%r9) |

531 | VMOVNT %VEC(1), -VEC_SIZE(%r9) |

532 | VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) |

533 | VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) |

534 | subq $PREFETCHED_LOAD_SIZE, %r9 |

535 | cmpq $PREFETCHED_LOAD_SIZE, %rdx |

536 | ja L(loop_large_backward) |

537 | sfence |

538 | /* Store the first 4 * VEC. */ |

539 | VMOVU %VEC(4), (%rdi) |

540 | VMOVU %VEC(5), VEC_SIZE(%rdi) |

541 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |

542 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |

543 | /* Store the last VEC. */ |

544 | VMOVU %VEC(8), (%r11) |

545 | VZEROUPPER |

546 | ret |

547 | #endif |

548 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |

549 | |

550 | #if IS_IN (libc) |

551 | # ifdef USE_MULTIARCH |

552 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |

553 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |

554 | # ifdef SHARED |

555 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |

556 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |

557 | # endif |

558 | # endif |

559 | # ifdef SHARED |

560 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |

561 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |

562 | # endif |

563 | #endif |

564 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |

565 | MEMCPY_SYMBOL (__memcpy, unaligned)) |

566 |