1 | /* SSE2 version of strlen/wcslen. |
---|---|

2 | Copyright (C) 2012-2019 Free Software Foundation, Inc. |

3 | This file is part of the GNU C Library. |

4 | |

5 | The GNU C Library is free software; you can redistribute it and/or |

6 | modify it under the terms of the GNU Lesser General Public |

7 | License as published by the Free Software Foundation; either |

8 | version 2.1 of the License, or (at your option) any later version. |

9 | |

10 | The GNU C Library is distributed in the hope that it will be useful, |

11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | Lesser General Public License for more details. |

14 | |

15 | You should have received a copy of the GNU Lesser General Public |

16 | License along with the GNU C Library; if not, see |

17 | <http://www.gnu.org/licenses/>. */ |

18 | |

19 | #include <sysdep.h> |

20 | |

21 | #ifdef AS_WCSLEN |

22 | # define PMINU pminud |

23 | # define PCMPEQ pcmpeqd |

24 | # define SHIFT_RETURN shrq $2, %rax |

25 | #else |

26 | # define PMINU pminub |

27 | # define PCMPEQ pcmpeqb |

28 | # define SHIFT_RETURN |

29 | #endif |

30 | |

31 | /* Long lived register in strlen(s), strnlen(s, n) are: |

32 | |

33 | %xmm3 - zero |

34 | %rdi - s |

35 | %r10 (s+n) & (~(64-1)) |

36 | %r11 s+n |

37 | */ |

38 | |

39 | |

40 | .text |

41 | ENTRY(strlen) |

42 | |

43 | /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ |

44 | #define FIND_ZERO \ |

45 | PCMPEQ (%rax), %xmm0; \ |

46 | PCMPEQ 16(%rax), %xmm1; \ |

47 | PCMPEQ 32(%rax), %xmm2; \ |

48 | PCMPEQ 48(%rax), %xmm3; \ |

49 | pmovmskb %xmm0, %esi; \ |

50 | pmovmskb %xmm1, %edx; \ |

51 | pmovmskb %xmm2, %r8d; \ |

52 | pmovmskb %xmm3, %ecx; \ |

53 | salq $16, %rdx; \ |

54 | salq $16, %rcx; \ |

55 | orq %rsi, %rdx; \ |

56 | orq %r8, %rcx; \ |

57 | salq $32, %rcx; \ |

58 | orq %rcx, %rdx; |

59 | |

60 | #ifdef AS_STRNLEN |

61 | /* Do not read anything when n==0. */ |

62 | test %RSI_LP, %RSI_LP |

63 | jne L(n_nonzero) |

64 | xor %rax, %rax |

65 | ret |

66 | L(n_nonzero): |

67 | # ifdef AS_WCSLEN |

68 | shl $2, %RSI_LP |

69 | # endif |

70 | |

71 | /* Initialize long lived registers. */ |

72 | |

73 | add %RDI_LP, %RSI_LP |

74 | mov %RSI_LP, %R10_LP |

75 | and $-64, %R10_LP |

76 | mov %RSI_LP, %R11_LP |

77 | #endif |

78 | |

79 | pxor %xmm0, %xmm0 |

80 | pxor %xmm1, %xmm1 |

81 | pxor %xmm2, %xmm2 |

82 | pxor %xmm3, %xmm3 |

83 | movq %rdi, %rax |

84 | movq %rdi, %rcx |

85 | andq $4095, %rcx |

86 | /* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ |

87 | cmpq $4047, %rcx |

88 | /* We cannot unify this branching as it would be ~6 cycles slower. */ |

89 | ja L(cross_page) |

90 | |

91 | #ifdef AS_STRNLEN |

92 | /* Test if end is among first 64 bytes. */ |

93 | # define STRNLEN_PROLOG \ |

94 | mov %r11, %rsi; \ |

95 | subq %rax, %rsi; \ |

96 | andq $-64, %rax; \ |

97 | testq $-64, %rsi; \ |

98 | je L(strnlen_ret) |

99 | #else |

100 | # define STRNLEN_PROLOG andq $-64, %rax; |

101 | #endif |

102 | |

103 | /* Ignore bits in mask that come before start of string. */ |

104 | #define PROLOG(lab) \ |

105 | movq %rdi, %rcx; \ |

106 | xorq %rax, %rcx; \ |

107 | STRNLEN_PROLOG; \ |

108 | sarq %cl, %rdx; \ |

109 | test %rdx, %rdx; \ |

110 | je L(lab); \ |

111 | bsfq %rdx, %rax; \ |

112 | SHIFT_RETURN; \ |

113 | ret |

114 | |

115 | #ifdef AS_STRNLEN |

116 | andq $-16, %rax |

117 | FIND_ZERO |

118 | #else |

119 | /* Test first 16 bytes unaligned. */ |

120 | movdqu (%rax), %xmm4 |

121 | PCMPEQ %xmm0, %xmm4 |

122 | pmovmskb %xmm4, %edx |

123 | test %edx, %edx |

124 | je L(next48_bytes) |

125 | bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ |

126 | SHIFT_RETURN |

127 | ret |

128 | |

129 | L(next48_bytes): |

130 | /* Same as FIND_ZERO except we do not check first 16 bytes. */ |

131 | andq $-16, %rax |

132 | PCMPEQ 16(%rax), %xmm1 |

133 | PCMPEQ 32(%rax), %xmm2 |

134 | PCMPEQ 48(%rax), %xmm3 |

135 | pmovmskb %xmm1, %edx |

136 | pmovmskb %xmm2, %r8d |

137 | pmovmskb %xmm3, %ecx |

138 | salq $16, %rdx |

139 | salq $16, %rcx |

140 | orq %r8, %rcx |

141 | salq $32, %rcx |

142 | orq %rcx, %rdx |

143 | #endif |

144 | |

145 | /* When no zero byte is found xmm1-3 are zero so we do not have to |

146 | zero them. */ |

147 | PROLOG(loop) |

148 | |

149 | .p2align 4 |

150 | L(cross_page): |

151 | andq $-64, %rax |

152 | FIND_ZERO |

153 | PROLOG(loop_init) |

154 | |

155 | #ifdef AS_STRNLEN |

156 | /* We must do this check to correctly handle strnlen (s, -1). */ |

157 | L(strnlen_ret): |

158 | bts %rsi, %rdx |

159 | sarq %cl, %rdx |

160 | test %rdx, %rdx |

161 | je L(loop_init) |

162 | bsfq %rdx, %rax |

163 | SHIFT_RETURN |

164 | ret |

165 | #endif |

166 | .p2align 4 |

167 | L(loop_init): |

168 | pxor %xmm1, %xmm1 |

169 | pxor %xmm2, %xmm2 |

170 | pxor %xmm3, %xmm3 |

171 | #ifdef AS_STRNLEN |

172 | .p2align 4 |

173 | L(loop): |

174 | |

175 | addq $64, %rax |

176 | cmpq %rax, %r10 |

177 | je L(exit_end) |

178 | |

179 | movdqa (%rax), %xmm0 |

180 | PMINU 16(%rax), %xmm0 |

181 | PMINU 32(%rax), %xmm0 |

182 | PMINU 48(%rax), %xmm0 |

183 | PCMPEQ %xmm3, %xmm0 |

184 | pmovmskb %xmm0, %edx |

185 | testl %edx, %edx |

186 | jne L(exit) |

187 | jmp L(loop) |

188 | |

189 | .p2align 4 |

190 | L(exit_end): |

191 | cmp %rax, %r11 |

192 | je L(first) /* Do not read when end is at page boundary. */ |

193 | pxor %xmm0, %xmm0 |

194 | FIND_ZERO |

195 | |

196 | L(first): |

197 | bts %r11, %rdx |

198 | bsfq %rdx, %rdx |

199 | addq %rdx, %rax |

200 | subq %rdi, %rax |

201 | SHIFT_RETURN |

202 | ret |

203 | |

204 | .p2align 4 |

205 | L(exit): |

206 | pxor %xmm0, %xmm0 |

207 | FIND_ZERO |

208 | |

209 | bsfq %rdx, %rdx |

210 | addq %rdx, %rax |

211 | subq %rdi, %rax |

212 | SHIFT_RETURN |

213 | ret |

214 | |

215 | #else |

216 | |

217 | /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ |

218 | .p2align 4 |

219 | L(loop): |

220 | |

221 | movdqa 64(%rax), %xmm0 |

222 | PMINU 80(%rax), %xmm0 |

223 | PMINU 96(%rax), %xmm0 |

224 | PMINU 112(%rax), %xmm0 |

225 | PCMPEQ %xmm3, %xmm0 |

226 | pmovmskb %xmm0, %edx |

227 | testl %edx, %edx |

228 | jne L(exit64) |

229 | |

230 | subq $-128, %rax |

231 | |

232 | movdqa (%rax), %xmm0 |

233 | PMINU 16(%rax), %xmm0 |

234 | PMINU 32(%rax), %xmm0 |

235 | PMINU 48(%rax), %xmm0 |

236 | PCMPEQ %xmm3, %xmm0 |

237 | pmovmskb %xmm0, %edx |

238 | testl %edx, %edx |

239 | jne L(exit0) |

240 | jmp L(loop) |

241 | |

242 | .p2align 4 |

243 | L(exit64): |

244 | addq $64, %rax |

245 | L(exit0): |

246 | pxor %xmm0, %xmm0 |

247 | FIND_ZERO |

248 | |

249 | bsfq %rdx, %rdx |

250 | addq %rdx, %rax |

251 | subq %rdi, %rax |

252 | SHIFT_RETURN |

253 | ret |

254 | |

255 | #endif |

256 | |

257 | END(strlen) |

258 | libc_hidden_builtin_def (strlen) |

259 |