1 | /* Copyright (C) 2012-2022 Free Software Foundation, Inc. |
2 | |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library. If not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* Assumptions: |
22 | * |
23 | * ARMv8-a, AArch64, Advanced SIMD. |
24 | * MTE compatible. |
25 | */ |
26 | |
27 | #ifndef STRLEN |
28 | # define STRLEN __strlen |
29 | #endif |
30 | |
31 | #define srcin x0 |
32 | #define result x0 |
33 | |
34 | #define src x1 |
35 | #define synd x2 |
36 | #define tmp x3 |
37 | #define wtmp w3 |
38 | #define shift x4 |
39 | |
40 | #define data q0 |
41 | #define vdata v0 |
42 | #define vhas_nul v1 |
43 | #define vrepmask v2 |
44 | #define vend v3 |
45 | #define dend d3 |
46 | |
47 | /* Core algorithm: |
48 | |
49 | For each 16-byte chunk we calculate a 64-bit syndrome value with four bits |
50 | per byte. For even bytes, bits 0-3 are set if the relevant byte matched the |
51 | requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are |
52 | set likewise for odd bytes so that adjacent bytes can be merged. Since the |
53 | bits in the syndrome reflect the order in which things occur in the original |
54 | string, counting trailing zeros identifies exactly which byte matched. */ |
55 | |
56 | ENTRY (STRLEN) |
57 | PTR_ARG (0) |
58 | bic src, srcin, 15 |
59 | mov wtmp, 0xf00f |
60 | ld1 {vdata.16b}, [src] |
61 | dup vrepmask.8h, wtmp |
62 | cmeq vhas_nul.16b, vdata.16b, 0 |
63 | lsl shift, srcin, 2 |
64 | and vhas_nul.16b, vhas_nul.16b, vrepmask.16b |
65 | addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ |
66 | fmov synd, dend |
67 | lsr synd, synd, shift |
68 | cbz synd, L(loop) |
69 | |
70 | rbit synd, synd |
71 | clz result, synd |
72 | lsr result, result, 2 |
73 | ret |
74 | |
75 | .p2align 5 |
76 | L(loop): |
77 | ldr data, [src, 16]! |
78 | cmeq vhas_nul.16b, vdata.16b, 0 |
79 | umaxp vend.16b, vhas_nul.16b, vhas_nul.16b |
80 | fmov synd, dend |
81 | cbz synd, L(loop) |
82 | |
83 | and vhas_nul.16b, vhas_nul.16b, vrepmask.16b |
84 | addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ |
85 | sub result, src, srcin |
86 | fmov synd, dend |
87 | #ifndef __AARCH64EB__ |
88 | rbit synd, synd |
89 | #endif |
90 | clz tmp, synd |
91 | add result, result, tmp, lsr 2 |
92 | ret |
93 | |
94 | END (STRLEN) |
95 | weak_alias (STRLEN, strlen) |
96 | libc_hidden_builtin_def (strlen) |
97 | |