1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/version.h>
4#include <linux/ptrace.h>
5#include <uapi/linux/bpf.h>
6#include <bpf/bpf_helpers.h>
7
8/*
9 * The CPU number, cstate number and pstate number are based
10 * on 96boards Hikey with octa CA53 CPUs.
11 *
12 * Every CPU have three idle states for cstate:
13 * WFI, CPU_OFF, CLUSTER_OFF
14 *
15 * Every CPU have 5 operating points:
16 * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17 *
18 * This code is based on these assumption and other platforms
19 * need to adjust these definitions.
20 */
21#define MAX_CPU 8
22#define MAX_PSTATE_ENTRIES 5
23#define MAX_CSTATE_ENTRIES 3
24
25static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26
27/*
28 * my_map structure is used to record cstate and pstate index and
29 * timestamp (Idx, Ts), when new event incoming we need to update
30 * combination for new state index and timestamp (Idx`, Ts`).
31 *
32 * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33 * interval for the previous state: Duration(Idx) = Ts` - Ts.
34 *
35 * Every CPU has one below array for recording state index and
36 * timestamp, and record for cstate and pstate saperately:
37 *
38 * +--------------------------+
39 * | cstate timestamp |
40 * +--------------------------+
41 * | cstate index |
42 * +--------------------------+
43 * | pstate timestamp |
44 * +--------------------------+
45 * | pstate index |
46 * +--------------------------+
47 */
48#define MAP_OFF_CSTATE_TIME 0
49#define MAP_OFF_CSTATE_IDX 1
50#define MAP_OFF_PSTATE_TIME 2
51#define MAP_OFF_PSTATE_IDX 3
52#define MAP_OFF_NUM 4
53
54struct {
55 __uint(type, BPF_MAP_TYPE_ARRAY);
56 __type(key, u32);
57 __type(value, u64);
58 __uint(max_entries, MAX_CPU * MAP_OFF_NUM);
59} my_map SEC(".maps");
60
61/* cstate_duration records duration time for every idle state per CPU */
62struct {
63 __uint(type, BPF_MAP_TYPE_ARRAY);
64 __type(key, u32);
65 __type(value, u64);
66 __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
67} cstate_duration SEC(".maps");
68
69/* pstate_duration records duration time for every operating point per CPU */
70struct {
71 __uint(type, BPF_MAP_TYPE_ARRAY);
72 __type(key, u32);
73 __type(value, u64);
74 __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
75} pstate_duration SEC(".maps");
76
77/*
78 * The trace events for cpu_idle and cpu_frequency are taken from:
79 * /sys/kernel/tracing/events/power/cpu_idle/format
80 * /sys/kernel/tracing/events/power/cpu_frequency/format
81 *
82 * These two events have same format, so define one common structure.
83 */
84struct cpu_args {
85 u64 pad;
86 u32 state;
87 u32 cpu_id;
88};
89
90/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
91static u32 find_cpu_pstate_idx(u32 frequency)
92{
93 u32 i;
94
95 for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96 if (frequency == cpu_opps[i])
97 return i;
98 }
99
100 return i;
101}
102
103SEC("tracepoint/power/cpu_idle")
104int bpf_prog1(struct cpu_args *ctx)
105{
106 u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107 u32 key, cpu, pstate_idx;
108 u64 *val;
109
110 if (ctx->cpu_id > MAX_CPU)
111 return 0;
112
113 cpu = ctx->cpu_id;
114
115 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116 cts = bpf_map_lookup_elem(&my_map, &key);
117 if (!cts)
118 return 0;
119
120 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121 cstate = bpf_map_lookup_elem(&my_map, &key);
122 if (!cstate)
123 return 0;
124
125 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126 pts = bpf_map_lookup_elem(&my_map, &key);
127 if (!pts)
128 return 0;
129
130 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131 pstate = bpf_map_lookup_elem(&my_map, &key);
132 if (!pstate)
133 return 0;
134
135 prev_state = *cstate;
136 *cstate = ctx->state;
137
138 if (!*cts) {
139 *cts = bpf_ktime_get_ns();
140 return 0;
141 }
142
143 cur_ts = bpf_ktime_get_ns();
144 delta = cur_ts - *cts;
145 *cts = cur_ts;
146
147 /*
148 * When state doesn't equal to (u32)-1, the cpu will enter
149 * one idle state; for this case we need to record interval
150 * for the pstate.
151 *
152 * OPP2
153 * +---------------------+
154 * OPP1 | |
155 * ---------+ |
156 * | Idle state
157 * +---------------
158 *
159 * |<- pstate duration ->|
160 * ^ ^
161 * pts cur_ts
162 */
163 if (ctx->state != (u32)-1) {
164
165 /* record pstate after have first cpu_frequency event */
166 if (!*pts)
167 return 0;
168
169 delta = cur_ts - *pts;
170
171 pstate_idx = find_cpu_pstate_idx(*pstate);
172 if (pstate_idx >= MAX_PSTATE_ENTRIES)
173 return 0;
174
175 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176 val = bpf_map_lookup_elem(&pstate_duration, &key);
177 if (val)
178 __sync_fetch_and_add((long *)val, delta);
179
180 /*
181 * When state equal to (u32)-1, the cpu just exits from one
182 * specific idle state; for this case we need to record
183 * interval for the pstate.
184 *
185 * OPP2
186 * -----------+
187 * | OPP1
188 * | +-----------
189 * | Idle state |
190 * +---------------------+
191 *
192 * |<- cstate duration ->|
193 * ^ ^
194 * cts cur_ts
195 */
196 } else {
197
198 key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199 val = bpf_map_lookup_elem(&cstate_duration, &key);
200 if (val)
201 __sync_fetch_and_add((long *)val, delta);
202 }
203
204 /* Update timestamp for pstate as new start time */
205 if (*pts)
206 *pts = cur_ts;
207
208 return 0;
209}
210
211SEC("tracepoint/power/cpu_frequency")
212int bpf_prog2(struct cpu_args *ctx)
213{
214 u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215 u32 key, cpu, pstate_idx;
216 u64 *val;
217
218 cpu = ctx->cpu_id;
219
220 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221 pts = bpf_map_lookup_elem(&my_map, &key);
222 if (!pts)
223 return 0;
224
225 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226 pstate = bpf_map_lookup_elem(&my_map, &key);
227 if (!pstate)
228 return 0;
229
230 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231 cstate = bpf_map_lookup_elem(&my_map, &key);
232 if (!cstate)
233 return 0;
234
235 prev_state = *pstate;
236 *pstate = ctx->state;
237
238 if (!*pts) {
239 *pts = bpf_ktime_get_ns();
240 return 0;
241 }
242
243 cur_ts = bpf_ktime_get_ns();
244 delta = cur_ts - *pts;
245 *pts = cur_ts;
246
247 /* When CPU is in idle, bail out to skip pstate statistics */
248 if (*cstate != (u32)(-1))
249 return 0;
250
251 /*
252 * The cpu changes to another different OPP (in below diagram
253 * change frequency from OPP3 to OPP1), need recording interval
254 * for previous frequency OPP3 and update timestamp as start
255 * time for new frequency OPP1.
256 *
257 * OPP3
258 * +---------------------+
259 * OPP2 | |
260 * ---------+ |
261 * | OPP1
262 * +---------------
263 *
264 * |<- pstate duration ->|
265 * ^ ^
266 * pts cur_ts
267 */
268 pstate_idx = find_cpu_pstate_idx(*pstate);
269 if (pstate_idx >= MAX_PSTATE_ENTRIES)
270 return 0;
271
272 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273 val = bpf_map_lookup_elem(&pstate_duration, &key);
274 if (val)
275 __sync_fetch_and_add((long *)val, delta);
276
277 return 0;
278}
279
280char _license[] SEC("license") = "GPL";
281u32 _version SEC("version") = LINUX_VERSION_CODE;
282

source code of linux/samples/bpf/cpustat_kern.c