1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_PSI_TYPES_H |
3 | #define _LINUX_PSI_TYPES_H |
4 | |
5 | #include <linux/kthread.h> |
6 | #include <linux/seqlock.h> |
7 | #include <linux/types.h> |
8 | #include <linux/kref.h> |
9 | #include <linux/wait.h> |
10 | |
11 | #ifdef CONFIG_PSI |
12 | |
13 | /* Tracked task states */ |
14 | enum psi_task_count { |
15 | NR_IOWAIT, |
16 | NR_MEMSTALL, |
17 | NR_RUNNING, |
18 | /* |
19 | * For IO and CPU stalls the presence of running/oncpu tasks |
20 | * in the domain means a partial rather than a full stall. |
21 | * For memory it's not so simple because of page reclaimers: |
22 | * they are running/oncpu while representing a stall. To tell |
23 | * whether a domain has productivity left or not, we need to |
24 | * distinguish between regular running (i.e. productive) |
25 | * threads and memstall ones. |
26 | */ |
27 | NR_MEMSTALL_RUNNING, |
28 | NR_PSI_TASK_COUNTS = 4, |
29 | }; |
30 | |
31 | /* Task state bitmasks */ |
32 | #define TSK_IOWAIT (1 << NR_IOWAIT) |
33 | #define TSK_MEMSTALL (1 << NR_MEMSTALL) |
34 | #define TSK_RUNNING (1 << NR_RUNNING) |
35 | #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) |
36 | |
37 | /* Only one task can be scheduled, no corresponding task count */ |
38 | #define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) |
39 | |
40 | /* Resources that workloads could be stalled on */ |
41 | enum psi_res { |
42 | PSI_IO, |
43 | PSI_MEM, |
44 | PSI_CPU, |
45 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
46 | PSI_IRQ, |
47 | #endif |
48 | NR_PSI_RESOURCES, |
49 | }; |
50 | |
51 | /* |
52 | * Pressure states for each resource: |
53 | * |
54 | * SOME: Stalled tasks & working tasks |
55 | * FULL: Stalled tasks & no working tasks |
56 | */ |
57 | enum psi_states { |
58 | PSI_IO_SOME, |
59 | PSI_IO_FULL, |
60 | PSI_MEM_SOME, |
61 | PSI_MEM_FULL, |
62 | PSI_CPU_SOME, |
63 | PSI_CPU_FULL, |
64 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
65 | PSI_IRQ_FULL, |
66 | #endif |
67 | /* Only per-CPU, to weigh the CPU in the global average: */ |
68 | PSI_NONIDLE, |
69 | NR_PSI_STATES, |
70 | }; |
71 | |
72 | /* Use one bit in the state mask to track TSK_ONCPU */ |
73 | #define PSI_ONCPU (1 << NR_PSI_STATES) |
74 | |
75 | /* Flag whether to re-arm avgs_work, see details in get_recent_times() */ |
76 | #define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1)) |
77 | |
78 | enum psi_aggregators { |
79 | PSI_AVGS = 0, |
80 | PSI_POLL, |
81 | NR_PSI_AGGREGATORS, |
82 | }; |
83 | |
84 | struct psi_group_cpu { |
85 | /* 1st cacheline updated by the scheduler */ |
86 | |
87 | /* Aggregator needs to know of concurrent changes */ |
88 | seqcount_t seq ____cacheline_aligned_in_smp; |
89 | |
90 | /* States of the tasks belonging to this group */ |
91 | unsigned int tasks[NR_PSI_TASK_COUNTS]; |
92 | |
93 | /* Aggregate pressure state derived from the tasks */ |
94 | u32 state_mask; |
95 | |
96 | /* Period time sampling buckets for each state of interest (ns) */ |
97 | u32 times[NR_PSI_STATES]; |
98 | |
99 | /* Time of last task change in this group (rq_clock) */ |
100 | u64 state_start; |
101 | |
102 | /* 2nd cacheline updated by the aggregator */ |
103 | |
104 | /* Delta detection against the sampling buckets */ |
105 | u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] |
106 | ____cacheline_aligned_in_smp; |
107 | }; |
108 | |
109 | /* PSI growth tracking window */ |
110 | struct psi_window { |
111 | /* Window size in ns */ |
112 | u64 size; |
113 | |
114 | /* Start time of the current window in ns */ |
115 | u64 start_time; |
116 | |
117 | /* Value at the start of the window */ |
118 | u64 start_value; |
119 | |
120 | /* Value growth in the previous window */ |
121 | u64 prev_growth; |
122 | }; |
123 | |
124 | struct psi_trigger { |
125 | /* PSI state being monitored by the trigger */ |
126 | enum psi_states state; |
127 | |
128 | /* User-spacified threshold in ns */ |
129 | u64 threshold; |
130 | |
131 | /* List node inside triggers list */ |
132 | struct list_head node; |
133 | |
134 | /* Backpointer needed during trigger destruction */ |
135 | struct psi_group *group; |
136 | |
137 | /* Wait queue for polling */ |
138 | wait_queue_head_t event_wait; |
139 | |
140 | /* Kernfs file for cgroup triggers */ |
141 | struct kernfs_open_file *of; |
142 | |
143 | /* Pending event flag */ |
144 | int event; |
145 | |
146 | /* Tracking window */ |
147 | struct psi_window win; |
148 | |
149 | /* |
150 | * Time last event was generated. Used for rate-limiting |
151 | * events to one per window |
152 | */ |
153 | u64 last_event_time; |
154 | |
155 | /* Deferred event(s) from previous ratelimit window */ |
156 | bool pending_event; |
157 | |
158 | /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */ |
159 | enum psi_aggregators aggregator; |
160 | }; |
161 | |
162 | struct psi_group { |
163 | struct psi_group *parent; |
164 | bool enabled; |
165 | |
166 | /* Protects data used by the aggregator */ |
167 | struct mutex avgs_lock; |
168 | |
169 | /* Per-cpu task state & time tracking */ |
170 | struct psi_group_cpu __percpu *pcpu; |
171 | |
172 | /* Running pressure averages */ |
173 | u64 avg_total[NR_PSI_STATES - 1]; |
174 | u64 avg_last_update; |
175 | u64 avg_next_update; |
176 | |
177 | /* Aggregator work control */ |
178 | struct delayed_work avgs_work; |
179 | |
180 | /* Unprivileged triggers against N*PSI_FREQ windows */ |
181 | struct list_head avg_triggers; |
182 | u32 avg_nr_triggers[NR_PSI_STATES - 1]; |
183 | |
184 | /* Total stall times and sampled pressure averages */ |
185 | u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; |
186 | unsigned long avg[NR_PSI_STATES - 1][3]; |
187 | |
188 | /* Monitor RT polling work control */ |
189 | struct task_struct __rcu *rtpoll_task; |
190 | struct timer_list rtpoll_timer; |
191 | wait_queue_head_t rtpoll_wait; |
192 | atomic_t rtpoll_wakeup; |
193 | atomic_t rtpoll_scheduled; |
194 | |
195 | /* Protects data used by the monitor */ |
196 | struct mutex rtpoll_trigger_lock; |
197 | |
198 | /* Configured RT polling triggers */ |
199 | struct list_head rtpoll_triggers; |
200 | u32 rtpoll_nr_triggers[NR_PSI_STATES - 1]; |
201 | u32 rtpoll_states; |
202 | u64 rtpoll_min_period; |
203 | |
204 | /* Total stall times at the start of RT polling monitor activation */ |
205 | u64 rtpoll_total[NR_PSI_STATES - 1]; |
206 | u64 rtpoll_next_update; |
207 | u64 rtpoll_until; |
208 | }; |
209 | |
210 | #else /* CONFIG_PSI */ |
211 | |
212 | #define NR_PSI_RESOURCES 0 |
213 | |
214 | struct psi_group { }; |
215 | |
216 | #endif /* CONFIG_PSI */ |
217 | |
218 | #endif /* _LINUX_PSI_TYPES_H */ |
219 | |