1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "blk-rq-qos.h" |
4 | |
5 | /* |
6 | * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, |
7 | * false if 'v' + 1 would be bigger than 'below'. |
8 | */ |
9 | static bool atomic_inc_below(atomic_t *v, unsigned int below) |
10 | { |
11 | unsigned int cur = atomic_read(v); |
12 | |
13 | do { |
14 | if (cur >= below) |
15 | return false; |
16 | } while (!atomic_try_cmpxchg(v, old: &cur, new: cur + 1)); |
17 | |
18 | return true; |
19 | } |
20 | |
21 | bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) |
22 | { |
23 | return atomic_inc_below(v: &rq_wait->inflight, below: limit); |
24 | } |
25 | |
26 | void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) |
27 | { |
28 | do { |
29 | if (rqos->ops->cleanup) |
30 | rqos->ops->cleanup(rqos, bio); |
31 | rqos = rqos->next; |
32 | } while (rqos); |
33 | } |
34 | |
35 | void __rq_qos_done(struct rq_qos *rqos, struct request *rq) |
36 | { |
37 | do { |
38 | if (rqos->ops->done) |
39 | rqos->ops->done(rqos, rq); |
40 | rqos = rqos->next; |
41 | } while (rqos); |
42 | } |
43 | |
44 | void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) |
45 | { |
46 | do { |
47 | if (rqos->ops->issue) |
48 | rqos->ops->issue(rqos, rq); |
49 | rqos = rqos->next; |
50 | } while (rqos); |
51 | } |
52 | |
53 | void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) |
54 | { |
55 | do { |
56 | if (rqos->ops->requeue) |
57 | rqos->ops->requeue(rqos, rq); |
58 | rqos = rqos->next; |
59 | } while (rqos); |
60 | } |
61 | |
62 | void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) |
63 | { |
64 | do { |
65 | if (rqos->ops->throttle) |
66 | rqos->ops->throttle(rqos, bio); |
67 | rqos = rqos->next; |
68 | } while (rqos); |
69 | } |
70 | |
71 | void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) |
72 | { |
73 | do { |
74 | if (rqos->ops->track) |
75 | rqos->ops->track(rqos, rq, bio); |
76 | rqos = rqos->next; |
77 | } while (rqos); |
78 | } |
79 | |
80 | void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) |
81 | { |
82 | do { |
83 | if (rqos->ops->merge) |
84 | rqos->ops->merge(rqos, rq, bio); |
85 | rqos = rqos->next; |
86 | } while (rqos); |
87 | } |
88 | |
89 | void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) |
90 | { |
91 | do { |
92 | if (rqos->ops->done_bio) |
93 | rqos->ops->done_bio(rqos, bio); |
94 | rqos = rqos->next; |
95 | } while (rqos); |
96 | } |
97 | |
98 | void __rq_qos_queue_depth_changed(struct rq_qos *rqos) |
99 | { |
100 | do { |
101 | if (rqos->ops->queue_depth_changed) |
102 | rqos->ops->queue_depth_changed(rqos); |
103 | rqos = rqos->next; |
104 | } while (rqos); |
105 | } |
106 | |
107 | /* |
108 | * Return true, if we can't increase the depth further by scaling |
109 | */ |
110 | bool rq_depth_calc_max_depth(struct rq_depth *rqd) |
111 | { |
112 | unsigned int depth; |
113 | bool ret = false; |
114 | |
115 | /* |
116 | * For QD=1 devices, this is a special case. It's important for those |
117 | * to have one request ready when one completes, so force a depth of |
118 | * 2 for those devices. On the backend, it'll be a depth of 1 anyway, |
119 | * since the device can't have more than that in flight. If we're |
120 | * scaling down, then keep a setting of 1/1/1. |
121 | */ |
122 | if (rqd->queue_depth == 1) { |
123 | if (rqd->scale_step > 0) |
124 | rqd->max_depth = 1; |
125 | else { |
126 | rqd->max_depth = 2; |
127 | ret = true; |
128 | } |
129 | } else { |
130 | /* |
131 | * scale_step == 0 is our default state. If we have suffered |
132 | * latency spikes, step will be > 0, and we shrink the |
133 | * allowed write depths. If step is < 0, we're only doing |
134 | * writes, and we allow a temporarily higher depth to |
135 | * increase performance. |
136 | */ |
137 | depth = min_t(unsigned int, rqd->default_depth, |
138 | rqd->queue_depth); |
139 | if (rqd->scale_step > 0) |
140 | depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); |
141 | else if (rqd->scale_step < 0) { |
142 | unsigned int maxd = 3 * rqd->queue_depth / 4; |
143 | |
144 | depth = 1 + ((depth - 1) << -rqd->scale_step); |
145 | if (depth > maxd) { |
146 | depth = maxd; |
147 | ret = true; |
148 | } |
149 | } |
150 | |
151 | rqd->max_depth = depth; |
152 | } |
153 | |
154 | return ret; |
155 | } |
156 | |
157 | /* Returns true on success and false if scaling up wasn't possible */ |
158 | bool rq_depth_scale_up(struct rq_depth *rqd) |
159 | { |
160 | /* |
161 | * Hit max in previous round, stop here |
162 | */ |
163 | if (rqd->scaled_max) |
164 | return false; |
165 | |
166 | rqd->scale_step--; |
167 | |
168 | rqd->scaled_max = rq_depth_calc_max_depth(rqd); |
169 | return true; |
170 | } |
171 | |
172 | /* |
173 | * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we |
174 | * had a latency violation. Returns true on success and returns false if |
175 | * scaling down wasn't possible. |
176 | */ |
177 | bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) |
178 | { |
179 | /* |
180 | * Stop scaling down when we've hit the limit. This also prevents |
181 | * ->scale_step from going to crazy values, if the device can't |
182 | * keep up. |
183 | */ |
184 | if (rqd->max_depth == 1) |
185 | return false; |
186 | |
187 | if (rqd->scale_step < 0 && hard_throttle) |
188 | rqd->scale_step = 0; |
189 | else |
190 | rqd->scale_step++; |
191 | |
192 | rqd->scaled_max = false; |
193 | rq_depth_calc_max_depth(rqd); |
194 | return true; |
195 | } |
196 | |
197 | struct rq_qos_wait_data { |
198 | struct wait_queue_entry wq; |
199 | struct task_struct *task; |
200 | struct rq_wait *rqw; |
201 | acquire_inflight_cb_t *cb; |
202 | void *private_data; |
203 | bool got_token; |
204 | }; |
205 | |
206 | static int rq_qos_wake_function(struct wait_queue_entry *curr, |
207 | unsigned int mode, int wake_flags, void *key) |
208 | { |
209 | struct rq_qos_wait_data *data = container_of(curr, |
210 | struct rq_qos_wait_data, |
211 | wq); |
212 | |
213 | /* |
214 | * If we fail to get a budget, return -1 to interrupt the wake up loop |
215 | * in __wake_up_common. |
216 | */ |
217 | if (!data->cb(data->rqw, data->private_data)) |
218 | return -1; |
219 | |
220 | data->got_token = true; |
221 | smp_wmb(); |
222 | list_del_init(entry: &curr->entry); |
223 | wake_up_process(tsk: data->task); |
224 | return 1; |
225 | } |
226 | |
227 | /** |
228 | * rq_qos_wait - throttle on a rqw if we need to |
229 | * @rqw: rqw to throttle on |
230 | * @private_data: caller provided specific data |
231 | * @acquire_inflight_cb: inc the rqw->inflight counter if we can |
232 | * @cleanup_cb: the callback to cleanup in case we race with a waker |
233 | * |
234 | * This provides a uniform place for the rq_qos users to do their throttling. |
235 | * Since you can end up with a lot of things sleeping at once, this manages the |
236 | * waking up based on the resources available. The acquire_inflight_cb should |
237 | * inc the rqw->inflight if we have the ability to do so, or return false if not |
238 | * and then we will sleep until the room becomes available. |
239 | * |
240 | * cleanup_cb is in case that we race with a waker and need to cleanup the |
241 | * inflight count accordingly. |
242 | */ |
243 | void rq_qos_wait(struct rq_wait *rqw, void *private_data, |
244 | acquire_inflight_cb_t *acquire_inflight_cb, |
245 | cleanup_cb_t *cleanup_cb) |
246 | { |
247 | struct rq_qos_wait_data data = { |
248 | .wq = { |
249 | .func = rq_qos_wake_function, |
250 | .entry = LIST_HEAD_INIT(data.wq.entry), |
251 | }, |
252 | .task = current, |
253 | .rqw = rqw, |
254 | .cb = acquire_inflight_cb, |
255 | .private_data = private_data, |
256 | }; |
257 | bool has_sleeper; |
258 | |
259 | has_sleeper = wq_has_sleeper(wq_head: &rqw->wait); |
260 | if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) |
261 | return; |
262 | |
263 | has_sleeper = !prepare_to_wait_exclusive(wq_head: &rqw->wait, wq_entry: &data.wq, |
264 | TASK_UNINTERRUPTIBLE); |
265 | do { |
266 | /* The memory barrier in set_task_state saves us here. */ |
267 | if (data.got_token) |
268 | break; |
269 | if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { |
270 | finish_wait(wq_head: &rqw->wait, wq_entry: &data.wq); |
271 | |
272 | /* |
273 | * We raced with rq_qos_wake_function() getting a token, |
274 | * which means we now have two. Put our local token |
275 | * and wake anyone else potentially waiting for one. |
276 | */ |
277 | smp_rmb(); |
278 | if (data.got_token) |
279 | cleanup_cb(rqw, private_data); |
280 | break; |
281 | } |
282 | io_schedule(); |
283 | has_sleeper = true; |
284 | set_current_state(TASK_UNINTERRUPTIBLE); |
285 | } while (1); |
286 | finish_wait(wq_head: &rqw->wait, wq_entry: &data.wq); |
287 | } |
288 | |
289 | void rq_qos_exit(struct request_queue *q) |
290 | { |
291 | mutex_lock(&q->rq_qos_mutex); |
292 | while (q->rq_qos) { |
293 | struct rq_qos *rqos = q->rq_qos; |
294 | q->rq_qos = rqos->next; |
295 | rqos->ops->exit(rqos); |
296 | } |
297 | mutex_unlock(lock: &q->rq_qos_mutex); |
298 | } |
299 | |
300 | int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, |
301 | const struct rq_qos_ops *ops) |
302 | { |
303 | struct request_queue *q = disk->queue; |
304 | |
305 | lockdep_assert_held(&q->rq_qos_mutex); |
306 | |
307 | rqos->disk = disk; |
308 | rqos->id = id; |
309 | rqos->ops = ops; |
310 | |
311 | /* |
312 | * No IO can be in-flight when adding rqos, so freeze queue, which |
313 | * is fine since we only support rq_qos for blk-mq queue. |
314 | */ |
315 | blk_mq_freeze_queue(q); |
316 | |
317 | if (rq_qos_id(q, id: rqos->id)) |
318 | goto ebusy; |
319 | rqos->next = q->rq_qos; |
320 | q->rq_qos = rqos; |
321 | |
322 | blk_mq_unfreeze_queue(q); |
323 | |
324 | if (rqos->ops->debugfs_attrs) { |
325 | mutex_lock(&q->debugfs_mutex); |
326 | blk_mq_debugfs_register_rqos(rqos); |
327 | mutex_unlock(lock: &q->debugfs_mutex); |
328 | } |
329 | |
330 | return 0; |
331 | ebusy: |
332 | blk_mq_unfreeze_queue(q); |
333 | return -EBUSY; |
334 | } |
335 | |
336 | void rq_qos_del(struct rq_qos *rqos) |
337 | { |
338 | struct request_queue *q = rqos->disk->queue; |
339 | struct rq_qos **cur; |
340 | |
341 | lockdep_assert_held(&q->rq_qos_mutex); |
342 | |
343 | blk_mq_freeze_queue(q); |
344 | for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { |
345 | if (*cur == rqos) { |
346 | *cur = rqos->next; |
347 | break; |
348 | } |
349 | } |
350 | blk_mq_unfreeze_queue(q); |
351 | |
352 | mutex_lock(&q->debugfs_mutex); |
353 | blk_mq_debugfs_unregister_rqos(rqos); |
354 | mutex_unlock(lock: &q->debugfs_mutex); |
355 | } |
356 | |