blk-wbt.c source code [linux/block/blk-wbt.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* buffered writeback throttling. loosely based on CoDel. We can't drop
4	* packets for IO scheduling, so the logic is something like this:
5	*
6	* - Monitor latencies in a defined window of time.
7	* - If the minimum latency in the above window exceeds some target, increment
8	* scaling step and scale down queue depth by a factor of 2x. The monitoring
9	* window is then shrunk to 100 / sqrt(scaling step + 1).
10	* - For any window where we don't have solid data on what the latencies
11	* look like, retain status quo.
12	* - If latencies look good, decrement scaling step.
13	* - If we're only doing writes, allow the scaling step to go negative. This
14	* will temporarily boost write performance, snapping back to a stable
15	* scaling step of 0 if reads show up or the heavy writers finish. Unlike
16	* positive scaling steps where we shrink the monitoring window, a negative
17	* scaling step retains the default step==0 window size.
18	*
19	* Copyright (C) 2016 Jens Axboe
20	*
21	*/
22	#include <linux/kernel.h>
23	#include <linux/blk_types.h>
24	#include <linux/slab.h>
25	#include <linux/backing-dev.h>
26	#include <linux/swap.h>
27
28	#include "blk-stat.h"
29	#include "blk-wbt.h"
30	#include "blk-rq-qos.h"
31	#include "elevator.h"
32	#include "blk.h"
33
34	#define CREATE_TRACE_POINTS
35	#include <trace/events/wbt.h>
36
37	enum wbt_flags {
38	WBT_TRACKED = `1`, / write, tracked for throttling /
39	WBT_READ = `2`, / read /
40	WBT_KSWAPD = `4`, / write, from kswapd /
41	WBT_DISCARD = `8`, / discard /
42
43	WBT_NR_BITS = `4`, / number of bits /
44	};
45
46	enum {
47	WBT_RWQ_BG = `0`,
48	WBT_RWQ_KSWAPD,
49	WBT_RWQ_DISCARD,
50	WBT_NUM_RWQ,
51	};
52
53	/*
54	* If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
55	* state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
56	* to WBT_STATE_OFF/ON_MANUAL.
57	*/
58	enum {
59	WBT_STATE_ON_DEFAULT = `1`, / on by default /
60	WBT_STATE_ON_MANUAL = `2`, / on manually by sysfs /
61	WBT_STATE_OFF_DEFAULT = `3`, / off by default /
62	WBT_STATE_OFF_MANUAL = `4`, / off manually by sysfs /
63	};
64
65	struct rq_wb {
66	/*
67	* Settings that govern how we throttle
68	*/
69	unsigned int wb_background; / background writeback /
70	unsigned int wb_normal; / normal writeback /
71
72	short enable_state; / WBT_STATE_* /
73
74	/*
75	* Number of consecutive periods where we don't have enough
76	* information to make a firm scale up/down decision.
77	*/
78	unsigned int unknown_cnt;
79
80	u64 win_nsec; / default window size /
81	u64 cur_win_nsec; / current window size /
82
83	struct blk_stat_callback *cb;
84
85	u64 sync_issue;
86	void *sync_cookie;
87
88	unsigned long last_issue; / last non-throttled issue /
89	unsigned long last_comp; / last non-throttled comp /
90	unsigned long min_lat_nsec;
91	struct rq_qos rqos;
92	struct rq_wait rq_wait[WBT_NUM_RWQ];
93	struct rq_depth rq_depth;
94	};
95
96	static inline struct rq_wb RQWB(struct* rq_qos *rqos)
97	{
98	return container_of(rqos, struct rq_wb, rqos);
99	}
100
101	static inline void wbt_clear_state(struct request *rq)
102	{
103	rq->wbt_flags = `0`;
104	}
105
106	static inline enum wbt_flags wbt_flags(struct request *rq)
107	{
108	return rq->wbt_flags;
109	}
110
111	static inline bool wbt_is_tracked(struct request *rq)
112	{
113	return rq->wbt_flags & WBT_TRACKED;
114	}
115
116	static inline bool wbt_is_read(struct request *rq)
117	{
118	return rq->wbt_flags & WBT_READ;
119	}
120
121	enum {
122	/*
123	* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
124	* from here depending on device stats
125	*/
126	RWB_DEF_DEPTH = `16`,
127
128	/*
129	* 100msec window
130	*/
131	RWB_WINDOW_NSEC = `100` * `1000` * `1000ULL`,
132
133	/*
134	* Disregard stats, if we don't meet this minimum
135	*/
136	RWB_MIN_WRITE_SAMPLES = `3`,
137
138	/*
139	* If we have this number of consecutive windows with not enough
140	* information to scale up or down, scale up.
141	*/
142	RWB_UNKNOWN_BUMP = `5`,
143	};
144
145	static inline bool rwb_enabled(struct rq_wb *rwb)
146	{
147	return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
148	rwb->enable_state != WBT_STATE_OFF_MANUAL;
149	}
150
151	static void wb_timestamp(struct rq_wb rwb, unsigned* long *var)
152	{
153	if (rwb_enabled(rwb)) {
154	const unsigned long cur = jiffies;
155
156	if (cur != *var)
157	*var = cur;
158	}
159	}
160
161	/*
162	* If a task was rate throttled in balance_dirty_pages() within the last
163	* second or so, use that to indicate a higher cleaning rate.
164	*/
165	static bool wb_recent_wait(struct rq_wb *rwb)
166	{
167	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
168
169	return time_before(jiffies, bdi->last_bdp_sleep + HZ);
170	}
171
172	static inline struct rq_wait get_rq_wait(struct* rq_wb *rwb,
173	enum wbt_flags wb_acct)
174	{
175	if (wb_acct & WBT_KSWAPD)
176	return &rwb->rq_wait[WBT_RWQ_KSWAPD];
177	else if (wb_acct & WBT_DISCARD)
178	return &rwb->rq_wait[WBT_RWQ_DISCARD];
179
180	return &rwb->rq_wait[WBT_RWQ_BG];
181	}
182
183	static void rwb_wake_all(struct rq_wb *rwb)
184	{
185	int i;
186
187	for (i = `0`; i < WBT_NUM_RWQ; i++) {
188	struct rq_wait *rqw = &rwb->rq_wait[i];
189
190	if (wq_has_sleeper(wq_head: &rqw->wait))
191	wake_up_all(&rqw->wait);
192	}
193	}
194
195	static void wbt_rqw_done(struct rq_wb rwb, struct* rq_wait *rqw,
196	enum wbt_flags wb_acct)
197	{
198	int inflight, limit;
199
200	inflight = atomic_dec_return(v: &rqw->inflight);
201
202	/*
203	* For discards, our limit is always the background. For writes, if
204	* the device does write back caching, drop further down before we
205	* wake people up.
206	*/
207	if (wb_acct & WBT_DISCARD)
208	limit = rwb->wb_background;
209	else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
210	!wb_recent_wait(rwb))
211	limit = `0`;
212	else
213	limit = rwb->wb_normal;
214
215	/*
216	* Don't wake anyone up if we are above the normal limit.
217	*/
218	if (inflight && inflight >= limit)
219	return;
220
221	if (wq_has_sleeper(wq_head: &rqw->wait)) {
222	int diff = limit - inflight;
223
224	if (!inflight \|\| diff >= rwb->wb_background / `2`)
225	wake_up_all(&rqw->wait);
226	}
227	}
228
229	static void __wbt_done(struct rq_qos rqos, enum* wbt_flags wb_acct)
230	{
231	struct rq_wb *rwb = RQWB(rqos);
232	struct rq_wait *rqw;
233
234	if (!(wb_acct & WBT_TRACKED))
235	return;
236
237	rqw = get_rq_wait(rwb, wb_acct);
238	wbt_rqw_done(rwb, rqw, wb_acct);
239	}
240
241	/*
242	* Called on completion of a request. Note that it's also called when
243	* a request is merged, when the request gets freed.
244	*/
245	static void wbt_done(struct rq_qos rqos, struct* request *rq)
246	{
247	struct rq_wb *rwb = RQWB(rqos);
248
249	if (!wbt_is_tracked(rq)) {
250	if (rwb->sync_cookie == rq) {
251	rwb->sync_issue = `0`;
252	rwb->sync_cookie = NULL;
253	}
254
255	if (wbt_is_read(rq))
256	wb_timestamp(rwb, var: &rwb->last_comp);
257	} else {
258	WARN_ON_ONCE(rq == rwb->sync_cookie);
259	__wbt_done(rqos, wb_acct: wbt_flags(rq));
260	}
261	wbt_clear_state(rq);
262	}
263
264	static inline bool stat_sample_valid(struct blk_rq_stat *stat)
265	{
266	/*
267	* We need at least one read sample, and a minimum of
268	* RWB_MIN_WRITE_SAMPLES. We require some write samples to know
269	* that it's writes impacting us, and not just some sole read on
270	* a device that is in a lower power state.
271	*/
272	return (stat[READ].nr_samples >= `1` &&
273	stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
274	}
275
276	static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
277	{
278	u64 issue = READ_ONCE(rwb->sync_issue);
279
280	if (!issue \|\| !rwb->sync_cookie)
281	return `0`;
282
283	return blk_time_get_ns() - issue;
284	}
285
286	static inline unsigned int wbt_inflight(struct rq_wb *rwb)
287	{
288	unsigned int i, ret = `0`;
289
290	for (i = `0`; i < WBT_NUM_RWQ; i++)
291	ret += atomic_read(v: &rwb->rq_wait[i].inflight);
292
293	return ret;
294	}
295
296	enum {
297	LAT_OK = `1`,
298	LAT_UNKNOWN,
299	LAT_UNKNOWN_WRITES,
300	LAT_EXCEEDED,
301	};
302
303	static int latency_exceeded(struct rq_wb rwb, struct* blk_rq_stat *stat)
304	{
305	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
306	struct rq_depth *rqd = &rwb->rq_depth;
307	u64 thislat;
308
309	/*
310	* If our stored sync issue exceeds the window size, or it
311	* exceeds our min target AND we haven't logged any entries,
312	* flag the latency as exceeded. wbt works off completion latencies,
313	* but for a flooded device, a single sync IO can take a long time
314	* to complete after being issued. If this time exceeds our
315	* monitoring window AND we didn't see any other completions in that
316	* window, then count that sync IO as a violation of the latency.
317	*/
318	thislat = rwb_sync_issue_lat(rwb);
319	if (thislat > rwb->cur_win_nsec \|\|
320	(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
321	trace_wbt_lat(bdi, lat: thislat);
322	return LAT_EXCEEDED;
323	}
324
325	/*
326	* No read/write mix, if stat isn't valid
327	*/
328	if (!stat_sample_valid(stat)) {
329	/*
330	* If we had writes in this stat window and the window is
331	* current, we're only doing writes. If a task recently
332	* waited or still has writes in flights, consider us doing
333	* just writes as well.
334	*/
335	if (stat[WRITE].nr_samples \|\| wb_recent_wait(rwb) \|\|
336	wbt_inflight(rwb))
337	return LAT_UNKNOWN_WRITES;
338	return LAT_UNKNOWN;
339	}
340
341	/*
342	* If the 'min' latency exceeds our target, step down.
343	*/
344	if (stat[READ].min > rwb->min_lat_nsec) {
345	trace_wbt_lat(bdi, lat: stat[READ].min);
346	trace_wbt_stat(bdi, stat);
347	return LAT_EXCEEDED;
348	}
349
350	if (rqd->scale_step)
351	trace_wbt_stat(bdi, stat);
352
353	return LAT_OK;
354	}
355
356	static void rwb_trace_step(struct rq_wb rwb, const* char *msg)
357	{
358	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
359	struct rq_depth *rqd = &rwb->rq_depth;
360
361	trace_wbt_step(bdi, msg, step: rqd->scale_step, window: rwb->cur_win_nsec,
362	bg: rwb->wb_background, normal: rwb->wb_normal, max: rqd->max_depth);
363	}
364
365	static void calc_wb_limits(struct rq_wb *rwb)
366	{
367	if (rwb->min_lat_nsec == `0`) {
368	rwb->wb_normal = rwb->wb_background = `0`;
369	} else if (rwb->rq_depth.max_depth <= `2`) {
370	rwb->wb_normal = rwb->rq_depth.max_depth;
371	rwb->wb_background = `1`;
372	} else {
373	rwb->wb_normal = (rwb->rq_depth.max_depth + `1`) / `2`;
374	rwb->wb_background = (rwb->rq_depth.max_depth + `3`) / `4`;
375	}
376	}
377
378	static void scale_up(struct rq_wb *rwb)
379	{
380	if (!rq_depth_scale_up(rqd: &rwb->rq_depth))
381	return;
382	calc_wb_limits(rwb);
383	rwb->unknown_cnt = `0`;
384	rwb_wake_all(rwb);
385	rwb_trace_step(rwb, tracepoint_string("scale up"));
386	}
387
388	static void scale_down(struct rq_wb *rwb, bool hard_throttle)
389	{
390	if (!rq_depth_scale_down(rqd: &rwb->rq_depth, hard_throttle))
391	return;
392	calc_wb_limits(rwb);
393	rwb->unknown_cnt = `0`;
394	rwb_trace_step(rwb, tracepoint_string("scale down"));
395	}
396
397	static void rwb_arm_timer(struct rq_wb *rwb)
398	{
399	struct rq_depth *rqd = &rwb->rq_depth;
400
401	if (rqd->scale_step > `0`) {
402	/*
403	* We should speed this up, using some variant of a fast
404	* integer inverse square root calculation. Since we only do
405	* this for every window expiration, it's not a huge deal,
406	* though.
407	*/
408	rwb->cur_win_nsec = div_u64(dividend: rwb->win_nsec << `4`,
409	divisor: int_sqrt((rqd->scale_step + `1`) << `8`));
410	} else {
411	/*
412	* For step < 0, we don't want to increase/decrease the
413	* window size.
414	*/
415	rwb->cur_win_nsec = rwb->win_nsec;
416	}
417
418	blk_stat_activate_nsecs(cb: rwb->cb, nsecs: rwb->cur_win_nsec);
419	}
420
421	static void wb_timer_fn(struct blk_stat_callback *cb)
422	{
423	struct rq_wb *rwb = cb->data;
424	struct rq_depth *rqd = &rwb->rq_depth;
425	unsigned int inflight = wbt_inflight(rwb);
426	int status;
427
428	if (!rwb->rqos.disk)
429	return;
430
431	status = latency_exceeded(rwb, stat: cb->stat);
432
433	trace_wbt_timer(bdi: rwb->rqos.disk->bdi, status, step: rqd->scale_step, inflight);
434
435	/*
436	* If we exceeded the latency target, step down. If we did not,
437	* step one level up. If we don't know enough to say either exceeded
438	* or ok, then don't do anything.
439	*/
440	switch (status) {
441	case LAT_EXCEEDED:
442	scale_down(rwb, hard_throttle: true);
443	break;
444	case LAT_OK:
445	scale_up(rwb);
446	break;
447	case LAT_UNKNOWN_WRITES:
448	/*
449	* We started a the center step, but don't have a valid
450	* read/write sample, but we do have writes going on.
451	* Allow step to go negative, to increase write perf.
452	*/
453	scale_up(rwb);
454	break;
455	case LAT_UNKNOWN:
456	if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
457	break;
458	/*
459	* We get here when previously scaled reduced depth, and we
460	* currently don't have a valid read/write sample. For that
461	* case, slowly return to center state (step == 0).
462	*/
463	if (rqd->scale_step > `0`)
464	scale_up(rwb);
465	else if (rqd->scale_step < `0`)
466	scale_down(rwb, hard_throttle: false);
467	break;
468	default:
469	break;
470	}
471
472	/*
473	* Re-arm timer, if we have IO in flight
474	*/
475	if (rqd->scale_step \|\| inflight)
476	rwb_arm_timer(rwb);
477	}
478
479	static void wbt_update_limits(struct rq_wb *rwb)
480	{
481	struct rq_depth *rqd = &rwb->rq_depth;
482
483	rqd->scale_step = `0`;
484	rqd->scaled_max = false;
485
486	rq_depth_calc_max_depth(rqd);
487	calc_wb_limits(rwb);
488
489	rwb_wake_all(rwb);
490	}
491
492	bool wbt_disabled(struct request_queue *q)
493	{
494	struct rq_qos *rqos = wbt_rq_qos(q);
495
496	return !rqos \|\| !rwb_enabled(rwb: RQWB(rqos));
497	}
498
499	u64 wbt_get_min_lat(struct request_queue *q)
500	{
501	struct rq_qos *rqos = wbt_rq_qos(q);
502	if (!rqos)
503	return `0`;
504	return RQWB(rqos)->min_lat_nsec;
505	}
506
507	void wbt_set_min_lat(struct request_queue *q, u64 val)
508	{
509	struct rq_qos *rqos = wbt_rq_qos(q);
510	if (!rqos)
511	return;
512
513	RQWB(rqos)->min_lat_nsec = val;
514	if (val)
515	RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
516	else
517	RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
518
519	wbt_update_limits(rwb: RQWB(rqos));
520	}
521
522
523	static bool close_io(struct rq_wb *rwb)
524	{
525	const unsigned long now = jiffies;
526
527	return time_before(now, rwb->last_issue + HZ / `10`) \|\|
528	time_before(now, rwb->last_comp + HZ / `10`);
529	}
530
531	#define REQ_HIPRIO (REQ_SYNC \| REQ_META \| REQ_PRIO)
532
533	static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
534	{
535	unsigned int limit;
536
537	if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
538	return rwb->wb_background;
539
540	/*
541	* At this point we know it's a buffered write. If this is
542	* kswapd trying to free memory, or REQ_SYNC is set, then
543	* it's WB_SYNC_ALL writeback, and we'll use the max limit for
544	* that. If the write is marked as a background write, then use
545	* the idle limit, or go to normal if we haven't had competing
546	* IO for a bit.
547	*/
548	if ((opf & REQ_HIPRIO) \|\| wb_recent_wait(rwb) \|\| current_is_kswapd())
549	limit = rwb->rq_depth.max_depth;
550	else if ((opf & REQ_BACKGROUND) \|\| close_io(rwb)) {
551	/*
552	* If less than 100ms since we completed unrelated IO,
553	* limit us to half the depth for background writeback.
554	*/
555	limit = rwb->wb_background;
556	} else
557	limit = rwb->wb_normal;
558
559	return limit;
560	}
561
562	struct wbt_wait_data {
563	struct rq_wb *rwb;
564	enum wbt_flags wb_acct;
565	blk_opf_t opf;
566	};
567
568	static bool wbt_inflight_cb(struct rq_wait rqw, void* *private_data)
569	{
570	struct wbt_wait_data *data = private_data;
571	return rq_wait_inc_below(rq_wait: rqw, limit: get_limit(rwb: data->rwb, opf: data->opf));
572	}
573
574	static void wbt_cleanup_cb(struct rq_wait rqw, void* *private_data)
575	{
576	struct wbt_wait_data *data = private_data;
577	wbt_rqw_done(rwb: data->rwb, rqw, wb_acct: data->wb_acct);
578	}
579
580	/*
581	* Block if we will exceed our limit, or if we are currently waiting for
582	* the timer to kick off queuing again.
583	*/
584	static void __wbt_wait(struct rq_wb rwb, enum* wbt_flags wb_acct,
585	blk_opf_t opf)
586	{
587	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
588	struct wbt_wait_data data = {
589	.rwb = rwb,
590	.wb_acct = wb_acct,
591	.opf = opf,
592	};
593
594	rq_qos_wait(rqw, private_data: &data, acquire_inflight_cb: wbt_inflight_cb, cleanup_cb: wbt_cleanup_cb);
595	}
596
597	static inline bool wbt_should_throttle(struct bio *bio)
598	{
599	switch (bio_op(bio)) {
600	case REQ_OP_WRITE:
601	/*
602	* Don't throttle WRITE_ODIRECT
603	*/
604	if ((bio->bi_opf & (REQ_SYNC \| REQ_IDLE)) ==
605	(REQ_SYNC \| REQ_IDLE))
606	return false;
607	fallthrough;
608	case REQ_OP_DISCARD:
609	return true;
610	default:
611	return false;
612	}
613	}
614
615	static enum wbt_flags bio_to_wbt_flags(struct rq_wb rwb, struct* bio *bio)
616	{
617	enum wbt_flags flags = `0`;
618
619	if (!rwb_enabled(rwb))
620	return `0`;
621
622	if (bio_op(bio) == REQ_OP_READ) {
623	flags = WBT_READ;
624	} else if (wbt_should_throttle(bio)) {
625	if (current_is_kswapd())
626	flags \|= WBT_KSWAPD;
627	if (bio_op(bio) == REQ_OP_DISCARD)
628	flags \|= WBT_DISCARD;
629	flags \|= WBT_TRACKED;
630	}
631	return flags;
632	}
633
634	static void wbt_cleanup(struct rq_qos rqos, struct* bio *bio)
635	{
636	struct rq_wb *rwb = RQWB(rqos);
637	enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
638	__wbt_done(rqos, wb_acct: flags);
639	}
640
641	/*
642	* May sleep, if we have exceeded the writeback limits. Caller can pass
643	* in an irq held spinlock, if it holds one when calling this function.
644	* If we do sleep, we'll release and re-grab it.
645	*/
646	static void wbt_wait(struct rq_qos rqos, struct* bio *bio)
647	{
648	struct rq_wb *rwb = RQWB(rqos);
649	enum wbt_flags flags;
650
651	flags = bio_to_wbt_flags(rwb, bio);
652	if (!(flags & WBT_TRACKED)) {
653	if (flags & WBT_READ)
654	wb_timestamp(rwb, var: &rwb->last_issue);
655	return;
656	}
657
658	__wbt_wait(rwb, wb_acct: flags, opf: bio->bi_opf);
659
660	if (!blk_stat_is_active(cb: rwb->cb))
661	rwb_arm_timer(rwb);
662	}
663
664	static void wbt_track(struct rq_qos rqos, struct* request rq, struct* bio *bio)
665	{
666	struct rq_wb *rwb = RQWB(rqos);
667	rq->wbt_flags \|= bio_to_wbt_flags(rwb, bio);
668	}
669
670	static void wbt_issue(struct rq_qos rqos, struct* request *rq)
671	{
672	struct rq_wb *rwb = RQWB(rqos);
673
674	if (!rwb_enabled(rwb))
675	return;
676
677	/*
678	* Track sync issue, in case it takes a long time to complete. Allows us
679	* to react quicker, if a sync IO takes a long time to complete. Note
680	* that this is just a hint. The request can go away when it completes,
681	* so it's important we never dereference it. We only use the address to
682	* compare with, which is why we store the sync_issue time locally.
683	*/
684	if (wbt_is_read(rq) && !rwb->sync_issue) {
685	rwb->sync_cookie = rq;
686	rwb->sync_issue = rq->io_start_time_ns;
687	}
688	}
689
690	static void wbt_requeue(struct rq_qos rqos, struct* request *rq)
691	{
692	struct rq_wb *rwb = RQWB(rqos);
693	if (!rwb_enabled(rwb))
694	return;
695	if (rq == rwb->sync_cookie) {
696	rwb->sync_issue = `0`;
697	rwb->sync_cookie = NULL;
698	}
699	}
700
701	/*
702	* Enable wbt if defaults are configured that way
703	*/
704	void wbt_enable_default(struct gendisk *disk)
705	{
706	struct request_queue *q = disk->queue;
707	struct rq_qos *rqos;
708	bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
709
710	if (q->elevator &&
711	test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
712	enable = false;
713
714	/ Throttling already enabled? /
715	rqos = wbt_rq_qos(q);
716	if (rqos) {
717	if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
718	RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
719	return;
720	}
721
722	/ Queue not registered? Maybe shutting down... /
723	if (!blk_queue_registered(q))
724	return;
725
726	if (queue_is_mq(q) && enable)
727	wbt_init(disk);
728	}
729	EXPORT_SYMBOL_GPL(wbt_enable_default);
730
731	u64 wbt_default_latency_nsec(struct request_queue *q)
732	{
733	/*
734	* We default to 2msec for non-rotational storage, and 75msec
735	* for rotational storage.
736	*/
737	if (blk_queue_nonrot(q))
738	return `2000000ULL`;
739	else
740	return `75000000ULL`;
741	}
742
743	static int wbt_data_dir(const struct request *rq)
744	{
745	const enum req_op op = req_op(req: rq);
746
747	if (op == REQ_OP_READ)
748	return READ;
749	else if (op_is_write(op))
750	return WRITE;
751
752	/ don't account /
753	return -`1`;
754	}
755
756	static void wbt_queue_depth_changed(struct rq_qos *rqos)
757	{
758	RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(q: rqos->disk->queue);
759	wbt_update_limits(rwb: RQWB(rqos));
760	}
761
762	static void wbt_exit(struct rq_qos *rqos)
763	{
764	struct rq_wb *rwb = RQWB(rqos);
765
766	blk_stat_remove_callback(q: rqos->disk->queue, cb: rwb->cb);
767	blk_stat_free_callback(cb: rwb->cb);
768	kfree(objp: rwb);
769	}
770
771	/*
772	* Disable wbt, if enabled by default.
773	*/
774	void wbt_disable_default(struct gendisk *disk)
775	{
776	struct rq_qos *rqos = wbt_rq_qos(q: disk->queue);
777	struct rq_wb *rwb;
778	if (!rqos)
779	return;
780	rwb = RQWB(rqos);
781	if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
782	blk_stat_deactivate(cb: rwb->cb);
783	rwb->enable_state = WBT_STATE_OFF_DEFAULT;
784	}
785	}
786	EXPORT_SYMBOL_GPL(wbt_disable_default);
787
788	#ifdef CONFIG_BLK_DEBUG_FS
789	static int wbt_curr_win_nsec_show(void data, struct* seq_file *m)
790	{
791	struct rq_qos *rqos = data;
792	struct rq_wb *rwb = RQWB(rqos);
793
794	seq_printf(m, fmt: "%llu\n", rwb->cur_win_nsec);
795	return `0`;
796	}
797
798	static int wbt_enabled_show(void data, struct* seq_file *m)
799	{
800	struct rq_qos *rqos = data;
801	struct rq_wb *rwb = RQWB(rqos);
802
803	seq_printf(m, fmt: "%d\n", rwb->enable_state);
804	return `0`;
805	}
806
807	static int wbt_id_show(void data, struct* seq_file *m)
808	{
809	struct rq_qos *rqos = data;
810
811	seq_printf(m, fmt: "%u\n", rqos->id);
812	return `0`;
813	}
814
815	static int wbt_inflight_show(void data, struct* seq_file *m)
816	{
817	struct rq_qos *rqos = data;
818	struct rq_wb *rwb = RQWB(rqos);
819	int i;
820
821	for (i = `0`; i < WBT_NUM_RWQ; i++)
822	seq_printf(m, fmt: "%d: inflight %d\n", i,
823	atomic_read(v: &rwb->rq_wait[i].inflight));
824	return `0`;
825	}
826
827	static int wbt_min_lat_nsec_show(void data, struct* seq_file *m)
828	{
829	struct rq_qos *rqos = data;
830	struct rq_wb *rwb = RQWB(rqos);
831
832	seq_printf(m, fmt: "%lu\n", rwb->min_lat_nsec);
833	return `0`;
834	}
835
836	static int wbt_unknown_cnt_show(void data, struct* seq_file *m)
837	{
838	struct rq_qos *rqos = data;
839	struct rq_wb *rwb = RQWB(rqos);
840
841	seq_printf(m, fmt: "%u\n", rwb->unknown_cnt);
842	return `0`;
843	}
844
845	static int wbt_normal_show(void data, struct* seq_file *m)
846	{
847	struct rq_qos *rqos = data;
848	struct rq_wb *rwb = RQWB(rqos);
849
850	seq_printf(m, fmt: "%u\n", rwb->wb_normal);
851	return `0`;
852	}
853
854	static int wbt_background_show(void data, struct* seq_file *m)
855	{
856	struct rq_qos *rqos = data;
857	struct rq_wb *rwb = RQWB(rqos);
858
859	seq_printf(m, fmt: "%u\n", rwb->wb_background);
860	return `0`;
861	}
862
863	static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
864	{"curr_win_nsec", `0400`, wbt_curr_win_nsec_show},
865	{"enabled", `0400`, wbt_enabled_show},
866	{"id", `0400`, wbt_id_show},
867	{"inflight", `0400`, wbt_inflight_show},
868	{"min_lat_nsec", `0400`, wbt_min_lat_nsec_show},
869	{"unknown_cnt", `0400`, wbt_unknown_cnt_show},
870	{"wb_normal", `0400`, wbt_normal_show},
871	{"wb_background", `0400`, wbt_background_show},
872	{},
873	};
874	#endif
875
876	static const struct rq_qos_ops wbt_rqos_ops = {
877	.throttle = wbt_wait,
878	.issue = wbt_issue,
879	.track = wbt_track,
880	.requeue = wbt_requeue,
881	.done = wbt_done,
882	.cleanup = wbt_cleanup,
883	.queue_depth_changed = wbt_queue_depth_changed,
884	.exit = wbt_exit,
885	#ifdef CONFIG_BLK_DEBUG_FS
886	.debugfs_attrs = wbt_debugfs_attrs,
887	#endif
888	};
889
890	int wbt_init(struct gendisk *disk)
891	{
892	struct request_queue *q = disk->queue;
893	struct rq_wb *rwb;
894	int i;
895	int ret;
896
897	rwb = kzalloc(size: sizeof(*rwb), GFP_KERNEL);
898	if (!rwb)
899	return -ENOMEM;
900
901	rwb->cb = blk_stat_alloc_callback(timer_fn: wb_timer_fn, bucket_fn: wbt_data_dir, buckets: `2`, data: rwb);
902	if (!rwb->cb) {
903	kfree(objp: rwb);
904	return -ENOMEM;
905	}
906
907	for (i = `0`; i < WBT_NUM_RWQ; i++)
908	rq_wait_init(rq_wait: &rwb->rq_wait[i]);
909
910	rwb->last_comp = rwb->last_issue = jiffies;
911	rwb->win_nsec = RWB_WINDOW_NSEC;
912	rwb->enable_state = WBT_STATE_ON_DEFAULT;
913	rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
914	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
915	rwb->rq_depth.queue_depth = blk_queue_depth(q);
916	wbt_update_limits(rwb);
917
918	/*
919	* Assign rwb and add the stats callback.
920	*/
921	mutex_lock(&q->rq_qos_mutex);
922	ret = rq_qos_add(rqos: &rwb->rqos, disk, id: RQ_QOS_WBT, ops: &wbt_rqos_ops);
923	mutex_unlock(lock: &q->rq_qos_mutex);
924	if (ret)
925	goto err_free;
926
927	blk_stat_add_callback(q, cb: rwb->cb);
928
929	return `0`;
930
931	err_free:
932	blk_stat_free_callback(cb: rwb->cb);
933	kfree(objp: rwb);
934	return ret;
935
936	}
937

source code of linux/block/blk-wbt.c