sch_hhf.c source code [linux/net/sched/sch_hhf.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)*
3	*
4	* Copyright (C) 2013 Terry Lam <vtlam@google.com>
5	* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
6	*/
7
8	#include <linux/jiffies.h>
9	#include <linux/module.h>
10	#include <linux/skbuff.h>
11	#include <linux/vmalloc.h>
12	#include <linux/siphash.h>
13	#include <net/pkt_sched.h>
14	#include <net/sock.h>
15
16	/ Heavy-Hitter Filter (HHF)*
17	*
18	* Principles :
19	* Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
20	* buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
21	* as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
22	* The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
23	* in which the heavy-hitter bucket is served with less weight.
24	* In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
25	* are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
26	* higher share of bandwidth.
27	*
28	* To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
29	* following paper:
30	* [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
31	* Accounting", in ACM SIGCOMM, 2002.
32	*
33	* Conceptually, a multi-stage filter comprises k independent hash functions
34	* and k counter arrays. Packets are indexed into k counter arrays by k hash
35	* functions, respectively. The counters are then increased by the packet sizes.
36	* Therefore,
37	* - For a heavy-hitter flow: all of its k array counters must be large.
38	* - For a non-heavy-hitter flow: some of its k array counters can be large
39	* due to hash collision with other small flows; however, with high
40	* probability, not all k counters are large.
41	*
42	* By the design of the multi-stage filter algorithm, the false negative rate
43	* (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
44	* susceptible to false positives (non-heavy-hitters mistakenly classified as
45	* heavy-hitters).
46	* Therefore, we also implement the following optimizations to reduce false
47	* positives by avoiding unnecessary increment of the counter values:
48	* - Optimization O1: once a heavy-hitter is identified, its bytes are not
49	* accounted in the array counters. This technique is called "shielding"
50	* in Section 3.3.1 of [EV02].
51	* - Optimization O2: conservative update of counters
52	* (Section 3.3.2 of [EV02]),
53	* New counter value = max {old counter value,
54	* smallest counter value + packet bytes}
55	*
56	* Finally, we refresh the counters periodically since otherwise the counter
57	* values will keep accumulating.
58	*
59	* Once a flow is classified as heavy-hitter, we also save its per-flow state
60	* in an exact-matching flow table so that its subsequent packets can be
61	* dispatched to the heavy-hitter bucket accordingly.
62	*
63	*
64	* At a high level, this qdisc works as follows:
65	* Given a packet p:
66	* - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
67	* heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
68	* bucket.
69	* - Otherwise, forward p to the multi-stage filter, denoted filter F
70	* + If F decides that p belongs to a non-heavy-hitter flow, then send p
71	* to the non-heavy-hitter bucket.
72	* + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
73	* then set up a new flow entry for the flow-id of p in the table T and
74	* send p to the heavy-hitter bucket.
75	*
76	* In this implementation:
77	* - T is a fixed-size hash-table with 1024 entries. Hash collision is
78	* resolved by linked-list chaining.
79	* - F has four counter arrays, each array containing 1024 32-bit counters.
80	* That means 4 * 1024 * 32 bits = 16KB of memory.
81	* - Since each array in F contains 1024 counters, 10 bits are sufficient to
82	* index into each array.
83	* Hence, instead of having four hash functions, we chop the 32-bit
84	* skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
85	* computed as XOR sum of those three chunks.
86	* - We need to clear the counter arrays periodically; however, directly
87	* memsetting 16KB of memory can lead to cache eviction and unwanted delay.
88	* So by representing each counter by a valid bit, we only need to reset
89	* 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
90	* - The Deficit Round Robin engine is taken from fq_codel implementation
91	* (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
92	* fq_codel_flow in fq_codel implementation.
93	*
94	*/
95
96	/ Non-configurable parameters /
97	#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
98	#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
99	#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
100	#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
101	#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
102
103	#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
104	enum wdrr_bucket_idx {
105	WDRR_BUCKET_FOR_HH = `0`, / bucket id for heavy-hitters /
106	WDRR_BUCKET_FOR_NON_HH = `1` / bucket id for non-heavy-hitters /
107	};
108
109	#define hhf_time_before(a, b) \
110	(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
111
112	/ Heavy-hitter per-flow state /
113	struct hh_flow_state {
114	u32 hash_id; / hash of flow-id (e.g. TCP 5-tuple) /
115	u32 hit_timestamp; / last time heavy-hitter was seen /
116	struct list_head flowchain; / chaining under hash collision /
117	};
118
119	/ Weighted Deficit Round Robin (WDRR) scheduler /
120	struct wdrr_bucket {
121	struct sk_buff *head;
122	struct sk_buff *tail;
123	struct list_head bucketchain;
124	int deficit;
125	};
126
127	struct hhf_sched_data {
128	struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
129	siphash_key_t perturbation; / hash perturbation /
130	u32 quantum; / psched_mtu(qdisc_dev(sch)); /
131	u32 drop_overlimit; / number of times max qdisc packet*
132	* limit was hit
133	*/
134	struct list_head hh_flows; /* table T (currently active HHs) /
135	u32 hh_flows_limit; / max active HH allocs /
136	u32 hh_flows_overlimit; / num of disallowed HH allocs /
137	u32 hh_flows_total_cnt; / total admitted HHs /
138	u32 hh_flows_current_cnt; / total current HHs /
139	u32 hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F /
140	u32 hhf_arrays_reset_timestamp; / last time hhf_arrays*
141	* was reset
142	*/
143	unsigned long hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits*
144	* of hhf_arrays
145	*/
146	/ Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR /
147	struct list_head new_buckets; / list of new buckets /
148	struct list_head old_buckets; / list of old buckets /
149
150	/ Configurable HHF parameters /
151	u32 hhf_reset_timeout; / interval to reset counter*
152	* arrays in filter F
153	* (default 40ms)
154	*/
155	u32 hhf_admit_bytes; / counter thresh to classify as*
156	* HH (default 128KB).
157	* With these default values,
158	* 128KB / 40ms = 25 Mbps
159	* i.e., we expect to capture HHs
160	* sending > 25 Mbps.
161	*/
162	u32 hhf_evict_timeout; / aging threshold to evict idle*
163	* HHs out of table T. This should
164	* be large enough to avoid
165	* reordering during HH eviction.
166	* (default 1s)
167	*/
168	u32 hhf_non_hh_weight; / WDRR weight for non-HHs*
169	* (default 2,
170	* i.e., non-HH : HH = 2 : 1)
171	*/
172	};
173
174	static u32 hhf_time_stamp(void)
175	{
176	return jiffies;
177	}
178
179	/ Looks up a heavy-hitter flow in a chaining list of table T. /
180	static struct hh_flow_state seek_list(const* u32 hash,
181	struct list_head *head,
182	struct hhf_sched_data *q)
183	{
184	struct hh_flow_state flow, next;
185	u32 now = hhf_time_stamp();
186
187	if (list_empty(head))
188	return NULL;
189
190	list_for_each_entry_safe(flow, next, head, flowchain) {
191	u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
192
193	if (hhf_time_before(prev, now)) {
194	/ Delete expired heavy-hitters, but preserve one entry*
195	* to avoid kzalloc() when next time this slot is hit.
196	*/
197	if (list_is_last(list: &flow->flowchain, head))
198	return NULL;
199	list_del(entry: &flow->flowchain);
200	kfree(objp: flow);
201	q->hh_flows_current_cnt--;
202	} else if (flow->hash_id == hash) {
203	return flow;
204	}
205	}
206	return NULL;
207	}
208
209	/ Returns a flow state entry for a new heavy-hitter. Either reuses an expired*
210	* entry or dynamically alloc a new entry.
211	*/
212	static struct hh_flow_state alloc_new_hh(struct* list_head *head,
213	struct hhf_sched_data *q)
214	{
215	struct hh_flow_state *flow;
216	u32 now = hhf_time_stamp();
217
218	if (!list_empty(head)) {
219	/ Find an expired heavy-hitter flow entry. /
220	list_for_each_entry(flow, head, flowchain) {
221	u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
222
223	if (hhf_time_before(prev, now))
224	return flow;
225	}
226	}
227
228	if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
229	q->hh_flows_overlimit++;
230	return NULL;
231	}
232	/ Create new entry. /
233	flow = kzalloc(size: sizeof(struct hh_flow_state), GFP_ATOMIC);
234	if (!flow)
235	return NULL;
236
237	q->hh_flows_current_cnt++;
238	INIT_LIST_HEAD(list: &flow->flowchain);
239	list_add_tail(new: &flow->flowchain, head);
240
241	return flow;
242	}
243
244	/ Assigns packets to WDRR buckets. Implements a multi-stage filter to*
245	* classify heavy-hitters.
246	*/
247	static enum wdrr_bucket_idx hhf_classify(struct sk_buff skb, struct* Qdisc *sch)
248	{
249	struct hhf_sched_data *q = qdisc_priv(sch);
250	u32 tmp_hash, hash;
251	u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
252	struct hh_flow_state *flow;
253	u32 pkt_len, min_hhf_val;
254	int i;
255	u32 prev;
256	u32 now = hhf_time_stamp();
257
258	/ Reset the HHF counter arrays if this is the right time. /
259	prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
260	if (hhf_time_before(prev, now)) {
261	for (i = `0`; i < HHF_ARRAYS_CNT; i++)
262	bitmap_zero(dst: q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
263	q->hhf_arrays_reset_timestamp = now;
264	}
265
266	/ Get hashed flow-id of the skb. /
267	hash = skb_get_hash_perturb(skb, perturb: &q->perturbation);
268
269	/ Check if this packet belongs to an already established HH flow. /
270	flow_pos = hash & HHF_BIT_MASK;
271	flow = seek_list(hash, head: &q->hh_flows[flow_pos], q);
272	if (flow) { / found its HH flow /
273	flow->hit_timestamp = now;
274	return WDRR_BUCKET_FOR_HH;
275	}
276
277	/ Now pass the packet through the multi-stage filter. /
278	tmp_hash = hash;
279	xorsum = `0`;
280	for (i = `0`; i < HHF_ARRAYS_CNT - `1`; i++) {
281	/ Split the skb_hash into three 10-bit chunks. /
282	filter_pos[i] = tmp_hash & HHF_BIT_MASK;
283	xorsum ^= filter_pos[i];
284	tmp_hash >>= HHF_BIT_MASK_LEN;
285	}
286	/ The last chunk is computed as XOR sum of other chunks. /
287	filter_pos[HHF_ARRAYS_CNT - `1`] = xorsum ^ tmp_hash;
288
289	pkt_len = qdisc_pkt_len(skb);
290	min_hhf_val = ~`0U`;
291	for (i = `0`; i < HHF_ARRAYS_CNT; i++) {
292	u32 val;
293
294	if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
295	q->hhf_arrays[i][filter_pos[i]] = `0`;
296	__set_bit(filter_pos[i], q->hhf_valid_bits[i]);
297	}
298
299	val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
300	if (min_hhf_val > val)
301	min_hhf_val = val;
302	}
303
304	/ Found a new HH iff all counter values > HH admit threshold. /
305	if (min_hhf_val > q->hhf_admit_bytes) {
306	/ Just captured a new heavy-hitter. /
307	flow = alloc_new_hh(head: &q->hh_flows[flow_pos], q);
308	if (!flow) / memory alloc problem /
309	return WDRR_BUCKET_FOR_NON_HH;
310	flow->hash_id = hash;
311	flow->hit_timestamp = now;
312	q->hh_flows_total_cnt++;
313
314	/ By returning without updating counters in q->hhf_arrays,*
315	* we implicitly implement "shielding" (see Optimization O1).
316	*/
317	return WDRR_BUCKET_FOR_HH;
318	}
319
320	/ Conservative update of HHF arrays (see Optimization O2). /
321	for (i = `0`; i < HHF_ARRAYS_CNT; i++) {
322	if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
323	q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
324	}
325	return WDRR_BUCKET_FOR_NON_HH;
326	}
327
328	/ Removes one skb from head of bucket. /
329	static struct sk_buff dequeue_head(struct* wdrr_bucket *bucket)
330	{
331	struct sk_buff *skb = bucket->head;
332
333	bucket->head = skb->next;
334	skb_mark_not_on_list(skb);
335	return skb;
336	}
337
338	/ Tail-adds skb to bucket. /
339	static void bucket_add(struct wdrr_bucket bucket, struct* sk_buff *skb)
340	{
341	if (bucket->head == NULL)
342	bucket->head = skb;
343	else
344	bucket->tail->next = skb;
345	bucket->tail = skb;
346	skb->next = NULL;
347	}
348
349	static unsigned int hhf_drop(struct Qdisc sch, struct* sk_buff **to_free)
350	{
351	struct hhf_sched_data *q = qdisc_priv(sch);
352	struct wdrr_bucket *bucket;
353
354	/ Always try to drop from heavy-hitters first. /
355	bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
356	if (!bucket->head)
357	bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
358
359	if (bucket->head) {
360	struct sk_buff *skb = dequeue_head(bucket);
361
362	sch->q.qlen--;
363	qdisc_qstats_backlog_dec(sch, skb);
364	qdisc_drop(skb, sch, to_free);
365	}
366
367	/ Return id of the bucket from which the packet was dropped. /
368	return bucket - q->buckets;
369	}
370
371	static int hhf_enqueue(struct sk_buff skb, struct* Qdisc *sch,
372	struct sk_buff **to_free)
373	{
374	struct hhf_sched_data *q = qdisc_priv(sch);
375	enum wdrr_bucket_idx idx;
376	struct wdrr_bucket *bucket;
377	unsigned int prev_backlog;
378
379	idx = hhf_classify(skb, sch);
380
381	bucket = &q->buckets[idx];
382	bucket_add(bucket, skb);
383	qdisc_qstats_backlog_inc(sch, skb);
384
385	if (list_empty(head: &bucket->bucketchain)) {
386	unsigned int weight;
387
388	/ The logic of new_buckets vs. old_buckets is the same as*
389	* new_flows vs. old_flows in the implementation of fq_codel,
390	* i.e., short bursts of non-HHs should have strict priority.
391	*/
392	if (idx == WDRR_BUCKET_FOR_HH) {
393	/ Always move heavy-hitters to old bucket. /
394	weight = `1`;
395	list_add_tail(new: &bucket->bucketchain, head: &q->old_buckets);
396	} else {
397	weight = q->hhf_non_hh_weight;
398	list_add_tail(new: &bucket->bucketchain, head: &q->new_buckets);
399	}
400	bucket->deficit = weight * q->quantum;
401	}
402	if (++sch->q.qlen <= sch->limit)
403	return NET_XMIT_SUCCESS;
404
405	prev_backlog = sch->qstats.backlog;
406	q->drop_overlimit++;
407	/ Return Congestion Notification only if we dropped a packet from this*
408	* bucket.
409	*/
410	if (hhf_drop(sch, to_free) == idx)
411	return NET_XMIT_CN;
412
413	/ As we dropped a packet, better let upper stack know this. /
414	qdisc_tree_reduce_backlog(qdisc: sch, n: `1`, len: prev_backlog - sch->qstats.backlog);
415	return NET_XMIT_SUCCESS;
416	}
417
418	static struct sk_buff hhf_dequeue(struct* Qdisc *sch)
419	{
420	struct hhf_sched_data *q = qdisc_priv(sch);
421	struct sk_buff *skb = NULL;
422	struct wdrr_bucket *bucket;
423	struct list_head *head;
424
425	begin:
426	head = &q->new_buckets;
427	if (list_empty(head)) {
428	head = &q->old_buckets;
429	if (list_empty(head))
430	return NULL;
431	}
432	bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
433
434	if (bucket->deficit <= `0`) {
435	int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
436	`1` : q->hhf_non_hh_weight;
437
438	bucket->deficit += weight * q->quantum;
439	list_move_tail(list: &bucket->bucketchain, head: &q->old_buckets);
440	goto begin;
441	}
442
443	if (bucket->head) {
444	skb = dequeue_head(bucket);
445	sch->q.qlen--;
446	qdisc_qstats_backlog_dec(sch, skb);
447	}
448
449	if (!skb) {
450	/ Force a pass through old_buckets to prevent starvation. /
451	if ((head == &q->new_buckets) && !list_empty(head: &q->old_buckets))
452	list_move_tail(list: &bucket->bucketchain, head: &q->old_buckets);
453	else
454	list_del_init(entry: &bucket->bucketchain);
455	goto begin;
456	}
457	qdisc_bstats_update(sch, skb);
458	bucket->deficit -= qdisc_pkt_len(skb);
459
460	return skb;
461	}
462
463	static void hhf_reset(struct Qdisc *sch)
464	{
465	struct sk_buff *skb;
466
467	while ((skb = hhf_dequeue(sch)) != NULL)
468	rtnl_kfree_skbs(head: skb, tail: skb);
469	}
470
471	static void hhf_destroy(struct Qdisc *sch)
472	{
473	int i;
474	struct hhf_sched_data *q = qdisc_priv(sch);
475
476	for (i = `0`; i < HHF_ARRAYS_CNT; i++) {
477	kvfree(addr: q->hhf_arrays[i]);
478	kvfree(addr: q->hhf_valid_bits[i]);
479	}
480
481	if (!q->hh_flows)
482	return;
483
484	for (i = `0`; i < HH_FLOWS_CNT; i++) {
485	struct hh_flow_state flow, next;
486	struct list_head *head = &q->hh_flows[i];
487
488	if (list_empty(head))
489	continue;
490	list_for_each_entry_safe(flow, next, head, flowchain) {
491	list_del(entry: &flow->flowchain);
492	kfree(objp: flow);
493	}
494	}
495	kvfree(addr: q->hh_flows);
496	}
497
498	static const struct nla_policy hhf_policy[TCA_HHF_MAX + `1`] = {
499	[TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
500	[TCA_HHF_QUANTUM] = { .type = NLA_U32 },
501	[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
502	[TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
503	[TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
504	[TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
505	[TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
506	};
507
508	static int hhf_change(struct Qdisc sch, struct* nlattr *opt,
509	struct netlink_ext_ack *extack)
510	{
511	struct hhf_sched_data *q = qdisc_priv(sch);
512	struct nlattr *tb[TCA_HHF_MAX + `1`];
513	unsigned int qlen, prev_backlog;
514	int err;
515	u64 non_hh_quantum;
516	u32 new_quantum = q->quantum;
517	u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
518
519	err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, nla: opt, policy: hhf_policy,
520	NULL);
521	if (err < `0`)
522	return err;
523
524	if (tb[TCA_HHF_QUANTUM])
525	new_quantum = nla_get_u32(nla: tb[TCA_HHF_QUANTUM]);
526
527	if (tb[TCA_HHF_NON_HH_WEIGHT])
528	new_hhf_non_hh_weight = nla_get_u32(nla: tb[TCA_HHF_NON_HH_WEIGHT]);
529
530	non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
531	if (non_hh_quantum == `0` \|\| non_hh_quantum > INT_MAX)
532	return -EINVAL;
533
534	sch_tree_lock(q: sch);
535
536	if (tb[TCA_HHF_BACKLOG_LIMIT])
537	sch->limit = nla_get_u32(nla: tb[TCA_HHF_BACKLOG_LIMIT]);
538
539	q->quantum = new_quantum;
540	q->hhf_non_hh_weight = new_hhf_non_hh_weight;
541
542	if (tb[TCA_HHF_HH_FLOWS_LIMIT])
543	q->hh_flows_limit = nla_get_u32(nla: tb[TCA_HHF_HH_FLOWS_LIMIT]);
544
545	if (tb[TCA_HHF_RESET_TIMEOUT]) {
546	u32 us = nla_get_u32(nla: tb[TCA_HHF_RESET_TIMEOUT]);
547
548	q->hhf_reset_timeout = usecs_to_jiffies(u: us);
549	}
550
551	if (tb[TCA_HHF_ADMIT_BYTES])
552	q->hhf_admit_bytes = nla_get_u32(nla: tb[TCA_HHF_ADMIT_BYTES]);
553
554	if (tb[TCA_HHF_EVICT_TIMEOUT]) {
555	u32 us = nla_get_u32(nla: tb[TCA_HHF_EVICT_TIMEOUT]);
556
557	q->hhf_evict_timeout = usecs_to_jiffies(u: us);
558	}
559
560	qlen = sch->q.qlen;
561	prev_backlog = sch->qstats.backlog;
562	while (sch->q.qlen > sch->limit) {
563	struct sk_buff *skb = hhf_dequeue(sch);
564
565	rtnl_kfree_skbs(head: skb, tail: skb);
566	}
567	qdisc_tree_reduce_backlog(qdisc: sch, n: qlen - sch->q.qlen,
568	len: prev_backlog - sch->qstats.backlog);
569
570	sch_tree_unlock(q: sch);
571	return `0`;
572	}
573
574	static int hhf_init(struct Qdisc sch, struct* nlattr *opt,
575	struct netlink_ext_ack *extack)
576	{
577	struct hhf_sched_data *q = qdisc_priv(sch);
578	int i;
579
580	sch->limit = `1000`;
581	q->quantum = psched_mtu(dev: qdisc_dev(qdisc: sch));
582	get_random_bytes(buf: &q->perturbation, len: sizeof(q->perturbation));
583	INIT_LIST_HEAD(list: &q->new_buckets);
584	INIT_LIST_HEAD(list: &q->old_buckets);
585
586	/ Configurable HHF parameters /
587	q->hhf_reset_timeout = HZ / `25`; / 40 ms /
588	q->hhf_admit_bytes = `131072`; / 128 KB /
589	q->hhf_evict_timeout = HZ; / 1 sec /
590	q->hhf_non_hh_weight = `2`;
591
592	if (opt) {
593	int err = hhf_change(sch, opt, extack);
594
595	if (err)
596	return err;
597	}
598
599	if (!q->hh_flows) {
600	/ Initialize heavy-hitter flow table. /
601	q->hh_flows = kvcalloc(HH_FLOWS_CNT, size: sizeof(struct list_head),
602	GFP_KERNEL);
603	if (!q->hh_flows)
604	return -ENOMEM;
605	for (i = `0`; i < HH_FLOWS_CNT; i++)
606	INIT_LIST_HEAD(list: &q->hh_flows[i]);
607
608	/ Cap max active HHs at twice len of hh_flows table. /
609	q->hh_flows_limit = `2` * HH_FLOWS_CNT;
610	q->hh_flows_overlimit = `0`;
611	q->hh_flows_total_cnt = `0`;
612	q->hh_flows_current_cnt = `0`;
613
614	/ Initialize heavy-hitter filter arrays. /
615	for (i = `0`; i < HHF_ARRAYS_CNT; i++) {
616	q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN,
617	size: sizeof(u32),
618	GFP_KERNEL);
619	if (!q->hhf_arrays[i]) {
620	/ Note: hhf_destroy() will be called*
621	* by our caller.
622	*/
623	return -ENOMEM;
624	}
625	}
626	q->hhf_arrays_reset_timestamp = hhf_time_stamp();
627
628	/ Initialize valid bits of heavy-hitter filter arrays. /
629	for (i = `0`; i < HHF_ARRAYS_CNT; i++) {
630	q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN /
631	BITS_PER_BYTE, GFP_KERNEL);
632	if (!q->hhf_valid_bits[i]) {
633	/ Note: hhf_destroy() will be called*
634	* by our caller.
635	*/
636	return -ENOMEM;
637	}
638	}
639
640	/ Initialize Weighted DRR buckets. /
641	for (i = `0`; i < WDRR_BUCKET_CNT; i++) {
642	struct wdrr_bucket *bucket = q->buckets + i;
643
644	INIT_LIST_HEAD(list: &bucket->bucketchain);
645	}
646	}
647
648	return `0`;
649	}
650
651	static int hhf_dump(struct Qdisc sch, struct* sk_buff *skb)
652	{
653	struct hhf_sched_data *q = qdisc_priv(sch);
654	struct nlattr *opts;
655
656	opts = nla_nest_start_noflag(skb, attrtype: TCA_OPTIONS);
657	if (opts == NULL)
658	goto nla_put_failure;
659
660	if (nla_put_u32(skb, attrtype: TCA_HHF_BACKLOG_LIMIT, value: sch->limit) \|\|
661	nla_put_u32(skb, attrtype: TCA_HHF_QUANTUM, value: q->quantum) \|\|
662	nla_put_u32(skb, attrtype: TCA_HHF_HH_FLOWS_LIMIT, value: q->hh_flows_limit) \|\|
663	nla_put_u32(skb, attrtype: TCA_HHF_RESET_TIMEOUT,
664	value: jiffies_to_usecs(j: q->hhf_reset_timeout)) \|\|
665	nla_put_u32(skb, attrtype: TCA_HHF_ADMIT_BYTES, value: q->hhf_admit_bytes) \|\|
666	nla_put_u32(skb, attrtype: TCA_HHF_EVICT_TIMEOUT,
667	value: jiffies_to_usecs(j: q->hhf_evict_timeout)) \|\|
668	nla_put_u32(skb, attrtype: TCA_HHF_NON_HH_WEIGHT, value: q->hhf_non_hh_weight))
669	goto nla_put_failure;
670
671	return nla_nest_end(skb, start: opts);
672
673	nla_put_failure:
674	return -`1`;
675	}
676
677	static int hhf_dump_stats(struct Qdisc sch, struct* gnet_dump *d)
678	{
679	struct hhf_sched_data *q = qdisc_priv(sch);
680	struct tc_hhf_xstats st = {
681	.drop_overlimit = q->drop_overlimit,
682	.hh_overlimit = q->hh_flows_overlimit,
683	.hh_tot_count = q->hh_flows_total_cnt,
684	.hh_cur_count = q->hh_flows_current_cnt,
685	};
686
687	return gnet_stats_copy_app(d, st: &st, len: sizeof(st));
688	}
689
690	static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
691	.id = "hhf",
692	.priv_size = sizeof(struct hhf_sched_data),
693
694	.enqueue = hhf_enqueue,
695	.dequeue = hhf_dequeue,
696	.peek = qdisc_peek_dequeued,
697	.init = hhf_init,
698	.reset = hhf_reset,
699	.destroy = hhf_destroy,
700	.change = hhf_change,
701	.dump = hhf_dump,
702	.dump_stats = hhf_dump_stats,
703	.owner = THIS_MODULE,
704	};
705
706	static int __init hhf_module_init(void)
707	{
708	return register_qdisc(qops: &hhf_qdisc_ops);
709	}
710
711	static void __exit hhf_module_exit(void)
712	{
713	unregister_qdisc(qops: &hhf_qdisc_ops);
714	}
715
716	module_init(hhf_module_init)
717	module_exit(hhf_module_exit)
718	MODULE_AUTHOR("Terry Lam");
719	MODULE_AUTHOR("Nandita Dukkipati");
720	MODULE_LICENSE("GPL");
721	MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
722

source code of linux/net/sched/sch_hhf.c