inet_fragment.c source code [linux/net/ipv4/inet_fragment.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* inet fragments management
4	*
5	* Authors: Pavel Emelyanov <xemul@openvz.org>
6	* Started as consolidation of ipv4/ip_fragment.c,
7	* ipv6/reassembly. and ipv6 nf conntrack reassembly
8	*/
9
10	#include <linux/list.h>
11	#include <linux/spinlock.h>
12	#include <linux/module.h>
13	#include <linux/timer.h>
14	#include <linux/mm.h>
15	#include <linux/random.h>
16	#include <linux/skbuff.h>
17	#include <linux/rtnetlink.h>
18	#include <linux/slab.h>
19	#include <linux/rhashtable.h>
20
21	#include <net/sock.h>
22	#include <net/inet_frag.h>
23	#include <net/inet_ecn.h>
24	#include <net/ip.h>
25	#include <net/ipv6.h>
26
27	/ Use skb->cb to track consecutive/adjacent fragments coming at*
28	* the end of the queue. Nodes in the rb-tree queue will
29	* contain "runs" of one or more adjacent fragments.
30	*
31	* Invariants:
32	* - next_frag is NULL at the tail of a "run";
33	* - the head of a "run" has the sum of all fragment lengths in frag_run_len.
34	*/
35	struct ipfrag_skb_cb {
36	union {
37	struct inet_skb_parm h4;
38	struct inet6_skb_parm h6;
39	};
40	struct sk_buff *next_frag;
41	int frag_run_len;
42	};
43
44	#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
45
46	static void fragcb_clear(struct sk_buff *skb)
47	{
48	RB_CLEAR_NODE(&skb->rbnode);
49	FRAG_CB(skb)->next_frag = NULL;
50	FRAG_CB(skb)->frag_run_len = skb->len;
51	}
52
53	/ Append skb to the last "run". /
54	static void fragrun_append_to_last(struct inet_frag_queue *q,
55	struct sk_buff *skb)
56	{
57	fragcb_clear(skb);
58
59	FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
60	FRAG_CB(q->fragments_tail)->next_frag = skb;
61	q->fragments_tail = skb;
62	}
63
64	/ Create a new "run" with the skb. /
65	static void fragrun_create(struct inet_frag_queue q, struct* sk_buff *skb)
66	{
67	BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
68	fragcb_clear(skb);
69
70	if (q->last_run_head)
71	rb_link_node(node: &skb->rbnode, parent: &q->last_run_head->rbnode,
72	rb_link: &q->last_run_head->rbnode.rb_right);
73	else
74	rb_link_node(node: &skb->rbnode, NULL, rb_link: &q->rb_fragments.rb_node);
75	rb_insert_color(&skb->rbnode, &q->rb_fragments);
76
77	q->fragments_tail = skb;
78	q->last_run_head = skb;
79	}
80
81	/ Given the OR values of all fragments, apply RFC 3168 5.3 requirements*
82	* Value : 0xff if frame should be dropped.
83	* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
84	*/
85	const u8 ip_frag_ecn_table[`16`] = {
86	/ at least one fragment had CE, and others ECT_0 or ECT_1 /
87	[IPFRAG_ECN_CE \| IPFRAG_ECN_ECT_0] = INET_ECN_CE,
88	[IPFRAG_ECN_CE \| IPFRAG_ECN_ECT_1] = INET_ECN_CE,
89	[IPFRAG_ECN_CE \| IPFRAG_ECN_ECT_0 \| IPFRAG_ECN_ECT_1] = INET_ECN_CE,
90
91	/ invalid combinations : drop frame /
92	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_CE] = `0xff`,
93	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_ECT_0] = `0xff`,
94	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_ECT_1] = `0xff`,
95	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_ECT_0 \| IPFRAG_ECN_ECT_1] = `0xff`,
96	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_CE \| IPFRAG_ECN_ECT_0] = `0xff`,
97	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_CE \| IPFRAG_ECN_ECT_1] = `0xff`,
98	[IPFRAG_ECN_NOT_ECT \| IPFRAG_ECN_CE \| IPFRAG_ECN_ECT_0 \| IPFRAG_ECN_ECT_1] = `0xff`,
99	};
100	EXPORT_SYMBOL(ip_frag_ecn_table);
101
102	int inet_frags_init(struct inet_frags *f)
103	{
104	f->frags_cachep = kmem_cache_create(name: f->frags_cache_name, size: f->qsize, align: `0`, flags: `0`,
105	NULL);
106	if (!f->frags_cachep)
107	return -ENOMEM;
108
109	refcount_set(r: &f->refcnt, n: `1`);
110	init_completion(x: &f->completion);
111	return `0`;
112	}
113	EXPORT_SYMBOL(inet_frags_init);
114
115	void inet_frags_fini(struct inet_frags *f)
116	{
117	if (refcount_dec_and_test(r: &f->refcnt))
118	complete(&f->completion);
119
120	wait_for_completion(&f->completion);
121
122	kmem_cache_destroy(s: f->frags_cachep);
123	f->frags_cachep = NULL;
124	}
125	EXPORT_SYMBOL(inet_frags_fini);
126
127	/ called from rhashtable_free_and_destroy() at netns_frags dismantle /
128	static void inet_frags_free_cb(void ptr, void* *arg)
129	{
130	struct inet_frag_queue *fq = ptr;
131	int count;
132
133	count = del_timer_sync(timer: &fq->timer) ? `1` : `0`;
134
135	spin_lock_bh(lock: &fq->lock);
136	fq->flags \|= INET_FRAG_DROP;
137	if (!(fq->flags & INET_FRAG_COMPLETE)) {
138	fq->flags \|= INET_FRAG_COMPLETE;
139	count++;
140	} else if (fq->flags & INET_FRAG_HASH_DEAD) {
141	count++;
142	}
143	spin_unlock_bh(lock: &fq->lock);
144
145	if (refcount_sub_and_test(i: count, r: &fq->refcnt))
146	inet_frag_destroy(q: fq);
147	}
148
149	static LLIST_HEAD(fqdir_free_list);
150
151	static void fqdir_free_fn(struct work_struct *work)
152	{
153	struct llist_node *kill_list;
154	struct fqdir fqdir, tmp;
155	struct inet_frags *f;
156
157	/ Atomically snapshot the list of fqdirs to free /
158	kill_list = llist_del_all(head: &fqdir_free_list);
159
160	/ We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)*
161	* have completed, since they need to dereference fqdir.
162	* Would it not be nice to have kfree_rcu_barrier() ? :)
163	*/
164	rcu_barrier();
165
166	llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
167	f = fqdir->f;
168	if (refcount_dec_and_test(r: &f->refcnt))
169	complete(&f->completion);
170
171	kfree(objp: fqdir);
172	}
173	}
174
175	static DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
176
177	static void fqdir_work_fn(struct work_struct *work)
178	{
179	struct fqdir fqdir = container_of(work, struct* fqdir, destroy_work);
180
181	rhashtable_free_and_destroy(ht: &fqdir->rhashtable, free_fn: inet_frags_free_cb, NULL);
182
183	if (llist_add(new: &fqdir->free_list, head: &fqdir_free_list))
184	queue_work(wq: system_wq, work: &fqdir_free_work);
185	}
186
187	int fqdir_init(struct fqdir fqdirp, struct** inet_frags f, struct* net *net)
188	{
189	struct fqdir fqdir = kzalloc(size: sizeof(fqdir), GFP_KERNEL);
190	int res;
191
192	if (!fqdir)
193	return -ENOMEM;
194	fqdir->f = f;
195	fqdir->net = net;
196	res = rhashtable_init(ht: &fqdir->rhashtable, params: &fqdir->f->rhash_params);
197	if (res < `0`) {
198	kfree(objp: fqdir);
199	return res;
200	}
201	refcount_inc(r: &f->refcnt);
202	*fqdirp = fqdir;
203	return `0`;
204	}
205	EXPORT_SYMBOL(fqdir_init);
206
207	static struct workqueue_struct *inet_frag_wq;
208
209	static int __init inet_frag_wq_init(void)
210	{
211	inet_frag_wq = create_workqueue("inet_frag_wq");
212	if (!inet_frag_wq)
213	panic(fmt: "Could not create inet frag workq");
214	return `0`;
215	}
216
217	pure_initcall(inet_frag_wq_init);
218
219	void fqdir_exit(struct fqdir *fqdir)
220	{
221	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
222	queue_work(wq: inet_frag_wq, work: &fqdir->destroy_work);
223	}
224	EXPORT_SYMBOL(fqdir_exit);
225
226	void inet_frag_kill(struct inet_frag_queue *fq)
227	{
228	if (del_timer(timer: &fq->timer))
229	refcount_dec(r: &fq->refcnt);
230
231	if (!(fq->flags & INET_FRAG_COMPLETE)) {
232	struct fqdir *fqdir = fq->fqdir;
233
234	fq->flags \|= INET_FRAG_COMPLETE;
235	rcu_read_lock();
236	/ The RCU read lock provides a memory barrier*
237	* guaranteeing that if fqdir->dead is false then
238	* the hash table destruction will not start until
239	* after we unlock. Paired with fqdir_pre_exit().
240	*/
241	if (!READ_ONCE(fqdir->dead)) {
242	rhashtable_remove_fast(ht: &fqdir->rhashtable, obj: &fq->node,
243	params: fqdir->f->rhash_params);
244	refcount_dec(r: &fq->refcnt);
245	} else {
246	fq->flags \|= INET_FRAG_HASH_DEAD;
247	}
248	rcu_read_unlock();
249	}
250	}
251	EXPORT_SYMBOL(inet_frag_kill);
252
253	static void inet_frag_destroy_rcu(struct rcu_head *head)
254	{
255	struct inet_frag_queue q = container_of(head, struct* inet_frag_queue,
256	rcu);
257	struct inet_frags *f = q->fqdir->f;
258
259	if (f->destructor)
260	f->destructor(q);
261	kmem_cache_free(s: f->frags_cachep, objp: q);
262	}
263
264	unsigned int inet_frag_rbtree_purge(struct rb_root *root,
265	enum skb_drop_reason reason)
266	{
267	struct rb_node *p = rb_first(root);
268	unsigned int sum = `0`;
269
270	while (p) {
271	struct sk_buff skb = rb_entry(p, struct* sk_buff, rbnode);
272
273	p = rb_next(p);
274	rb_erase(&skb->rbnode, root);
275	while (skb) {
276	struct sk_buff *next = FRAG_CB(skb)->next_frag;
277
278	sum += skb->truesize;
279	kfree_skb_reason(skb, reason);
280	skb = next;
281	}
282	}
283	return sum;
284	}
285	EXPORT_SYMBOL(inet_frag_rbtree_purge);
286
287	void inet_frag_destroy(struct inet_frag_queue *q)
288	{
289	unsigned int sum, sum_truesize = `0`;
290	enum skb_drop_reason reason;
291	struct inet_frags *f;
292	struct fqdir *fqdir;
293
294	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
295	reason = (q->flags & INET_FRAG_DROP) ?
296	SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
297	SKB_CONSUMED;
298	WARN_ON(del_timer(&q->timer) != `0`);
299
300	/ Release all fragment data. /
301	fqdir = q->fqdir;
302	f = fqdir->f;
303	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
304	sum = sum_truesize + f->qsize;
305
306	call_rcu(head: &q->rcu, func: inet_frag_destroy_rcu);
307
308	sub_frag_mem_limit(fqdir, val: sum);
309	}
310	EXPORT_SYMBOL(inet_frag_destroy);
311
312	static struct inet_frag_queue inet_frag_alloc(struct* fqdir *fqdir,
313	struct inet_frags *f,
314	void *arg)
315	{
316	struct inet_frag_queue *q;
317
318	q = kmem_cache_zalloc(k: f->frags_cachep, GFP_ATOMIC);
319	if (!q)
320	return NULL;
321
322	q->fqdir = fqdir;
323	f->constructor(q, arg);
324	add_frag_mem_limit(fqdir, val: f->qsize);
325
326	timer_setup(&q->timer, f->frag_expire, `0`);
327	spin_lock_init(&q->lock);
328	refcount_set(r: &q->refcnt, n: `3`);
329
330	return q;
331	}
332
333	static struct inet_frag_queue inet_frag_create(struct* fqdir *fqdir,
334	void *arg,
335	struct inet_frag_queue **prev)
336	{
337	struct inet_frags *f = fqdir->f;
338	struct inet_frag_queue *q;
339
340	q = inet_frag_alloc(fqdir, f, arg);
341	if (!q) {
342	*prev = ERR_PTR(error: -ENOMEM);
343	return NULL;
344	}
345	mod_timer(timer: &q->timer, expires: jiffies + fqdir->timeout);
346
347	*prev = rhashtable_lookup_get_insert_key(ht: &fqdir->rhashtable, key: &q->key,
348	obj: &q->node, params: f->rhash_params);
349	if (*prev) {
350	q->flags \|= INET_FRAG_COMPLETE;
351	inet_frag_kill(q);
352	inet_frag_destroy(q);
353	return NULL;
354	}
355	return q;
356	}
357
358	/ TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() /
359	struct inet_frag_queue inet_frag_find(struct* fqdir fqdir, void* *key)
360	{
361	/ This pairs with WRITE_ONCE() in fqdir_pre_exit(). /
362	long high_thresh = READ_ONCE(fqdir->high_thresh);
363	struct inet_frag_queue fq = NULL, prev;
364
365	if (!high_thresh \|\| frag_mem_limit(fqdir) > high_thresh)
366	return NULL;
367
368	rcu_read_lock();
369
370	prev = rhashtable_lookup(ht: &fqdir->rhashtable, key, params: fqdir->f->rhash_params);
371	if (!prev)
372	fq = inet_frag_create(fqdir, arg: key, prev: &prev);
373	if (!IS_ERR_OR_NULL(ptr: prev)) {
374	fq = prev;
375	if (!refcount_inc_not_zero(r: &fq->refcnt))
376	fq = NULL;
377	}
378	rcu_read_unlock();
379	return fq;
380	}
381	EXPORT_SYMBOL(inet_frag_find);
382
383	int inet_frag_queue_insert(struct inet_frag_queue q, struct* sk_buff *skb,
384	int offset, int end)
385	{
386	struct sk_buff *last = q->fragments_tail;
387
388	/ RFC5722, Section 4, amended by Errata ID : 3089*
389	* When reassembling an IPv6 datagram, if
390	* one or more its constituent fragments is determined to be an
391	* overlapping fragment, the entire datagram (and any constituent
392	* fragments) MUST be silently discarded.
393	*
394	* Duplicates, however, should be ignored (i.e. skb dropped, but the
395	* queue/fragments kept for later reassembly).
396	*/
397	if (!last)
398	fragrun_create(q, skb); / First fragment. /
399	else if (last->ip_defrag_offset + last->len < end) {
400	/ This is the common case: skb goes to the end. /
401	/ Detect and discard overlaps. /
402	if (offset < last->ip_defrag_offset + last->len)
403	return IPFRAG_OVERLAP;
404	if (offset == last->ip_defrag_offset + last->len)
405	fragrun_append_to_last(q, skb);
406	else
407	fragrun_create(q, skb);
408	} else {
409	/ Binary search. Note that skb can become the first fragment,*
410	* but not the last (covered above).
411	*/
412	struct rb_node *rbn, parent;
413
414	rbn = &q->rb_fragments.rb_node;
415	do {
416	struct sk_buff *curr;
417	int curr_run_end;
418
419	parent = *rbn;
420	curr = rb_to_skb(parent);
421	curr_run_end = curr->ip_defrag_offset +
422	FRAG_CB(curr)->frag_run_len;
423	if (end <= curr->ip_defrag_offset)
424	rbn = &parent->rb_left;
425	else if (offset >= curr_run_end)
426	rbn = &parent->rb_right;
427	else if (offset >= curr->ip_defrag_offset &&
428	end <= curr_run_end)
429	return IPFRAG_DUP;
430	else
431	return IPFRAG_OVERLAP;
432	} while (*rbn);
433	/ Here we have parent properly set, and rbn pointing to*
434	* one of its NULL left/right children. Insert skb.
435	*/
436	fragcb_clear(skb);
437	rb_link_node(node: &skb->rbnode, parent, rb_link: rbn);
438	rb_insert_color(&skb->rbnode, &q->rb_fragments);
439	}
440
441	skb->ip_defrag_offset = offset;
442
443	return IPFRAG_OK;
444	}
445	EXPORT_SYMBOL(inet_frag_queue_insert);
446
447	void inet_frag_reasm_prepare(struct* inet_frag_queue q, struct* sk_buff *skb,
448	struct sk_buff *parent)
449	{
450	struct sk_buff fp, head = skb_rb_first(&q->rb_fragments);
451	struct sk_buff **nextp;
452	int delta;
453
454	if (head != skb) {
455	fp = skb_clone(skb, GFP_ATOMIC);
456	if (!fp)
457	return NULL;
458	FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
459	if (RB_EMPTY_NODE(&skb->rbnode))
460	FRAG_CB(parent)->next_frag = fp;
461	else
462	rb_replace_node(victim: &skb->rbnode, new: &fp->rbnode,
463	root: &q->rb_fragments);
464	if (q->fragments_tail == skb)
465	q->fragments_tail = fp;
466	skb_morph(dst: skb, src: head);
467	FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
468	rb_replace_node(victim: &head->rbnode, new: &skb->rbnode,
469	root: &q->rb_fragments);
470	consume_skb(skb: head);
471	head = skb;
472	}
473	WARN_ON(head->ip_defrag_offset != `0`);
474
475	delta = -head->truesize;
476
477	/ Head of list must not be cloned. /
478	if (skb_unclone(skb: head, GFP_ATOMIC))
479	return NULL;
480
481	delta += head->truesize;
482	if (delta)
483	add_frag_mem_limit(fqdir: q->fqdir, val: delta);
484
485	/ If the first fragment is fragmented itself, we split*
486	* it to two chunks: the first with data and paged part
487	* and the second, holding only fragments.
488	*/
489	if (skb_has_frag_list(skb: head)) {
490	struct sk_buff *clone;
491	int i, plen = `0`;
492
493	clone = alloc_skb(size: `0`, GFP_ATOMIC);
494	if (!clone)
495	return NULL;
496	skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
497	skb_frag_list_init(skb: head);
498	for (i = `0`; i < skb_shinfo(head)->nr_frags; i++)
499	plen += skb_frag_size(frag: &skb_shinfo(head)->frags[i]);
500	clone->data_len = head->data_len - plen;
501	clone->len = clone->data_len;
502	head->truesize += clone->truesize;
503	clone->csum = `0`;
504	clone->ip_summed = head->ip_summed;
505	add_frag_mem_limit(fqdir: q->fqdir, val: clone->truesize);
506	skb_shinfo(head)->frag_list = clone;
507	nextp = &clone->next;
508	} else {
509	nextp = &skb_shinfo(head)->frag_list;
510	}
511
512	return nextp;
513	}
514	EXPORT_SYMBOL(inet_frag_reasm_prepare);
515
516	void inet_frag_reasm_finish(struct inet_frag_queue q, struct* sk_buff *head,
517	void *reasm_data, bool try_coalesce)
518	{
519	struct sk_buff **nextp = reasm_data;
520	struct rb_node *rbn;
521	struct sk_buff *fp;
522	int sum_truesize;
523
524	skb_push(skb: head, len: head->data - skb_network_header(skb: head));
525
526	/ Traverse the tree in order, to build frag_list. /
527	fp = FRAG_CB(head)->next_frag;
528	rbn = rb_next(&head->rbnode);
529	rb_erase(&head->rbnode, &q->rb_fragments);
530
531	sum_truesize = head->truesize;
532	while (rbn \|\| fp) {
533	/ fp points to the next sk_buff in the current run;*
534	* rbn points to the next run.
535	*/
536	/ Go through the current run. /
537	while (fp) {
538	struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
539	bool stolen;
540	int delta;
541
542	sum_truesize += fp->truesize;
543	if (head->ip_summed != fp->ip_summed)
544	head->ip_summed = CHECKSUM_NONE;
545	else if (head->ip_summed == CHECKSUM_COMPLETE)
546	head->csum = csum_add(csum: head->csum, addend: fp->csum);
547
548	if (try_coalesce && skb_try_coalesce(to: head, from: fp, fragstolen: &stolen,
549	delta_truesize: &delta)) {
550	kfree_skb_partial(skb: fp, head_stolen: stolen);
551	} else {
552	fp->prev = NULL;
553	memset(&fp->rbnode, `0`, sizeof(fp->rbnode));
554	fp->sk = NULL;
555
556	head->data_len += fp->len;
557	head->len += fp->len;
558	head->truesize += fp->truesize;
559
560	*nextp = fp;
561	nextp = &fp->next;
562	}
563
564	fp = next_frag;
565	}
566	/ Move to the next run. /
567	if (rbn) {
568	struct rb_node *rbnext = rb_next(rbn);
569
570	fp = rb_to_skb(rbn);
571	rb_erase(rbn, &q->rb_fragments);
572	rbn = rbnext;
573	}
574	}
575	sub_frag_mem_limit(fqdir: q->fqdir, val: sum_truesize);
576
577	*nextp = NULL;
578	skb_mark_not_on_list(skb: head);
579	head->prev = NULL;
580	head->tstamp = q->stamp;
581	head->mono_delivery_time = q->mono_delivery_time;
582	}
583	EXPORT_SYMBOL(inet_frag_reasm_finish);
584
585	struct sk_buff inet_frag_pull_head(struct* inet_frag_queue *q)
586	{
587	struct sk_buff head, skb;
588
589	head = skb_rb_first(&q->rb_fragments);
590	if (!head)
591	return NULL;
592	skb = FRAG_CB(head)->next_frag;
593	if (skb)
594	rb_replace_node(victim: &head->rbnode, new: &skb->rbnode,
595	root: &q->rb_fragments);
596	else
597	rb_erase(&head->rbnode, &q->rb_fragments);
598	memset(&head->rbnode, `0`, sizeof(head->rbnode));
599	barrier();
600
601	if (head == q->fragments_tail)
602	q->fragments_tail = NULL;
603
604	sub_frag_mem_limit(fqdir: q->fqdir, val: head->truesize);
605
606	return head;
607	}
608	EXPORT_SYMBOL(inet_frag_pull_head);
609

source code of linux/net/ipv4/inet_fragment.c