1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Routines having to do with the 'struct sk_buff' memory handlers. |
4 | * |
5 | * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> |
6 | * Florian La Roche <rzsfl@rz.uni-sb.de> |
7 | * |
8 | * Fixes: |
9 | * Alan Cox : Fixed the worst of the load |
10 | * balancer bugs. |
11 | * Dave Platt : Interrupt stacking fix. |
12 | * Richard Kooijman : Timestamp fixes. |
13 | * Alan Cox : Changed buffer format. |
14 | * Alan Cox : destructor hook for AF_UNIX etc. |
15 | * Linus Torvalds : Better skb_clone. |
16 | * Alan Cox : Added skb_copy. |
17 | * Alan Cox : Added all the changed routines Linus |
18 | * only put in the headers |
19 | * Ray VanTassle : Fixed --skb->lock in free |
20 | * Alan Cox : skb_copy copy arp field |
21 | * Andi Kleen : slabified it. |
22 | * Robert Olsson : Removed skb_head_pool |
23 | * |
24 | * NOTE: |
25 | * The __skb_ routines should be called with interrupts |
26 | * disabled, or you better be *real* sure that the operation is atomic |
27 | * with respect to whatever list is being frobbed (e.g. via lock_sock() |
28 | * or via disabling bottom half handlers, etc). |
29 | */ |
30 | |
31 | /* |
32 | * The functions in this file will not compile correctly with gcc 2.4.x |
33 | */ |
34 | |
35 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
36 | |
37 | #include <linux/module.h> |
38 | #include <linux/types.h> |
39 | #include <linux/kernel.h> |
40 | #include <linux/mm.h> |
41 | #include <linux/interrupt.h> |
42 | #include <linux/in.h> |
43 | #include <linux/inet.h> |
44 | #include <linux/slab.h> |
45 | #include <linux/tcp.h> |
46 | #include <linux/udp.h> |
47 | #include <linux/sctp.h> |
48 | #include <linux/netdevice.h> |
49 | #ifdef CONFIG_NET_CLS_ACT |
50 | #include <net/pkt_sched.h> |
51 | #endif |
52 | #include <linux/string.h> |
53 | #include <linux/skbuff.h> |
54 | #include <linux/splice.h> |
55 | #include <linux/cache.h> |
56 | #include <linux/rtnetlink.h> |
57 | #include <linux/init.h> |
58 | #include <linux/scatterlist.h> |
59 | #include <linux/errqueue.h> |
60 | #include <linux/prefetch.h> |
61 | #include <linux/bitfield.h> |
62 | #include <linux/if_vlan.h> |
63 | #include <linux/mpls.h> |
64 | #include <linux/kcov.h> |
65 | #include <linux/iov_iter.h> |
66 | |
67 | #include <net/protocol.h> |
68 | #include <net/dst.h> |
69 | #include <net/sock.h> |
70 | #include <net/checksum.h> |
71 | #include <net/gso.h> |
72 | #include <net/ip6_checksum.h> |
73 | #include <net/xfrm.h> |
74 | #include <net/mpls.h> |
75 | #include <net/mptcp.h> |
76 | #include <net/mctp.h> |
77 | #include <net/page_pool/helpers.h> |
78 | #include <net/dropreason.h> |
79 | |
80 | #include <linux/uaccess.h> |
81 | #include <trace/events/skb.h> |
82 | #include <linux/highmem.h> |
83 | #include <linux/capability.h> |
84 | #include <linux/user_namespace.h> |
85 | #include <linux/indirect_call_wrapper.h> |
86 | #include <linux/textsearch.h> |
87 | |
88 | #include "dev.h" |
89 | #include "sock_destructor.h" |
90 | |
91 | struct kmem_cache *skbuff_cache __ro_after_init; |
92 | static struct kmem_cache *skbuff_fclone_cache __ro_after_init; |
93 | #ifdef CONFIG_SKB_EXTENSIONS |
94 | static struct kmem_cache *skbuff_ext_cache __ro_after_init; |
95 | #endif |
96 | |
97 | |
98 | static struct kmem_cache *skb_small_head_cache __ro_after_init; |
99 | |
100 | #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) |
101 | |
102 | /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. |
103 | * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique |
104 | * size, and we can differentiate heads from skb_small_head_cache |
105 | * vs system slabs by looking at their size (skb_end_offset()). |
106 | */ |
107 | #define SKB_SMALL_HEAD_CACHE_SIZE \ |
108 | (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ |
109 | (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ |
110 | SKB_SMALL_HEAD_SIZE) |
111 | |
112 | #define SKB_SMALL_HEAD_HEADROOM \ |
113 | SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) |
114 | |
115 | int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; |
116 | EXPORT_SYMBOL(sysctl_max_skb_frags); |
117 | |
118 | #undef FN |
119 | #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, |
120 | static const char * const drop_reasons[] = { |
121 | [SKB_CONSUMED] = "CONSUMED" , |
122 | DEFINE_DROP_REASON(FN, FN) |
123 | }; |
124 | |
125 | static const struct drop_reason_list drop_reasons_core = { |
126 | .reasons = drop_reasons, |
127 | .n_reasons = ARRAY_SIZE(drop_reasons), |
128 | }; |
129 | |
130 | const struct drop_reason_list __rcu * |
131 | drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { |
132 | [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), |
133 | }; |
134 | EXPORT_SYMBOL(drop_reasons_by_subsys); |
135 | |
136 | /** |
137 | * drop_reasons_register_subsys - register another drop reason subsystem |
138 | * @subsys: the subsystem to register, must not be the core |
139 | * @list: the list of drop reasons within the subsystem, must point to |
140 | * a statically initialized list |
141 | */ |
142 | void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, |
143 | const struct drop_reason_list *list) |
144 | { |
145 | if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || |
146 | subsys >= ARRAY_SIZE(drop_reasons_by_subsys), |
147 | "invalid subsystem %d\n" , subsys)) |
148 | return; |
149 | |
150 | /* must point to statically allocated memory, so INIT is OK */ |
151 | RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); |
152 | } |
153 | EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); |
154 | |
155 | /** |
156 | * drop_reasons_unregister_subsys - unregister a drop reason subsystem |
157 | * @subsys: the subsystem to remove, must not be the core |
158 | * |
159 | * Note: This will synchronize_rcu() to ensure no users when it returns. |
160 | */ |
161 | void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) |
162 | { |
163 | if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || |
164 | subsys >= ARRAY_SIZE(drop_reasons_by_subsys), |
165 | "invalid subsystem %d\n" , subsys)) |
166 | return; |
167 | |
168 | RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); |
169 | |
170 | synchronize_rcu(); |
171 | } |
172 | EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); |
173 | |
174 | /** |
175 | * skb_panic - private function for out-of-line support |
176 | * @skb: buffer |
177 | * @sz: size |
178 | * @addr: address |
179 | * @msg: skb_over_panic or skb_under_panic |
180 | * |
181 | * Out-of-line support for skb_put() and skb_push(). |
182 | * Called via the wrapper skb_over_panic() or skb_under_panic(). |
183 | * Keep out of line to prevent kernel bloat. |
184 | * __builtin_return_address is not used because it is not always reliable. |
185 | */ |
186 | static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, |
187 | const char msg[]) |
188 | { |
189 | pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n" , |
190 | msg, addr, skb->len, sz, skb->head, skb->data, |
191 | (unsigned long)skb->tail, (unsigned long)skb->end, |
192 | skb->dev ? skb->dev->name : "<NULL>" ); |
193 | BUG(); |
194 | } |
195 | |
196 | static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) |
197 | { |
198 | skb_panic(skb, sz, addr, msg: __func__); |
199 | } |
200 | |
201 | static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) |
202 | { |
203 | skb_panic(skb, sz, addr, msg: __func__); |
204 | } |
205 | |
206 | #define NAPI_SKB_CACHE_SIZE 64 |
207 | #define NAPI_SKB_CACHE_BULK 16 |
208 | #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) |
209 | |
210 | #if PAGE_SIZE == SZ_4K |
211 | |
212 | #define NAPI_HAS_SMALL_PAGE_FRAG 1 |
213 | #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) |
214 | |
215 | /* specialized page frag allocator using a single order 0 page |
216 | * and slicing it into 1K sized fragment. Constrained to systems |
217 | * with a very limited amount of 1K fragments fitting a single |
218 | * page - to avoid excessive truesize underestimation |
219 | */ |
220 | |
221 | struct page_frag_1k { |
222 | void *va; |
223 | u16 offset; |
224 | bool pfmemalloc; |
225 | }; |
226 | |
227 | static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) |
228 | { |
229 | struct page *page; |
230 | int offset; |
231 | |
232 | offset = nc->offset - SZ_1K; |
233 | if (likely(offset >= 0)) |
234 | goto use_frag; |
235 | |
236 | page = alloc_pages_node(NUMA_NO_NODE, gfp_mask: gfp, order: 0); |
237 | if (!page) |
238 | return NULL; |
239 | |
240 | nc->va = page_address(page); |
241 | nc->pfmemalloc = page_is_pfmemalloc(page); |
242 | offset = PAGE_SIZE - SZ_1K; |
243 | page_ref_add(page, nr: offset / SZ_1K); |
244 | |
245 | use_frag: |
246 | nc->offset = offset; |
247 | return nc->va + offset; |
248 | } |
249 | #else |
250 | |
251 | /* the small page is actually unused in this build; add dummy helpers |
252 | * to please the compiler and avoid later preprocessor's conditionals |
253 | */ |
254 | #define NAPI_HAS_SMALL_PAGE_FRAG 0 |
255 | #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false |
256 | |
257 | struct page_frag_1k { |
258 | }; |
259 | |
260 | static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) |
261 | { |
262 | return NULL; |
263 | } |
264 | |
265 | #endif |
266 | |
267 | struct napi_alloc_cache { |
268 | struct page_frag_cache page; |
269 | struct page_frag_1k page_small; |
270 | unsigned int skb_count; |
271 | void *skb_cache[NAPI_SKB_CACHE_SIZE]; |
272 | }; |
273 | |
274 | static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); |
275 | static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); |
276 | |
277 | /* Double check that napi_get_frags() allocates skbs with |
278 | * skb->head being backed by slab, not a page fragment. |
279 | * This is to make sure bug fixed in 3226b158e67c |
280 | * ("net: avoid 32 x truesize under-estimation for tiny skbs") |
281 | * does not accidentally come back. |
282 | */ |
283 | void napi_get_frags_check(struct napi_struct *napi) |
284 | { |
285 | struct sk_buff *skb; |
286 | |
287 | local_bh_disable(); |
288 | skb = napi_get_frags(napi); |
289 | WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); |
290 | napi_free_frags(napi); |
291 | local_bh_enable(); |
292 | } |
293 | |
294 | void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) |
295 | { |
296 | struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); |
297 | |
298 | fragsz = SKB_DATA_ALIGN(fragsz); |
299 | |
300 | return page_frag_alloc_align(nc: &nc->page, fragsz, GFP_ATOMIC, align_mask); |
301 | } |
302 | EXPORT_SYMBOL(__napi_alloc_frag_align); |
303 | |
304 | void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) |
305 | { |
306 | void *data; |
307 | |
308 | fragsz = SKB_DATA_ALIGN(fragsz); |
309 | if (in_hardirq() || irqs_disabled()) { |
310 | struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); |
311 | |
312 | data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); |
313 | } else { |
314 | struct napi_alloc_cache *nc; |
315 | |
316 | local_bh_disable(); |
317 | nc = this_cpu_ptr(&napi_alloc_cache); |
318 | data = page_frag_alloc_align(nc: &nc->page, fragsz, GFP_ATOMIC, align_mask); |
319 | local_bh_enable(); |
320 | } |
321 | return data; |
322 | } |
323 | EXPORT_SYMBOL(__netdev_alloc_frag_align); |
324 | |
325 | static struct sk_buff *napi_skb_cache_get(void) |
326 | { |
327 | struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); |
328 | struct sk_buff *skb; |
329 | |
330 | if (unlikely(!nc->skb_count)) { |
331 | nc->skb_count = kmem_cache_alloc_bulk(s: skbuff_cache, |
332 | GFP_ATOMIC, |
333 | NAPI_SKB_CACHE_BULK, |
334 | p: nc->skb_cache); |
335 | if (unlikely(!nc->skb_count)) |
336 | return NULL; |
337 | } |
338 | |
339 | skb = nc->skb_cache[--nc->skb_count]; |
340 | kasan_unpoison_object_data(cache: skbuff_cache, object: skb); |
341 | |
342 | return skb; |
343 | } |
344 | |
345 | static inline void __finalize_skb_around(struct sk_buff *skb, void *data, |
346 | unsigned int size) |
347 | { |
348 | struct skb_shared_info *shinfo; |
349 | |
350 | size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
351 | |
352 | /* Assumes caller memset cleared SKB */ |
353 | skb->truesize = SKB_TRUESIZE(size); |
354 | refcount_set(r: &skb->users, n: 1); |
355 | skb->head = data; |
356 | skb->data = data; |
357 | skb_reset_tail_pointer(skb); |
358 | skb_set_end_offset(skb, offset: size); |
359 | skb->mac_header = (typeof(skb->mac_header))~0U; |
360 | skb->transport_header = (typeof(skb->transport_header))~0U; |
361 | skb->alloc_cpu = raw_smp_processor_id(); |
362 | /* make sure we initialize shinfo sequentially */ |
363 | shinfo = skb_shinfo(skb); |
364 | memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); |
365 | atomic_set(v: &shinfo->dataref, i: 1); |
366 | |
367 | skb_set_kcov_handle(skb, kcov_handle: kcov_common_handle()); |
368 | } |
369 | |
370 | static inline void *__slab_build_skb(struct sk_buff *skb, void *data, |
371 | unsigned int *size) |
372 | { |
373 | void *resized; |
374 | |
375 | /* Must find the allocation size (and grow it to match). */ |
376 | *size = ksize(objp: data); |
377 | /* krealloc() will immediately return "data" when |
378 | * "ksize(data)" is requested: it is the existing upper |
379 | * bounds. As a result, GFP_ATOMIC will be ignored. Note |
380 | * that this "new" pointer needs to be passed back to the |
381 | * caller for use so the __alloc_size hinting will be |
382 | * tracked correctly. |
383 | */ |
384 | resized = krealloc(objp: data, new_size: *size, GFP_ATOMIC); |
385 | WARN_ON_ONCE(resized != data); |
386 | return resized; |
387 | } |
388 | |
389 | /* build_skb() variant which can operate on slab buffers. |
390 | * Note that this should be used sparingly as slab buffers |
391 | * cannot be combined efficiently by GRO! |
392 | */ |
393 | struct sk_buff *slab_build_skb(void *data) |
394 | { |
395 | struct sk_buff *skb; |
396 | unsigned int size; |
397 | |
398 | skb = kmem_cache_alloc(cachep: skbuff_cache, GFP_ATOMIC); |
399 | if (unlikely(!skb)) |
400 | return NULL; |
401 | |
402 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
403 | data = __slab_build_skb(skb, data, size: &size); |
404 | __finalize_skb_around(skb, data, size); |
405 | |
406 | return skb; |
407 | } |
408 | EXPORT_SYMBOL(slab_build_skb); |
409 | |
410 | /* Caller must provide SKB that is memset cleared */ |
411 | static void __build_skb_around(struct sk_buff *skb, void *data, |
412 | unsigned int frag_size) |
413 | { |
414 | unsigned int size = frag_size; |
415 | |
416 | /* frag_size == 0 is considered deprecated now. Callers |
417 | * using slab buffer should use slab_build_skb() instead. |
418 | */ |
419 | if (WARN_ONCE(size == 0, "Use slab_build_skb() instead" )) |
420 | data = __slab_build_skb(skb, data, size: &size); |
421 | |
422 | __finalize_skb_around(skb, data, size); |
423 | } |
424 | |
425 | /** |
426 | * __build_skb - build a network buffer |
427 | * @data: data buffer provided by caller |
428 | * @frag_size: size of data (must not be 0) |
429 | * |
430 | * Allocate a new &sk_buff. Caller provides space holding head and |
431 | * skb_shared_info. @data must have been allocated from the page |
432 | * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() |
433 | * allocation is deprecated, and callers should use slab_build_skb() |
434 | * instead.) |
435 | * The return is the new skb buffer. |
436 | * On a failure the return is %NULL, and @data is not freed. |
437 | * Notes : |
438 | * Before IO, driver allocates only data buffer where NIC put incoming frame |
439 | * Driver should add room at head (NET_SKB_PAD) and |
440 | * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) |
441 | * After IO, driver calls build_skb(), to allocate sk_buff and populate it |
442 | * before giving packet to stack. |
443 | * RX rings only contains data buffers, not full skbs. |
444 | */ |
445 | struct sk_buff *__build_skb(void *data, unsigned int frag_size) |
446 | { |
447 | struct sk_buff *skb; |
448 | |
449 | skb = kmem_cache_alloc(cachep: skbuff_cache, GFP_ATOMIC); |
450 | if (unlikely(!skb)) |
451 | return NULL; |
452 | |
453 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
454 | __build_skb_around(skb, data, frag_size); |
455 | |
456 | return skb; |
457 | } |
458 | |
459 | /* build_skb() is wrapper over __build_skb(), that specifically |
460 | * takes care of skb->head and skb->pfmemalloc |
461 | */ |
462 | struct sk_buff *build_skb(void *data, unsigned int frag_size) |
463 | { |
464 | struct sk_buff *skb = __build_skb(data, frag_size); |
465 | |
466 | if (likely(skb && frag_size)) { |
467 | skb->head_frag = 1; |
468 | skb_propagate_pfmemalloc(page: virt_to_head_page(x: data), skb); |
469 | } |
470 | return skb; |
471 | } |
472 | EXPORT_SYMBOL(build_skb); |
473 | |
474 | /** |
475 | * build_skb_around - build a network buffer around provided skb |
476 | * @skb: sk_buff provide by caller, must be memset cleared |
477 | * @data: data buffer provided by caller |
478 | * @frag_size: size of data |
479 | */ |
480 | struct sk_buff *build_skb_around(struct sk_buff *skb, |
481 | void *data, unsigned int frag_size) |
482 | { |
483 | if (unlikely(!skb)) |
484 | return NULL; |
485 | |
486 | __build_skb_around(skb, data, frag_size); |
487 | |
488 | if (frag_size) { |
489 | skb->head_frag = 1; |
490 | skb_propagate_pfmemalloc(page: virt_to_head_page(x: data), skb); |
491 | } |
492 | return skb; |
493 | } |
494 | EXPORT_SYMBOL(build_skb_around); |
495 | |
496 | /** |
497 | * __napi_build_skb - build a network buffer |
498 | * @data: data buffer provided by caller |
499 | * @frag_size: size of data |
500 | * |
501 | * Version of __build_skb() that uses NAPI percpu caches to obtain |
502 | * skbuff_head instead of inplace allocation. |
503 | * |
504 | * Returns a new &sk_buff on success, %NULL on allocation failure. |
505 | */ |
506 | static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) |
507 | { |
508 | struct sk_buff *skb; |
509 | |
510 | skb = napi_skb_cache_get(); |
511 | if (unlikely(!skb)) |
512 | return NULL; |
513 | |
514 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
515 | __build_skb_around(skb, data, frag_size); |
516 | |
517 | return skb; |
518 | } |
519 | |
520 | /** |
521 | * napi_build_skb - build a network buffer |
522 | * @data: data buffer provided by caller |
523 | * @frag_size: size of data |
524 | * |
525 | * Version of __napi_build_skb() that takes care of skb->head_frag |
526 | * and skb->pfmemalloc when the data is a page or page fragment. |
527 | * |
528 | * Returns a new &sk_buff on success, %NULL on allocation failure. |
529 | */ |
530 | struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) |
531 | { |
532 | struct sk_buff *skb = __napi_build_skb(data, frag_size); |
533 | |
534 | if (likely(skb) && frag_size) { |
535 | skb->head_frag = 1; |
536 | skb_propagate_pfmemalloc(page: virt_to_head_page(x: data), skb); |
537 | } |
538 | |
539 | return skb; |
540 | } |
541 | EXPORT_SYMBOL(napi_build_skb); |
542 | |
543 | /* |
544 | * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells |
545 | * the caller if emergency pfmemalloc reserves are being used. If it is and |
546 | * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves |
547 | * may be used. Otherwise, the packet data may be discarded until enough |
548 | * memory is free |
549 | */ |
550 | static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, |
551 | bool *pfmemalloc) |
552 | { |
553 | bool ret_pfmemalloc = false; |
554 | size_t obj_size; |
555 | void *obj; |
556 | |
557 | obj_size = SKB_HEAD_ALIGN(*size); |
558 | if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && |
559 | !(flags & KMALLOC_NOT_NORMAL_BITS)) { |
560 | obj = kmem_cache_alloc_node(s: skb_small_head_cache, |
561 | flags: flags | __GFP_NOMEMALLOC | __GFP_NOWARN, |
562 | node); |
563 | *size = SKB_SMALL_HEAD_CACHE_SIZE; |
564 | if (obj || !(gfp_pfmemalloc_allowed(gfp_mask: flags))) |
565 | goto out; |
566 | /* Try again but now we are using pfmemalloc reserves */ |
567 | ret_pfmemalloc = true; |
568 | obj = kmem_cache_alloc_node(s: skb_small_head_cache, flags, node); |
569 | goto out; |
570 | } |
571 | |
572 | obj_size = kmalloc_size_roundup(size: obj_size); |
573 | /* The following cast might truncate high-order bits of obj_size, this |
574 | * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. |
575 | */ |
576 | *size = (unsigned int)obj_size; |
577 | |
578 | /* |
579 | * Try a regular allocation, when that fails and we're not entitled |
580 | * to the reserves, fail. |
581 | */ |
582 | obj = kmalloc_node_track_caller(obj_size, |
583 | flags | __GFP_NOMEMALLOC | __GFP_NOWARN, |
584 | node); |
585 | if (obj || !(gfp_pfmemalloc_allowed(gfp_mask: flags))) |
586 | goto out; |
587 | |
588 | /* Try again but now we are using pfmemalloc reserves */ |
589 | ret_pfmemalloc = true; |
590 | obj = kmalloc_node_track_caller(obj_size, flags, node); |
591 | |
592 | out: |
593 | if (pfmemalloc) |
594 | *pfmemalloc = ret_pfmemalloc; |
595 | |
596 | return obj; |
597 | } |
598 | |
599 | /* Allocate a new skbuff. We do this ourselves so we can fill in a few |
600 | * 'private' fields and also do memory statistics to find all the |
601 | * [BEEP] leaks. |
602 | * |
603 | */ |
604 | |
605 | /** |
606 | * __alloc_skb - allocate a network buffer |
607 | * @size: size to allocate |
608 | * @gfp_mask: allocation mask |
609 | * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache |
610 | * instead of head cache and allocate a cloned (child) skb. |
611 | * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for |
612 | * allocations in case the data is required for writeback |
613 | * @node: numa node to allocate memory on |
614 | * |
615 | * Allocate a new &sk_buff. The returned buffer has no headroom and a |
616 | * tail room of at least size bytes. The object has a reference count |
617 | * of one. The return is the buffer. On a failure the return is %NULL. |
618 | * |
619 | * Buffers may only be allocated from interrupts using a @gfp_mask of |
620 | * %GFP_ATOMIC. |
621 | */ |
622 | struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, |
623 | int flags, int node) |
624 | { |
625 | struct kmem_cache *cache; |
626 | struct sk_buff *skb; |
627 | bool pfmemalloc; |
628 | u8 *data; |
629 | |
630 | cache = (flags & SKB_ALLOC_FCLONE) |
631 | ? skbuff_fclone_cache : skbuff_cache; |
632 | |
633 | if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) |
634 | gfp_mask |= __GFP_MEMALLOC; |
635 | |
636 | /* Get the HEAD */ |
637 | if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && |
638 | likely(node == NUMA_NO_NODE || node == numa_mem_id())) |
639 | skb = napi_skb_cache_get(); |
640 | else |
641 | skb = kmem_cache_alloc_node(s: cache, flags: gfp_mask & ~GFP_DMA, node); |
642 | if (unlikely(!skb)) |
643 | return NULL; |
644 | prefetchw(x: skb); |
645 | |
646 | /* We do our best to align skb_shared_info on a separate cache |
647 | * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives |
648 | * aligned memory blocks, unless SLUB/SLAB debug is enabled. |
649 | * Both skb->head and skb_shared_info are cache line aligned. |
650 | */ |
651 | data = kmalloc_reserve(size: &size, flags: gfp_mask, node, pfmemalloc: &pfmemalloc); |
652 | if (unlikely(!data)) |
653 | goto nodata; |
654 | /* kmalloc_size_roundup() might give us more room than requested. |
655 | * Put skb_shared_info exactly at the end of allocated zone, |
656 | * to allow max possible filling before reallocation. |
657 | */ |
658 | prefetchw(x: data + SKB_WITH_OVERHEAD(size)); |
659 | |
660 | /* |
661 | * Only clear those fields we need to clear, not those that we will |
662 | * actually initialise below. Hence, don't put any more fields after |
663 | * the tail pointer in struct sk_buff! |
664 | */ |
665 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
666 | __build_skb_around(skb, data, frag_size: size); |
667 | skb->pfmemalloc = pfmemalloc; |
668 | |
669 | if (flags & SKB_ALLOC_FCLONE) { |
670 | struct sk_buff_fclones *fclones; |
671 | |
672 | fclones = container_of(skb, struct sk_buff_fclones, skb1); |
673 | |
674 | skb->fclone = SKB_FCLONE_ORIG; |
675 | refcount_set(r: &fclones->fclone_ref, n: 1); |
676 | } |
677 | |
678 | return skb; |
679 | |
680 | nodata: |
681 | kmem_cache_free(s: cache, objp: skb); |
682 | return NULL; |
683 | } |
684 | EXPORT_SYMBOL(__alloc_skb); |
685 | |
686 | /** |
687 | * __netdev_alloc_skb - allocate an skbuff for rx on a specific device |
688 | * @dev: network device to receive on |
689 | * @len: length to allocate |
690 | * @gfp_mask: get_free_pages mask, passed to alloc_skb |
691 | * |
692 | * Allocate a new &sk_buff and assign it a usage count of one. The |
693 | * buffer has NET_SKB_PAD headroom built in. Users should allocate |
694 | * the headroom they think they need without accounting for the |
695 | * built in space. The built in space is used for optimisations. |
696 | * |
697 | * %NULL is returned if there is no free memory. |
698 | */ |
699 | struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, |
700 | gfp_t gfp_mask) |
701 | { |
702 | struct page_frag_cache *nc; |
703 | struct sk_buff *skb; |
704 | bool pfmemalloc; |
705 | void *data; |
706 | |
707 | len += NET_SKB_PAD; |
708 | |
709 | /* If requested length is either too small or too big, |
710 | * we use kmalloc() for skb->head allocation. |
711 | */ |
712 | if (len <= SKB_WITH_OVERHEAD(1024) || |
713 | len > SKB_WITH_OVERHEAD(PAGE_SIZE) || |
714 | (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { |
715 | skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); |
716 | if (!skb) |
717 | goto skb_fail; |
718 | goto skb_success; |
719 | } |
720 | |
721 | len = SKB_HEAD_ALIGN(len); |
722 | |
723 | if (sk_memalloc_socks()) |
724 | gfp_mask |= __GFP_MEMALLOC; |
725 | |
726 | if (in_hardirq() || irqs_disabled()) { |
727 | nc = this_cpu_ptr(&netdev_alloc_cache); |
728 | data = page_frag_alloc(nc, fragsz: len, gfp_mask); |
729 | pfmemalloc = nc->pfmemalloc; |
730 | } else { |
731 | local_bh_disable(); |
732 | nc = this_cpu_ptr(&napi_alloc_cache.page); |
733 | data = page_frag_alloc(nc, fragsz: len, gfp_mask); |
734 | pfmemalloc = nc->pfmemalloc; |
735 | local_bh_enable(); |
736 | } |
737 | |
738 | if (unlikely(!data)) |
739 | return NULL; |
740 | |
741 | skb = __build_skb(data, frag_size: len); |
742 | if (unlikely(!skb)) { |
743 | skb_free_frag(addr: data); |
744 | return NULL; |
745 | } |
746 | |
747 | if (pfmemalloc) |
748 | skb->pfmemalloc = 1; |
749 | skb->head_frag = 1; |
750 | |
751 | skb_success: |
752 | skb_reserve(skb, NET_SKB_PAD); |
753 | skb->dev = dev; |
754 | |
755 | skb_fail: |
756 | return skb; |
757 | } |
758 | EXPORT_SYMBOL(__netdev_alloc_skb); |
759 | |
760 | /** |
761 | * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance |
762 | * @napi: napi instance this buffer was allocated for |
763 | * @len: length to allocate |
764 | * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages |
765 | * |
766 | * Allocate a new sk_buff for use in NAPI receive. This buffer will |
767 | * attempt to allocate the head from a special reserved region used |
768 | * only for NAPI Rx allocation. By doing this we can save several |
769 | * CPU cycles by avoiding having to disable and re-enable IRQs. |
770 | * |
771 | * %NULL is returned if there is no free memory. |
772 | */ |
773 | struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, |
774 | gfp_t gfp_mask) |
775 | { |
776 | struct napi_alloc_cache *nc; |
777 | struct sk_buff *skb; |
778 | bool pfmemalloc; |
779 | void *data; |
780 | |
781 | DEBUG_NET_WARN_ON_ONCE(!in_softirq()); |
782 | len += NET_SKB_PAD + NET_IP_ALIGN; |
783 | |
784 | /* If requested length is either too small or too big, |
785 | * we use kmalloc() for skb->head allocation. |
786 | * When the small frag allocator is available, prefer it over kmalloc |
787 | * for small fragments |
788 | */ |
789 | if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || |
790 | len > SKB_WITH_OVERHEAD(PAGE_SIZE) || |
791 | (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { |
792 | skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, |
793 | NUMA_NO_NODE); |
794 | if (!skb) |
795 | goto skb_fail; |
796 | goto skb_success; |
797 | } |
798 | |
799 | nc = this_cpu_ptr(&napi_alloc_cache); |
800 | |
801 | if (sk_memalloc_socks()) |
802 | gfp_mask |= __GFP_MEMALLOC; |
803 | |
804 | if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { |
805 | /* we are artificially inflating the allocation size, but |
806 | * that is not as bad as it may look like, as: |
807 | * - 'len' less than GRO_MAX_HEAD makes little sense |
808 | * - On most systems, larger 'len' values lead to fragment |
809 | * size above 512 bytes |
810 | * - kmalloc would use the kmalloc-1k slab for such values |
811 | * - Builds with smaller GRO_MAX_HEAD will very likely do |
812 | * little networking, as that implies no WiFi and no |
813 | * tunnels support, and 32 bits arches. |
814 | */ |
815 | len = SZ_1K; |
816 | |
817 | data = page_frag_alloc_1k(nc: &nc->page_small, gfp: gfp_mask); |
818 | pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); |
819 | } else { |
820 | len = SKB_HEAD_ALIGN(len); |
821 | |
822 | data = page_frag_alloc(nc: &nc->page, fragsz: len, gfp_mask); |
823 | pfmemalloc = nc->page.pfmemalloc; |
824 | } |
825 | |
826 | if (unlikely(!data)) |
827 | return NULL; |
828 | |
829 | skb = __napi_build_skb(data, frag_size: len); |
830 | if (unlikely(!skb)) { |
831 | skb_free_frag(addr: data); |
832 | return NULL; |
833 | } |
834 | |
835 | if (pfmemalloc) |
836 | skb->pfmemalloc = 1; |
837 | skb->head_frag = 1; |
838 | |
839 | skb_success: |
840 | skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); |
841 | skb->dev = napi->dev; |
842 | |
843 | skb_fail: |
844 | return skb; |
845 | } |
846 | EXPORT_SYMBOL(__napi_alloc_skb); |
847 | |
848 | void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, |
849 | int size, unsigned int truesize) |
850 | { |
851 | DEBUG_NET_WARN_ON_ONCE(size > truesize); |
852 | |
853 | skb_fill_page_desc(skb, i, page, off, size); |
854 | skb->len += size; |
855 | skb->data_len += size; |
856 | skb->truesize += truesize; |
857 | } |
858 | EXPORT_SYMBOL(skb_add_rx_frag); |
859 | |
860 | void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, |
861 | unsigned int truesize) |
862 | { |
863 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
864 | |
865 | DEBUG_NET_WARN_ON_ONCE(size > truesize); |
866 | |
867 | skb_frag_size_add(frag, delta: size); |
868 | skb->len += size; |
869 | skb->data_len += size; |
870 | skb->truesize += truesize; |
871 | } |
872 | EXPORT_SYMBOL(skb_coalesce_rx_frag); |
873 | |
874 | static void skb_drop_list(struct sk_buff **listp) |
875 | { |
876 | kfree_skb_list(segs: *listp); |
877 | *listp = NULL; |
878 | } |
879 | |
880 | static inline void skb_drop_fraglist(struct sk_buff *skb) |
881 | { |
882 | skb_drop_list(listp: &skb_shinfo(skb)->frag_list); |
883 | } |
884 | |
885 | static void skb_clone_fraglist(struct sk_buff *skb) |
886 | { |
887 | struct sk_buff *list; |
888 | |
889 | skb_walk_frags(skb, list) |
890 | skb_get(skb: list); |
891 | } |
892 | |
893 | #if IS_ENABLED(CONFIG_PAGE_POOL) |
894 | bool napi_pp_put_page(struct page *page, bool napi_safe) |
895 | { |
896 | bool allow_direct = false; |
897 | struct page_pool *pp; |
898 | |
899 | page = compound_head(page); |
900 | |
901 | /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation |
902 | * in order to preserve any existing bits, such as bit 0 for the |
903 | * head page of compound page and bit 1 for pfmemalloc page, so |
904 | * mask those bits for freeing side when doing below checking, |
905 | * and page_is_pfmemalloc() is checked in __page_pool_put_page() |
906 | * to avoid recycling the pfmemalloc page. |
907 | */ |
908 | if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) |
909 | return false; |
910 | |
911 | pp = page->pp; |
912 | |
913 | /* Allow direct recycle if we have reasons to believe that we are |
914 | * in the same context as the consumer would run, so there's |
915 | * no possible race. |
916 | * __page_pool_put_page() makes sure we're not in hardirq context |
917 | * and interrupts are enabled prior to accessing the cache. |
918 | */ |
919 | if (napi_safe || in_softirq()) { |
920 | const struct napi_struct *napi = READ_ONCE(pp->p.napi); |
921 | |
922 | allow_direct = napi && |
923 | READ_ONCE(napi->list_owner) == smp_processor_id(); |
924 | } |
925 | |
926 | /* Driver set this to memory recycling info. Reset it on recycle. |
927 | * This will *not* work for NIC using a split-page memory model. |
928 | * The page will be returned to the pool here regardless of the |
929 | * 'flipped' fragment being in use or not. |
930 | */ |
931 | page_pool_put_full_page(pool: pp, page, allow_direct); |
932 | |
933 | return true; |
934 | } |
935 | EXPORT_SYMBOL(napi_pp_put_page); |
936 | #endif |
937 | |
938 | static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) |
939 | { |
940 | if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) |
941 | return false; |
942 | return napi_pp_put_page(virt_to_page(data), napi_safe); |
943 | } |
944 | |
945 | static void skb_kfree_head(void *head, unsigned int end_offset) |
946 | { |
947 | if (end_offset == SKB_SMALL_HEAD_HEADROOM) |
948 | kmem_cache_free(s: skb_small_head_cache, objp: head); |
949 | else |
950 | kfree(objp: head); |
951 | } |
952 | |
953 | static void skb_free_head(struct sk_buff *skb, bool napi_safe) |
954 | { |
955 | unsigned char *head = skb->head; |
956 | |
957 | if (skb->head_frag) { |
958 | if (skb_pp_recycle(skb, data: head, napi_safe)) |
959 | return; |
960 | skb_free_frag(addr: head); |
961 | } else { |
962 | skb_kfree_head(head, end_offset: skb_end_offset(skb)); |
963 | } |
964 | } |
965 | |
966 | static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, |
967 | bool napi_safe) |
968 | { |
969 | struct skb_shared_info *shinfo = skb_shinfo(skb); |
970 | int i; |
971 | |
972 | if (skb->cloned && |
973 | atomic_sub_return(i: skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, |
974 | v: &shinfo->dataref)) |
975 | goto exit; |
976 | |
977 | if (skb_zcopy(skb)) { |
978 | bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; |
979 | |
980 | skb_zcopy_clear(skb, zerocopy_success: true); |
981 | if (skip_unref) |
982 | goto free_head; |
983 | } |
984 | |
985 | for (i = 0; i < shinfo->nr_frags; i++) |
986 | napi_frag_unref(frag: &shinfo->frags[i], recycle: skb->pp_recycle, napi_safe); |
987 | |
988 | free_head: |
989 | if (shinfo->frag_list) |
990 | kfree_skb_list_reason(segs: shinfo->frag_list, reason); |
991 | |
992 | skb_free_head(skb, napi_safe); |
993 | exit: |
994 | /* When we clone an SKB we copy the reycling bit. The pp_recycle |
995 | * bit is only set on the head though, so in order to avoid races |
996 | * while trying to recycle fragments on __skb_frag_unref() we need |
997 | * to make one SKB responsible for triggering the recycle path. |
998 | * So disable the recycling bit if an SKB is cloned and we have |
999 | * additional references to the fragmented part of the SKB. |
1000 | * Eventually the last SKB will have the recycling bit set and it's |
1001 | * dataref set to 0, which will trigger the recycling |
1002 | */ |
1003 | skb->pp_recycle = 0; |
1004 | } |
1005 | |
1006 | /* |
1007 | * Free an skbuff by memory without cleaning the state. |
1008 | */ |
1009 | static void kfree_skbmem(struct sk_buff *skb) |
1010 | { |
1011 | struct sk_buff_fclones *fclones; |
1012 | |
1013 | switch (skb->fclone) { |
1014 | case SKB_FCLONE_UNAVAILABLE: |
1015 | kmem_cache_free(s: skbuff_cache, objp: skb); |
1016 | return; |
1017 | |
1018 | case SKB_FCLONE_ORIG: |
1019 | fclones = container_of(skb, struct sk_buff_fclones, skb1); |
1020 | |
1021 | /* We usually free the clone (TX completion) before original skb |
1022 | * This test would have no chance to be true for the clone, |
1023 | * while here, branch prediction will be good. |
1024 | */ |
1025 | if (refcount_read(r: &fclones->fclone_ref) == 1) |
1026 | goto fastpath; |
1027 | break; |
1028 | |
1029 | default: /* SKB_FCLONE_CLONE */ |
1030 | fclones = container_of(skb, struct sk_buff_fclones, skb2); |
1031 | break; |
1032 | } |
1033 | if (!refcount_dec_and_test(r: &fclones->fclone_ref)) |
1034 | return; |
1035 | fastpath: |
1036 | kmem_cache_free(s: skbuff_fclone_cache, objp: fclones); |
1037 | } |
1038 | |
1039 | void skb_release_head_state(struct sk_buff *skb) |
1040 | { |
1041 | skb_dst_drop(skb); |
1042 | if (skb->destructor) { |
1043 | DEBUG_NET_WARN_ON_ONCE(in_hardirq()); |
1044 | skb->destructor(skb); |
1045 | } |
1046 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
1047 | nf_conntrack_put(nfct: skb_nfct(skb)); |
1048 | #endif |
1049 | skb_ext_put(skb); |
1050 | } |
1051 | |
1052 | /* Free everything but the sk_buff shell. */ |
1053 | static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, |
1054 | bool napi_safe) |
1055 | { |
1056 | skb_release_head_state(skb); |
1057 | if (likely(skb->head)) |
1058 | skb_release_data(skb, reason, napi_safe); |
1059 | } |
1060 | |
1061 | /** |
1062 | * __kfree_skb - private function |
1063 | * @skb: buffer |
1064 | * |
1065 | * Free an sk_buff. Release anything attached to the buffer. |
1066 | * Clean the state. This is an internal helper function. Users should |
1067 | * always call kfree_skb |
1068 | */ |
1069 | |
1070 | void __kfree_skb(struct sk_buff *skb) |
1071 | { |
1072 | skb_release_all(skb, reason: SKB_DROP_REASON_NOT_SPECIFIED, napi_safe: false); |
1073 | kfree_skbmem(skb); |
1074 | } |
1075 | EXPORT_SYMBOL(__kfree_skb); |
1076 | |
1077 | static __always_inline |
1078 | bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) |
1079 | { |
1080 | if (unlikely(!skb_unref(skb))) |
1081 | return false; |
1082 | |
1083 | DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || |
1084 | u32_get_bits(reason, |
1085 | SKB_DROP_REASON_SUBSYS_MASK) >= |
1086 | SKB_DROP_REASON_SUBSYS_NUM); |
1087 | |
1088 | if (reason == SKB_CONSUMED) |
1089 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1090 | else |
1091 | trace_kfree_skb(skb, location: __builtin_return_address(0), reason); |
1092 | return true; |
1093 | } |
1094 | |
1095 | /** |
1096 | * kfree_skb_reason - free an sk_buff with special reason |
1097 | * @skb: buffer to free |
1098 | * @reason: reason why this skb is dropped |
1099 | * |
1100 | * Drop a reference to the buffer and free it if the usage count has |
1101 | * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' |
1102 | * tracepoint. |
1103 | */ |
1104 | void __fix_address |
1105 | kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) |
1106 | { |
1107 | if (__kfree_skb_reason(skb, reason)) |
1108 | __kfree_skb(skb); |
1109 | } |
1110 | EXPORT_SYMBOL(kfree_skb_reason); |
1111 | |
1112 | #define KFREE_SKB_BULK_SIZE 16 |
1113 | |
1114 | struct skb_free_array { |
1115 | unsigned int skb_count; |
1116 | void *skb_array[KFREE_SKB_BULK_SIZE]; |
1117 | }; |
1118 | |
1119 | static void kfree_skb_add_bulk(struct sk_buff *skb, |
1120 | struct skb_free_array *sa, |
1121 | enum skb_drop_reason reason) |
1122 | { |
1123 | /* if SKB is a clone, don't handle this case */ |
1124 | if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { |
1125 | __kfree_skb(skb); |
1126 | return; |
1127 | } |
1128 | |
1129 | skb_release_all(skb, reason, napi_safe: false); |
1130 | sa->skb_array[sa->skb_count++] = skb; |
1131 | |
1132 | if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { |
1133 | kmem_cache_free_bulk(s: skbuff_cache, KFREE_SKB_BULK_SIZE, |
1134 | p: sa->skb_array); |
1135 | sa->skb_count = 0; |
1136 | } |
1137 | } |
1138 | |
1139 | void __fix_address |
1140 | kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) |
1141 | { |
1142 | struct skb_free_array sa; |
1143 | |
1144 | sa.skb_count = 0; |
1145 | |
1146 | while (segs) { |
1147 | struct sk_buff *next = segs->next; |
1148 | |
1149 | if (__kfree_skb_reason(skb: segs, reason)) { |
1150 | skb_poison_list(skb: segs); |
1151 | kfree_skb_add_bulk(skb: segs, sa: &sa, reason); |
1152 | } |
1153 | |
1154 | segs = next; |
1155 | } |
1156 | |
1157 | if (sa.skb_count) |
1158 | kmem_cache_free_bulk(s: skbuff_cache, size: sa.skb_count, p: sa.skb_array); |
1159 | } |
1160 | EXPORT_SYMBOL(kfree_skb_list_reason); |
1161 | |
1162 | /* Dump skb information and contents. |
1163 | * |
1164 | * Must only be called from net_ratelimit()-ed paths. |
1165 | * |
1166 | * Dumps whole packets if full_pkt, only headers otherwise. |
1167 | */ |
1168 | void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) |
1169 | { |
1170 | struct skb_shared_info *sh = skb_shinfo(skb); |
1171 | struct net_device *dev = skb->dev; |
1172 | struct sock *sk = skb->sk; |
1173 | struct sk_buff *list_skb; |
1174 | bool has_mac, has_trans; |
1175 | int headroom, tailroom; |
1176 | int i, len, seg_len; |
1177 | |
1178 | if (full_pkt) |
1179 | len = skb->len; |
1180 | else |
1181 | len = min_t(int, skb->len, MAX_HEADER + 128); |
1182 | |
1183 | headroom = skb_headroom(skb); |
1184 | tailroom = skb_tailroom(skb); |
1185 | |
1186 | has_mac = skb_mac_header_was_set(skb); |
1187 | has_trans = skb_transport_header_was_set(skb); |
1188 | |
1189 | printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" |
1190 | "mac=(%d,%d) net=(%d,%d) trans=%d\n" |
1191 | "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" |
1192 | "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" |
1193 | "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" , |
1194 | level, skb->len, headroom, skb_headlen(skb), tailroom, |
1195 | has_mac ? skb->mac_header : -1, |
1196 | has_mac ? skb_mac_header_len(skb) : -1, |
1197 | skb->network_header, |
1198 | has_trans ? skb_network_header_len(skb) : -1, |
1199 | has_trans ? skb->transport_header : -1, |
1200 | sh->tx_flags, sh->nr_frags, |
1201 | sh->gso_size, sh->gso_type, sh->gso_segs, |
1202 | skb->csum, skb->ip_summed, skb->csum_complete_sw, |
1203 | skb->csum_valid, skb->csum_level, |
1204 | skb->hash, skb->sw_hash, skb->l4_hash, |
1205 | ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); |
1206 | |
1207 | if (dev) |
1208 | printk("%sdev name=%s feat=%pNF\n" , |
1209 | level, dev->name, &dev->features); |
1210 | if (sk) |
1211 | printk("%ssk family=%hu type=%u proto=%u\n" , |
1212 | level, sk->sk_family, sk->sk_type, sk->sk_protocol); |
1213 | |
1214 | if (full_pkt && headroom) |
1215 | print_hex_dump(level, prefix_str: "skb headroom: " , prefix_type: DUMP_PREFIX_OFFSET, |
1216 | rowsize: 16, groupsize: 1, buf: skb->head, len: headroom, ascii: false); |
1217 | |
1218 | seg_len = min_t(int, skb_headlen(skb), len); |
1219 | if (seg_len) |
1220 | print_hex_dump(level, prefix_str: "skb linear: " , prefix_type: DUMP_PREFIX_OFFSET, |
1221 | rowsize: 16, groupsize: 1, buf: skb->data, len: seg_len, ascii: false); |
1222 | len -= seg_len; |
1223 | |
1224 | if (full_pkt && tailroom) |
1225 | print_hex_dump(level, prefix_str: "skb tailroom: " , prefix_type: DUMP_PREFIX_OFFSET, |
1226 | rowsize: 16, groupsize: 1, buf: skb_tail_pointer(skb), len: tailroom, ascii: false); |
1227 | |
1228 | for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { |
1229 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
1230 | u32 p_off, p_len, copied; |
1231 | struct page *p; |
1232 | u8 *vaddr; |
1233 | |
1234 | skb_frag_foreach_page(frag, skb_frag_off(frag), |
1235 | skb_frag_size(frag), p, p_off, p_len, |
1236 | copied) { |
1237 | seg_len = min_t(int, p_len, len); |
1238 | vaddr = kmap_atomic(page: p); |
1239 | print_hex_dump(level, prefix_str: "skb frag: " , |
1240 | prefix_type: DUMP_PREFIX_OFFSET, |
1241 | rowsize: 16, groupsize: 1, buf: vaddr + p_off, len: seg_len, ascii: false); |
1242 | kunmap_atomic(vaddr); |
1243 | len -= seg_len; |
1244 | if (!len) |
1245 | break; |
1246 | } |
1247 | } |
1248 | |
1249 | if (full_pkt && skb_has_frag_list(skb)) { |
1250 | printk("skb fraglist:\n" ); |
1251 | skb_walk_frags(skb, list_skb) |
1252 | skb_dump(level, skb: list_skb, full_pkt: true); |
1253 | } |
1254 | } |
1255 | EXPORT_SYMBOL(skb_dump); |
1256 | |
1257 | /** |
1258 | * skb_tx_error - report an sk_buff xmit error |
1259 | * @skb: buffer that triggered an error |
1260 | * |
1261 | * Report xmit error if a device callback is tracking this skb. |
1262 | * skb must be freed afterwards. |
1263 | */ |
1264 | void skb_tx_error(struct sk_buff *skb) |
1265 | { |
1266 | if (skb) { |
1267 | skb_zcopy_downgrade_managed(skb); |
1268 | skb_zcopy_clear(skb, zerocopy_success: true); |
1269 | } |
1270 | } |
1271 | EXPORT_SYMBOL(skb_tx_error); |
1272 | |
1273 | #ifdef CONFIG_TRACEPOINTS |
1274 | /** |
1275 | * consume_skb - free an skbuff |
1276 | * @skb: buffer to free |
1277 | * |
1278 | * Drop a ref to the buffer and free it if the usage count has hit zero |
1279 | * Functions identically to kfree_skb, but kfree_skb assumes that the frame |
1280 | * is being dropped after a failure and notes that |
1281 | */ |
1282 | void consume_skb(struct sk_buff *skb) |
1283 | { |
1284 | if (!skb_unref(skb)) |
1285 | return; |
1286 | |
1287 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1288 | __kfree_skb(skb); |
1289 | } |
1290 | EXPORT_SYMBOL(consume_skb); |
1291 | #endif |
1292 | |
1293 | /** |
1294 | * __consume_stateless_skb - free an skbuff, assuming it is stateless |
1295 | * @skb: buffer to free |
1296 | * |
1297 | * Alike consume_skb(), but this variant assumes that this is the last |
1298 | * skb reference and all the head states have been already dropped |
1299 | */ |
1300 | void __consume_stateless_skb(struct sk_buff *skb) |
1301 | { |
1302 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1303 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
1304 | kfree_skbmem(skb); |
1305 | } |
1306 | |
1307 | static void napi_skb_cache_put(struct sk_buff *skb) |
1308 | { |
1309 | struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); |
1310 | u32 i; |
1311 | |
1312 | kasan_poison_object_data(cache: skbuff_cache, object: skb); |
1313 | nc->skb_cache[nc->skb_count++] = skb; |
1314 | |
1315 | if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { |
1316 | for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) |
1317 | kasan_unpoison_object_data(cache: skbuff_cache, |
1318 | object: nc->skb_cache[i]); |
1319 | |
1320 | kmem_cache_free_bulk(s: skbuff_cache, NAPI_SKB_CACHE_HALF, |
1321 | p: nc->skb_cache + NAPI_SKB_CACHE_HALF); |
1322 | nc->skb_count = NAPI_SKB_CACHE_HALF; |
1323 | } |
1324 | } |
1325 | |
1326 | void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) |
1327 | { |
1328 | skb_release_all(skb, reason, napi_safe: true); |
1329 | napi_skb_cache_put(skb); |
1330 | } |
1331 | |
1332 | void napi_skb_free_stolen_head(struct sk_buff *skb) |
1333 | { |
1334 | if (unlikely(skb->slow_gro)) { |
1335 | nf_reset_ct(skb); |
1336 | skb_dst_drop(skb); |
1337 | skb_ext_put(skb); |
1338 | skb_orphan(skb); |
1339 | skb->slow_gro = 0; |
1340 | } |
1341 | napi_skb_cache_put(skb); |
1342 | } |
1343 | |
1344 | void napi_consume_skb(struct sk_buff *skb, int budget) |
1345 | { |
1346 | /* Zero budget indicate non-NAPI context called us, like netpoll */ |
1347 | if (unlikely(!budget)) { |
1348 | dev_consume_skb_any(skb); |
1349 | return; |
1350 | } |
1351 | |
1352 | DEBUG_NET_WARN_ON_ONCE(!in_softirq()); |
1353 | |
1354 | if (!skb_unref(skb)) |
1355 | return; |
1356 | |
1357 | /* if reaching here SKB is ready to free */ |
1358 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1359 | |
1360 | /* if SKB is a clone, don't handle this case */ |
1361 | if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { |
1362 | __kfree_skb(skb); |
1363 | return; |
1364 | } |
1365 | |
1366 | skb_release_all(skb, reason: SKB_CONSUMED, napi_safe: !!budget); |
1367 | napi_skb_cache_put(skb); |
1368 | } |
1369 | EXPORT_SYMBOL(napi_consume_skb); |
1370 | |
1371 | /* Make sure a field is contained by headers group */ |
1372 | #define CHECK_SKB_FIELD(field) \ |
1373 | BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ |
1374 | offsetof(struct sk_buff, headers.field)); \ |
1375 | |
1376 | static void (struct sk_buff *new, const struct sk_buff *old) |
1377 | { |
1378 | new->tstamp = old->tstamp; |
1379 | /* We do not copy old->sk */ |
1380 | new->dev = old->dev; |
1381 | memcpy(new->cb, old->cb, sizeof(old->cb)); |
1382 | skb_dst_copy(nskb: new, oskb: old); |
1383 | __skb_ext_copy(dst: new, src: old); |
1384 | __nf_copy(dst: new, src: old, copy: false); |
1385 | |
1386 | /* Note : this field could be in the headers group. |
1387 | * It is not yet because we do not want to have a 16 bit hole |
1388 | */ |
1389 | new->queue_mapping = old->queue_mapping; |
1390 | |
1391 | memcpy(&new->headers, &old->headers, sizeof(new->headers)); |
1392 | CHECK_SKB_FIELD(protocol); |
1393 | CHECK_SKB_FIELD(csum); |
1394 | CHECK_SKB_FIELD(hash); |
1395 | CHECK_SKB_FIELD(priority); |
1396 | CHECK_SKB_FIELD(skb_iif); |
1397 | CHECK_SKB_FIELD(vlan_proto); |
1398 | CHECK_SKB_FIELD(vlan_tci); |
1399 | CHECK_SKB_FIELD(transport_header); |
1400 | CHECK_SKB_FIELD(network_header); |
1401 | CHECK_SKB_FIELD(mac_header); |
1402 | CHECK_SKB_FIELD(inner_protocol); |
1403 | CHECK_SKB_FIELD(inner_transport_header); |
1404 | CHECK_SKB_FIELD(inner_network_header); |
1405 | CHECK_SKB_FIELD(inner_mac_header); |
1406 | CHECK_SKB_FIELD(mark); |
1407 | #ifdef CONFIG_NETWORK_SECMARK |
1408 | CHECK_SKB_FIELD(secmark); |
1409 | #endif |
1410 | #ifdef CONFIG_NET_RX_BUSY_POLL |
1411 | CHECK_SKB_FIELD(napi_id); |
1412 | #endif |
1413 | CHECK_SKB_FIELD(alloc_cpu); |
1414 | #ifdef CONFIG_XPS |
1415 | CHECK_SKB_FIELD(sender_cpu); |
1416 | #endif |
1417 | #ifdef CONFIG_NET_SCHED |
1418 | CHECK_SKB_FIELD(tc_index); |
1419 | #endif |
1420 | |
1421 | } |
1422 | |
1423 | /* |
1424 | * You should not add any new code to this function. Add it to |
1425 | * __copy_skb_header above instead. |
1426 | */ |
1427 | static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) |
1428 | { |
1429 | #define C(x) n->x = skb->x |
1430 | |
1431 | n->next = n->prev = NULL; |
1432 | n->sk = NULL; |
1433 | __copy_skb_header(new: n, old: skb); |
1434 | |
1435 | C(len); |
1436 | C(data_len); |
1437 | C(mac_len); |
1438 | n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; |
1439 | n->cloned = 1; |
1440 | n->nohdr = 0; |
1441 | n->peeked = 0; |
1442 | C(pfmemalloc); |
1443 | C(pp_recycle); |
1444 | n->destructor = NULL; |
1445 | C(tail); |
1446 | C(end); |
1447 | C(head); |
1448 | C(head_frag); |
1449 | C(data); |
1450 | C(truesize); |
1451 | refcount_set(r: &n->users, n: 1); |
1452 | |
1453 | atomic_inc(v: &(skb_shinfo(skb)->dataref)); |
1454 | skb->cloned = 1; |
1455 | |
1456 | return n; |
1457 | #undef C |
1458 | } |
1459 | |
1460 | /** |
1461 | * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg |
1462 | * @first: first sk_buff of the msg |
1463 | */ |
1464 | struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) |
1465 | { |
1466 | struct sk_buff *n; |
1467 | |
1468 | n = alloc_skb(size: 0, GFP_ATOMIC); |
1469 | if (!n) |
1470 | return NULL; |
1471 | |
1472 | n->len = first->len; |
1473 | n->data_len = first->len; |
1474 | n->truesize = first->truesize; |
1475 | |
1476 | skb_shinfo(n)->frag_list = first; |
1477 | |
1478 | __copy_skb_header(new: n, old: first); |
1479 | n->destructor = NULL; |
1480 | |
1481 | return n; |
1482 | } |
1483 | EXPORT_SYMBOL_GPL(alloc_skb_for_msg); |
1484 | |
1485 | /** |
1486 | * skb_morph - morph one skb into another |
1487 | * @dst: the skb to receive the contents |
1488 | * @src: the skb to supply the contents |
1489 | * |
1490 | * This is identical to skb_clone except that the target skb is |
1491 | * supplied by the user. |
1492 | * |
1493 | * The target skb is returned upon exit. |
1494 | */ |
1495 | struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) |
1496 | { |
1497 | skb_release_all(skb: dst, reason: SKB_CONSUMED, napi_safe: false); |
1498 | return __skb_clone(n: dst, skb: src); |
1499 | } |
1500 | EXPORT_SYMBOL_GPL(skb_morph); |
1501 | |
1502 | int mm_account_pinned_pages(struct mmpin *mmp, size_t size) |
1503 | { |
1504 | unsigned long max_pg, num_pg, new_pg, old_pg, rlim; |
1505 | struct user_struct *user; |
1506 | |
1507 | if (capable(CAP_IPC_LOCK) || !size) |
1508 | return 0; |
1509 | |
1510 | rlim = rlimit(RLIMIT_MEMLOCK); |
1511 | if (rlim == RLIM_INFINITY) |
1512 | return 0; |
1513 | |
1514 | num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ |
1515 | max_pg = rlim >> PAGE_SHIFT; |
1516 | user = mmp->user ? : current_user(); |
1517 | |
1518 | old_pg = atomic_long_read(v: &user->locked_vm); |
1519 | do { |
1520 | new_pg = old_pg + num_pg; |
1521 | if (new_pg > max_pg) |
1522 | return -ENOBUFS; |
1523 | } while (!atomic_long_try_cmpxchg(v: &user->locked_vm, old: &old_pg, new: new_pg)); |
1524 | |
1525 | if (!mmp->user) { |
1526 | mmp->user = get_uid(u: user); |
1527 | mmp->num_pg = num_pg; |
1528 | } else { |
1529 | mmp->num_pg += num_pg; |
1530 | } |
1531 | |
1532 | return 0; |
1533 | } |
1534 | EXPORT_SYMBOL_GPL(mm_account_pinned_pages); |
1535 | |
1536 | void mm_unaccount_pinned_pages(struct mmpin *mmp) |
1537 | { |
1538 | if (mmp->user) { |
1539 | atomic_long_sub(i: mmp->num_pg, v: &mmp->user->locked_vm); |
1540 | free_uid(mmp->user); |
1541 | } |
1542 | } |
1543 | EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); |
1544 | |
1545 | static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) |
1546 | { |
1547 | struct ubuf_info_msgzc *uarg; |
1548 | struct sk_buff *skb; |
1549 | |
1550 | WARN_ON_ONCE(!in_task()); |
1551 | |
1552 | skb = sock_omalloc(sk, size: 0, GFP_KERNEL); |
1553 | if (!skb) |
1554 | return NULL; |
1555 | |
1556 | BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); |
1557 | uarg = (void *)skb->cb; |
1558 | uarg->mmp.user = NULL; |
1559 | |
1560 | if (mm_account_pinned_pages(&uarg->mmp, size)) { |
1561 | kfree_skb(skb); |
1562 | return NULL; |
1563 | } |
1564 | |
1565 | uarg->ubuf.callback = msg_zerocopy_callback; |
1566 | uarg->id = ((u32)atomic_inc_return(v: &sk->sk_zckey)) - 1; |
1567 | uarg->len = 1; |
1568 | uarg->bytelen = size; |
1569 | uarg->zerocopy = 1; |
1570 | uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; |
1571 | refcount_set(r: &uarg->ubuf.refcnt, n: 1); |
1572 | sock_hold(sk); |
1573 | |
1574 | return &uarg->ubuf; |
1575 | } |
1576 | |
1577 | static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) |
1578 | { |
1579 | return container_of((void *)uarg, struct sk_buff, cb); |
1580 | } |
1581 | |
1582 | struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, |
1583 | struct ubuf_info *uarg) |
1584 | { |
1585 | if (uarg) { |
1586 | struct ubuf_info_msgzc *uarg_zc; |
1587 | const u32 byte_limit = 1 << 19; /* limit to a few TSO */ |
1588 | u32 bytelen, next; |
1589 | |
1590 | /* there might be non MSG_ZEROCOPY users */ |
1591 | if (uarg->callback != msg_zerocopy_callback) |
1592 | return NULL; |
1593 | |
1594 | /* realloc only when socket is locked (TCP, UDP cork), |
1595 | * so uarg->len and sk_zckey access is serialized |
1596 | */ |
1597 | if (!sock_owned_by_user(sk)) { |
1598 | WARN_ON_ONCE(1); |
1599 | return NULL; |
1600 | } |
1601 | |
1602 | uarg_zc = uarg_to_msgzc(uarg); |
1603 | bytelen = uarg_zc->bytelen + size; |
1604 | if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { |
1605 | /* TCP can create new skb to attach new uarg */ |
1606 | if (sk->sk_type == SOCK_STREAM) |
1607 | goto new_alloc; |
1608 | return NULL; |
1609 | } |
1610 | |
1611 | next = (u32)atomic_read(v: &sk->sk_zckey); |
1612 | if ((u32)(uarg_zc->id + uarg_zc->len) == next) { |
1613 | if (mm_account_pinned_pages(&uarg_zc->mmp, size)) |
1614 | return NULL; |
1615 | uarg_zc->len++; |
1616 | uarg_zc->bytelen = bytelen; |
1617 | atomic_set(v: &sk->sk_zckey, i: ++next); |
1618 | |
1619 | /* no extra ref when appending to datagram (MSG_MORE) */ |
1620 | if (sk->sk_type == SOCK_STREAM) |
1621 | net_zcopy_get(uarg); |
1622 | |
1623 | return uarg; |
1624 | } |
1625 | } |
1626 | |
1627 | new_alloc: |
1628 | return msg_zerocopy_alloc(sk, size); |
1629 | } |
1630 | EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); |
1631 | |
1632 | static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) |
1633 | { |
1634 | struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); |
1635 | u32 old_lo, old_hi; |
1636 | u64 sum_len; |
1637 | |
1638 | old_lo = serr->ee.ee_info; |
1639 | old_hi = serr->ee.ee_data; |
1640 | sum_len = old_hi - old_lo + 1ULL + len; |
1641 | |
1642 | if (sum_len >= (1ULL << 32)) |
1643 | return false; |
1644 | |
1645 | if (lo != old_hi + 1) |
1646 | return false; |
1647 | |
1648 | serr->ee.ee_data += len; |
1649 | return true; |
1650 | } |
1651 | |
1652 | static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) |
1653 | { |
1654 | struct sk_buff *tail, *skb = skb_from_uarg(uarg); |
1655 | struct sock_exterr_skb *serr; |
1656 | struct sock *sk = skb->sk; |
1657 | struct sk_buff_head *q; |
1658 | unsigned long flags; |
1659 | bool is_zerocopy; |
1660 | u32 lo, hi; |
1661 | u16 len; |
1662 | |
1663 | mm_unaccount_pinned_pages(&uarg->mmp); |
1664 | |
1665 | /* if !len, there was only 1 call, and it was aborted |
1666 | * so do not queue a completion notification |
1667 | */ |
1668 | if (!uarg->len || sock_flag(sk, flag: SOCK_DEAD)) |
1669 | goto release; |
1670 | |
1671 | len = uarg->len; |
1672 | lo = uarg->id; |
1673 | hi = uarg->id + len - 1; |
1674 | is_zerocopy = uarg->zerocopy; |
1675 | |
1676 | serr = SKB_EXT_ERR(skb); |
1677 | memset(serr, 0, sizeof(*serr)); |
1678 | serr->ee.ee_errno = 0; |
1679 | serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; |
1680 | serr->ee.ee_data = hi; |
1681 | serr->ee.ee_info = lo; |
1682 | if (!is_zerocopy) |
1683 | serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; |
1684 | |
1685 | q = &sk->sk_error_queue; |
1686 | spin_lock_irqsave(&q->lock, flags); |
1687 | tail = skb_peek_tail(list_: q); |
1688 | if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || |
1689 | !skb_zerocopy_notify_extend(skb: tail, lo, len)) { |
1690 | __skb_queue_tail(list: q, newsk: skb); |
1691 | skb = NULL; |
1692 | } |
1693 | spin_unlock_irqrestore(lock: &q->lock, flags); |
1694 | |
1695 | sk_error_report(sk); |
1696 | |
1697 | release: |
1698 | consume_skb(skb); |
1699 | sock_put(sk); |
1700 | } |
1701 | |
1702 | void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, |
1703 | bool success) |
1704 | { |
1705 | struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); |
1706 | |
1707 | uarg_zc->zerocopy = uarg_zc->zerocopy & success; |
1708 | |
1709 | if (refcount_dec_and_test(r: &uarg->refcnt)) |
1710 | __msg_zerocopy_callback(uarg: uarg_zc); |
1711 | } |
1712 | EXPORT_SYMBOL_GPL(msg_zerocopy_callback); |
1713 | |
1714 | void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) |
1715 | { |
1716 | struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; |
1717 | |
1718 | atomic_dec(v: &sk->sk_zckey); |
1719 | uarg_to_msgzc(uarg)->len--; |
1720 | |
1721 | if (have_uref) |
1722 | msg_zerocopy_callback(NULL, uarg, true); |
1723 | } |
1724 | EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); |
1725 | |
1726 | int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, |
1727 | struct msghdr *msg, int len, |
1728 | struct ubuf_info *uarg) |
1729 | { |
1730 | struct ubuf_info *orig_uarg = skb_zcopy(skb); |
1731 | int err, orig_len = skb->len; |
1732 | |
1733 | /* An skb can only point to one uarg. This edge case happens when |
1734 | * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. |
1735 | */ |
1736 | if (orig_uarg && uarg != orig_uarg) |
1737 | return -EEXIST; |
1738 | |
1739 | err = __zerocopy_sg_from_iter(msg, sk, skb, from: &msg->msg_iter, length: len); |
1740 | if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { |
1741 | struct sock *save_sk = skb->sk; |
1742 | |
1743 | /* Streams do not free skb on error. Reset to prev state. */ |
1744 | iov_iter_revert(i: &msg->msg_iter, bytes: skb->len - orig_len); |
1745 | skb->sk = sk; |
1746 | ___pskb_trim(skb, len: orig_len); |
1747 | skb->sk = save_sk; |
1748 | return err; |
1749 | } |
1750 | |
1751 | skb_zcopy_set(skb, uarg, NULL); |
1752 | return skb->len - orig_len; |
1753 | } |
1754 | EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); |
1755 | |
1756 | void __skb_zcopy_downgrade_managed(struct sk_buff *skb) |
1757 | { |
1758 | int i; |
1759 | |
1760 | skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; |
1761 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
1762 | skb_frag_ref(skb, f: i); |
1763 | } |
1764 | EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); |
1765 | |
1766 | static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, |
1767 | gfp_t gfp_mask) |
1768 | { |
1769 | if (skb_zcopy(skb: orig)) { |
1770 | if (skb_zcopy(skb: nskb)) { |
1771 | /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ |
1772 | if (!gfp_mask) { |
1773 | WARN_ON_ONCE(1); |
1774 | return -ENOMEM; |
1775 | } |
1776 | if (skb_uarg(nskb) == skb_uarg(orig)) |
1777 | return 0; |
1778 | if (skb_copy_ubufs(skb: nskb, GFP_ATOMIC)) |
1779 | return -EIO; |
1780 | } |
1781 | skb_zcopy_set(skb: nskb, skb_uarg(orig), NULL); |
1782 | } |
1783 | return 0; |
1784 | } |
1785 | |
1786 | /** |
1787 | * skb_copy_ubufs - copy userspace skb frags buffers to kernel |
1788 | * @skb: the skb to modify |
1789 | * @gfp_mask: allocation priority |
1790 | * |
1791 | * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. |
1792 | * It will copy all frags into kernel and drop the reference |
1793 | * to userspace pages. |
1794 | * |
1795 | * If this function is called from an interrupt gfp_mask() must be |
1796 | * %GFP_ATOMIC. |
1797 | * |
1798 | * Returns 0 on success or a negative error code on failure |
1799 | * to allocate kernel memory to copy to. |
1800 | */ |
1801 | int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) |
1802 | { |
1803 | int num_frags = skb_shinfo(skb)->nr_frags; |
1804 | struct page *page, *head = NULL; |
1805 | int i, order, psize, new_frags; |
1806 | u32 d_off; |
1807 | |
1808 | if (skb_shared(skb) || skb_unclone(skb, pri: gfp_mask)) |
1809 | return -EINVAL; |
1810 | |
1811 | if (!num_frags) |
1812 | goto release; |
1813 | |
1814 | /* We might have to allocate high order pages, so compute what minimum |
1815 | * page order is needed. |
1816 | */ |
1817 | order = 0; |
1818 | while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) |
1819 | order++; |
1820 | psize = (PAGE_SIZE << order); |
1821 | |
1822 | new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); |
1823 | for (i = 0; i < new_frags; i++) { |
1824 | page = alloc_pages(gfp: gfp_mask | __GFP_COMP, order); |
1825 | if (!page) { |
1826 | while (head) { |
1827 | struct page *next = (struct page *)page_private(head); |
1828 | put_page(page: head); |
1829 | head = next; |
1830 | } |
1831 | return -ENOMEM; |
1832 | } |
1833 | set_page_private(page, private: (unsigned long)head); |
1834 | head = page; |
1835 | } |
1836 | |
1837 | page = head; |
1838 | d_off = 0; |
1839 | for (i = 0; i < num_frags; i++) { |
1840 | skb_frag_t *f = &skb_shinfo(skb)->frags[i]; |
1841 | u32 p_off, p_len, copied; |
1842 | struct page *p; |
1843 | u8 *vaddr; |
1844 | |
1845 | skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), |
1846 | p, p_off, p_len, copied) { |
1847 | u32 copy, done = 0; |
1848 | vaddr = kmap_atomic(page: p); |
1849 | |
1850 | while (done < p_len) { |
1851 | if (d_off == psize) { |
1852 | d_off = 0; |
1853 | page = (struct page *)page_private(page); |
1854 | } |
1855 | copy = min_t(u32, psize - d_off, p_len - done); |
1856 | memcpy(page_address(page) + d_off, |
1857 | vaddr + p_off + done, copy); |
1858 | done += copy; |
1859 | d_off += copy; |
1860 | } |
1861 | kunmap_atomic(vaddr); |
1862 | } |
1863 | } |
1864 | |
1865 | /* skb frags release userspace buffers */ |
1866 | for (i = 0; i < num_frags; i++) |
1867 | skb_frag_unref(skb, f: i); |
1868 | |
1869 | /* skb frags point to kernel buffers */ |
1870 | for (i = 0; i < new_frags - 1; i++) { |
1871 | __skb_fill_page_desc(skb, i, page: head, off: 0, size: psize); |
1872 | head = (struct page *)page_private(head); |
1873 | } |
1874 | __skb_fill_page_desc(skb, i: new_frags - 1, page: head, off: 0, size: d_off); |
1875 | skb_shinfo(skb)->nr_frags = new_frags; |
1876 | |
1877 | release: |
1878 | skb_zcopy_clear(skb, zerocopy_success: false); |
1879 | return 0; |
1880 | } |
1881 | EXPORT_SYMBOL_GPL(skb_copy_ubufs); |
1882 | |
1883 | /** |
1884 | * skb_clone - duplicate an sk_buff |
1885 | * @skb: buffer to clone |
1886 | * @gfp_mask: allocation priority |
1887 | * |
1888 | * Duplicate an &sk_buff. The new one is not owned by a socket. Both |
1889 | * copies share the same packet data but not structure. The new |
1890 | * buffer has a reference count of 1. If the allocation fails the |
1891 | * function returns %NULL otherwise the new buffer is returned. |
1892 | * |
1893 | * If this function is called from an interrupt gfp_mask() must be |
1894 | * %GFP_ATOMIC. |
1895 | */ |
1896 | |
1897 | struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) |
1898 | { |
1899 | struct sk_buff_fclones *fclones = container_of(skb, |
1900 | struct sk_buff_fclones, |
1901 | skb1); |
1902 | struct sk_buff *n; |
1903 | |
1904 | if (skb_orphan_frags(skb, gfp_mask)) |
1905 | return NULL; |
1906 | |
1907 | if (skb->fclone == SKB_FCLONE_ORIG && |
1908 | refcount_read(r: &fclones->fclone_ref) == 1) { |
1909 | n = &fclones->skb2; |
1910 | refcount_set(r: &fclones->fclone_ref, n: 2); |
1911 | n->fclone = SKB_FCLONE_CLONE; |
1912 | } else { |
1913 | if (skb_pfmemalloc(skb)) |
1914 | gfp_mask |= __GFP_MEMALLOC; |
1915 | |
1916 | n = kmem_cache_alloc(cachep: skbuff_cache, flags: gfp_mask); |
1917 | if (!n) |
1918 | return NULL; |
1919 | |
1920 | n->fclone = SKB_FCLONE_UNAVAILABLE; |
1921 | } |
1922 | |
1923 | return __skb_clone(n, skb); |
1924 | } |
1925 | EXPORT_SYMBOL(skb_clone); |
1926 | |
1927 | void (struct sk_buff *skb, int off) |
1928 | { |
1929 | /* Only adjust this if it actually is csum_start rather than csum */ |
1930 | if (skb->ip_summed == CHECKSUM_PARTIAL) |
1931 | skb->csum_start += off; |
1932 | /* {transport,network,mac}_header and tail are relative to skb->head */ |
1933 | skb->transport_header += off; |
1934 | skb->network_header += off; |
1935 | if (skb_mac_header_was_set(skb)) |
1936 | skb->mac_header += off; |
1937 | skb->inner_transport_header += off; |
1938 | skb->inner_network_header += off; |
1939 | skb->inner_mac_header += off; |
1940 | } |
1941 | EXPORT_SYMBOL(skb_headers_offset_update); |
1942 | |
1943 | void (struct sk_buff *new, const struct sk_buff *old) |
1944 | { |
1945 | __copy_skb_header(new, old); |
1946 | |
1947 | skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; |
1948 | skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; |
1949 | skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; |
1950 | } |
1951 | EXPORT_SYMBOL(skb_copy_header); |
1952 | |
1953 | static inline int skb_alloc_rx_flag(const struct sk_buff *skb) |
1954 | { |
1955 | if (skb_pfmemalloc(skb)) |
1956 | return SKB_ALLOC_RX; |
1957 | return 0; |
1958 | } |
1959 | |
1960 | /** |
1961 | * skb_copy - create private copy of an sk_buff |
1962 | * @skb: buffer to copy |
1963 | * @gfp_mask: allocation priority |
1964 | * |
1965 | * Make a copy of both an &sk_buff and its data. This is used when the |
1966 | * caller wishes to modify the data and needs a private copy of the |
1967 | * data to alter. Returns %NULL on failure or the pointer to the buffer |
1968 | * on success. The returned buffer has a reference count of 1. |
1969 | * |
1970 | * As by-product this function converts non-linear &sk_buff to linear |
1971 | * one, so that &sk_buff becomes completely private and caller is allowed |
1972 | * to modify all the data of returned buffer. This means that this |
1973 | * function is not recommended for use in circumstances when only |
1974 | * header is going to be modified. Use pskb_copy() instead. |
1975 | */ |
1976 | |
1977 | struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) |
1978 | { |
1979 | int = skb_headroom(skb); |
1980 | unsigned int size = skb_end_offset(skb) + skb->data_len; |
1981 | struct sk_buff *n = __alloc_skb(size, gfp_mask, |
1982 | skb_alloc_rx_flag(skb), NUMA_NO_NODE); |
1983 | |
1984 | if (!n) |
1985 | return NULL; |
1986 | |
1987 | /* Set the data pointer */ |
1988 | skb_reserve(skb: n, len: headerlen); |
1989 | /* Set the tail pointer and length */ |
1990 | skb_put(skb: n, len: skb->len); |
1991 | |
1992 | BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); |
1993 | |
1994 | skb_copy_header(n, skb); |
1995 | return n; |
1996 | } |
1997 | EXPORT_SYMBOL(skb_copy); |
1998 | |
1999 | /** |
2000 | * __pskb_copy_fclone - create copy of an sk_buff with private head. |
2001 | * @skb: buffer to copy |
2002 | * @headroom: headroom of new skb |
2003 | * @gfp_mask: allocation priority |
2004 | * @fclone: if true allocate the copy of the skb from the fclone |
2005 | * cache instead of the head cache; it is recommended to set this |
2006 | * to true for the cases where the copy will likely be cloned |
2007 | * |
2008 | * Make a copy of both an &sk_buff and part of its data, located |
2009 | * in header. Fragmented data remain shared. This is used when |
2010 | * the caller wishes to modify only header of &sk_buff and needs |
2011 | * private copy of the header to alter. Returns %NULL on failure |
2012 | * or the pointer to the buffer on success. |
2013 | * The returned buffer has a reference count of 1. |
2014 | */ |
2015 | |
2016 | struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, |
2017 | gfp_t gfp_mask, bool fclone) |
2018 | { |
2019 | unsigned int size = skb_headlen(skb) + headroom; |
2020 | int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); |
2021 | struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); |
2022 | |
2023 | if (!n) |
2024 | goto out; |
2025 | |
2026 | /* Set the data pointer */ |
2027 | skb_reserve(skb: n, len: headroom); |
2028 | /* Set the tail pointer and length */ |
2029 | skb_put(skb: n, len: skb_headlen(skb)); |
2030 | /* Copy the bytes */ |
2031 | skb_copy_from_linear_data(skb, to: n->data, len: n->len); |
2032 | |
2033 | n->truesize += skb->data_len; |
2034 | n->data_len = skb->data_len; |
2035 | n->len = skb->len; |
2036 | |
2037 | if (skb_shinfo(skb)->nr_frags) { |
2038 | int i; |
2039 | |
2040 | if (skb_orphan_frags(skb, gfp_mask) || |
2041 | skb_zerocopy_clone(nskb: n, orig: skb, gfp_mask)) { |
2042 | kfree_skb(skb: n); |
2043 | n = NULL; |
2044 | goto out; |
2045 | } |
2046 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2047 | skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; |
2048 | skb_frag_ref(skb, f: i); |
2049 | } |
2050 | skb_shinfo(n)->nr_frags = i; |
2051 | } |
2052 | |
2053 | if (skb_has_frag_list(skb)) { |
2054 | skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; |
2055 | skb_clone_fraglist(skb: n); |
2056 | } |
2057 | |
2058 | skb_copy_header(n, skb); |
2059 | out: |
2060 | return n; |
2061 | } |
2062 | EXPORT_SYMBOL(__pskb_copy_fclone); |
2063 | |
2064 | /** |
2065 | * pskb_expand_head - reallocate header of &sk_buff |
2066 | * @skb: buffer to reallocate |
2067 | * @nhead: room to add at head |
2068 | * @ntail: room to add at tail |
2069 | * @gfp_mask: allocation priority |
2070 | * |
2071 | * Expands (or creates identical copy, if @nhead and @ntail are zero) |
2072 | * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have |
2073 | * reference count of 1. Returns zero in the case of success or error, |
2074 | * if expansion failed. In the last case, &sk_buff is not changed. |
2075 | * |
2076 | * All the pointers pointing into skb header may change and must be |
2077 | * reloaded after call to this function. |
2078 | */ |
2079 | |
2080 | int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, |
2081 | gfp_t gfp_mask) |
2082 | { |
2083 | unsigned int osize = skb_end_offset(skb); |
2084 | unsigned int size = osize + nhead + ntail; |
2085 | long off; |
2086 | u8 *data; |
2087 | int i; |
2088 | |
2089 | BUG_ON(nhead < 0); |
2090 | |
2091 | BUG_ON(skb_shared(skb)); |
2092 | |
2093 | skb_zcopy_downgrade_managed(skb); |
2094 | |
2095 | if (skb_pfmemalloc(skb)) |
2096 | gfp_mask |= __GFP_MEMALLOC; |
2097 | |
2098 | data = kmalloc_reserve(size: &size, flags: gfp_mask, NUMA_NO_NODE, NULL); |
2099 | if (!data) |
2100 | goto nodata; |
2101 | size = SKB_WITH_OVERHEAD(size); |
2102 | |
2103 | /* Copy only real data... and, alas, header. This should be |
2104 | * optimized for the cases when header is void. |
2105 | */ |
2106 | memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); |
2107 | |
2108 | memcpy((struct skb_shared_info *)(data + size), |
2109 | skb_shinfo(skb), |
2110 | offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); |
2111 | |
2112 | /* |
2113 | * if shinfo is shared we must drop the old head gracefully, but if it |
2114 | * is not we can just drop the old head and let the existing refcount |
2115 | * be since all we did is relocate the values |
2116 | */ |
2117 | if (skb_cloned(skb)) { |
2118 | if (skb_orphan_frags(skb, gfp_mask)) |
2119 | goto nofrags; |
2120 | if (skb_zcopy(skb)) |
2121 | refcount_inc(r: &skb_uarg(skb)->refcnt); |
2122 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
2123 | skb_frag_ref(skb, f: i); |
2124 | |
2125 | if (skb_has_frag_list(skb)) |
2126 | skb_clone_fraglist(skb); |
2127 | |
2128 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
2129 | } else { |
2130 | skb_free_head(skb, napi_safe: false); |
2131 | } |
2132 | off = (data + nhead) - skb->head; |
2133 | |
2134 | skb->head = data; |
2135 | skb->head_frag = 0; |
2136 | skb->data += off; |
2137 | |
2138 | skb_set_end_offset(skb, offset: size); |
2139 | #ifdef NET_SKBUFF_DATA_USES_OFFSET |
2140 | off = nhead; |
2141 | #endif |
2142 | skb->tail += off; |
2143 | skb_headers_offset_update(skb, nhead); |
2144 | skb->cloned = 0; |
2145 | skb->hdr_len = 0; |
2146 | skb->nohdr = 0; |
2147 | atomic_set(v: &skb_shinfo(skb)->dataref, i: 1); |
2148 | |
2149 | skb_metadata_clear(skb); |
2150 | |
2151 | /* It is not generally safe to change skb->truesize. |
2152 | * For the moment, we really care of rx path, or |
2153 | * when skb is orphaned (not attached to a socket). |
2154 | */ |
2155 | if (!skb->sk || skb->destructor == sock_edemux) |
2156 | skb->truesize += size - osize; |
2157 | |
2158 | return 0; |
2159 | |
2160 | nofrags: |
2161 | skb_kfree_head(head: data, end_offset: size); |
2162 | nodata: |
2163 | return -ENOMEM; |
2164 | } |
2165 | EXPORT_SYMBOL(pskb_expand_head); |
2166 | |
2167 | /* Make private copy of skb with writable head and some headroom */ |
2168 | |
2169 | struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) |
2170 | { |
2171 | struct sk_buff *skb2; |
2172 | int delta = headroom - skb_headroom(skb); |
2173 | |
2174 | if (delta <= 0) |
2175 | skb2 = pskb_copy(skb, GFP_ATOMIC); |
2176 | else { |
2177 | skb2 = skb_clone(skb, GFP_ATOMIC); |
2178 | if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, |
2179 | GFP_ATOMIC)) { |
2180 | kfree_skb(skb: skb2); |
2181 | skb2 = NULL; |
2182 | } |
2183 | } |
2184 | return skb2; |
2185 | } |
2186 | EXPORT_SYMBOL(skb_realloc_headroom); |
2187 | |
2188 | /* Note: We plan to rework this in linux-6.4 */ |
2189 | int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) |
2190 | { |
2191 | unsigned int saved_end_offset, saved_truesize; |
2192 | struct skb_shared_info *shinfo; |
2193 | int res; |
2194 | |
2195 | saved_end_offset = skb_end_offset(skb); |
2196 | saved_truesize = skb->truesize; |
2197 | |
2198 | res = pskb_expand_head(skb, 0, 0, pri); |
2199 | if (res) |
2200 | return res; |
2201 | |
2202 | skb->truesize = saved_truesize; |
2203 | |
2204 | if (likely(skb_end_offset(skb) == saved_end_offset)) |
2205 | return 0; |
2206 | |
2207 | /* We can not change skb->end if the original or new value |
2208 | * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). |
2209 | */ |
2210 | if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || |
2211 | skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { |
2212 | /* We think this path should not be taken. |
2213 | * Add a temporary trace to warn us just in case. |
2214 | */ |
2215 | pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n" , |
2216 | saved_end_offset, skb_end_offset(skb)); |
2217 | WARN_ON_ONCE(1); |
2218 | return 0; |
2219 | } |
2220 | |
2221 | shinfo = skb_shinfo(skb); |
2222 | |
2223 | /* We are about to change back skb->end, |
2224 | * we need to move skb_shinfo() to its new location. |
2225 | */ |
2226 | memmove(skb->head + saved_end_offset, |
2227 | shinfo, |
2228 | offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); |
2229 | |
2230 | skb_set_end_offset(skb, offset: saved_end_offset); |
2231 | |
2232 | return 0; |
2233 | } |
2234 | |
2235 | /** |
2236 | * skb_expand_head - reallocate header of &sk_buff |
2237 | * @skb: buffer to reallocate |
2238 | * @headroom: needed headroom |
2239 | * |
2240 | * Unlike skb_realloc_headroom, this one does not allocate a new skb |
2241 | * if possible; copies skb->sk to new skb as needed |
2242 | * and frees original skb in case of failures. |
2243 | * |
2244 | * It expect increased headroom and generates warning otherwise. |
2245 | */ |
2246 | |
2247 | struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) |
2248 | { |
2249 | int delta = headroom - skb_headroom(skb); |
2250 | int osize = skb_end_offset(skb); |
2251 | struct sock *sk = skb->sk; |
2252 | |
2253 | if (WARN_ONCE(delta <= 0, |
2254 | "%s is expecting an increase in the headroom" , __func__)) |
2255 | return skb; |
2256 | |
2257 | delta = SKB_DATA_ALIGN(delta); |
2258 | /* pskb_expand_head() might crash, if skb is shared. */ |
2259 | if (skb_shared(skb) || !is_skb_wmem(skb)) { |
2260 | struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); |
2261 | |
2262 | if (unlikely(!nskb)) |
2263 | goto fail; |
2264 | |
2265 | if (sk) |
2266 | skb_set_owner_w(skb: nskb, sk); |
2267 | consume_skb(skb); |
2268 | skb = nskb; |
2269 | } |
2270 | if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) |
2271 | goto fail; |
2272 | |
2273 | if (sk && is_skb_wmem(skb)) { |
2274 | delta = skb_end_offset(skb) - osize; |
2275 | refcount_add(i: delta, r: &sk->sk_wmem_alloc); |
2276 | skb->truesize += delta; |
2277 | } |
2278 | return skb; |
2279 | |
2280 | fail: |
2281 | kfree_skb(skb); |
2282 | return NULL; |
2283 | } |
2284 | EXPORT_SYMBOL(skb_expand_head); |
2285 | |
2286 | /** |
2287 | * skb_copy_expand - copy and expand sk_buff |
2288 | * @skb: buffer to copy |
2289 | * @newheadroom: new free bytes at head |
2290 | * @newtailroom: new free bytes at tail |
2291 | * @gfp_mask: allocation priority |
2292 | * |
2293 | * Make a copy of both an &sk_buff and its data and while doing so |
2294 | * allocate additional space. |
2295 | * |
2296 | * This is used when the caller wishes to modify the data and needs a |
2297 | * private copy of the data to alter as well as more space for new fields. |
2298 | * Returns %NULL on failure or the pointer to the buffer |
2299 | * on success. The returned buffer has a reference count of 1. |
2300 | * |
2301 | * You must pass %GFP_ATOMIC as the allocation priority if this function |
2302 | * is called from an interrupt. |
2303 | */ |
2304 | struct sk_buff *skb_copy_expand(const struct sk_buff *skb, |
2305 | int newheadroom, int newtailroom, |
2306 | gfp_t gfp_mask) |
2307 | { |
2308 | /* |
2309 | * Allocate the copy buffer |
2310 | */ |
2311 | struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, |
2312 | gfp_mask, skb_alloc_rx_flag(skb), |
2313 | NUMA_NO_NODE); |
2314 | int oldheadroom = skb_headroom(skb); |
2315 | int head_copy_len, head_copy_off; |
2316 | |
2317 | if (!n) |
2318 | return NULL; |
2319 | |
2320 | skb_reserve(skb: n, len: newheadroom); |
2321 | |
2322 | /* Set the tail pointer and length */ |
2323 | skb_put(skb: n, len: skb->len); |
2324 | |
2325 | head_copy_len = oldheadroom; |
2326 | head_copy_off = 0; |
2327 | if (newheadroom <= head_copy_len) |
2328 | head_copy_len = newheadroom; |
2329 | else |
2330 | head_copy_off = newheadroom - head_copy_len; |
2331 | |
2332 | /* Copy the linear header and data. */ |
2333 | BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, |
2334 | skb->len + head_copy_len)); |
2335 | |
2336 | skb_copy_header(n, skb); |
2337 | |
2338 | skb_headers_offset_update(n, newheadroom - oldheadroom); |
2339 | |
2340 | return n; |
2341 | } |
2342 | EXPORT_SYMBOL(skb_copy_expand); |
2343 | |
2344 | /** |
2345 | * __skb_pad - zero pad the tail of an skb |
2346 | * @skb: buffer to pad |
2347 | * @pad: space to pad |
2348 | * @free_on_error: free buffer on error |
2349 | * |
2350 | * Ensure that a buffer is followed by a padding area that is zero |
2351 | * filled. Used by network drivers which may DMA or transfer data |
2352 | * beyond the buffer end onto the wire. |
2353 | * |
2354 | * May return error in out of memory cases. The skb is freed on error |
2355 | * if @free_on_error is true. |
2356 | */ |
2357 | |
2358 | int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) |
2359 | { |
2360 | int err; |
2361 | int ntail; |
2362 | |
2363 | /* If the skbuff is non linear tailroom is always zero.. */ |
2364 | if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { |
2365 | memset(skb->data+skb->len, 0, pad); |
2366 | return 0; |
2367 | } |
2368 | |
2369 | ntail = skb->data_len + pad - (skb->end - skb->tail); |
2370 | if (likely(skb_cloned(skb) || ntail > 0)) { |
2371 | err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); |
2372 | if (unlikely(err)) |
2373 | goto free_skb; |
2374 | } |
2375 | |
2376 | /* FIXME: The use of this function with non-linear skb's really needs |
2377 | * to be audited. |
2378 | */ |
2379 | err = skb_linearize(skb); |
2380 | if (unlikely(err)) |
2381 | goto free_skb; |
2382 | |
2383 | memset(skb->data + skb->len, 0, pad); |
2384 | return 0; |
2385 | |
2386 | free_skb: |
2387 | if (free_on_error) |
2388 | kfree_skb(skb); |
2389 | return err; |
2390 | } |
2391 | EXPORT_SYMBOL(__skb_pad); |
2392 | |
2393 | /** |
2394 | * pskb_put - add data to the tail of a potentially fragmented buffer |
2395 | * @skb: start of the buffer to use |
2396 | * @tail: tail fragment of the buffer to use |
2397 | * @len: amount of data to add |
2398 | * |
2399 | * This function extends the used data area of the potentially |
2400 | * fragmented buffer. @tail must be the last fragment of @skb -- or |
2401 | * @skb itself. If this would exceed the total buffer size the kernel |
2402 | * will panic. A pointer to the first byte of the extra data is |
2403 | * returned. |
2404 | */ |
2405 | |
2406 | void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) |
2407 | { |
2408 | if (tail != skb) { |
2409 | skb->data_len += len; |
2410 | skb->len += len; |
2411 | } |
2412 | return skb_put(skb: tail, len); |
2413 | } |
2414 | EXPORT_SYMBOL_GPL(pskb_put); |
2415 | |
2416 | /** |
2417 | * skb_put - add data to a buffer |
2418 | * @skb: buffer to use |
2419 | * @len: amount of data to add |
2420 | * |
2421 | * This function extends the used data area of the buffer. If this would |
2422 | * exceed the total buffer size the kernel will panic. A pointer to the |
2423 | * first byte of the extra data is returned. |
2424 | */ |
2425 | void *skb_put(struct sk_buff *skb, unsigned int len) |
2426 | { |
2427 | void *tmp = skb_tail_pointer(skb); |
2428 | SKB_LINEAR_ASSERT(skb); |
2429 | skb->tail += len; |
2430 | skb->len += len; |
2431 | if (unlikely(skb->tail > skb->end)) |
2432 | skb_over_panic(skb, sz: len, addr: __builtin_return_address(0)); |
2433 | return tmp; |
2434 | } |
2435 | EXPORT_SYMBOL(skb_put); |
2436 | |
2437 | /** |
2438 | * skb_push - add data to the start of a buffer |
2439 | * @skb: buffer to use |
2440 | * @len: amount of data to add |
2441 | * |
2442 | * This function extends the used data area of the buffer at the buffer |
2443 | * start. If this would exceed the total buffer headroom the kernel will |
2444 | * panic. A pointer to the first byte of the extra data is returned. |
2445 | */ |
2446 | void *skb_push(struct sk_buff *skb, unsigned int len) |
2447 | { |
2448 | skb->data -= len; |
2449 | skb->len += len; |
2450 | if (unlikely(skb->data < skb->head)) |
2451 | skb_under_panic(skb, sz: len, addr: __builtin_return_address(0)); |
2452 | return skb->data; |
2453 | } |
2454 | EXPORT_SYMBOL(skb_push); |
2455 | |
2456 | /** |
2457 | * skb_pull - remove data from the start of a buffer |
2458 | * @skb: buffer to use |
2459 | * @len: amount of data to remove |
2460 | * |
2461 | * This function removes data from the start of a buffer, returning |
2462 | * the memory to the headroom. A pointer to the next data in the buffer |
2463 | * is returned. Once the data has been pulled future pushes will overwrite |
2464 | * the old data. |
2465 | */ |
2466 | void *skb_pull(struct sk_buff *skb, unsigned int len) |
2467 | { |
2468 | return skb_pull_inline(skb, len); |
2469 | } |
2470 | EXPORT_SYMBOL(skb_pull); |
2471 | |
2472 | /** |
2473 | * skb_pull_data - remove data from the start of a buffer returning its |
2474 | * original position. |
2475 | * @skb: buffer to use |
2476 | * @len: amount of data to remove |
2477 | * |
2478 | * This function removes data from the start of a buffer, returning |
2479 | * the memory to the headroom. A pointer to the original data in the buffer |
2480 | * is returned after checking if there is enough data to pull. Once the |
2481 | * data has been pulled future pushes will overwrite the old data. |
2482 | */ |
2483 | void *skb_pull_data(struct sk_buff *skb, size_t len) |
2484 | { |
2485 | void *data = skb->data; |
2486 | |
2487 | if (skb->len < len) |
2488 | return NULL; |
2489 | |
2490 | skb_pull(skb, len); |
2491 | |
2492 | return data; |
2493 | } |
2494 | EXPORT_SYMBOL(skb_pull_data); |
2495 | |
2496 | /** |
2497 | * skb_trim - remove end from a buffer |
2498 | * @skb: buffer to alter |
2499 | * @len: new length |
2500 | * |
2501 | * Cut the length of a buffer down by removing data from the tail. If |
2502 | * the buffer is already under the length specified it is not modified. |
2503 | * The skb must be linear. |
2504 | */ |
2505 | void skb_trim(struct sk_buff *skb, unsigned int len) |
2506 | { |
2507 | if (skb->len > len) |
2508 | __skb_trim(skb, len); |
2509 | } |
2510 | EXPORT_SYMBOL(skb_trim); |
2511 | |
2512 | /* Trims skb to length len. It can change skb pointers. |
2513 | */ |
2514 | |
2515 | int ___pskb_trim(struct sk_buff *skb, unsigned int len) |
2516 | { |
2517 | struct sk_buff **fragp; |
2518 | struct sk_buff *frag; |
2519 | int offset = skb_headlen(skb); |
2520 | int nfrags = skb_shinfo(skb)->nr_frags; |
2521 | int i; |
2522 | int err; |
2523 | |
2524 | if (skb_cloned(skb) && |
2525 | unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) |
2526 | return err; |
2527 | |
2528 | i = 0; |
2529 | if (offset >= len) |
2530 | goto drop_pages; |
2531 | |
2532 | for (; i < nfrags; i++) { |
2533 | int end = offset + skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
2534 | |
2535 | if (end < len) { |
2536 | offset = end; |
2537 | continue; |
2538 | } |
2539 | |
2540 | skb_frag_size_set(frag: &skb_shinfo(skb)->frags[i++], size: len - offset); |
2541 | |
2542 | drop_pages: |
2543 | skb_shinfo(skb)->nr_frags = i; |
2544 | |
2545 | for (; i < nfrags; i++) |
2546 | skb_frag_unref(skb, f: i); |
2547 | |
2548 | if (skb_has_frag_list(skb)) |
2549 | skb_drop_fraglist(skb); |
2550 | goto done; |
2551 | } |
2552 | |
2553 | for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); |
2554 | fragp = &frag->next) { |
2555 | int end = offset + frag->len; |
2556 | |
2557 | if (skb_shared(skb: frag)) { |
2558 | struct sk_buff *nfrag; |
2559 | |
2560 | nfrag = skb_clone(frag, GFP_ATOMIC); |
2561 | if (unlikely(!nfrag)) |
2562 | return -ENOMEM; |
2563 | |
2564 | nfrag->next = frag->next; |
2565 | consume_skb(frag); |
2566 | frag = nfrag; |
2567 | *fragp = frag; |
2568 | } |
2569 | |
2570 | if (end < len) { |
2571 | offset = end; |
2572 | continue; |
2573 | } |
2574 | |
2575 | if (end > len && |
2576 | unlikely((err = pskb_trim(frag, len - offset)))) |
2577 | return err; |
2578 | |
2579 | if (frag->next) |
2580 | skb_drop_list(listp: &frag->next); |
2581 | break; |
2582 | } |
2583 | |
2584 | done: |
2585 | if (len > skb_headlen(skb)) { |
2586 | skb->data_len -= skb->len - len; |
2587 | skb->len = len; |
2588 | } else { |
2589 | skb->len = len; |
2590 | skb->data_len = 0; |
2591 | skb_set_tail_pointer(skb, offset: len); |
2592 | } |
2593 | |
2594 | if (!skb->sk || skb->destructor == sock_edemux) |
2595 | skb_condense(skb); |
2596 | return 0; |
2597 | } |
2598 | EXPORT_SYMBOL(___pskb_trim); |
2599 | |
2600 | /* Note : use pskb_trim_rcsum() instead of calling this directly |
2601 | */ |
2602 | int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) |
2603 | { |
2604 | if (skb->ip_summed == CHECKSUM_COMPLETE) { |
2605 | int delta = skb->len - len; |
2606 | |
2607 | skb->csum = csum_block_sub(csum: skb->csum, |
2608 | csum2: skb_checksum(skb, offset: len, len: delta, csum: 0), |
2609 | offset: len); |
2610 | } else if (skb->ip_summed == CHECKSUM_PARTIAL) { |
2611 | int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; |
2612 | int offset = skb_checksum_start_offset(skb) + skb->csum_offset; |
2613 | |
2614 | if (offset + sizeof(__sum16) > hdlen) |
2615 | return -EINVAL; |
2616 | } |
2617 | return __pskb_trim(skb, len); |
2618 | } |
2619 | EXPORT_SYMBOL(pskb_trim_rcsum_slow); |
2620 | |
2621 | /** |
2622 | * __pskb_pull_tail - advance tail of skb header |
2623 | * @skb: buffer to reallocate |
2624 | * @delta: number of bytes to advance tail |
2625 | * |
2626 | * The function makes a sense only on a fragmented &sk_buff, |
2627 | * it expands header moving its tail forward and copying necessary |
2628 | * data from fragmented part. |
2629 | * |
2630 | * &sk_buff MUST have reference count of 1. |
2631 | * |
2632 | * Returns %NULL (and &sk_buff does not change) if pull failed |
2633 | * or value of new tail of skb in the case of success. |
2634 | * |
2635 | * All the pointers pointing into skb header may change and must be |
2636 | * reloaded after call to this function. |
2637 | */ |
2638 | |
2639 | /* Moves tail of skb head forward, copying data from fragmented part, |
2640 | * when it is necessary. |
2641 | * 1. It may fail due to malloc failure. |
2642 | * 2. It may change skb pointers. |
2643 | * |
2644 | * It is pretty complicated. Luckily, it is called only in exceptional cases. |
2645 | */ |
2646 | void *__pskb_pull_tail(struct sk_buff *skb, int delta) |
2647 | { |
2648 | /* If skb has not enough free space at tail, get new one |
2649 | * plus 128 bytes for future expansions. If we have enough |
2650 | * room at tail, reallocate without expansion only if skb is cloned. |
2651 | */ |
2652 | int i, k, eat = (skb->tail + delta) - skb->end; |
2653 | |
2654 | if (eat > 0 || skb_cloned(skb)) { |
2655 | if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, |
2656 | GFP_ATOMIC)) |
2657 | return NULL; |
2658 | } |
2659 | |
2660 | BUG_ON(skb_copy_bits(skb, skb_headlen(skb), |
2661 | skb_tail_pointer(skb), delta)); |
2662 | |
2663 | /* Optimization: no fragments, no reasons to preestimate |
2664 | * size of pulled pages. Superb. |
2665 | */ |
2666 | if (!skb_has_frag_list(skb)) |
2667 | goto pull_pages; |
2668 | |
2669 | /* Estimate size of pulled pages. */ |
2670 | eat = delta; |
2671 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2672 | int size = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
2673 | |
2674 | if (size >= eat) |
2675 | goto pull_pages; |
2676 | eat -= size; |
2677 | } |
2678 | |
2679 | /* If we need update frag list, we are in troubles. |
2680 | * Certainly, it is possible to add an offset to skb data, |
2681 | * but taking into account that pulling is expected to |
2682 | * be very rare operation, it is worth to fight against |
2683 | * further bloating skb head and crucify ourselves here instead. |
2684 | * Pure masohism, indeed. 8)8) |
2685 | */ |
2686 | if (eat) { |
2687 | struct sk_buff *list = skb_shinfo(skb)->frag_list; |
2688 | struct sk_buff *clone = NULL; |
2689 | struct sk_buff *insp = NULL; |
2690 | |
2691 | do { |
2692 | if (list->len <= eat) { |
2693 | /* Eaten as whole. */ |
2694 | eat -= list->len; |
2695 | list = list->next; |
2696 | insp = list; |
2697 | } else { |
2698 | /* Eaten partially. */ |
2699 | if (skb_is_gso(skb) && !list->head_frag && |
2700 | skb_headlen(skb: list)) |
2701 | skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; |
2702 | |
2703 | if (skb_shared(skb: list)) { |
2704 | /* Sucks! We need to fork list. :-( */ |
2705 | clone = skb_clone(list, GFP_ATOMIC); |
2706 | if (!clone) |
2707 | return NULL; |
2708 | insp = list->next; |
2709 | list = clone; |
2710 | } else { |
2711 | /* This may be pulled without |
2712 | * problems. */ |
2713 | insp = list; |
2714 | } |
2715 | if (!pskb_pull(skb: list, len: eat)) { |
2716 | kfree_skb(skb: clone); |
2717 | return NULL; |
2718 | } |
2719 | break; |
2720 | } |
2721 | } while (eat); |
2722 | |
2723 | /* Free pulled out fragments. */ |
2724 | while ((list = skb_shinfo(skb)->frag_list) != insp) { |
2725 | skb_shinfo(skb)->frag_list = list->next; |
2726 | consume_skb(list); |
2727 | } |
2728 | /* And insert new clone at head. */ |
2729 | if (clone) { |
2730 | clone->next = list; |
2731 | skb_shinfo(skb)->frag_list = clone; |
2732 | } |
2733 | } |
2734 | /* Success! Now we may commit changes to skb data. */ |
2735 | |
2736 | pull_pages: |
2737 | eat = delta; |
2738 | k = 0; |
2739 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2740 | int size = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
2741 | |
2742 | if (size <= eat) { |
2743 | skb_frag_unref(skb, f: i); |
2744 | eat -= size; |
2745 | } else { |
2746 | skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; |
2747 | |
2748 | *frag = skb_shinfo(skb)->frags[i]; |
2749 | if (eat) { |
2750 | skb_frag_off_add(frag, delta: eat); |
2751 | skb_frag_size_sub(frag, delta: eat); |
2752 | if (!i) |
2753 | goto end; |
2754 | eat = 0; |
2755 | } |
2756 | k++; |
2757 | } |
2758 | } |
2759 | skb_shinfo(skb)->nr_frags = k; |
2760 | |
2761 | end: |
2762 | skb->tail += delta; |
2763 | skb->data_len -= delta; |
2764 | |
2765 | if (!skb->data_len) |
2766 | skb_zcopy_clear(skb, zerocopy_success: false); |
2767 | |
2768 | return skb_tail_pointer(skb); |
2769 | } |
2770 | EXPORT_SYMBOL(__pskb_pull_tail); |
2771 | |
2772 | /** |
2773 | * skb_copy_bits - copy bits from skb to kernel buffer |
2774 | * @skb: source skb |
2775 | * @offset: offset in source |
2776 | * @to: destination buffer |
2777 | * @len: number of bytes to copy |
2778 | * |
2779 | * Copy the specified number of bytes from the source skb to the |
2780 | * destination buffer. |
2781 | * |
2782 | * CAUTION ! : |
2783 | * If its prototype is ever changed, |
2784 | * check arch/{*}/net/{*}.S files, |
2785 | * since it is called from BPF assembly code. |
2786 | */ |
2787 | int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) |
2788 | { |
2789 | int start = skb_headlen(skb); |
2790 | struct sk_buff *frag_iter; |
2791 | int i, copy; |
2792 | |
2793 | if (offset > (int)skb->len - len) |
2794 | goto fault; |
2795 | |
2796 | /* Copy header. */ |
2797 | if ((copy = start - offset) > 0) { |
2798 | if (copy > len) |
2799 | copy = len; |
2800 | skb_copy_from_linear_data_offset(skb, offset, to, len: copy); |
2801 | if ((len -= copy) == 0) |
2802 | return 0; |
2803 | offset += copy; |
2804 | to += copy; |
2805 | } |
2806 | |
2807 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2808 | int end; |
2809 | skb_frag_t *f = &skb_shinfo(skb)->frags[i]; |
2810 | |
2811 | WARN_ON(start > offset + len); |
2812 | |
2813 | end = start + skb_frag_size(frag: f); |
2814 | if ((copy = end - offset) > 0) { |
2815 | u32 p_off, p_len, copied; |
2816 | struct page *p; |
2817 | u8 *vaddr; |
2818 | |
2819 | if (copy > len) |
2820 | copy = len; |
2821 | |
2822 | skb_frag_foreach_page(f, |
2823 | skb_frag_off(f) + offset - start, |
2824 | copy, p, p_off, p_len, copied) { |
2825 | vaddr = kmap_atomic(page: p); |
2826 | memcpy(to + copied, vaddr + p_off, p_len); |
2827 | kunmap_atomic(vaddr); |
2828 | } |
2829 | |
2830 | if ((len -= copy) == 0) |
2831 | return 0; |
2832 | offset += copy; |
2833 | to += copy; |
2834 | } |
2835 | start = end; |
2836 | } |
2837 | |
2838 | skb_walk_frags(skb, frag_iter) { |
2839 | int end; |
2840 | |
2841 | WARN_ON(start > offset + len); |
2842 | |
2843 | end = start + frag_iter->len; |
2844 | if ((copy = end - offset) > 0) { |
2845 | if (copy > len) |
2846 | copy = len; |
2847 | if (skb_copy_bits(skb: frag_iter, offset: offset - start, to, len: copy)) |
2848 | goto fault; |
2849 | if ((len -= copy) == 0) |
2850 | return 0; |
2851 | offset += copy; |
2852 | to += copy; |
2853 | } |
2854 | start = end; |
2855 | } |
2856 | |
2857 | if (!len) |
2858 | return 0; |
2859 | |
2860 | fault: |
2861 | return -EFAULT; |
2862 | } |
2863 | EXPORT_SYMBOL(skb_copy_bits); |
2864 | |
2865 | /* |
2866 | * Callback from splice_to_pipe(), if we need to release some pages |
2867 | * at the end of the spd in case we error'ed out in filling the pipe. |
2868 | */ |
2869 | static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) |
2870 | { |
2871 | put_page(page: spd->pages[i]); |
2872 | } |
2873 | |
2874 | static struct page *linear_to_page(struct page *page, unsigned int *len, |
2875 | unsigned int *offset, |
2876 | struct sock *sk) |
2877 | { |
2878 | struct page_frag *pfrag = sk_page_frag(sk); |
2879 | |
2880 | if (!sk_page_frag_refill(sk, pfrag)) |
2881 | return NULL; |
2882 | |
2883 | *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); |
2884 | |
2885 | memcpy(page_address(pfrag->page) + pfrag->offset, |
2886 | page_address(page) + *offset, *len); |
2887 | *offset = pfrag->offset; |
2888 | pfrag->offset += *len; |
2889 | |
2890 | return pfrag->page; |
2891 | } |
2892 | |
2893 | static bool spd_can_coalesce(const struct splice_pipe_desc *spd, |
2894 | struct page *page, |
2895 | unsigned int offset) |
2896 | { |
2897 | return spd->nr_pages && |
2898 | spd->pages[spd->nr_pages - 1] == page && |
2899 | (spd->partial[spd->nr_pages - 1].offset + |
2900 | spd->partial[spd->nr_pages - 1].len == offset); |
2901 | } |
2902 | |
2903 | /* |
2904 | * Fill page/offset/length into spd, if it can hold more pages. |
2905 | */ |
2906 | static bool spd_fill_page(struct splice_pipe_desc *spd, |
2907 | struct pipe_inode_info *pipe, struct page *page, |
2908 | unsigned int *len, unsigned int offset, |
2909 | bool linear, |
2910 | struct sock *sk) |
2911 | { |
2912 | if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) |
2913 | return true; |
2914 | |
2915 | if (linear) { |
2916 | page = linear_to_page(page, len, offset: &offset, sk); |
2917 | if (!page) |
2918 | return true; |
2919 | } |
2920 | if (spd_can_coalesce(spd, page, offset)) { |
2921 | spd->partial[spd->nr_pages - 1].len += *len; |
2922 | return false; |
2923 | } |
2924 | get_page(page); |
2925 | spd->pages[spd->nr_pages] = page; |
2926 | spd->partial[spd->nr_pages].len = *len; |
2927 | spd->partial[spd->nr_pages].offset = offset; |
2928 | spd->nr_pages++; |
2929 | |
2930 | return false; |
2931 | } |
2932 | |
2933 | static bool __splice_segment(struct page *page, unsigned int poff, |
2934 | unsigned int plen, unsigned int *off, |
2935 | unsigned int *len, |
2936 | struct splice_pipe_desc *spd, bool linear, |
2937 | struct sock *sk, |
2938 | struct pipe_inode_info *pipe) |
2939 | { |
2940 | if (!*len) |
2941 | return true; |
2942 | |
2943 | /* skip this segment if already processed */ |
2944 | if (*off >= plen) { |
2945 | *off -= plen; |
2946 | return false; |
2947 | } |
2948 | |
2949 | /* ignore any bits we already processed */ |
2950 | poff += *off; |
2951 | plen -= *off; |
2952 | *off = 0; |
2953 | |
2954 | do { |
2955 | unsigned int flen = min(*len, plen); |
2956 | |
2957 | if (spd_fill_page(spd, pipe, page, len: &flen, offset: poff, |
2958 | linear, sk)) |
2959 | return true; |
2960 | poff += flen; |
2961 | plen -= flen; |
2962 | *len -= flen; |
2963 | } while (*len && plen); |
2964 | |
2965 | return false; |
2966 | } |
2967 | |
2968 | /* |
2969 | * Map linear and fragment data from the skb to spd. It reports true if the |
2970 | * pipe is full or if we already spliced the requested length. |
2971 | */ |
2972 | static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, |
2973 | unsigned int *offset, unsigned int *len, |
2974 | struct splice_pipe_desc *spd, struct sock *sk) |
2975 | { |
2976 | int seg; |
2977 | struct sk_buff *iter; |
2978 | |
2979 | /* map the linear part : |
2980 | * If skb->head_frag is set, this 'linear' part is backed by a |
2981 | * fragment, and if the head is not shared with any clones then |
2982 | * we can avoid a copy since we own the head portion of this page. |
2983 | */ |
2984 | if (__splice_segment(virt_to_page(skb->data), |
2985 | poff: (unsigned long) skb->data & (PAGE_SIZE - 1), |
2986 | plen: skb_headlen(skb), |
2987 | off: offset, len, spd, |
2988 | linear: skb_head_is_locked(skb), |
2989 | sk, pipe)) |
2990 | return true; |
2991 | |
2992 | /* |
2993 | * then map the fragments |
2994 | */ |
2995 | for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { |
2996 | const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; |
2997 | |
2998 | if (__splice_segment(page: skb_frag_page(frag: f), |
2999 | poff: skb_frag_off(frag: f), plen: skb_frag_size(frag: f), |
3000 | off: offset, len, spd, linear: false, sk, pipe)) |
3001 | return true; |
3002 | } |
3003 | |
3004 | skb_walk_frags(skb, iter) { |
3005 | if (*offset >= iter->len) { |
3006 | *offset -= iter->len; |
3007 | continue; |
3008 | } |
3009 | /* __skb_splice_bits() only fails if the output has no room |
3010 | * left, so no point in going over the frag_list for the error |
3011 | * case. |
3012 | */ |
3013 | if (__skb_splice_bits(skb: iter, pipe, offset, len, spd, sk)) |
3014 | return true; |
3015 | } |
3016 | |
3017 | return false; |
3018 | } |
3019 | |
3020 | /* |
3021 | * Map data from the skb to a pipe. Should handle both the linear part, |
3022 | * the fragments, and the frag list. |
3023 | */ |
3024 | int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, |
3025 | struct pipe_inode_info *pipe, unsigned int tlen, |
3026 | unsigned int flags) |
3027 | { |
3028 | struct partial_page partial[MAX_SKB_FRAGS]; |
3029 | struct page *pages[MAX_SKB_FRAGS]; |
3030 | struct splice_pipe_desc spd = { |
3031 | .pages = pages, |
3032 | .partial = partial, |
3033 | .nr_pages_max = MAX_SKB_FRAGS, |
3034 | .ops = &nosteal_pipe_buf_ops, |
3035 | .spd_release = sock_spd_release, |
3036 | }; |
3037 | int ret = 0; |
3038 | |
3039 | __skb_splice_bits(skb, pipe, offset: &offset, len: &tlen, spd: &spd, sk); |
3040 | |
3041 | if (spd.nr_pages) |
3042 | ret = splice_to_pipe(pipe, &spd); |
3043 | |
3044 | return ret; |
3045 | } |
3046 | EXPORT_SYMBOL_GPL(skb_splice_bits); |
3047 | |
3048 | static int sendmsg_locked(struct sock *sk, struct msghdr *msg) |
3049 | { |
3050 | struct socket *sock = sk->sk_socket; |
3051 | size_t size = msg_data_left(msg); |
3052 | |
3053 | if (!sock) |
3054 | return -EINVAL; |
3055 | |
3056 | if (!sock->ops->sendmsg_locked) |
3057 | return sock_no_sendmsg_locked(sk, msg, len: size); |
3058 | |
3059 | return sock->ops->sendmsg_locked(sk, msg, size); |
3060 | } |
3061 | |
3062 | static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) |
3063 | { |
3064 | struct socket *sock = sk->sk_socket; |
3065 | |
3066 | if (!sock) |
3067 | return -EINVAL; |
3068 | return sock_sendmsg(sock, msg); |
3069 | } |
3070 | |
3071 | typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); |
3072 | static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, |
3073 | int len, sendmsg_func sendmsg) |
3074 | { |
3075 | unsigned int orig_len = len; |
3076 | struct sk_buff *head = skb; |
3077 | unsigned short fragidx; |
3078 | int slen, ret; |
3079 | |
3080 | do_frag_list: |
3081 | |
3082 | /* Deal with head data */ |
3083 | while (offset < skb_headlen(skb) && len) { |
3084 | struct kvec kv; |
3085 | struct msghdr msg; |
3086 | |
3087 | slen = min_t(int, len, skb_headlen(skb) - offset); |
3088 | kv.iov_base = skb->data + offset; |
3089 | kv.iov_len = slen; |
3090 | memset(&msg, 0, sizeof(msg)); |
3091 | msg.msg_flags = MSG_DONTWAIT; |
3092 | |
3093 | iov_iter_kvec(i: &msg.msg_iter, ITER_SOURCE, kvec: &kv, nr_segs: 1, count: slen); |
3094 | ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, |
3095 | sendmsg_unlocked, sk, &msg); |
3096 | if (ret <= 0) |
3097 | goto error; |
3098 | |
3099 | offset += ret; |
3100 | len -= ret; |
3101 | } |
3102 | |
3103 | /* All the data was skb head? */ |
3104 | if (!len) |
3105 | goto out; |
3106 | |
3107 | /* Make offset relative to start of frags */ |
3108 | offset -= skb_headlen(skb); |
3109 | |
3110 | /* Find where we are in frag list */ |
3111 | for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { |
3112 | skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; |
3113 | |
3114 | if (offset < skb_frag_size(frag)) |
3115 | break; |
3116 | |
3117 | offset -= skb_frag_size(frag); |
3118 | } |
3119 | |
3120 | for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { |
3121 | skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; |
3122 | |
3123 | slen = min_t(size_t, len, skb_frag_size(frag) - offset); |
3124 | |
3125 | while (slen) { |
3126 | struct bio_vec bvec; |
3127 | struct msghdr msg = { |
3128 | .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, |
3129 | }; |
3130 | |
3131 | bvec_set_page(bv: &bvec, page: skb_frag_page(frag), len: slen, |
3132 | offset: skb_frag_off(frag) + offset); |
3133 | iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: 1, |
3134 | count: slen); |
3135 | |
3136 | ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, |
3137 | sendmsg_unlocked, sk, &msg); |
3138 | if (ret <= 0) |
3139 | goto error; |
3140 | |
3141 | len -= ret; |
3142 | offset += ret; |
3143 | slen -= ret; |
3144 | } |
3145 | |
3146 | offset = 0; |
3147 | } |
3148 | |
3149 | if (len) { |
3150 | /* Process any frag lists */ |
3151 | |
3152 | if (skb == head) { |
3153 | if (skb_has_frag_list(skb)) { |
3154 | skb = skb_shinfo(skb)->frag_list; |
3155 | goto do_frag_list; |
3156 | } |
3157 | } else if (skb->next) { |
3158 | skb = skb->next; |
3159 | goto do_frag_list; |
3160 | } |
3161 | } |
3162 | |
3163 | out: |
3164 | return orig_len - len; |
3165 | |
3166 | error: |
3167 | return orig_len == len ? ret : orig_len - len; |
3168 | } |
3169 | |
3170 | /* Send skb data on a socket. Socket must be locked. */ |
3171 | int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, |
3172 | int len) |
3173 | { |
3174 | return __skb_send_sock(sk, skb, offset, len, sendmsg: sendmsg_locked); |
3175 | } |
3176 | EXPORT_SYMBOL_GPL(skb_send_sock_locked); |
3177 | |
3178 | /* Send skb data on a socket. Socket must be unlocked. */ |
3179 | int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) |
3180 | { |
3181 | return __skb_send_sock(sk, skb, offset, len, sendmsg: sendmsg_unlocked); |
3182 | } |
3183 | |
3184 | /** |
3185 | * skb_store_bits - store bits from kernel buffer to skb |
3186 | * @skb: destination buffer |
3187 | * @offset: offset in destination |
3188 | * @from: source buffer |
3189 | * @len: number of bytes to copy |
3190 | * |
3191 | * Copy the specified number of bytes from the source buffer to the |
3192 | * destination skb. This function handles all the messy bits of |
3193 | * traversing fragment lists and such. |
3194 | */ |
3195 | |
3196 | int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) |
3197 | { |
3198 | int start = skb_headlen(skb); |
3199 | struct sk_buff *frag_iter; |
3200 | int i, copy; |
3201 | |
3202 | if (offset > (int)skb->len - len) |
3203 | goto fault; |
3204 | |
3205 | if ((copy = start - offset) > 0) { |
3206 | if (copy > len) |
3207 | copy = len; |
3208 | skb_copy_to_linear_data_offset(skb, offset, from, len: copy); |
3209 | if ((len -= copy) == 0) |
3210 | return 0; |
3211 | offset += copy; |
3212 | from += copy; |
3213 | } |
3214 | |
3215 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
3216 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
3217 | int end; |
3218 | |
3219 | WARN_ON(start > offset + len); |
3220 | |
3221 | end = start + skb_frag_size(frag); |
3222 | if ((copy = end - offset) > 0) { |
3223 | u32 p_off, p_len, copied; |
3224 | struct page *p; |
3225 | u8 *vaddr; |
3226 | |
3227 | if (copy > len) |
3228 | copy = len; |
3229 | |
3230 | skb_frag_foreach_page(frag, |
3231 | skb_frag_off(frag) + offset - start, |
3232 | copy, p, p_off, p_len, copied) { |
3233 | vaddr = kmap_atomic(page: p); |
3234 | memcpy(vaddr + p_off, from + copied, p_len); |
3235 | kunmap_atomic(vaddr); |
3236 | } |
3237 | |
3238 | if ((len -= copy) == 0) |
3239 | return 0; |
3240 | offset += copy; |
3241 | from += copy; |
3242 | } |
3243 | start = end; |
3244 | } |
3245 | |
3246 | skb_walk_frags(skb, frag_iter) { |
3247 | int end; |
3248 | |
3249 | WARN_ON(start > offset + len); |
3250 | |
3251 | end = start + frag_iter->len; |
3252 | if ((copy = end - offset) > 0) { |
3253 | if (copy > len) |
3254 | copy = len; |
3255 | if (skb_store_bits(skb: frag_iter, offset: offset - start, |
3256 | from, len: copy)) |
3257 | goto fault; |
3258 | if ((len -= copy) == 0) |
3259 | return 0; |
3260 | offset += copy; |
3261 | from += copy; |
3262 | } |
3263 | start = end; |
3264 | } |
3265 | if (!len) |
3266 | return 0; |
3267 | |
3268 | fault: |
3269 | return -EFAULT; |
3270 | } |
3271 | EXPORT_SYMBOL(skb_store_bits); |
3272 | |
3273 | /* Checksum skb data. */ |
3274 | __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, |
3275 | __wsum csum, const struct skb_checksum_ops *ops) |
3276 | { |
3277 | int start = skb_headlen(skb); |
3278 | int i, copy = start - offset; |
3279 | struct sk_buff *frag_iter; |
3280 | int pos = 0; |
3281 | |
3282 | /* Checksum header. */ |
3283 | if (copy > 0) { |
3284 | if (copy > len) |
3285 | copy = len; |
3286 | csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, |
3287 | skb->data + offset, copy, csum); |
3288 | if ((len -= copy) == 0) |
3289 | return csum; |
3290 | offset += copy; |
3291 | pos = copy; |
3292 | } |
3293 | |
3294 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
3295 | int end; |
3296 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
3297 | |
3298 | WARN_ON(start > offset + len); |
3299 | |
3300 | end = start + skb_frag_size(frag); |
3301 | if ((copy = end - offset) > 0) { |
3302 | u32 p_off, p_len, copied; |
3303 | struct page *p; |
3304 | __wsum csum2; |
3305 | u8 *vaddr; |
3306 | |
3307 | if (copy > len) |
3308 | copy = len; |
3309 | |
3310 | skb_frag_foreach_page(frag, |
3311 | skb_frag_off(frag) + offset - start, |
3312 | copy, p, p_off, p_len, copied) { |
3313 | vaddr = kmap_atomic(page: p); |
3314 | csum2 = INDIRECT_CALL_1(ops->update, |
3315 | csum_partial_ext, |
3316 | vaddr + p_off, p_len, 0); |
3317 | kunmap_atomic(vaddr); |
3318 | csum = INDIRECT_CALL_1(ops->combine, |
3319 | csum_block_add_ext, csum, |
3320 | csum2, pos, p_len); |
3321 | pos += p_len; |
3322 | } |
3323 | |
3324 | if (!(len -= copy)) |
3325 | return csum; |
3326 | offset += copy; |
3327 | } |
3328 | start = end; |
3329 | } |
3330 | |
3331 | skb_walk_frags(skb, frag_iter) { |
3332 | int end; |
3333 | |
3334 | WARN_ON(start > offset + len); |
3335 | |
3336 | end = start + frag_iter->len; |
3337 | if ((copy = end - offset) > 0) { |
3338 | __wsum csum2; |
3339 | if (copy > len) |
3340 | copy = len; |
3341 | csum2 = __skb_checksum(skb: frag_iter, offset: offset - start, |
3342 | len: copy, csum: 0, ops); |
3343 | csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, |
3344 | csum, csum2, pos, copy); |
3345 | if ((len -= copy) == 0) |
3346 | return csum; |
3347 | offset += copy; |
3348 | pos += copy; |
3349 | } |
3350 | start = end; |
3351 | } |
3352 | BUG_ON(len); |
3353 | |
3354 | return csum; |
3355 | } |
3356 | EXPORT_SYMBOL(__skb_checksum); |
3357 | |
3358 | __wsum skb_checksum(const struct sk_buff *skb, int offset, |
3359 | int len, __wsum csum) |
3360 | { |
3361 | const struct skb_checksum_ops ops = { |
3362 | .update = csum_partial_ext, |
3363 | .combine = csum_block_add_ext, |
3364 | }; |
3365 | |
3366 | return __skb_checksum(skb, offset, len, csum, &ops); |
3367 | } |
3368 | EXPORT_SYMBOL(skb_checksum); |
3369 | |
3370 | /* Both of above in one bottle. */ |
3371 | |
3372 | __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, |
3373 | u8 *to, int len) |
3374 | { |
3375 | int start = skb_headlen(skb); |
3376 | int i, copy = start - offset; |
3377 | struct sk_buff *frag_iter; |
3378 | int pos = 0; |
3379 | __wsum csum = 0; |
3380 | |
3381 | /* Copy header. */ |
3382 | if (copy > 0) { |
3383 | if (copy > len) |
3384 | copy = len; |
3385 | csum = csum_partial_copy_nocheck(src: skb->data + offset, dst: to, |
3386 | len: copy); |
3387 | if ((len -= copy) == 0) |
3388 | return csum; |
3389 | offset += copy; |
3390 | to += copy; |
3391 | pos = copy; |
3392 | } |
3393 | |
3394 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
3395 | int end; |
3396 | |
3397 | WARN_ON(start > offset + len); |
3398 | |
3399 | end = start + skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
3400 | if ((copy = end - offset) > 0) { |
3401 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
3402 | u32 p_off, p_len, copied; |
3403 | struct page *p; |
3404 | __wsum csum2; |
3405 | u8 *vaddr; |
3406 | |
3407 | if (copy > len) |
3408 | copy = len; |
3409 | |
3410 | skb_frag_foreach_page(frag, |
3411 | skb_frag_off(frag) + offset - start, |
3412 | copy, p, p_off, p_len, copied) { |
3413 | vaddr = kmap_atomic(page: p); |
3414 | csum2 = csum_partial_copy_nocheck(src: vaddr + p_off, |
3415 | dst: to + copied, |
3416 | len: p_len); |
3417 | kunmap_atomic(vaddr); |
3418 | csum = csum_block_add(csum, csum2, offset: pos); |
3419 | pos += p_len; |
3420 | } |
3421 | |
3422 | if (!(len -= copy)) |
3423 | return csum; |
3424 | offset += copy; |
3425 | to += copy; |
3426 | } |
3427 | start = end; |
3428 | } |
3429 | |
3430 | skb_walk_frags(skb, frag_iter) { |
3431 | __wsum csum2; |
3432 | int end; |
3433 | |
3434 | WARN_ON(start > offset + len); |
3435 | |
3436 | end = start + frag_iter->len; |
3437 | if ((copy = end - offset) > 0) { |
3438 | if (copy > len) |
3439 | copy = len; |
3440 | csum2 = skb_copy_and_csum_bits(skb: frag_iter, |
3441 | offset: offset - start, |
3442 | to, len: copy); |
3443 | csum = csum_block_add(csum, csum2, offset: pos); |
3444 | if ((len -= copy) == 0) |
3445 | return csum; |
3446 | offset += copy; |
3447 | to += copy; |
3448 | pos += copy; |
3449 | } |
3450 | start = end; |
3451 | } |
3452 | BUG_ON(len); |
3453 | return csum; |
3454 | } |
3455 | EXPORT_SYMBOL(skb_copy_and_csum_bits); |
3456 | |
3457 | __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) |
3458 | { |
3459 | __sum16 sum; |
3460 | |
3461 | sum = csum_fold(sum: skb_checksum(skb, 0, len, skb->csum)); |
3462 | /* See comments in __skb_checksum_complete(). */ |
3463 | if (likely(!sum)) { |
3464 | if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && |
3465 | !skb->csum_complete_sw) |
3466 | netdev_rx_csum_fault(dev: skb->dev, skb); |
3467 | } |
3468 | if (!skb_shared(skb)) |
3469 | skb->csum_valid = !sum; |
3470 | return sum; |
3471 | } |
3472 | EXPORT_SYMBOL(__skb_checksum_complete_head); |
3473 | |
3474 | /* This function assumes skb->csum already holds pseudo header's checksum, |
3475 | * which has been changed from the hardware checksum, for example, by |
3476 | * __skb_checksum_validate_complete(). And, the original skb->csum must |
3477 | * have been validated unsuccessfully for CHECKSUM_COMPLETE case. |
3478 | * |
3479 | * It returns non-zero if the recomputed checksum is still invalid, otherwise |
3480 | * zero. The new checksum is stored back into skb->csum unless the skb is |
3481 | * shared. |
3482 | */ |
3483 | __sum16 __skb_checksum_complete(struct sk_buff *skb) |
3484 | { |
3485 | __wsum csum; |
3486 | __sum16 sum; |
3487 | |
3488 | csum = skb_checksum(skb, 0, skb->len, 0); |
3489 | |
3490 | sum = csum_fold(sum: csum_add(csum: skb->csum, addend: csum)); |
3491 | /* This check is inverted, because we already knew the hardware |
3492 | * checksum is invalid before calling this function. So, if the |
3493 | * re-computed checksum is valid instead, then we have a mismatch |
3494 | * between the original skb->csum and skb_checksum(). This means either |
3495 | * the original hardware checksum is incorrect or we screw up skb->csum |
3496 | * when moving skb->data around. |
3497 | */ |
3498 | if (likely(!sum)) { |
3499 | if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && |
3500 | !skb->csum_complete_sw) |
3501 | netdev_rx_csum_fault(dev: skb->dev, skb); |
3502 | } |
3503 | |
3504 | if (!skb_shared(skb)) { |
3505 | /* Save full packet checksum */ |
3506 | skb->csum = csum; |
3507 | skb->ip_summed = CHECKSUM_COMPLETE; |
3508 | skb->csum_complete_sw = 1; |
3509 | skb->csum_valid = !sum; |
3510 | } |
3511 | |
3512 | return sum; |
3513 | } |
3514 | EXPORT_SYMBOL(__skb_checksum_complete); |
3515 | |
3516 | static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) |
3517 | { |
3518 | net_warn_ratelimited( |
3519 | "%s: attempt to compute crc32c without libcrc32c.ko\n" , |
3520 | __func__); |
3521 | return 0; |
3522 | } |
3523 | |
3524 | static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, |
3525 | int offset, int len) |
3526 | { |
3527 | net_warn_ratelimited( |
3528 | "%s: attempt to compute crc32c without libcrc32c.ko\n" , |
3529 | __func__); |
3530 | return 0; |
3531 | } |
3532 | |
3533 | static const struct skb_checksum_ops default_crc32c_ops = { |
3534 | .update = warn_crc32c_csum_update, |
3535 | .combine = warn_crc32c_csum_combine, |
3536 | }; |
3537 | |
3538 | const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = |
3539 | &default_crc32c_ops; |
3540 | EXPORT_SYMBOL(crc32c_csum_stub); |
3541 | |
3542 | /** |
3543 | * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() |
3544 | * @from: source buffer |
3545 | * |
3546 | * Calculates the amount of linear headroom needed in the 'to' skb passed |
3547 | * into skb_zerocopy(). |
3548 | */ |
3549 | unsigned int |
3550 | skb_zerocopy_headlen(const struct sk_buff *from) |
3551 | { |
3552 | unsigned int hlen = 0; |
3553 | |
3554 | if (!from->head_frag || |
3555 | skb_headlen(skb: from) < L1_CACHE_BYTES || |
3556 | skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { |
3557 | hlen = skb_headlen(skb: from); |
3558 | if (!hlen) |
3559 | hlen = from->len; |
3560 | } |
3561 | |
3562 | if (skb_has_frag_list(skb: from)) |
3563 | hlen = from->len; |
3564 | |
3565 | return hlen; |
3566 | } |
3567 | EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); |
3568 | |
3569 | /** |
3570 | * skb_zerocopy - Zero copy skb to skb |
3571 | * @to: destination buffer |
3572 | * @from: source buffer |
3573 | * @len: number of bytes to copy from source buffer |
3574 | * @hlen: size of linear headroom in destination buffer |
3575 | * |
3576 | * Copies up to `len` bytes from `from` to `to` by creating references |
3577 | * to the frags in the source buffer. |
3578 | * |
3579 | * The `hlen` as calculated by skb_zerocopy_headlen() specifies the |
3580 | * headroom in the `to` buffer. |
3581 | * |
3582 | * Return value: |
3583 | * 0: everything is OK |
3584 | * -ENOMEM: couldn't orphan frags of @from due to lack of memory |
3585 | * -EFAULT: skb_copy_bits() found some problem with skb geometry |
3586 | */ |
3587 | int |
3588 | skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) |
3589 | { |
3590 | int i, j = 0; |
3591 | int plen = 0; /* length of skb->head fragment */ |
3592 | int ret; |
3593 | struct page *page; |
3594 | unsigned int offset; |
3595 | |
3596 | BUG_ON(!from->head_frag && !hlen); |
3597 | |
3598 | /* dont bother with small payloads */ |
3599 | if (len <= skb_tailroom(skb: to)) |
3600 | return skb_copy_bits(from, 0, skb_put(to, len), len); |
3601 | |
3602 | if (hlen) { |
3603 | ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); |
3604 | if (unlikely(ret)) |
3605 | return ret; |
3606 | len -= hlen; |
3607 | } else { |
3608 | plen = min_t(int, skb_headlen(from), len); |
3609 | if (plen) { |
3610 | page = virt_to_head_page(x: from->head); |
3611 | offset = from->data - (unsigned char *)page_address(page); |
3612 | __skb_fill_page_desc(skb: to, i: 0, page, off: offset, size: plen); |
3613 | get_page(page); |
3614 | j = 1; |
3615 | len -= plen; |
3616 | } |
3617 | } |
3618 | |
3619 | skb_len_add(skb: to, delta: len + plen); |
3620 | |
3621 | if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { |
3622 | skb_tx_error(from); |
3623 | return -ENOMEM; |
3624 | } |
3625 | skb_zerocopy_clone(nskb: to, orig: from, GFP_ATOMIC); |
3626 | |
3627 | for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { |
3628 | int size; |
3629 | |
3630 | if (!len) |
3631 | break; |
3632 | skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; |
3633 | size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), |
3634 | len); |
3635 | skb_frag_size_set(frag: &skb_shinfo(to)->frags[j], size); |
3636 | len -= size; |
3637 | skb_frag_ref(skb: to, f: j); |
3638 | j++; |
3639 | } |
3640 | skb_shinfo(to)->nr_frags = j; |
3641 | |
3642 | return 0; |
3643 | } |
3644 | EXPORT_SYMBOL_GPL(skb_zerocopy); |
3645 | |
3646 | void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) |
3647 | { |
3648 | __wsum csum; |
3649 | long csstart; |
3650 | |
3651 | if (skb->ip_summed == CHECKSUM_PARTIAL) |
3652 | csstart = skb_checksum_start_offset(skb); |
3653 | else |
3654 | csstart = skb_headlen(skb); |
3655 | |
3656 | BUG_ON(csstart > skb_headlen(skb)); |
3657 | |
3658 | skb_copy_from_linear_data(skb, to, len: csstart); |
3659 | |
3660 | csum = 0; |
3661 | if (csstart != skb->len) |
3662 | csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, |
3663 | skb->len - csstart); |
3664 | |
3665 | if (skb->ip_summed == CHECKSUM_PARTIAL) { |
3666 | long csstuff = csstart + skb->csum_offset; |
3667 | |
3668 | *((__sum16 *)(to + csstuff)) = csum_fold(sum: csum); |
3669 | } |
3670 | } |
3671 | EXPORT_SYMBOL(skb_copy_and_csum_dev); |
3672 | |
3673 | /** |
3674 | * skb_dequeue - remove from the head of the queue |
3675 | * @list: list to dequeue from |
3676 | * |
3677 | * Remove the head of the list. The list lock is taken so the function |
3678 | * may be used safely with other locking list functions. The head item is |
3679 | * returned or %NULL if the list is empty. |
3680 | */ |
3681 | |
3682 | struct sk_buff *skb_dequeue(struct sk_buff_head *list) |
3683 | { |
3684 | unsigned long flags; |
3685 | struct sk_buff *result; |
3686 | |
3687 | spin_lock_irqsave(&list->lock, flags); |
3688 | result = __skb_dequeue(list); |
3689 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3690 | return result; |
3691 | } |
3692 | EXPORT_SYMBOL(skb_dequeue); |
3693 | |
3694 | /** |
3695 | * skb_dequeue_tail - remove from the tail of the queue |
3696 | * @list: list to dequeue from |
3697 | * |
3698 | * Remove the tail of the list. The list lock is taken so the function |
3699 | * may be used safely with other locking list functions. The tail item is |
3700 | * returned or %NULL if the list is empty. |
3701 | */ |
3702 | struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) |
3703 | { |
3704 | unsigned long flags; |
3705 | struct sk_buff *result; |
3706 | |
3707 | spin_lock_irqsave(&list->lock, flags); |
3708 | result = __skb_dequeue_tail(list); |
3709 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3710 | return result; |
3711 | } |
3712 | EXPORT_SYMBOL(skb_dequeue_tail); |
3713 | |
3714 | /** |
3715 | * skb_queue_purge_reason - empty a list |
3716 | * @list: list to empty |
3717 | * @reason: drop reason |
3718 | * |
3719 | * Delete all buffers on an &sk_buff list. Each buffer is removed from |
3720 | * the list and one reference dropped. This function takes the list |
3721 | * lock and is atomic with respect to other list locking functions. |
3722 | */ |
3723 | void skb_queue_purge_reason(struct sk_buff_head *list, |
3724 | enum skb_drop_reason reason) |
3725 | { |
3726 | struct sk_buff_head tmp; |
3727 | unsigned long flags; |
3728 | |
3729 | if (skb_queue_empty_lockless(list)) |
3730 | return; |
3731 | |
3732 | __skb_queue_head_init(list: &tmp); |
3733 | |
3734 | spin_lock_irqsave(&list->lock, flags); |
3735 | skb_queue_splice_init(list, head: &tmp); |
3736 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3737 | |
3738 | __skb_queue_purge_reason(list: &tmp, reason); |
3739 | } |
3740 | EXPORT_SYMBOL(skb_queue_purge_reason); |
3741 | |
3742 | /** |
3743 | * skb_rbtree_purge - empty a skb rbtree |
3744 | * @root: root of the rbtree to empty |
3745 | * Return value: the sum of truesizes of all purged skbs. |
3746 | * |
3747 | * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from |
3748 | * the list and one reference dropped. This function does not take |
3749 | * any lock. Synchronization should be handled by the caller (e.g., TCP |
3750 | * out-of-order queue is protected by the socket lock). |
3751 | */ |
3752 | unsigned int skb_rbtree_purge(struct rb_root *root) |
3753 | { |
3754 | struct rb_node *p = rb_first(root); |
3755 | unsigned int sum = 0; |
3756 | |
3757 | while (p) { |
3758 | struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); |
3759 | |
3760 | p = rb_next(p); |
3761 | rb_erase(&skb->rbnode, root); |
3762 | sum += skb->truesize; |
3763 | kfree_skb(skb); |
3764 | } |
3765 | return sum; |
3766 | } |
3767 | |
3768 | void skb_errqueue_purge(struct sk_buff_head *list) |
3769 | { |
3770 | struct sk_buff *skb, *next; |
3771 | struct sk_buff_head kill; |
3772 | unsigned long flags; |
3773 | |
3774 | __skb_queue_head_init(list: &kill); |
3775 | |
3776 | spin_lock_irqsave(&list->lock, flags); |
3777 | skb_queue_walk_safe(list, skb, next) { |
3778 | if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || |
3779 | SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) |
3780 | continue; |
3781 | __skb_unlink(skb, list); |
3782 | __skb_queue_tail(list: &kill, newsk: skb); |
3783 | } |
3784 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3785 | __skb_queue_purge(list: &kill); |
3786 | } |
3787 | EXPORT_SYMBOL(skb_errqueue_purge); |
3788 | |
3789 | /** |
3790 | * skb_queue_head - queue a buffer at the list head |
3791 | * @list: list to use |
3792 | * @newsk: buffer to queue |
3793 | * |
3794 | * Queue a buffer at the start of the list. This function takes the |
3795 | * list lock and can be used safely with other locking &sk_buff functions |
3796 | * safely. |
3797 | * |
3798 | * A buffer cannot be placed on two lists at the same time. |
3799 | */ |
3800 | void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) |
3801 | { |
3802 | unsigned long flags; |
3803 | |
3804 | spin_lock_irqsave(&list->lock, flags); |
3805 | __skb_queue_head(list, newsk); |
3806 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3807 | } |
3808 | EXPORT_SYMBOL(skb_queue_head); |
3809 | |
3810 | /** |
3811 | * skb_queue_tail - queue a buffer at the list tail |
3812 | * @list: list to use |
3813 | * @newsk: buffer to queue |
3814 | * |
3815 | * Queue a buffer at the tail of the list. This function takes the |
3816 | * list lock and can be used safely with other locking &sk_buff functions |
3817 | * safely. |
3818 | * |
3819 | * A buffer cannot be placed on two lists at the same time. |
3820 | */ |
3821 | void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) |
3822 | { |
3823 | unsigned long flags; |
3824 | |
3825 | spin_lock_irqsave(&list->lock, flags); |
3826 | __skb_queue_tail(list, newsk); |
3827 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3828 | } |
3829 | EXPORT_SYMBOL(skb_queue_tail); |
3830 | |
3831 | /** |
3832 | * skb_unlink - remove a buffer from a list |
3833 | * @skb: buffer to remove |
3834 | * @list: list to use |
3835 | * |
3836 | * Remove a packet from a list. The list locks are taken and this |
3837 | * function is atomic with respect to other list locked calls |
3838 | * |
3839 | * You must know what list the SKB is on. |
3840 | */ |
3841 | void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) |
3842 | { |
3843 | unsigned long flags; |
3844 | |
3845 | spin_lock_irqsave(&list->lock, flags); |
3846 | __skb_unlink(skb, list); |
3847 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3848 | } |
3849 | EXPORT_SYMBOL(skb_unlink); |
3850 | |
3851 | /** |
3852 | * skb_append - append a buffer |
3853 | * @old: buffer to insert after |
3854 | * @newsk: buffer to insert |
3855 | * @list: list to use |
3856 | * |
3857 | * Place a packet after a given packet in a list. The list locks are taken |
3858 | * and this function is atomic with respect to other list locked calls. |
3859 | * A buffer cannot be placed on two lists at the same time. |
3860 | */ |
3861 | void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) |
3862 | { |
3863 | unsigned long flags; |
3864 | |
3865 | spin_lock_irqsave(&list->lock, flags); |
3866 | __skb_queue_after(list, prev: old, newsk); |
3867 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3868 | } |
3869 | EXPORT_SYMBOL(skb_append); |
3870 | |
3871 | static inline void (struct sk_buff *skb, |
3872 | struct sk_buff* skb1, |
3873 | const u32 len, const int pos) |
3874 | { |
3875 | int i; |
3876 | |
3877 | skb_copy_from_linear_data_offset(skb, offset: len, to: skb_put(skb1, pos - len), |
3878 | len: pos - len); |
3879 | /* And move data appendix as is. */ |
3880 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
3881 | skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; |
3882 | |
3883 | skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; |
3884 | skb_shinfo(skb)->nr_frags = 0; |
3885 | skb1->data_len = skb->data_len; |
3886 | skb1->len += skb1->data_len; |
3887 | skb->data_len = 0; |
3888 | skb->len = len; |
3889 | skb_set_tail_pointer(skb, offset: len); |
3890 | } |
3891 | |
3892 | static inline void (struct sk_buff *skb, |
3893 | struct sk_buff* skb1, |
3894 | const u32 len, int pos) |
3895 | { |
3896 | int i, k = 0; |
3897 | const int nfrags = skb_shinfo(skb)->nr_frags; |
3898 | |
3899 | skb_shinfo(skb)->nr_frags = 0; |
3900 | skb1->len = skb1->data_len = skb->len - len; |
3901 | skb->len = len; |
3902 | skb->data_len = len - pos; |
3903 | |
3904 | for (i = 0; i < nfrags; i++) { |
3905 | int size = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
3906 | |
3907 | if (pos + size > len) { |
3908 | skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; |
3909 | |
3910 | if (pos < len) { |
3911 | /* Split frag. |
3912 | * We have two variants in this case: |
3913 | * 1. Move all the frag to the second |
3914 | * part, if it is possible. F.e. |
3915 | * this approach is mandatory for TUX, |
3916 | * where splitting is expensive. |
3917 | * 2. Split is accurately. We make this. |
3918 | */ |
3919 | skb_frag_ref(skb, f: i); |
3920 | skb_frag_off_add(frag: &skb_shinfo(skb1)->frags[0], delta: len - pos); |
3921 | skb_frag_size_sub(frag: &skb_shinfo(skb1)->frags[0], delta: len - pos); |
3922 | skb_frag_size_set(frag: &skb_shinfo(skb)->frags[i], size: len - pos); |
3923 | skb_shinfo(skb)->nr_frags++; |
3924 | } |
3925 | k++; |
3926 | } else |
3927 | skb_shinfo(skb)->nr_frags++; |
3928 | pos += size; |
3929 | } |
3930 | skb_shinfo(skb1)->nr_frags = k; |
3931 | } |
3932 | |
3933 | /** |
3934 | * skb_split - Split fragmented skb to two parts at length len. |
3935 | * @skb: the buffer to split |
3936 | * @skb1: the buffer to receive the second part |
3937 | * @len: new length for skb |
3938 | */ |
3939 | void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) |
3940 | { |
3941 | int pos = skb_headlen(skb); |
3942 | const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; |
3943 | |
3944 | skb_zcopy_downgrade_managed(skb); |
3945 | |
3946 | skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; |
3947 | skb_zerocopy_clone(nskb: skb1, orig: skb, gfp_mask: 0); |
3948 | if (len < pos) /* Split line is inside header. */ |
3949 | skb_split_inside_header(skb, skb1, len, pos); |
3950 | else /* Second chunk has no header, nothing to copy. */ |
3951 | skb_split_no_header(skb, skb1, len, pos); |
3952 | } |
3953 | EXPORT_SYMBOL(skb_split); |
3954 | |
3955 | /* Shifting from/to a cloned skb is a no-go. |
3956 | * |
3957 | * Caller cannot keep skb_shinfo related pointers past calling here! |
3958 | */ |
3959 | static int skb_prepare_for_shift(struct sk_buff *skb) |
3960 | { |
3961 | return skb_unclone_keeptruesize(skb, GFP_ATOMIC); |
3962 | } |
3963 | |
3964 | /** |
3965 | * skb_shift - Shifts paged data partially from skb to another |
3966 | * @tgt: buffer into which tail data gets added |
3967 | * @skb: buffer from which the paged data comes from |
3968 | * @shiftlen: shift up to this many bytes |
3969 | * |
3970 | * Attempts to shift up to shiftlen worth of bytes, which may be less than |
3971 | * the length of the skb, from skb to tgt. Returns number bytes shifted. |
3972 | * It's up to caller to free skb if everything was shifted. |
3973 | * |
3974 | * If @tgt runs out of frags, the whole operation is aborted. |
3975 | * |
3976 | * Skb cannot include anything else but paged data while tgt is allowed |
3977 | * to have non-paged data as well. |
3978 | * |
3979 | * TODO: full sized shift could be optimized but that would need |
3980 | * specialized skb free'er to handle frags without up-to-date nr_frags. |
3981 | */ |
3982 | int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) |
3983 | { |
3984 | int from, to, merge, todo; |
3985 | skb_frag_t *fragfrom, *fragto; |
3986 | |
3987 | BUG_ON(shiftlen > skb->len); |
3988 | |
3989 | if (skb_headlen(skb)) |
3990 | return 0; |
3991 | if (skb_zcopy(skb: tgt) || skb_zcopy(skb)) |
3992 | return 0; |
3993 | |
3994 | todo = shiftlen; |
3995 | from = 0; |
3996 | to = skb_shinfo(tgt)->nr_frags; |
3997 | fragfrom = &skb_shinfo(skb)->frags[from]; |
3998 | |
3999 | /* Actual merge is delayed until the point when we know we can |
4000 | * commit all, so that we don't have to undo partial changes |
4001 | */ |
4002 | if (!to || |
4003 | !skb_can_coalesce(skb: tgt, i: to, page: skb_frag_page(frag: fragfrom), |
4004 | off: skb_frag_off(frag: fragfrom))) { |
4005 | merge = -1; |
4006 | } else { |
4007 | merge = to - 1; |
4008 | |
4009 | todo -= skb_frag_size(frag: fragfrom); |
4010 | if (todo < 0) { |
4011 | if (skb_prepare_for_shift(skb) || |
4012 | skb_prepare_for_shift(skb: tgt)) |
4013 | return 0; |
4014 | |
4015 | /* All previous frag pointers might be stale! */ |
4016 | fragfrom = &skb_shinfo(skb)->frags[from]; |
4017 | fragto = &skb_shinfo(tgt)->frags[merge]; |
4018 | |
4019 | skb_frag_size_add(frag: fragto, delta: shiftlen); |
4020 | skb_frag_size_sub(frag: fragfrom, delta: shiftlen); |
4021 | skb_frag_off_add(frag: fragfrom, delta: shiftlen); |
4022 | |
4023 | goto onlymerged; |
4024 | } |
4025 | |
4026 | from++; |
4027 | } |
4028 | |
4029 | /* Skip full, not-fitting skb to avoid expensive operations */ |
4030 | if ((shiftlen == skb->len) && |
4031 | (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) |
4032 | return 0; |
4033 | |
4034 | if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(skb: tgt)) |
4035 | return 0; |
4036 | |
4037 | while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { |
4038 | if (to == MAX_SKB_FRAGS) |
4039 | return 0; |
4040 | |
4041 | fragfrom = &skb_shinfo(skb)->frags[from]; |
4042 | fragto = &skb_shinfo(tgt)->frags[to]; |
4043 | |
4044 | if (todo >= skb_frag_size(frag: fragfrom)) { |
4045 | *fragto = *fragfrom; |
4046 | todo -= skb_frag_size(frag: fragfrom); |
4047 | from++; |
4048 | to++; |
4049 | |
4050 | } else { |
4051 | __skb_frag_ref(frag: fragfrom); |
4052 | skb_frag_page_copy(fragto, fragfrom); |
4053 | skb_frag_off_copy(fragto, fragfrom); |
4054 | skb_frag_size_set(frag: fragto, size: todo); |
4055 | |
4056 | skb_frag_off_add(frag: fragfrom, delta: todo); |
4057 | skb_frag_size_sub(frag: fragfrom, delta: todo); |
4058 | todo = 0; |
4059 | |
4060 | to++; |
4061 | break; |
4062 | } |
4063 | } |
4064 | |
4065 | /* Ready to "commit" this state change to tgt */ |
4066 | skb_shinfo(tgt)->nr_frags = to; |
4067 | |
4068 | if (merge >= 0) { |
4069 | fragfrom = &skb_shinfo(skb)->frags[0]; |
4070 | fragto = &skb_shinfo(tgt)->frags[merge]; |
4071 | |
4072 | skb_frag_size_add(frag: fragto, delta: skb_frag_size(frag: fragfrom)); |
4073 | __skb_frag_unref(frag: fragfrom, recycle: skb->pp_recycle); |
4074 | } |
4075 | |
4076 | /* Reposition in the original skb */ |
4077 | to = 0; |
4078 | while (from < skb_shinfo(skb)->nr_frags) |
4079 | skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; |
4080 | skb_shinfo(skb)->nr_frags = to; |
4081 | |
4082 | BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); |
4083 | |
4084 | onlymerged: |
4085 | /* Most likely the tgt won't ever need its checksum anymore, skb on |
4086 | * the other hand might need it if it needs to be resent |
4087 | */ |
4088 | tgt->ip_summed = CHECKSUM_PARTIAL; |
4089 | skb->ip_summed = CHECKSUM_PARTIAL; |
4090 | |
4091 | skb_len_add(skb, delta: -shiftlen); |
4092 | skb_len_add(skb: tgt, delta: shiftlen); |
4093 | |
4094 | return shiftlen; |
4095 | } |
4096 | |
4097 | /** |
4098 | * skb_prepare_seq_read - Prepare a sequential read of skb data |
4099 | * @skb: the buffer to read |
4100 | * @from: lower offset of data to be read |
4101 | * @to: upper offset of data to be read |
4102 | * @st: state variable |
4103 | * |
4104 | * Initializes the specified state variable. Must be called before |
4105 | * invoking skb_seq_read() for the first time. |
4106 | */ |
4107 | void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, |
4108 | unsigned int to, struct skb_seq_state *st) |
4109 | { |
4110 | st->lower_offset = from; |
4111 | st->upper_offset = to; |
4112 | st->root_skb = st->cur_skb = skb; |
4113 | st->frag_idx = st->stepped_offset = 0; |
4114 | st->frag_data = NULL; |
4115 | st->frag_off = 0; |
4116 | } |
4117 | EXPORT_SYMBOL(skb_prepare_seq_read); |
4118 | |
4119 | /** |
4120 | * skb_seq_read - Sequentially read skb data |
4121 | * @consumed: number of bytes consumed by the caller so far |
4122 | * @data: destination pointer for data to be returned |
4123 | * @st: state variable |
4124 | * |
4125 | * Reads a block of skb data at @consumed relative to the |
4126 | * lower offset specified to skb_prepare_seq_read(). Assigns |
4127 | * the head of the data block to @data and returns the length |
4128 | * of the block or 0 if the end of the skb data or the upper |
4129 | * offset has been reached. |
4130 | * |
4131 | * The caller is not required to consume all of the data |
4132 | * returned, i.e. @consumed is typically set to the number |
4133 | * of bytes already consumed and the next call to |
4134 | * skb_seq_read() will return the remaining part of the block. |
4135 | * |
4136 | * Note 1: The size of each block of data returned can be arbitrary, |
4137 | * this limitation is the cost for zerocopy sequential |
4138 | * reads of potentially non linear data. |
4139 | * |
4140 | * Note 2: Fragment lists within fragments are not implemented |
4141 | * at the moment, state->root_skb could be replaced with |
4142 | * a stack for this purpose. |
4143 | */ |
4144 | unsigned int skb_seq_read(unsigned int consumed, const u8 **data, |
4145 | struct skb_seq_state *st) |
4146 | { |
4147 | unsigned int block_limit, abs_offset = consumed + st->lower_offset; |
4148 | skb_frag_t *frag; |
4149 | |
4150 | if (unlikely(abs_offset >= st->upper_offset)) { |
4151 | if (st->frag_data) { |
4152 | kunmap_atomic(st->frag_data); |
4153 | st->frag_data = NULL; |
4154 | } |
4155 | return 0; |
4156 | } |
4157 | |
4158 | next_skb: |
4159 | block_limit = skb_headlen(skb: st->cur_skb) + st->stepped_offset; |
4160 | |
4161 | if (abs_offset < block_limit && !st->frag_data) { |
4162 | *data = st->cur_skb->data + (abs_offset - st->stepped_offset); |
4163 | return block_limit - abs_offset; |
4164 | } |
4165 | |
4166 | if (st->frag_idx == 0 && !st->frag_data) |
4167 | st->stepped_offset += skb_headlen(skb: st->cur_skb); |
4168 | |
4169 | while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { |
4170 | unsigned int pg_idx, pg_off, pg_sz; |
4171 | |
4172 | frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; |
4173 | |
4174 | pg_idx = 0; |
4175 | pg_off = skb_frag_off(frag); |
4176 | pg_sz = skb_frag_size(frag); |
4177 | |
4178 | if (skb_frag_must_loop(p: skb_frag_page(frag))) { |
4179 | pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; |
4180 | pg_off = offset_in_page(pg_off + st->frag_off); |
4181 | pg_sz = min_t(unsigned int, pg_sz - st->frag_off, |
4182 | PAGE_SIZE - pg_off); |
4183 | } |
4184 | |
4185 | block_limit = pg_sz + st->stepped_offset; |
4186 | if (abs_offset < block_limit) { |
4187 | if (!st->frag_data) |
4188 | st->frag_data = kmap_atomic(page: skb_frag_page(frag) + pg_idx); |
4189 | |
4190 | *data = (u8 *)st->frag_data + pg_off + |
4191 | (abs_offset - st->stepped_offset); |
4192 | |
4193 | return block_limit - abs_offset; |
4194 | } |
4195 | |
4196 | if (st->frag_data) { |
4197 | kunmap_atomic(st->frag_data); |
4198 | st->frag_data = NULL; |
4199 | } |
4200 | |
4201 | st->stepped_offset += pg_sz; |
4202 | st->frag_off += pg_sz; |
4203 | if (st->frag_off == skb_frag_size(frag)) { |
4204 | st->frag_off = 0; |
4205 | st->frag_idx++; |
4206 | } |
4207 | } |
4208 | |
4209 | if (st->frag_data) { |
4210 | kunmap_atomic(st->frag_data); |
4211 | st->frag_data = NULL; |
4212 | } |
4213 | |
4214 | if (st->root_skb == st->cur_skb && skb_has_frag_list(skb: st->root_skb)) { |
4215 | st->cur_skb = skb_shinfo(st->root_skb)->frag_list; |
4216 | st->frag_idx = 0; |
4217 | goto next_skb; |
4218 | } else if (st->cur_skb->next) { |
4219 | st->cur_skb = st->cur_skb->next; |
4220 | st->frag_idx = 0; |
4221 | goto next_skb; |
4222 | } |
4223 | |
4224 | return 0; |
4225 | } |
4226 | EXPORT_SYMBOL(skb_seq_read); |
4227 | |
4228 | /** |
4229 | * skb_abort_seq_read - Abort a sequential read of skb data |
4230 | * @st: state variable |
4231 | * |
4232 | * Must be called if skb_seq_read() was not called until it |
4233 | * returned 0. |
4234 | */ |
4235 | void skb_abort_seq_read(struct skb_seq_state *st) |
4236 | { |
4237 | if (st->frag_data) |
4238 | kunmap_atomic(st->frag_data); |
4239 | } |
4240 | EXPORT_SYMBOL(skb_abort_seq_read); |
4241 | |
4242 | #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) |
4243 | |
4244 | static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, |
4245 | struct ts_config *conf, |
4246 | struct ts_state *state) |
4247 | { |
4248 | return skb_seq_read(offset, text, TS_SKB_CB(state)); |
4249 | } |
4250 | |
4251 | static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) |
4252 | { |
4253 | skb_abort_seq_read(TS_SKB_CB(state)); |
4254 | } |
4255 | |
4256 | /** |
4257 | * skb_find_text - Find a text pattern in skb data |
4258 | * @skb: the buffer to look in |
4259 | * @from: search offset |
4260 | * @to: search limit |
4261 | * @config: textsearch configuration |
4262 | * |
4263 | * Finds a pattern in the skb data according to the specified |
4264 | * textsearch configuration. Use textsearch_next() to retrieve |
4265 | * subsequent occurrences of the pattern. Returns the offset |
4266 | * to the first occurrence or UINT_MAX if no match was found. |
4267 | */ |
4268 | unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, |
4269 | unsigned int to, struct ts_config *config) |
4270 | { |
4271 | unsigned int patlen = config->ops->get_pattern_len(config); |
4272 | struct ts_state state; |
4273 | unsigned int ret; |
4274 | |
4275 | BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); |
4276 | |
4277 | config->get_next_block = skb_ts_get_next_block; |
4278 | config->finish = skb_ts_finish; |
4279 | |
4280 | skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); |
4281 | |
4282 | ret = textsearch_find(conf: config, state: &state); |
4283 | return (ret + patlen <= to - from ? ret : UINT_MAX); |
4284 | } |
4285 | EXPORT_SYMBOL(skb_find_text); |
4286 | |
4287 | int skb_append_pagefrags(struct sk_buff *skb, struct page *page, |
4288 | int offset, size_t size, size_t max_frags) |
4289 | { |
4290 | int i = skb_shinfo(skb)->nr_frags; |
4291 | |
4292 | if (skb_can_coalesce(skb, i, page, off: offset)) { |
4293 | skb_frag_size_add(frag: &skb_shinfo(skb)->frags[i - 1], delta: size); |
4294 | } else if (i < max_frags) { |
4295 | skb_zcopy_downgrade_managed(skb); |
4296 | get_page(page); |
4297 | skb_fill_page_desc_noacc(skb, i, page, off: offset, size); |
4298 | } else { |
4299 | return -EMSGSIZE; |
4300 | } |
4301 | |
4302 | return 0; |
4303 | } |
4304 | EXPORT_SYMBOL_GPL(skb_append_pagefrags); |
4305 | |
4306 | /** |
4307 | * skb_pull_rcsum - pull skb and update receive checksum |
4308 | * @skb: buffer to update |
4309 | * @len: length of data pulled |
4310 | * |
4311 | * This function performs an skb_pull on the packet and updates |
4312 | * the CHECKSUM_COMPLETE checksum. It should be used on |
4313 | * receive path processing instead of skb_pull unless you know |
4314 | * that the checksum difference is zero (e.g., a valid IP header) |
4315 | * or you are setting ip_summed to CHECKSUM_NONE. |
4316 | */ |
4317 | void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) |
4318 | { |
4319 | unsigned char *data = skb->data; |
4320 | |
4321 | BUG_ON(len > skb->len); |
4322 | __skb_pull(skb, len); |
4323 | skb_postpull_rcsum(skb, start: data, len); |
4324 | return skb->data; |
4325 | } |
4326 | EXPORT_SYMBOL_GPL(skb_pull_rcsum); |
4327 | |
4328 | static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) |
4329 | { |
4330 | skb_frag_t head_frag; |
4331 | struct page *page; |
4332 | |
4333 | page = virt_to_head_page(x: frag_skb->head); |
4334 | skb_frag_fill_page_desc(frag: &head_frag, page, off: frag_skb->data - |
4335 | (unsigned char *)page_address(page), |
4336 | size: skb_headlen(skb: frag_skb)); |
4337 | return head_frag; |
4338 | } |
4339 | |
4340 | struct sk_buff *skb_segment_list(struct sk_buff *skb, |
4341 | netdev_features_t features, |
4342 | unsigned int offset) |
4343 | { |
4344 | struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; |
4345 | unsigned int tnl_hlen = skb_tnl_header_len(inner_skb: skb); |
4346 | unsigned int delta_truesize = 0; |
4347 | unsigned int delta_len = 0; |
4348 | struct sk_buff *tail = NULL; |
4349 | struct sk_buff *nskb, *tmp; |
4350 | int len_diff, err; |
4351 | |
4352 | skb_push(skb, -skb_network_offset(skb) + offset); |
4353 | |
4354 | /* Ensure the head is writeable before touching the shared info */ |
4355 | err = skb_unclone(skb, GFP_ATOMIC); |
4356 | if (err) |
4357 | goto err_linearize; |
4358 | |
4359 | skb_shinfo(skb)->frag_list = NULL; |
4360 | |
4361 | while (list_skb) { |
4362 | nskb = list_skb; |
4363 | list_skb = list_skb->next; |
4364 | |
4365 | err = 0; |
4366 | delta_truesize += nskb->truesize; |
4367 | if (skb_shared(skb: nskb)) { |
4368 | tmp = skb_clone(nskb, GFP_ATOMIC); |
4369 | if (tmp) { |
4370 | consume_skb(nskb); |
4371 | nskb = tmp; |
4372 | err = skb_unclone(skb: nskb, GFP_ATOMIC); |
4373 | } else { |
4374 | err = -ENOMEM; |
4375 | } |
4376 | } |
4377 | |
4378 | if (!tail) |
4379 | skb->next = nskb; |
4380 | else |
4381 | tail->next = nskb; |
4382 | |
4383 | if (unlikely(err)) { |
4384 | nskb->next = list_skb; |
4385 | goto err_linearize; |
4386 | } |
4387 | |
4388 | tail = nskb; |
4389 | |
4390 | delta_len += nskb->len; |
4391 | |
4392 | skb_push(nskb, -skb_network_offset(skb: nskb) + offset); |
4393 | |
4394 | skb_release_head_state(skb: nskb); |
4395 | len_diff = skb_network_header_len(skb: nskb) - skb_network_header_len(skb); |
4396 | __copy_skb_header(new: nskb, old: skb); |
4397 | |
4398 | skb_headers_offset_update(nskb, skb_headroom(skb: nskb) - skb_headroom(skb)); |
4399 | nskb->transport_header += len_diff; |
4400 | skb_copy_from_linear_data_offset(skb, offset: -tnl_hlen, |
4401 | to: nskb->data - tnl_hlen, |
4402 | len: offset + tnl_hlen); |
4403 | |
4404 | if (skb_needs_linearize(skb: nskb, features) && |
4405 | __skb_linearize(skb: nskb)) |
4406 | goto err_linearize; |
4407 | } |
4408 | |
4409 | skb->truesize = skb->truesize - delta_truesize; |
4410 | skb->data_len = skb->data_len - delta_len; |
4411 | skb->len = skb->len - delta_len; |
4412 | |
4413 | skb_gso_reset(skb); |
4414 | |
4415 | skb->prev = tail; |
4416 | |
4417 | if (skb_needs_linearize(skb, features) && |
4418 | __skb_linearize(skb)) |
4419 | goto err_linearize; |
4420 | |
4421 | skb_get(skb); |
4422 | |
4423 | return skb; |
4424 | |
4425 | err_linearize: |
4426 | kfree_skb_list(segs: skb->next); |
4427 | skb->next = NULL; |
4428 | return ERR_PTR(error: -ENOMEM); |
4429 | } |
4430 | EXPORT_SYMBOL_GPL(skb_segment_list); |
4431 | |
4432 | /** |
4433 | * skb_segment - Perform protocol segmentation on skb. |
4434 | * @head_skb: buffer to segment |
4435 | * @features: features for the output path (see dev->features) |
4436 | * |
4437 | * This function performs segmentation on the given skb. It returns |
4438 | * a pointer to the first in a list of new skbs for the segments. |
4439 | * In case of error it returns ERR_PTR(err). |
4440 | */ |
4441 | struct sk_buff *skb_segment(struct sk_buff *head_skb, |
4442 | netdev_features_t features) |
4443 | { |
4444 | struct sk_buff *segs = NULL; |
4445 | struct sk_buff *tail = NULL; |
4446 | struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; |
4447 | unsigned int mss = skb_shinfo(head_skb)->gso_size; |
4448 | unsigned int doffset = head_skb->data - skb_mac_header(skb: head_skb); |
4449 | unsigned int offset = doffset; |
4450 | unsigned int tnl_hlen = skb_tnl_header_len(inner_skb: head_skb); |
4451 | unsigned int partial_segs = 0; |
4452 | unsigned int headroom; |
4453 | unsigned int len = head_skb->len; |
4454 | struct sk_buff *frag_skb; |
4455 | skb_frag_t *frag; |
4456 | __be16 proto; |
4457 | bool csum, sg; |
4458 | int err = -ENOMEM; |
4459 | int i = 0; |
4460 | int nfrags, pos; |
4461 | |
4462 | if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && |
4463 | mss != GSO_BY_FRAGS && mss != skb_headlen(skb: head_skb)) { |
4464 | struct sk_buff *check_skb; |
4465 | |
4466 | for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { |
4467 | if (skb_headlen(skb: check_skb) && !check_skb->head_frag) { |
4468 | /* gso_size is untrusted, and we have a frag_list with |
4469 | * a linear non head_frag item. |
4470 | * |
4471 | * If head_skb's headlen does not fit requested gso_size, |
4472 | * it means that the frag_list members do NOT terminate |
4473 | * on exact gso_size boundaries. Hence we cannot perform |
4474 | * skb_frag_t page sharing. Therefore we must fallback to |
4475 | * copying the frag_list skbs; we do so by disabling SG. |
4476 | */ |
4477 | features &= ~NETIF_F_SG; |
4478 | break; |
4479 | } |
4480 | } |
4481 | } |
4482 | |
4483 | __skb_push(skb: head_skb, len: doffset); |
4484 | proto = skb_network_protocol(skb: head_skb, NULL); |
4485 | if (unlikely(!proto)) |
4486 | return ERR_PTR(error: -EINVAL); |
4487 | |
4488 | sg = !!(features & NETIF_F_SG); |
4489 | csum = !!can_checksum_protocol(features, protocol: proto); |
4490 | |
4491 | if (sg && csum && (mss != GSO_BY_FRAGS)) { |
4492 | if (!(features & NETIF_F_GSO_PARTIAL)) { |
4493 | struct sk_buff *iter; |
4494 | unsigned int frag_len; |
4495 | |
4496 | if (!list_skb || |
4497 | !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) |
4498 | goto normal; |
4499 | |
4500 | /* If we get here then all the required |
4501 | * GSO features except frag_list are supported. |
4502 | * Try to split the SKB to multiple GSO SKBs |
4503 | * with no frag_list. |
4504 | * Currently we can do that only when the buffers don't |
4505 | * have a linear part and all the buffers except |
4506 | * the last are of the same length. |
4507 | */ |
4508 | frag_len = list_skb->len; |
4509 | skb_walk_frags(head_skb, iter) { |
4510 | if (frag_len != iter->len && iter->next) |
4511 | goto normal; |
4512 | if (skb_headlen(skb: iter) && !iter->head_frag) |
4513 | goto normal; |
4514 | |
4515 | len -= iter->len; |
4516 | } |
4517 | |
4518 | if (len != frag_len) |
4519 | goto normal; |
4520 | } |
4521 | |
4522 | /* GSO partial only requires that we trim off any excess that |
4523 | * doesn't fit into an MSS sized block, so take care of that |
4524 | * now. |
4525 | */ |
4526 | partial_segs = len / mss; |
4527 | if (partial_segs > 1) |
4528 | mss *= partial_segs; |
4529 | else |
4530 | partial_segs = 0; |
4531 | } |
4532 | |
4533 | normal: |
4534 | headroom = skb_headroom(skb: head_skb); |
4535 | pos = skb_headlen(skb: head_skb); |
4536 | |
4537 | if (skb_orphan_frags(skb: head_skb, GFP_ATOMIC)) |
4538 | return ERR_PTR(error: -ENOMEM); |
4539 | |
4540 | nfrags = skb_shinfo(head_skb)->nr_frags; |
4541 | frag = skb_shinfo(head_skb)->frags; |
4542 | frag_skb = head_skb; |
4543 | |
4544 | do { |
4545 | struct sk_buff *nskb; |
4546 | skb_frag_t *nskb_frag; |
4547 | int hsize; |
4548 | int size; |
4549 | |
4550 | if (unlikely(mss == GSO_BY_FRAGS)) { |
4551 | len = list_skb->len; |
4552 | } else { |
4553 | len = head_skb->len - offset; |
4554 | if (len > mss) |
4555 | len = mss; |
4556 | } |
4557 | |
4558 | hsize = skb_headlen(skb: head_skb) - offset; |
4559 | |
4560 | if (hsize <= 0 && i >= nfrags && skb_headlen(skb: list_skb) && |
4561 | (skb_headlen(skb: list_skb) == len || sg)) { |
4562 | BUG_ON(skb_headlen(list_skb) > len); |
4563 | |
4564 | nskb = skb_clone(list_skb, GFP_ATOMIC); |
4565 | if (unlikely(!nskb)) |
4566 | goto err; |
4567 | |
4568 | i = 0; |
4569 | nfrags = skb_shinfo(list_skb)->nr_frags; |
4570 | frag = skb_shinfo(list_skb)->frags; |
4571 | frag_skb = list_skb; |
4572 | pos += skb_headlen(skb: list_skb); |
4573 | |
4574 | while (pos < offset + len) { |
4575 | BUG_ON(i >= nfrags); |
4576 | |
4577 | size = skb_frag_size(frag); |
4578 | if (pos + size > offset + len) |
4579 | break; |
4580 | |
4581 | i++; |
4582 | pos += size; |
4583 | frag++; |
4584 | } |
4585 | |
4586 | list_skb = list_skb->next; |
4587 | |
4588 | if (unlikely(pskb_trim(nskb, len))) { |
4589 | kfree_skb(skb: nskb); |
4590 | goto err; |
4591 | } |
4592 | |
4593 | hsize = skb_end_offset(skb: nskb); |
4594 | if (skb_cow_head(skb: nskb, headroom: doffset + headroom)) { |
4595 | kfree_skb(skb: nskb); |
4596 | goto err; |
4597 | } |
4598 | |
4599 | nskb->truesize += skb_end_offset(skb: nskb) - hsize; |
4600 | skb_release_head_state(skb: nskb); |
4601 | __skb_push(skb: nskb, len: doffset); |
4602 | } else { |
4603 | if (hsize < 0) |
4604 | hsize = 0; |
4605 | if (hsize > len || !sg) |
4606 | hsize = len; |
4607 | |
4608 | nskb = __alloc_skb(hsize + doffset + headroom, |
4609 | GFP_ATOMIC, skb_alloc_rx_flag(skb: head_skb), |
4610 | NUMA_NO_NODE); |
4611 | |
4612 | if (unlikely(!nskb)) |
4613 | goto err; |
4614 | |
4615 | skb_reserve(skb: nskb, len: headroom); |
4616 | __skb_put(skb: nskb, len: doffset); |
4617 | } |
4618 | |
4619 | if (segs) |
4620 | tail->next = nskb; |
4621 | else |
4622 | segs = nskb; |
4623 | tail = nskb; |
4624 | |
4625 | __copy_skb_header(new: nskb, old: head_skb); |
4626 | |
4627 | skb_headers_offset_update(nskb, skb_headroom(skb: nskb) - headroom); |
4628 | skb_reset_mac_len(skb: nskb); |
4629 | |
4630 | skb_copy_from_linear_data_offset(skb: head_skb, offset: -tnl_hlen, |
4631 | to: nskb->data - tnl_hlen, |
4632 | len: doffset + tnl_hlen); |
4633 | |
4634 | if (nskb->len == len + doffset) |
4635 | goto perform_csum_check; |
4636 | |
4637 | if (!sg) { |
4638 | if (!csum) { |
4639 | if (!nskb->remcsum_offload) |
4640 | nskb->ip_summed = CHECKSUM_NONE; |
4641 | SKB_GSO_CB(nskb)->csum = |
4642 | skb_copy_and_csum_bits(head_skb, offset, |
4643 | skb_put(nskb, |
4644 | len), |
4645 | len); |
4646 | SKB_GSO_CB(nskb)->csum_start = |
4647 | skb_headroom(skb: nskb) + doffset; |
4648 | } else { |
4649 | if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) |
4650 | goto err; |
4651 | } |
4652 | continue; |
4653 | } |
4654 | |
4655 | nskb_frag = skb_shinfo(nskb)->frags; |
4656 | |
4657 | skb_copy_from_linear_data_offset(skb: head_skb, offset, |
4658 | to: skb_put(nskb, hsize), len: hsize); |
4659 | |
4660 | skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & |
4661 | SKBFL_SHARED_FRAG; |
4662 | |
4663 | if (skb_zerocopy_clone(nskb, orig: frag_skb, GFP_ATOMIC)) |
4664 | goto err; |
4665 | |
4666 | while (pos < offset + len) { |
4667 | if (i >= nfrags) { |
4668 | if (skb_orphan_frags(skb: list_skb, GFP_ATOMIC) || |
4669 | skb_zerocopy_clone(nskb, orig: list_skb, |
4670 | GFP_ATOMIC)) |
4671 | goto err; |
4672 | |
4673 | i = 0; |
4674 | nfrags = skb_shinfo(list_skb)->nr_frags; |
4675 | frag = skb_shinfo(list_skb)->frags; |
4676 | frag_skb = list_skb; |
4677 | if (!skb_headlen(skb: list_skb)) { |
4678 | BUG_ON(!nfrags); |
4679 | } else { |
4680 | BUG_ON(!list_skb->head_frag); |
4681 | |
4682 | /* to make room for head_frag. */ |
4683 | i--; |
4684 | frag--; |
4685 | } |
4686 | |
4687 | list_skb = list_skb->next; |
4688 | } |
4689 | |
4690 | if (unlikely(skb_shinfo(nskb)->nr_frags >= |
4691 | MAX_SKB_FRAGS)) { |
4692 | net_warn_ratelimited( |
4693 | "skb_segment: too many frags: %u %u\n" , |
4694 | pos, mss); |
4695 | err = -EINVAL; |
4696 | goto err; |
4697 | } |
4698 | |
4699 | *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; |
4700 | __skb_frag_ref(frag: nskb_frag); |
4701 | size = skb_frag_size(frag: nskb_frag); |
4702 | |
4703 | if (pos < offset) { |
4704 | skb_frag_off_add(frag: nskb_frag, delta: offset - pos); |
4705 | skb_frag_size_sub(frag: nskb_frag, delta: offset - pos); |
4706 | } |
4707 | |
4708 | skb_shinfo(nskb)->nr_frags++; |
4709 | |
4710 | if (pos + size <= offset + len) { |
4711 | i++; |
4712 | frag++; |
4713 | pos += size; |
4714 | } else { |
4715 | skb_frag_size_sub(frag: nskb_frag, delta: pos + size - (offset + len)); |
4716 | goto skip_fraglist; |
4717 | } |
4718 | |
4719 | nskb_frag++; |
4720 | } |
4721 | |
4722 | skip_fraglist: |
4723 | nskb->data_len = len - hsize; |
4724 | nskb->len += nskb->data_len; |
4725 | nskb->truesize += nskb->data_len; |
4726 | |
4727 | perform_csum_check: |
4728 | if (!csum) { |
4729 | if (skb_has_shared_frag(skb: nskb) && |
4730 | __skb_linearize(skb: nskb)) |
4731 | goto err; |
4732 | |
4733 | if (!nskb->remcsum_offload) |
4734 | nskb->ip_summed = CHECKSUM_NONE; |
4735 | SKB_GSO_CB(nskb)->csum = |
4736 | skb_checksum(nskb, doffset, |
4737 | nskb->len - doffset, 0); |
4738 | SKB_GSO_CB(nskb)->csum_start = |
4739 | skb_headroom(skb: nskb) + doffset; |
4740 | } |
4741 | } while ((offset += len) < head_skb->len); |
4742 | |
4743 | /* Some callers want to get the end of the list. |
4744 | * Put it in segs->prev to avoid walking the list. |
4745 | * (see validate_xmit_skb_list() for example) |
4746 | */ |
4747 | segs->prev = tail; |
4748 | |
4749 | if (partial_segs) { |
4750 | struct sk_buff *iter; |
4751 | int type = skb_shinfo(head_skb)->gso_type; |
4752 | unsigned short gso_size = skb_shinfo(head_skb)->gso_size; |
4753 | |
4754 | /* Update type to add partial and then remove dodgy if set */ |
4755 | type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; |
4756 | type &= ~SKB_GSO_DODGY; |
4757 | |
4758 | /* Update GSO info and prepare to start updating headers on |
4759 | * our way back down the stack of protocols. |
4760 | */ |
4761 | for (iter = segs; iter; iter = iter->next) { |
4762 | skb_shinfo(iter)->gso_size = gso_size; |
4763 | skb_shinfo(iter)->gso_segs = partial_segs; |
4764 | skb_shinfo(iter)->gso_type = type; |
4765 | SKB_GSO_CB(iter)->data_offset = skb_headroom(skb: iter) + doffset; |
4766 | } |
4767 | |
4768 | if (tail->len - doffset <= gso_size) |
4769 | skb_shinfo(tail)->gso_size = 0; |
4770 | else if (tail != segs) |
4771 | skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); |
4772 | } |
4773 | |
4774 | /* Following permits correct backpressure, for protocols |
4775 | * using skb_set_owner_w(). |
4776 | * Idea is to tranfert ownership from head_skb to last segment. |
4777 | */ |
4778 | if (head_skb->destructor == sock_wfree) { |
4779 | swap(tail->truesize, head_skb->truesize); |
4780 | swap(tail->destructor, head_skb->destructor); |
4781 | swap(tail->sk, head_skb->sk); |
4782 | } |
4783 | return segs; |
4784 | |
4785 | err: |
4786 | kfree_skb_list(segs); |
4787 | return ERR_PTR(error: err); |
4788 | } |
4789 | EXPORT_SYMBOL_GPL(skb_segment); |
4790 | |
4791 | #ifdef CONFIG_SKB_EXTENSIONS |
4792 | #define SKB_EXT_ALIGN_VALUE 8 |
4793 | #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) |
4794 | |
4795 | static const u8 skb_ext_type_len[] = { |
4796 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) |
4797 | [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), |
4798 | #endif |
4799 | #ifdef CONFIG_XFRM |
4800 | [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), |
4801 | #endif |
4802 | #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) |
4803 | [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), |
4804 | #endif |
4805 | #if IS_ENABLED(CONFIG_MPTCP) |
4806 | [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), |
4807 | #endif |
4808 | #if IS_ENABLED(CONFIG_MCTP_FLOWS) |
4809 | [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), |
4810 | #endif |
4811 | }; |
4812 | |
4813 | static __always_inline unsigned int skb_ext_total_length(void) |
4814 | { |
4815 | unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); |
4816 | int i; |
4817 | |
4818 | for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) |
4819 | l += skb_ext_type_len[i]; |
4820 | |
4821 | return l; |
4822 | } |
4823 | |
4824 | static void skb_extensions_init(void) |
4825 | { |
4826 | BUILD_BUG_ON(SKB_EXT_NUM >= 8); |
4827 | BUILD_BUG_ON(skb_ext_total_length() > 255); |
4828 | |
4829 | skbuff_ext_cache = kmem_cache_create(name: "skbuff_ext_cache" , |
4830 | SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), |
4831 | align: 0, |
4832 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, |
4833 | NULL); |
4834 | } |
4835 | #else |
4836 | static void skb_extensions_init(void) {} |
4837 | #endif |
4838 | |
4839 | /* The SKB kmem_cache slab is critical for network performance. Never |
4840 | * merge/alias the slab with similar sized objects. This avoids fragmentation |
4841 | * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. |
4842 | */ |
4843 | #ifndef CONFIG_SLUB_TINY |
4844 | #define FLAG_SKB_NO_MERGE SLAB_NO_MERGE |
4845 | #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ |
4846 | #define FLAG_SKB_NO_MERGE 0 |
4847 | #endif |
4848 | |
4849 | void __init skb_init(void) |
4850 | { |
4851 | skbuff_cache = kmem_cache_create_usercopy(name: "skbuff_head_cache" , |
4852 | size: sizeof(struct sk_buff), |
4853 | align: 0, |
4854 | SLAB_HWCACHE_ALIGN|SLAB_PANIC| |
4855 | FLAG_SKB_NO_MERGE, |
4856 | offsetof(struct sk_buff, cb), |
4857 | sizeof_field(struct sk_buff, cb), |
4858 | NULL); |
4859 | skbuff_fclone_cache = kmem_cache_create(name: "skbuff_fclone_cache" , |
4860 | size: sizeof(struct sk_buff_fclones), |
4861 | align: 0, |
4862 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, |
4863 | NULL); |
4864 | /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. |
4865 | * struct skb_shared_info is located at the end of skb->head, |
4866 | * and should not be copied to/from user. |
4867 | */ |
4868 | skb_small_head_cache = kmem_cache_create_usercopy(name: "skbuff_small_head" , |
4869 | SKB_SMALL_HEAD_CACHE_SIZE, |
4870 | align: 0, |
4871 | SLAB_HWCACHE_ALIGN | SLAB_PANIC, |
4872 | useroffset: 0, |
4873 | SKB_SMALL_HEAD_HEADROOM, |
4874 | NULL); |
4875 | skb_extensions_init(); |
4876 | } |
4877 | |
4878 | static int |
4879 | __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, |
4880 | unsigned int recursion_level) |
4881 | { |
4882 | int start = skb_headlen(skb); |
4883 | int i, copy = start - offset; |
4884 | struct sk_buff *frag_iter; |
4885 | int elt = 0; |
4886 | |
4887 | if (unlikely(recursion_level >= 24)) |
4888 | return -EMSGSIZE; |
4889 | |
4890 | if (copy > 0) { |
4891 | if (copy > len) |
4892 | copy = len; |
4893 | sg_set_buf(sg, buf: skb->data + offset, buflen: copy); |
4894 | elt++; |
4895 | if ((len -= copy) == 0) |
4896 | return elt; |
4897 | offset += copy; |
4898 | } |
4899 | |
4900 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
4901 | int end; |
4902 | |
4903 | WARN_ON(start > offset + len); |
4904 | |
4905 | end = start + skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
4906 | if ((copy = end - offset) > 0) { |
4907 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
4908 | if (unlikely(elt && sg_is_last(&sg[elt - 1]))) |
4909 | return -EMSGSIZE; |
4910 | |
4911 | if (copy > len) |
4912 | copy = len; |
4913 | sg_set_page(sg: &sg[elt], page: skb_frag_page(frag), len: copy, |
4914 | offset: skb_frag_off(frag) + offset - start); |
4915 | elt++; |
4916 | if (!(len -= copy)) |
4917 | return elt; |
4918 | offset += copy; |
4919 | } |
4920 | start = end; |
4921 | } |
4922 | |
4923 | skb_walk_frags(skb, frag_iter) { |
4924 | int end, ret; |
4925 | |
4926 | WARN_ON(start > offset + len); |
4927 | |
4928 | end = start + frag_iter->len; |
4929 | if ((copy = end - offset) > 0) { |
4930 | if (unlikely(elt && sg_is_last(&sg[elt - 1]))) |
4931 | return -EMSGSIZE; |
4932 | |
4933 | if (copy > len) |
4934 | copy = len; |
4935 | ret = __skb_to_sgvec(skb: frag_iter, sg: sg+elt, offset: offset - start, |
4936 | len: copy, recursion_level: recursion_level + 1); |
4937 | if (unlikely(ret < 0)) |
4938 | return ret; |
4939 | elt += ret; |
4940 | if ((len -= copy) == 0) |
4941 | return elt; |
4942 | offset += copy; |
4943 | } |
4944 | start = end; |
4945 | } |
4946 | BUG_ON(len); |
4947 | return elt; |
4948 | } |
4949 | |
4950 | /** |
4951 | * skb_to_sgvec - Fill a scatter-gather list from a socket buffer |
4952 | * @skb: Socket buffer containing the buffers to be mapped |
4953 | * @sg: The scatter-gather list to map into |
4954 | * @offset: The offset into the buffer's contents to start mapping |
4955 | * @len: Length of buffer space to be mapped |
4956 | * |
4957 | * Fill the specified scatter-gather list with mappings/pointers into a |
4958 | * region of the buffer space attached to a socket buffer. Returns either |
4959 | * the number of scatterlist items used, or -EMSGSIZE if the contents |
4960 | * could not fit. |
4961 | */ |
4962 | int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) |
4963 | { |
4964 | int nsg = __skb_to_sgvec(skb, sg, offset, len, recursion_level: 0); |
4965 | |
4966 | if (nsg <= 0) |
4967 | return nsg; |
4968 | |
4969 | sg_mark_end(sg: &sg[nsg - 1]); |
4970 | |
4971 | return nsg; |
4972 | } |
4973 | EXPORT_SYMBOL_GPL(skb_to_sgvec); |
4974 | |
4975 | /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given |
4976 | * sglist without mark the sg which contain last skb data as the end. |
4977 | * So the caller can mannipulate sg list as will when padding new data after |
4978 | * the first call without calling sg_unmark_end to expend sg list. |
4979 | * |
4980 | * Scenario to use skb_to_sgvec_nomark: |
4981 | * 1. sg_init_table |
4982 | * 2. skb_to_sgvec_nomark(payload1) |
4983 | * 3. skb_to_sgvec_nomark(payload2) |
4984 | * |
4985 | * This is equivalent to: |
4986 | * 1. sg_init_table |
4987 | * 2. skb_to_sgvec(payload1) |
4988 | * 3. sg_unmark_end |
4989 | * 4. skb_to_sgvec(payload2) |
4990 | * |
4991 | * When mapping mutilple payload conditionally, skb_to_sgvec_nomark |
4992 | * is more preferable. |
4993 | */ |
4994 | int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, |
4995 | int offset, int len) |
4996 | { |
4997 | return __skb_to_sgvec(skb, sg, offset, len, recursion_level: 0); |
4998 | } |
4999 | EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); |
5000 | |
5001 | |
5002 | |
5003 | /** |
5004 | * skb_cow_data - Check that a socket buffer's data buffers are writable |
5005 | * @skb: The socket buffer to check. |
5006 | * @tailbits: Amount of trailing space to be added |
5007 | * @trailer: Returned pointer to the skb where the @tailbits space begins |
5008 | * |
5009 | * Make sure that the data buffers attached to a socket buffer are |
5010 | * writable. If they are not, private copies are made of the data buffers |
5011 | * and the socket buffer is set to use these instead. |
5012 | * |
5013 | * If @tailbits is given, make sure that there is space to write @tailbits |
5014 | * bytes of data beyond current end of socket buffer. @trailer will be |
5015 | * set to point to the skb in which this space begins. |
5016 | * |
5017 | * The number of scatterlist elements required to completely map the |
5018 | * COW'd and extended socket buffer will be returned. |
5019 | */ |
5020 | int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) |
5021 | { |
5022 | int copyflag; |
5023 | int elt; |
5024 | struct sk_buff *skb1, **skb_p; |
5025 | |
5026 | /* If skb is cloned or its head is paged, reallocate |
5027 | * head pulling out all the pages (pages are considered not writable |
5028 | * at the moment even if they are anonymous). |
5029 | */ |
5030 | if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && |
5031 | !__pskb_pull_tail(skb, __skb_pagelen(skb))) |
5032 | return -ENOMEM; |
5033 | |
5034 | /* Easy case. Most of packets will go this way. */ |
5035 | if (!skb_has_frag_list(skb)) { |
5036 | /* A little of trouble, not enough of space for trailer. |
5037 | * This should not happen, when stack is tuned to generate |
5038 | * good frames. OK, on miss we reallocate and reserve even more |
5039 | * space, 128 bytes is fair. */ |
5040 | |
5041 | if (skb_tailroom(skb) < tailbits && |
5042 | pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) |
5043 | return -ENOMEM; |
5044 | |
5045 | /* Voila! */ |
5046 | *trailer = skb; |
5047 | return 1; |
5048 | } |
5049 | |
5050 | /* Misery. We are in troubles, going to mincer fragments... */ |
5051 | |
5052 | elt = 1; |
5053 | skb_p = &skb_shinfo(skb)->frag_list; |
5054 | copyflag = 0; |
5055 | |
5056 | while ((skb1 = *skb_p) != NULL) { |
5057 | int ntail = 0; |
5058 | |
5059 | /* The fragment is partially pulled by someone, |
5060 | * this can happen on input. Copy it and everything |
5061 | * after it. */ |
5062 | |
5063 | if (skb_shared(skb: skb1)) |
5064 | copyflag = 1; |
5065 | |
5066 | /* If the skb is the last, worry about trailer. */ |
5067 | |
5068 | if (skb1->next == NULL && tailbits) { |
5069 | if (skb_shinfo(skb1)->nr_frags || |
5070 | skb_has_frag_list(skb: skb1) || |
5071 | skb_tailroom(skb: skb1) < tailbits) |
5072 | ntail = tailbits + 128; |
5073 | } |
5074 | |
5075 | if (copyflag || |
5076 | skb_cloned(skb: skb1) || |
5077 | ntail || |
5078 | skb_shinfo(skb1)->nr_frags || |
5079 | skb_has_frag_list(skb: skb1)) { |
5080 | struct sk_buff *skb2; |
5081 | |
5082 | /* Fuck, we are miserable poor guys... */ |
5083 | if (ntail == 0) |
5084 | skb2 = skb_copy(skb1, GFP_ATOMIC); |
5085 | else |
5086 | skb2 = skb_copy_expand(skb1, |
5087 | skb_headroom(skb: skb1), |
5088 | ntail, |
5089 | GFP_ATOMIC); |
5090 | if (unlikely(skb2 == NULL)) |
5091 | return -ENOMEM; |
5092 | |
5093 | if (skb1->sk) |
5094 | skb_set_owner_w(skb: skb2, sk: skb1->sk); |
5095 | |
5096 | /* Looking around. Are we still alive? |
5097 | * OK, link new skb, drop old one */ |
5098 | |
5099 | skb2->next = skb1->next; |
5100 | *skb_p = skb2; |
5101 | kfree_skb(skb: skb1); |
5102 | skb1 = skb2; |
5103 | } |
5104 | elt++; |
5105 | *trailer = skb1; |
5106 | skb_p = &skb1->next; |
5107 | } |
5108 | |
5109 | return elt; |
5110 | } |
5111 | EXPORT_SYMBOL_GPL(skb_cow_data); |
5112 | |
5113 | static void sock_rmem_free(struct sk_buff *skb) |
5114 | { |
5115 | struct sock *sk = skb->sk; |
5116 | |
5117 | atomic_sub(i: skb->truesize, v: &sk->sk_rmem_alloc); |
5118 | } |
5119 | |
5120 | static void skb_set_err_queue(struct sk_buff *skb) |
5121 | { |
5122 | /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. |
5123 | * So, it is safe to (mis)use it to mark skbs on the error queue. |
5124 | */ |
5125 | skb->pkt_type = PACKET_OUTGOING; |
5126 | BUILD_BUG_ON(PACKET_OUTGOING == 0); |
5127 | } |
5128 | |
5129 | /* |
5130 | * Note: We dont mem charge error packets (no sk_forward_alloc changes) |
5131 | */ |
5132 | int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) |
5133 | { |
5134 | if (atomic_read(v: &sk->sk_rmem_alloc) + skb->truesize >= |
5135 | (unsigned int)READ_ONCE(sk->sk_rcvbuf)) |
5136 | return -ENOMEM; |
5137 | |
5138 | skb_orphan(skb); |
5139 | skb->sk = sk; |
5140 | skb->destructor = sock_rmem_free; |
5141 | atomic_add(i: skb->truesize, v: &sk->sk_rmem_alloc); |
5142 | skb_set_err_queue(skb); |
5143 | |
5144 | /* before exiting rcu section, make sure dst is refcounted */ |
5145 | skb_dst_force(skb); |
5146 | |
5147 | skb_queue_tail(&sk->sk_error_queue, skb); |
5148 | if (!sock_flag(sk, flag: SOCK_DEAD)) |
5149 | sk_error_report(sk); |
5150 | return 0; |
5151 | } |
5152 | EXPORT_SYMBOL(sock_queue_err_skb); |
5153 | |
5154 | static bool is_icmp_err_skb(const struct sk_buff *skb) |
5155 | { |
5156 | return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || |
5157 | SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); |
5158 | } |
5159 | |
5160 | struct sk_buff *sock_dequeue_err_skb(struct sock *sk) |
5161 | { |
5162 | struct sk_buff_head *q = &sk->sk_error_queue; |
5163 | struct sk_buff *skb, *skb_next = NULL; |
5164 | bool icmp_next = false; |
5165 | unsigned long flags; |
5166 | |
5167 | if (skb_queue_empty_lockless(list: q)) |
5168 | return NULL; |
5169 | |
5170 | spin_lock_irqsave(&q->lock, flags); |
5171 | skb = __skb_dequeue(list: q); |
5172 | if (skb && (skb_next = skb_peek(list_: q))) { |
5173 | icmp_next = is_icmp_err_skb(skb: skb_next); |
5174 | if (icmp_next) |
5175 | sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; |
5176 | } |
5177 | spin_unlock_irqrestore(lock: &q->lock, flags); |
5178 | |
5179 | if (is_icmp_err_skb(skb) && !icmp_next) |
5180 | sk->sk_err = 0; |
5181 | |
5182 | if (skb_next) |
5183 | sk_error_report(sk); |
5184 | |
5185 | return skb; |
5186 | } |
5187 | EXPORT_SYMBOL(sock_dequeue_err_skb); |
5188 | |
5189 | /** |
5190 | * skb_clone_sk - create clone of skb, and take reference to socket |
5191 | * @skb: the skb to clone |
5192 | * |
5193 | * This function creates a clone of a buffer that holds a reference on |
5194 | * sk_refcnt. Buffers created via this function are meant to be |
5195 | * returned using sock_queue_err_skb, or free via kfree_skb. |
5196 | * |
5197 | * When passing buffers allocated with this function to sock_queue_err_skb |
5198 | * it is necessary to wrap the call with sock_hold/sock_put in order to |
5199 | * prevent the socket from being released prior to being enqueued on |
5200 | * the sk_error_queue. |
5201 | */ |
5202 | struct sk_buff *skb_clone_sk(struct sk_buff *skb) |
5203 | { |
5204 | struct sock *sk = skb->sk; |
5205 | struct sk_buff *clone; |
5206 | |
5207 | if (!sk || !refcount_inc_not_zero(r: &sk->sk_refcnt)) |
5208 | return NULL; |
5209 | |
5210 | clone = skb_clone(skb, GFP_ATOMIC); |
5211 | if (!clone) { |
5212 | sock_put(sk); |
5213 | return NULL; |
5214 | } |
5215 | |
5216 | clone->sk = sk; |
5217 | clone->destructor = sock_efree; |
5218 | |
5219 | return clone; |
5220 | } |
5221 | EXPORT_SYMBOL(skb_clone_sk); |
5222 | |
5223 | static void __skb_complete_tx_timestamp(struct sk_buff *skb, |
5224 | struct sock *sk, |
5225 | int tstype, |
5226 | bool opt_stats) |
5227 | { |
5228 | struct sock_exterr_skb *serr; |
5229 | int err; |
5230 | |
5231 | BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); |
5232 | |
5233 | serr = SKB_EXT_ERR(skb); |
5234 | memset(serr, 0, sizeof(*serr)); |
5235 | serr->ee.ee_errno = ENOMSG; |
5236 | serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; |
5237 | serr->ee.ee_info = tstype; |
5238 | serr->opt_stats = opt_stats; |
5239 | serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; |
5240 | if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { |
5241 | serr->ee.ee_data = skb_shinfo(skb)->tskey; |
5242 | if (sk_is_tcp(sk)) |
5243 | serr->ee.ee_data -= atomic_read(v: &sk->sk_tskey); |
5244 | } |
5245 | |
5246 | err = sock_queue_err_skb(sk, skb); |
5247 | |
5248 | if (err) |
5249 | kfree_skb(skb); |
5250 | } |
5251 | |
5252 | static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) |
5253 | { |
5254 | bool ret; |
5255 | |
5256 | if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) |
5257 | return true; |
5258 | |
5259 | read_lock_bh(&sk->sk_callback_lock); |
5260 | ret = sk->sk_socket && sk->sk_socket->file && |
5261 | file_ns_capable(file: sk->sk_socket->file, ns: &init_user_ns, CAP_NET_RAW); |
5262 | read_unlock_bh(&sk->sk_callback_lock); |
5263 | return ret; |
5264 | } |
5265 | |
5266 | void skb_complete_tx_timestamp(struct sk_buff *skb, |
5267 | struct skb_shared_hwtstamps *hwtstamps) |
5268 | { |
5269 | struct sock *sk = skb->sk; |
5270 | |
5271 | if (!skb_may_tx_timestamp(sk, tsonly: false)) |
5272 | goto err; |
5273 | |
5274 | /* Take a reference to prevent skb_orphan() from freeing the socket, |
5275 | * but only if the socket refcount is not zero. |
5276 | */ |
5277 | if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { |
5278 | *skb_hwtstamps(skb) = *hwtstamps; |
5279 | __skb_complete_tx_timestamp(skb, sk, tstype: SCM_TSTAMP_SND, opt_stats: false); |
5280 | sock_put(sk); |
5281 | return; |
5282 | } |
5283 | |
5284 | err: |
5285 | kfree_skb(skb); |
5286 | } |
5287 | EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); |
5288 | |
5289 | void __skb_tstamp_tx(struct sk_buff *orig_skb, |
5290 | const struct sk_buff *ack_skb, |
5291 | struct skb_shared_hwtstamps *hwtstamps, |
5292 | struct sock *sk, int tstype) |
5293 | { |
5294 | struct sk_buff *skb; |
5295 | bool tsonly, opt_stats = false; |
5296 | u32 tsflags; |
5297 | |
5298 | if (!sk) |
5299 | return; |
5300 | |
5301 | tsflags = READ_ONCE(sk->sk_tsflags); |
5302 | if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && |
5303 | skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) |
5304 | return; |
5305 | |
5306 | tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; |
5307 | if (!skb_may_tx_timestamp(sk, tsonly)) |
5308 | return; |
5309 | |
5310 | if (tsonly) { |
5311 | #ifdef CONFIG_INET |
5312 | if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && |
5313 | sk_is_tcp(sk)) { |
5314 | skb = tcp_get_timestamping_opt_stats(sk, orig_skb, |
5315 | ack_skb); |
5316 | opt_stats = true; |
5317 | } else |
5318 | #endif |
5319 | skb = alloc_skb(size: 0, GFP_ATOMIC); |
5320 | } else { |
5321 | skb = skb_clone(orig_skb, GFP_ATOMIC); |
5322 | |
5323 | if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { |
5324 | kfree_skb(skb); |
5325 | return; |
5326 | } |
5327 | } |
5328 | if (!skb) |
5329 | return; |
5330 | |
5331 | if (tsonly) { |
5332 | skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & |
5333 | SKBTX_ANY_TSTAMP; |
5334 | skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; |
5335 | } |
5336 | |
5337 | if (hwtstamps) |
5338 | *skb_hwtstamps(skb) = *hwtstamps; |
5339 | else |
5340 | __net_timestamp(skb); |
5341 | |
5342 | __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); |
5343 | } |
5344 | EXPORT_SYMBOL_GPL(__skb_tstamp_tx); |
5345 | |
5346 | void skb_tstamp_tx(struct sk_buff *orig_skb, |
5347 | struct skb_shared_hwtstamps *hwtstamps) |
5348 | { |
5349 | return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, |
5350 | SCM_TSTAMP_SND); |
5351 | } |
5352 | EXPORT_SYMBOL_GPL(skb_tstamp_tx); |
5353 | |
5354 | #ifdef CONFIG_WIRELESS |
5355 | void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) |
5356 | { |
5357 | struct sock *sk = skb->sk; |
5358 | struct sock_exterr_skb *serr; |
5359 | int err = 1; |
5360 | |
5361 | skb->wifi_acked_valid = 1; |
5362 | skb->wifi_acked = acked; |
5363 | |
5364 | serr = SKB_EXT_ERR(skb); |
5365 | memset(serr, 0, sizeof(*serr)); |
5366 | serr->ee.ee_errno = ENOMSG; |
5367 | serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; |
5368 | |
5369 | /* Take a reference to prevent skb_orphan() from freeing the socket, |
5370 | * but only if the socket refcount is not zero. |
5371 | */ |
5372 | if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { |
5373 | err = sock_queue_err_skb(sk, skb); |
5374 | sock_put(sk); |
5375 | } |
5376 | if (err) |
5377 | kfree_skb(skb); |
5378 | } |
5379 | EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); |
5380 | #endif /* CONFIG_WIRELESS */ |
5381 | |
5382 | /** |
5383 | * skb_partial_csum_set - set up and verify partial csum values for packet |
5384 | * @skb: the skb to set |
5385 | * @start: the number of bytes after skb->data to start checksumming. |
5386 | * @off: the offset from start to place the checksum. |
5387 | * |
5388 | * For untrusted partially-checksummed packets, we need to make sure the values |
5389 | * for skb->csum_start and skb->csum_offset are valid so we don't oops. |
5390 | * |
5391 | * This function checks and sets those values and skb->ip_summed: if this |
5392 | * returns false you should drop the packet. |
5393 | */ |
5394 | bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) |
5395 | { |
5396 | u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); |
5397 | u32 csum_start = skb_headroom(skb) + (u32)start; |
5398 | |
5399 | if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { |
5400 | net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n" , |
5401 | start, off, skb_headroom(skb), skb_headlen(skb)); |
5402 | return false; |
5403 | } |
5404 | skb->ip_summed = CHECKSUM_PARTIAL; |
5405 | skb->csum_start = csum_start; |
5406 | skb->csum_offset = off; |
5407 | skb->transport_header = csum_start; |
5408 | return true; |
5409 | } |
5410 | EXPORT_SYMBOL_GPL(skb_partial_csum_set); |
5411 | |
5412 | static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, |
5413 | unsigned int max) |
5414 | { |
5415 | if (skb_headlen(skb) >= len) |
5416 | return 0; |
5417 | |
5418 | /* If we need to pullup then pullup to the max, so we |
5419 | * won't need to do it again. |
5420 | */ |
5421 | if (max > skb->len) |
5422 | max = skb->len; |
5423 | |
5424 | if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) |
5425 | return -ENOMEM; |
5426 | |
5427 | if (skb_headlen(skb) < len) |
5428 | return -EPROTO; |
5429 | |
5430 | return 0; |
5431 | } |
5432 | |
5433 | #define MAX_TCP_HDR_LEN (15 * 4) |
5434 | |
5435 | static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, |
5436 | typeof(IPPROTO_IP) proto, |
5437 | unsigned int off) |
5438 | { |
5439 | int err; |
5440 | |
5441 | switch (proto) { |
5442 | case IPPROTO_TCP: |
5443 | err = skb_maybe_pull_tail(skb, len: off + sizeof(struct tcphdr), |
5444 | max: off + MAX_TCP_HDR_LEN); |
5445 | if (!err && !skb_partial_csum_set(skb, off, |
5446 | offsetof(struct tcphdr, |
5447 | check))) |
5448 | err = -EPROTO; |
5449 | return err ? ERR_PTR(error: err) : &tcp_hdr(skb)->check; |
5450 | |
5451 | case IPPROTO_UDP: |
5452 | err = skb_maybe_pull_tail(skb, len: off + sizeof(struct udphdr), |
5453 | max: off + sizeof(struct udphdr)); |
5454 | if (!err && !skb_partial_csum_set(skb, off, |
5455 | offsetof(struct udphdr, |
5456 | check))) |
5457 | err = -EPROTO; |
5458 | return err ? ERR_PTR(error: err) : &udp_hdr(skb)->check; |
5459 | } |
5460 | |
5461 | return ERR_PTR(error: -EPROTO); |
5462 | } |
5463 | |
5464 | /* This value should be large enough to cover a tagged ethernet header plus |
5465 | * maximally sized IP and TCP or UDP headers. |
5466 | */ |
5467 | #define MAX_IP_HDR_LEN 128 |
5468 | |
5469 | static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) |
5470 | { |
5471 | unsigned int off; |
5472 | bool fragment; |
5473 | __sum16 *csum; |
5474 | int err; |
5475 | |
5476 | fragment = false; |
5477 | |
5478 | err = skb_maybe_pull_tail(skb, |
5479 | len: sizeof(struct iphdr), |
5480 | MAX_IP_HDR_LEN); |
5481 | if (err < 0) |
5482 | goto out; |
5483 | |
5484 | if (ip_is_fragment(iph: ip_hdr(skb))) |
5485 | fragment = true; |
5486 | |
5487 | off = ip_hdrlen(skb); |
5488 | |
5489 | err = -EPROTO; |
5490 | |
5491 | if (fragment) |
5492 | goto out; |
5493 | |
5494 | csum = skb_checksum_setup_ip(skb, proto: ip_hdr(skb)->protocol, off); |
5495 | if (IS_ERR(ptr: csum)) |
5496 | return PTR_ERR(ptr: csum); |
5497 | |
5498 | if (recalculate) |
5499 | *csum = ~csum_tcpudp_magic(saddr: ip_hdr(skb)->saddr, |
5500 | daddr: ip_hdr(skb)->daddr, |
5501 | len: skb->len - off, |
5502 | proto: ip_hdr(skb)->protocol, sum: 0); |
5503 | err = 0; |
5504 | |
5505 | out: |
5506 | return err; |
5507 | } |
5508 | |
5509 | /* This value should be large enough to cover a tagged ethernet header plus |
5510 | * an IPv6 header, all options, and a maximal TCP or UDP header. |
5511 | */ |
5512 | #define MAX_IPV6_HDR_LEN 256 |
5513 | |
5514 | #define OPT_HDR(type, skb, off) \ |
5515 | (type *)(skb_network_header(skb) + (off)) |
5516 | |
5517 | static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) |
5518 | { |
5519 | int err; |
5520 | u8 nexthdr; |
5521 | unsigned int off; |
5522 | unsigned int len; |
5523 | bool fragment; |
5524 | bool done; |
5525 | __sum16 *csum; |
5526 | |
5527 | fragment = false; |
5528 | done = false; |
5529 | |
5530 | off = sizeof(struct ipv6hdr); |
5531 | |
5532 | err = skb_maybe_pull_tail(skb, len: off, MAX_IPV6_HDR_LEN); |
5533 | if (err < 0) |
5534 | goto out; |
5535 | |
5536 | nexthdr = ipv6_hdr(skb)->nexthdr; |
5537 | |
5538 | len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); |
5539 | while (off <= len && !done) { |
5540 | switch (nexthdr) { |
5541 | case IPPROTO_DSTOPTS: |
5542 | case IPPROTO_HOPOPTS: |
5543 | case IPPROTO_ROUTING: { |
5544 | struct ipv6_opt_hdr *hp; |
5545 | |
5546 | err = skb_maybe_pull_tail(skb, |
5547 | len: off + |
5548 | sizeof(struct ipv6_opt_hdr), |
5549 | MAX_IPV6_HDR_LEN); |
5550 | if (err < 0) |
5551 | goto out; |
5552 | |
5553 | hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); |
5554 | nexthdr = hp->nexthdr; |
5555 | off += ipv6_optlen(hp); |
5556 | break; |
5557 | } |
5558 | case IPPROTO_AH: { |
5559 | struct ip_auth_hdr *hp; |
5560 | |
5561 | err = skb_maybe_pull_tail(skb, |
5562 | len: off + |
5563 | sizeof(struct ip_auth_hdr), |
5564 | MAX_IPV6_HDR_LEN); |
5565 | if (err < 0) |
5566 | goto out; |
5567 | |
5568 | hp = OPT_HDR(struct ip_auth_hdr, skb, off); |
5569 | nexthdr = hp->nexthdr; |
5570 | off += ipv6_authlen(hp); |
5571 | break; |
5572 | } |
5573 | case IPPROTO_FRAGMENT: { |
5574 | struct frag_hdr *hp; |
5575 | |
5576 | err = skb_maybe_pull_tail(skb, |
5577 | len: off + |
5578 | sizeof(struct frag_hdr), |
5579 | MAX_IPV6_HDR_LEN); |
5580 | if (err < 0) |
5581 | goto out; |
5582 | |
5583 | hp = OPT_HDR(struct frag_hdr, skb, off); |
5584 | |
5585 | if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) |
5586 | fragment = true; |
5587 | |
5588 | nexthdr = hp->nexthdr; |
5589 | off += sizeof(struct frag_hdr); |
5590 | break; |
5591 | } |
5592 | default: |
5593 | done = true; |
5594 | break; |
5595 | } |
5596 | } |
5597 | |
5598 | err = -EPROTO; |
5599 | |
5600 | if (!done || fragment) |
5601 | goto out; |
5602 | |
5603 | csum = skb_checksum_setup_ip(skb, proto: nexthdr, off); |
5604 | if (IS_ERR(ptr: csum)) |
5605 | return PTR_ERR(ptr: csum); |
5606 | |
5607 | if (recalculate) |
5608 | *csum = ~csum_ipv6_magic(saddr: &ipv6_hdr(skb)->saddr, |
5609 | daddr: &ipv6_hdr(skb)->daddr, |
5610 | len: skb->len - off, proto: nexthdr, sum: 0); |
5611 | err = 0; |
5612 | |
5613 | out: |
5614 | return err; |
5615 | } |
5616 | |
5617 | /** |
5618 | * skb_checksum_setup - set up partial checksum offset |
5619 | * @skb: the skb to set up |
5620 | * @recalculate: if true the pseudo-header checksum will be recalculated |
5621 | */ |
5622 | int skb_checksum_setup(struct sk_buff *skb, bool recalculate) |
5623 | { |
5624 | int err; |
5625 | |
5626 | switch (skb->protocol) { |
5627 | case htons(ETH_P_IP): |
5628 | err = skb_checksum_setup_ipv4(skb, recalculate); |
5629 | break; |
5630 | |
5631 | case htons(ETH_P_IPV6): |
5632 | err = skb_checksum_setup_ipv6(skb, recalculate); |
5633 | break; |
5634 | |
5635 | default: |
5636 | err = -EPROTO; |
5637 | break; |
5638 | } |
5639 | |
5640 | return err; |
5641 | } |
5642 | EXPORT_SYMBOL(skb_checksum_setup); |
5643 | |
5644 | /** |
5645 | * skb_checksum_maybe_trim - maybe trims the given skb |
5646 | * @skb: the skb to check |
5647 | * @transport_len: the data length beyond the network header |
5648 | * |
5649 | * Checks whether the given skb has data beyond the given transport length. |
5650 | * If so, returns a cloned skb trimmed to this transport length. |
5651 | * Otherwise returns the provided skb. Returns NULL in error cases |
5652 | * (e.g. transport_len exceeds skb length or out-of-memory). |
5653 | * |
5654 | * Caller needs to set the skb transport header and free any returned skb if it |
5655 | * differs from the provided skb. |
5656 | */ |
5657 | static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, |
5658 | unsigned int transport_len) |
5659 | { |
5660 | struct sk_buff *skb_chk; |
5661 | unsigned int len = skb_transport_offset(skb) + transport_len; |
5662 | int ret; |
5663 | |
5664 | if (skb->len < len) |
5665 | return NULL; |
5666 | else if (skb->len == len) |
5667 | return skb; |
5668 | |
5669 | skb_chk = skb_clone(skb, GFP_ATOMIC); |
5670 | if (!skb_chk) |
5671 | return NULL; |
5672 | |
5673 | ret = pskb_trim_rcsum(skb: skb_chk, len); |
5674 | if (ret) { |
5675 | kfree_skb(skb: skb_chk); |
5676 | return NULL; |
5677 | } |
5678 | |
5679 | return skb_chk; |
5680 | } |
5681 | |
5682 | /** |
5683 | * skb_checksum_trimmed - validate checksum of an skb |
5684 | * @skb: the skb to check |
5685 | * @transport_len: the data length beyond the network header |
5686 | * @skb_chkf: checksum function to use |
5687 | * |
5688 | * Applies the given checksum function skb_chkf to the provided skb. |
5689 | * Returns a checked and maybe trimmed skb. Returns NULL on error. |
5690 | * |
5691 | * If the skb has data beyond the given transport length, then a |
5692 | * trimmed & cloned skb is checked and returned. |
5693 | * |
5694 | * Caller needs to set the skb transport header and free any returned skb if it |
5695 | * differs from the provided skb. |
5696 | */ |
5697 | struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, |
5698 | unsigned int transport_len, |
5699 | __sum16(*skb_chkf)(struct sk_buff *skb)) |
5700 | { |
5701 | struct sk_buff *skb_chk; |
5702 | unsigned int offset = skb_transport_offset(skb); |
5703 | __sum16 ret; |
5704 | |
5705 | skb_chk = skb_checksum_maybe_trim(skb, transport_len); |
5706 | if (!skb_chk) |
5707 | goto err; |
5708 | |
5709 | if (!pskb_may_pull(skb: skb_chk, len: offset)) |
5710 | goto err; |
5711 | |
5712 | skb_pull_rcsum(skb_chk, offset); |
5713 | ret = skb_chkf(skb_chk); |
5714 | skb_push_rcsum(skb: skb_chk, len: offset); |
5715 | |
5716 | if (ret) |
5717 | goto err; |
5718 | |
5719 | return skb_chk; |
5720 | |
5721 | err: |
5722 | if (skb_chk && skb_chk != skb) |
5723 | kfree_skb(skb: skb_chk); |
5724 | |
5725 | return NULL; |
5726 | |
5727 | } |
5728 | EXPORT_SYMBOL(skb_checksum_trimmed); |
5729 | |
5730 | void __skb_warn_lro_forwarding(const struct sk_buff *skb) |
5731 | { |
5732 | net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n" , |
5733 | skb->dev->name); |
5734 | } |
5735 | EXPORT_SYMBOL(__skb_warn_lro_forwarding); |
5736 | |
5737 | void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) |
5738 | { |
5739 | if (head_stolen) { |
5740 | skb_release_head_state(skb); |
5741 | kmem_cache_free(s: skbuff_cache, objp: skb); |
5742 | } else { |
5743 | __kfree_skb(skb); |
5744 | } |
5745 | } |
5746 | EXPORT_SYMBOL(kfree_skb_partial); |
5747 | |
5748 | /** |
5749 | * skb_try_coalesce - try to merge skb to prior one |
5750 | * @to: prior buffer |
5751 | * @from: buffer to add |
5752 | * @fragstolen: pointer to boolean |
5753 | * @delta_truesize: how much more was allocated than was requested |
5754 | */ |
5755 | bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, |
5756 | bool *fragstolen, int *delta_truesize) |
5757 | { |
5758 | struct skb_shared_info *to_shinfo, *from_shinfo; |
5759 | int i, delta, len = from->len; |
5760 | |
5761 | *fragstolen = false; |
5762 | |
5763 | if (skb_cloned(skb: to)) |
5764 | return false; |
5765 | |
5766 | /* In general, avoid mixing page_pool and non-page_pool allocated |
5767 | * pages within the same SKB. Additionally avoid dealing with clones |
5768 | * with page_pool pages, in case the SKB is using page_pool fragment |
5769 | * references (page_pool_alloc_frag()). Since we only take full page |
5770 | * references for cloned SKBs at the moment that would result in |
5771 | * inconsistent reference counts. |
5772 | * In theory we could take full references if @from is cloned and |
5773 | * !@to->pp_recycle but its tricky (due to potential race with |
5774 | * the clone disappearing) and rare, so not worth dealing with. |
5775 | */ |
5776 | if (to->pp_recycle != from->pp_recycle || |
5777 | (from->pp_recycle && skb_cloned(skb: from))) |
5778 | return false; |
5779 | |
5780 | if (len <= skb_tailroom(skb: to)) { |
5781 | if (len) |
5782 | BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); |
5783 | *delta_truesize = 0; |
5784 | return true; |
5785 | } |
5786 | |
5787 | to_shinfo = skb_shinfo(to); |
5788 | from_shinfo = skb_shinfo(from); |
5789 | if (to_shinfo->frag_list || from_shinfo->frag_list) |
5790 | return false; |
5791 | if (skb_zcopy(skb: to) || skb_zcopy(skb: from)) |
5792 | return false; |
5793 | |
5794 | if (skb_headlen(skb: from) != 0) { |
5795 | struct page *page; |
5796 | unsigned int offset; |
5797 | |
5798 | if (to_shinfo->nr_frags + |
5799 | from_shinfo->nr_frags >= MAX_SKB_FRAGS) |
5800 | return false; |
5801 | |
5802 | if (skb_head_is_locked(skb: from)) |
5803 | return false; |
5804 | |
5805 | delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); |
5806 | |
5807 | page = virt_to_head_page(x: from->head); |
5808 | offset = from->data - (unsigned char *)page_address(page); |
5809 | |
5810 | skb_fill_page_desc(skb: to, i: to_shinfo->nr_frags, |
5811 | page, off: offset, size: skb_headlen(skb: from)); |
5812 | *fragstolen = true; |
5813 | } else { |
5814 | if (to_shinfo->nr_frags + |
5815 | from_shinfo->nr_frags > MAX_SKB_FRAGS) |
5816 | return false; |
5817 | |
5818 | delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); |
5819 | } |
5820 | |
5821 | WARN_ON_ONCE(delta < len); |
5822 | |
5823 | memcpy(to_shinfo->frags + to_shinfo->nr_frags, |
5824 | from_shinfo->frags, |
5825 | from_shinfo->nr_frags * sizeof(skb_frag_t)); |
5826 | to_shinfo->nr_frags += from_shinfo->nr_frags; |
5827 | |
5828 | if (!skb_cloned(skb: from)) |
5829 | from_shinfo->nr_frags = 0; |
5830 | |
5831 | /* if the skb is not cloned this does nothing |
5832 | * since we set nr_frags to 0. |
5833 | */ |
5834 | for (i = 0; i < from_shinfo->nr_frags; i++) |
5835 | __skb_frag_ref(frag: &from_shinfo->frags[i]); |
5836 | |
5837 | to->truesize += delta; |
5838 | to->len += len; |
5839 | to->data_len += len; |
5840 | |
5841 | *delta_truesize = delta; |
5842 | return true; |
5843 | } |
5844 | EXPORT_SYMBOL(skb_try_coalesce); |
5845 | |
5846 | /** |
5847 | * skb_scrub_packet - scrub an skb |
5848 | * |
5849 | * @skb: buffer to clean |
5850 | * @xnet: packet is crossing netns |
5851 | * |
5852 | * skb_scrub_packet can be used after encapsulating or decapsulting a packet |
5853 | * into/from a tunnel. Some information have to be cleared during these |
5854 | * operations. |
5855 | * skb_scrub_packet can also be used to clean a skb before injecting it in |
5856 | * another namespace (@xnet == true). We have to clear all information in the |
5857 | * skb that could impact namespace isolation. |
5858 | */ |
5859 | void skb_scrub_packet(struct sk_buff *skb, bool xnet) |
5860 | { |
5861 | skb->pkt_type = PACKET_HOST; |
5862 | skb->skb_iif = 0; |
5863 | skb->ignore_df = 0; |
5864 | skb_dst_drop(skb); |
5865 | skb_ext_reset(skb); |
5866 | nf_reset_ct(skb); |
5867 | nf_reset_trace(skb); |
5868 | |
5869 | #ifdef CONFIG_NET_SWITCHDEV |
5870 | skb->offload_fwd_mark = 0; |
5871 | skb->offload_l3_fwd_mark = 0; |
5872 | #endif |
5873 | |
5874 | if (!xnet) |
5875 | return; |
5876 | |
5877 | ipvs_reset(skb); |
5878 | skb->mark = 0; |
5879 | skb_clear_tstamp(skb); |
5880 | } |
5881 | EXPORT_SYMBOL_GPL(skb_scrub_packet); |
5882 | |
5883 | static struct sk_buff *(struct sk_buff *skb) |
5884 | { |
5885 | int mac_len, meta_len; |
5886 | void *meta; |
5887 | |
5888 | if (skb_cow(skb, headroom: skb_headroom(skb)) < 0) { |
5889 | kfree_skb(skb); |
5890 | return NULL; |
5891 | } |
5892 | |
5893 | mac_len = skb->data - skb_mac_header(skb); |
5894 | if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { |
5895 | memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), |
5896 | mac_len - VLAN_HLEN - ETH_TLEN); |
5897 | } |
5898 | |
5899 | meta_len = skb_metadata_len(skb); |
5900 | if (meta_len) { |
5901 | meta = skb_metadata_end(skb) - meta_len; |
5902 | memmove(meta + VLAN_HLEN, meta, meta_len); |
5903 | } |
5904 | |
5905 | skb->mac_header += VLAN_HLEN; |
5906 | return skb; |
5907 | } |
5908 | |
5909 | struct sk_buff *skb_vlan_untag(struct sk_buff *skb) |
5910 | { |
5911 | struct vlan_hdr *vhdr; |
5912 | u16 vlan_tci; |
5913 | |
5914 | if (unlikely(skb_vlan_tag_present(skb))) { |
5915 | /* vlan_tci is already set-up so leave this for another time */ |
5916 | return skb; |
5917 | } |
5918 | |
5919 | skb = skb_share_check(skb, GFP_ATOMIC); |
5920 | if (unlikely(!skb)) |
5921 | goto err_free; |
5922 | /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ |
5923 | if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) |
5924 | goto err_free; |
5925 | |
5926 | vhdr = (struct vlan_hdr *)skb->data; |
5927 | vlan_tci = ntohs(vhdr->h_vlan_TCI); |
5928 | __vlan_hwaccel_put_tag(skb, vlan_proto: skb->protocol, vlan_tci); |
5929 | |
5930 | skb_pull_rcsum(skb, VLAN_HLEN); |
5931 | vlan_set_encap_proto(skb, vhdr); |
5932 | |
5933 | skb = skb_reorder_vlan_header(skb); |
5934 | if (unlikely(!skb)) |
5935 | goto err_free; |
5936 | |
5937 | skb_reset_network_header(skb); |
5938 | if (!skb_transport_header_was_set(skb)) |
5939 | skb_reset_transport_header(skb); |
5940 | skb_reset_mac_len(skb); |
5941 | |
5942 | return skb; |
5943 | |
5944 | err_free: |
5945 | kfree_skb(skb); |
5946 | return NULL; |
5947 | } |
5948 | EXPORT_SYMBOL(skb_vlan_untag); |
5949 | |
5950 | int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) |
5951 | { |
5952 | if (!pskb_may_pull(skb, len: write_len)) |
5953 | return -ENOMEM; |
5954 | |
5955 | if (!skb_cloned(skb) || skb_clone_writable(skb, len: write_len)) |
5956 | return 0; |
5957 | |
5958 | return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); |
5959 | } |
5960 | EXPORT_SYMBOL(skb_ensure_writable); |
5961 | |
5962 | /* remove VLAN header from packet and update csum accordingly. |
5963 | * expects a non skb_vlan_tag_present skb with a vlan tag payload |
5964 | */ |
5965 | int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) |
5966 | { |
5967 | int offset = skb->data - skb_mac_header(skb); |
5968 | int err; |
5969 | |
5970 | if (WARN_ONCE(offset, |
5971 | "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n" , |
5972 | offset)) { |
5973 | return -EINVAL; |
5974 | } |
5975 | |
5976 | err = skb_ensure_writable(skb, VLAN_ETH_HLEN); |
5977 | if (unlikely(err)) |
5978 | return err; |
5979 | |
5980 | skb_postpull_rcsum(skb, start: skb->data + (2 * ETH_ALEN), VLAN_HLEN); |
5981 | |
5982 | vlan_remove_tag(skb, vlan_tci); |
5983 | |
5984 | skb->mac_header += VLAN_HLEN; |
5985 | |
5986 | if (skb_network_offset(skb) < ETH_HLEN) |
5987 | skb_set_network_header(skb, ETH_HLEN); |
5988 | |
5989 | skb_reset_mac_len(skb); |
5990 | |
5991 | return err; |
5992 | } |
5993 | EXPORT_SYMBOL(__skb_vlan_pop); |
5994 | |
5995 | /* Pop a vlan tag either from hwaccel or from payload. |
5996 | * Expects skb->data at mac header. |
5997 | */ |
5998 | int skb_vlan_pop(struct sk_buff *skb) |
5999 | { |
6000 | u16 vlan_tci; |
6001 | __be16 vlan_proto; |
6002 | int err; |
6003 | |
6004 | if (likely(skb_vlan_tag_present(skb))) { |
6005 | __vlan_hwaccel_clear_tag(skb); |
6006 | } else { |
6007 | if (unlikely(!eth_type_vlan(skb->protocol))) |
6008 | return 0; |
6009 | |
6010 | err = __skb_vlan_pop(skb, &vlan_tci); |
6011 | if (err) |
6012 | return err; |
6013 | } |
6014 | /* move next vlan tag to hw accel tag */ |
6015 | if (likely(!eth_type_vlan(skb->protocol))) |
6016 | return 0; |
6017 | |
6018 | vlan_proto = skb->protocol; |
6019 | err = __skb_vlan_pop(skb, &vlan_tci); |
6020 | if (unlikely(err)) |
6021 | return err; |
6022 | |
6023 | __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); |
6024 | return 0; |
6025 | } |
6026 | EXPORT_SYMBOL(skb_vlan_pop); |
6027 | |
6028 | /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). |
6029 | * Expects skb->data at mac header. |
6030 | */ |
6031 | int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) |
6032 | { |
6033 | if (skb_vlan_tag_present(skb)) { |
6034 | int offset = skb->data - skb_mac_header(skb); |
6035 | int err; |
6036 | |
6037 | if (WARN_ONCE(offset, |
6038 | "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n" , |
6039 | offset)) { |
6040 | return -EINVAL; |
6041 | } |
6042 | |
6043 | err = __vlan_insert_tag(skb, vlan_proto: skb->vlan_proto, |
6044 | skb_vlan_tag_get(skb)); |
6045 | if (err) |
6046 | return err; |
6047 | |
6048 | skb->protocol = skb->vlan_proto; |
6049 | skb->mac_len += VLAN_HLEN; |
6050 | |
6051 | skb_postpush_rcsum(skb, start: skb->data + (2 * ETH_ALEN), VLAN_HLEN); |
6052 | } |
6053 | __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); |
6054 | return 0; |
6055 | } |
6056 | EXPORT_SYMBOL(skb_vlan_push); |
6057 | |
6058 | /** |
6059 | * skb_eth_pop() - Drop the Ethernet header at the head of a packet |
6060 | * |
6061 | * @skb: Socket buffer to modify |
6062 | * |
6063 | * Drop the Ethernet header of @skb. |
6064 | * |
6065 | * Expects that skb->data points to the mac header and that no VLAN tags are |
6066 | * present. |
6067 | * |
6068 | * Returns 0 on success, -errno otherwise. |
6069 | */ |
6070 | int skb_eth_pop(struct sk_buff *skb) |
6071 | { |
6072 | if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || |
6073 | skb_network_offset(skb) < ETH_HLEN) |
6074 | return -EPROTO; |
6075 | |
6076 | skb_pull_rcsum(skb, ETH_HLEN); |
6077 | skb_reset_mac_header(skb); |
6078 | skb_reset_mac_len(skb); |
6079 | |
6080 | return 0; |
6081 | } |
6082 | EXPORT_SYMBOL(skb_eth_pop); |
6083 | |
6084 | /** |
6085 | * skb_eth_push() - Add a new Ethernet header at the head of a packet |
6086 | * |
6087 | * @skb: Socket buffer to modify |
6088 | * @dst: Destination MAC address of the new header |
6089 | * @src: Source MAC address of the new header |
6090 | * |
6091 | * Prepend @skb with a new Ethernet header. |
6092 | * |
6093 | * Expects that skb->data points to the mac header, which must be empty. |
6094 | * |
6095 | * Returns 0 on success, -errno otherwise. |
6096 | */ |
6097 | int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, |
6098 | const unsigned char *src) |
6099 | { |
6100 | struct ethhdr *eth; |
6101 | int err; |
6102 | |
6103 | if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) |
6104 | return -EPROTO; |
6105 | |
6106 | err = skb_cow_head(skb, headroom: sizeof(*eth)); |
6107 | if (err < 0) |
6108 | return err; |
6109 | |
6110 | skb_push(skb, sizeof(*eth)); |
6111 | skb_reset_mac_header(skb); |
6112 | skb_reset_mac_len(skb); |
6113 | |
6114 | eth = eth_hdr(skb); |
6115 | ether_addr_copy(dst: eth->h_dest, src: dst); |
6116 | ether_addr_copy(dst: eth->h_source, src); |
6117 | eth->h_proto = skb->protocol; |
6118 | |
6119 | skb_postpush_rcsum(skb, start: eth, len: sizeof(*eth)); |
6120 | |
6121 | return 0; |
6122 | } |
6123 | EXPORT_SYMBOL(skb_eth_push); |
6124 | |
6125 | /* Update the ethertype of hdr and the skb csum value if required. */ |
6126 | static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, |
6127 | __be16 ethertype) |
6128 | { |
6129 | if (skb->ip_summed == CHECKSUM_COMPLETE) { |
6130 | __be16 diff[] = { ~hdr->h_proto, ethertype }; |
6131 | |
6132 | skb->csum = csum_partial(buff: (char *)diff, len: sizeof(diff), sum: skb->csum); |
6133 | } |
6134 | |
6135 | hdr->h_proto = ethertype; |
6136 | } |
6137 | |
6138 | /** |
6139 | * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of |
6140 | * the packet |
6141 | * |
6142 | * @skb: buffer |
6143 | * @mpls_lse: MPLS label stack entry to push |
6144 | * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) |
6145 | * @mac_len: length of the MAC header |
6146 | * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is |
6147 | * ethernet |
6148 | * |
6149 | * Expects skb->data at mac header. |
6150 | * |
6151 | * Returns 0 on success, -errno otherwise. |
6152 | */ |
6153 | int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, |
6154 | int mac_len, bool ethernet) |
6155 | { |
6156 | struct mpls_shim_hdr *lse; |
6157 | int err; |
6158 | |
6159 | if (unlikely(!eth_p_mpls(mpls_proto))) |
6160 | return -EINVAL; |
6161 | |
6162 | /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ |
6163 | if (skb->encapsulation) |
6164 | return -EINVAL; |
6165 | |
6166 | err = skb_cow_head(skb, MPLS_HLEN); |
6167 | if (unlikely(err)) |
6168 | return err; |
6169 | |
6170 | if (!skb->inner_protocol) { |
6171 | skb_set_inner_network_header(skb, offset: skb_network_offset(skb)); |
6172 | skb_set_inner_protocol(skb, protocol: skb->protocol); |
6173 | } |
6174 | |
6175 | skb_push(skb, MPLS_HLEN); |
6176 | memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), |
6177 | mac_len); |
6178 | skb_reset_mac_header(skb); |
6179 | skb_set_network_header(skb, offset: mac_len); |
6180 | skb_reset_mac_len(skb); |
6181 | |
6182 | lse = mpls_hdr(skb); |
6183 | lse->label_stack_entry = mpls_lse; |
6184 | skb_postpush_rcsum(skb, start: lse, MPLS_HLEN); |
6185 | |
6186 | if (ethernet && mac_len >= ETH_HLEN) |
6187 | skb_mod_eth_type(skb, hdr: eth_hdr(skb), ethertype: mpls_proto); |
6188 | skb->protocol = mpls_proto; |
6189 | |
6190 | return 0; |
6191 | } |
6192 | EXPORT_SYMBOL_GPL(skb_mpls_push); |
6193 | |
6194 | /** |
6195 | * skb_mpls_pop() - pop the outermost MPLS header |
6196 | * |
6197 | * @skb: buffer |
6198 | * @next_proto: ethertype of header after popped MPLS header |
6199 | * @mac_len: length of the MAC header |
6200 | * @ethernet: flag to indicate if the packet is ethernet |
6201 | * |
6202 | * Expects skb->data at mac header. |
6203 | * |
6204 | * Returns 0 on success, -errno otherwise. |
6205 | */ |
6206 | int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, |
6207 | bool ethernet) |
6208 | { |
6209 | int err; |
6210 | |
6211 | if (unlikely(!eth_p_mpls(skb->protocol))) |
6212 | return 0; |
6213 | |
6214 | err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); |
6215 | if (unlikely(err)) |
6216 | return err; |
6217 | |
6218 | skb_postpull_rcsum(skb, start: mpls_hdr(skb), MPLS_HLEN); |
6219 | memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), |
6220 | mac_len); |
6221 | |
6222 | __skb_pull(skb, MPLS_HLEN); |
6223 | skb_reset_mac_header(skb); |
6224 | skb_set_network_header(skb, offset: mac_len); |
6225 | |
6226 | if (ethernet && mac_len >= ETH_HLEN) { |
6227 | struct ethhdr *hdr; |
6228 | |
6229 | /* use mpls_hdr() to get ethertype to account for VLANs. */ |
6230 | hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); |
6231 | skb_mod_eth_type(skb, hdr, ethertype: next_proto); |
6232 | } |
6233 | skb->protocol = next_proto; |
6234 | |
6235 | return 0; |
6236 | } |
6237 | EXPORT_SYMBOL_GPL(skb_mpls_pop); |
6238 | |
6239 | /** |
6240 | * skb_mpls_update_lse() - modify outermost MPLS header and update csum |
6241 | * |
6242 | * @skb: buffer |
6243 | * @mpls_lse: new MPLS label stack entry to update to |
6244 | * |
6245 | * Expects skb->data at mac header. |
6246 | * |
6247 | * Returns 0 on success, -errno otherwise. |
6248 | */ |
6249 | int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) |
6250 | { |
6251 | int err; |
6252 | |
6253 | if (unlikely(!eth_p_mpls(skb->protocol))) |
6254 | return -EINVAL; |
6255 | |
6256 | err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); |
6257 | if (unlikely(err)) |
6258 | return err; |
6259 | |
6260 | if (skb->ip_summed == CHECKSUM_COMPLETE) { |
6261 | __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; |
6262 | |
6263 | skb->csum = csum_partial(buff: (char *)diff, len: sizeof(diff), sum: skb->csum); |
6264 | } |
6265 | |
6266 | mpls_hdr(skb)->label_stack_entry = mpls_lse; |
6267 | |
6268 | return 0; |
6269 | } |
6270 | EXPORT_SYMBOL_GPL(skb_mpls_update_lse); |
6271 | |
6272 | /** |
6273 | * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header |
6274 | * |
6275 | * @skb: buffer |
6276 | * |
6277 | * Expects skb->data at mac header. |
6278 | * |
6279 | * Returns 0 on success, -errno otherwise. |
6280 | */ |
6281 | int skb_mpls_dec_ttl(struct sk_buff *skb) |
6282 | { |
6283 | u32 lse; |
6284 | u8 ttl; |
6285 | |
6286 | if (unlikely(!eth_p_mpls(skb->protocol))) |
6287 | return -EINVAL; |
6288 | |
6289 | if (!pskb_may_pull(skb, len: skb_network_offset(skb) + MPLS_HLEN)) |
6290 | return -ENOMEM; |
6291 | |
6292 | lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); |
6293 | ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; |
6294 | if (!--ttl) |
6295 | return -EINVAL; |
6296 | |
6297 | lse &= ~MPLS_LS_TTL_MASK; |
6298 | lse |= ttl << MPLS_LS_TTL_SHIFT; |
6299 | |
6300 | return skb_mpls_update_lse(skb, cpu_to_be32(lse)); |
6301 | } |
6302 | EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); |
6303 | |
6304 | /** |
6305 | * alloc_skb_with_frags - allocate skb with page frags |
6306 | * |
6307 | * @header_len: size of linear part |
6308 | * @data_len: needed length in frags |
6309 | * @order: max page order desired. |
6310 | * @errcode: pointer to error code if any |
6311 | * @gfp_mask: allocation mask |
6312 | * |
6313 | * This can be used to allocate a paged skb, given a maximal order for frags. |
6314 | */ |
6315 | struct sk_buff *alloc_skb_with_frags(unsigned long , |
6316 | unsigned long data_len, |
6317 | int order, |
6318 | int *errcode, |
6319 | gfp_t gfp_mask) |
6320 | { |
6321 | unsigned long chunk; |
6322 | struct sk_buff *skb; |
6323 | struct page *page; |
6324 | int nr_frags = 0; |
6325 | |
6326 | *errcode = -EMSGSIZE; |
6327 | if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) |
6328 | return NULL; |
6329 | |
6330 | *errcode = -ENOBUFS; |
6331 | skb = alloc_skb(size: header_len, priority: gfp_mask); |
6332 | if (!skb) |
6333 | return NULL; |
6334 | |
6335 | while (data_len) { |
6336 | if (nr_frags == MAX_SKB_FRAGS - 1) |
6337 | goto failure; |
6338 | while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) |
6339 | order--; |
6340 | |
6341 | if (order) { |
6342 | page = alloc_pages(gfp: (gfp_mask & ~__GFP_DIRECT_RECLAIM) | |
6343 | __GFP_COMP | |
6344 | __GFP_NOWARN, |
6345 | order); |
6346 | if (!page) { |
6347 | order--; |
6348 | continue; |
6349 | } |
6350 | } else { |
6351 | page = alloc_page(gfp_mask); |
6352 | if (!page) |
6353 | goto failure; |
6354 | } |
6355 | chunk = min_t(unsigned long, data_len, |
6356 | PAGE_SIZE << order); |
6357 | skb_fill_page_desc(skb, i: nr_frags, page, off: 0, size: chunk); |
6358 | nr_frags++; |
6359 | skb->truesize += (PAGE_SIZE << order); |
6360 | data_len -= chunk; |
6361 | } |
6362 | return skb; |
6363 | |
6364 | failure: |
6365 | kfree_skb(skb); |
6366 | return NULL; |
6367 | } |
6368 | EXPORT_SYMBOL(alloc_skb_with_frags); |
6369 | |
6370 | /* carve out the first off bytes from skb when off < headlen */ |
6371 | static int (struct sk_buff *skb, const u32 off, |
6372 | const int headlen, gfp_t gfp_mask) |
6373 | { |
6374 | int i; |
6375 | unsigned int size = skb_end_offset(skb); |
6376 | int new_hlen = headlen - off; |
6377 | u8 *data; |
6378 | |
6379 | if (skb_pfmemalloc(skb)) |
6380 | gfp_mask |= __GFP_MEMALLOC; |
6381 | |
6382 | data = kmalloc_reserve(size: &size, flags: gfp_mask, NUMA_NO_NODE, NULL); |
6383 | if (!data) |
6384 | return -ENOMEM; |
6385 | size = SKB_WITH_OVERHEAD(size); |
6386 | |
6387 | /* Copy real data, and all frags */ |
6388 | skb_copy_from_linear_data_offset(skb, offset: off, to: data, len: new_hlen); |
6389 | skb->len -= off; |
6390 | |
6391 | memcpy((struct skb_shared_info *)(data + size), |
6392 | skb_shinfo(skb), |
6393 | offsetof(struct skb_shared_info, |
6394 | frags[skb_shinfo(skb)->nr_frags])); |
6395 | if (skb_cloned(skb)) { |
6396 | /* drop the old head gracefully */ |
6397 | if (skb_orphan_frags(skb, gfp_mask)) { |
6398 | skb_kfree_head(head: data, end_offset: size); |
6399 | return -ENOMEM; |
6400 | } |
6401 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
6402 | skb_frag_ref(skb, f: i); |
6403 | if (skb_has_frag_list(skb)) |
6404 | skb_clone_fraglist(skb); |
6405 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
6406 | } else { |
6407 | /* we can reuse existing recount- all we did was |
6408 | * relocate values |
6409 | */ |
6410 | skb_free_head(skb, napi_safe: false); |
6411 | } |
6412 | |
6413 | skb->head = data; |
6414 | skb->data = data; |
6415 | skb->head_frag = 0; |
6416 | skb_set_end_offset(skb, offset: size); |
6417 | skb_set_tail_pointer(skb, offset: skb_headlen(skb)); |
6418 | skb_headers_offset_update(skb, 0); |
6419 | skb->cloned = 0; |
6420 | skb->hdr_len = 0; |
6421 | skb->nohdr = 0; |
6422 | atomic_set(v: &skb_shinfo(skb)->dataref, i: 1); |
6423 | |
6424 | return 0; |
6425 | } |
6426 | |
6427 | static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); |
6428 | |
6429 | /* carve out the first eat bytes from skb's frag_list. May recurse into |
6430 | * pskb_carve() |
6431 | */ |
6432 | static int pskb_carve_frag_list(struct sk_buff *skb, |
6433 | struct skb_shared_info *shinfo, int eat, |
6434 | gfp_t gfp_mask) |
6435 | { |
6436 | struct sk_buff *list = shinfo->frag_list; |
6437 | struct sk_buff *clone = NULL; |
6438 | struct sk_buff *insp = NULL; |
6439 | |
6440 | do { |
6441 | if (!list) { |
6442 | pr_err("Not enough bytes to eat. Want %d\n" , eat); |
6443 | return -EFAULT; |
6444 | } |
6445 | if (list->len <= eat) { |
6446 | /* Eaten as whole. */ |
6447 | eat -= list->len; |
6448 | list = list->next; |
6449 | insp = list; |
6450 | } else { |
6451 | /* Eaten partially. */ |
6452 | if (skb_shared(skb: list)) { |
6453 | clone = skb_clone(list, gfp_mask); |
6454 | if (!clone) |
6455 | return -ENOMEM; |
6456 | insp = list->next; |
6457 | list = clone; |
6458 | } else { |
6459 | /* This may be pulled without problems. */ |
6460 | insp = list; |
6461 | } |
6462 | if (pskb_carve(skb: list, off: eat, gfp: gfp_mask) < 0) { |
6463 | kfree_skb(skb: clone); |
6464 | return -ENOMEM; |
6465 | } |
6466 | break; |
6467 | } |
6468 | } while (eat); |
6469 | |
6470 | /* Free pulled out fragments. */ |
6471 | while ((list = shinfo->frag_list) != insp) { |
6472 | shinfo->frag_list = list->next; |
6473 | consume_skb(list); |
6474 | } |
6475 | /* And insert new clone at head. */ |
6476 | if (clone) { |
6477 | clone->next = list; |
6478 | shinfo->frag_list = clone; |
6479 | } |
6480 | return 0; |
6481 | } |
6482 | |
6483 | /* carve off first len bytes from skb. Split line (off) is in the |
6484 | * non-linear part of skb |
6485 | */ |
6486 | static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, |
6487 | int pos, gfp_t gfp_mask) |
6488 | { |
6489 | int i, k = 0; |
6490 | unsigned int size = skb_end_offset(skb); |
6491 | u8 *data; |
6492 | const int nfrags = skb_shinfo(skb)->nr_frags; |
6493 | struct skb_shared_info *shinfo; |
6494 | |
6495 | if (skb_pfmemalloc(skb)) |
6496 | gfp_mask |= __GFP_MEMALLOC; |
6497 | |
6498 | data = kmalloc_reserve(size: &size, flags: gfp_mask, NUMA_NO_NODE, NULL); |
6499 | if (!data) |
6500 | return -ENOMEM; |
6501 | size = SKB_WITH_OVERHEAD(size); |
6502 | |
6503 | memcpy((struct skb_shared_info *)(data + size), |
6504 | skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); |
6505 | if (skb_orphan_frags(skb, gfp_mask)) { |
6506 | skb_kfree_head(head: data, end_offset: size); |
6507 | return -ENOMEM; |
6508 | } |
6509 | shinfo = (struct skb_shared_info *)(data + size); |
6510 | for (i = 0; i < nfrags; i++) { |
6511 | int fsize = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
6512 | |
6513 | if (pos + fsize > off) { |
6514 | shinfo->frags[k] = skb_shinfo(skb)->frags[i]; |
6515 | |
6516 | if (pos < off) { |
6517 | /* Split frag. |
6518 | * We have two variants in this case: |
6519 | * 1. Move all the frag to the second |
6520 | * part, if it is possible. F.e. |
6521 | * this approach is mandatory for TUX, |
6522 | * where splitting is expensive. |
6523 | * 2. Split is accurately. We make this. |
6524 | */ |
6525 | skb_frag_off_add(frag: &shinfo->frags[0], delta: off - pos); |
6526 | skb_frag_size_sub(frag: &shinfo->frags[0], delta: off - pos); |
6527 | } |
6528 | skb_frag_ref(skb, f: i); |
6529 | k++; |
6530 | } |
6531 | pos += fsize; |
6532 | } |
6533 | shinfo->nr_frags = k; |
6534 | if (skb_has_frag_list(skb)) |
6535 | skb_clone_fraglist(skb); |
6536 | |
6537 | /* split line is in frag list */ |
6538 | if (k == 0 && pskb_carve_frag_list(skb, shinfo, eat: off - pos, gfp_mask)) { |
6539 | /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ |
6540 | if (skb_has_frag_list(skb)) |
6541 | kfree_skb_list(skb_shinfo(skb)->frag_list); |
6542 | skb_kfree_head(head: data, end_offset: size); |
6543 | return -ENOMEM; |
6544 | } |
6545 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
6546 | |
6547 | skb->head = data; |
6548 | skb->head_frag = 0; |
6549 | skb->data = data; |
6550 | skb_set_end_offset(skb, offset: size); |
6551 | skb_reset_tail_pointer(skb); |
6552 | skb_headers_offset_update(skb, 0); |
6553 | skb->cloned = 0; |
6554 | skb->hdr_len = 0; |
6555 | skb->nohdr = 0; |
6556 | skb->len -= off; |
6557 | skb->data_len = skb->len; |
6558 | atomic_set(v: &skb_shinfo(skb)->dataref, i: 1); |
6559 | return 0; |
6560 | } |
6561 | |
6562 | /* remove len bytes from the beginning of the skb */ |
6563 | static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) |
6564 | { |
6565 | int headlen = skb_headlen(skb); |
6566 | |
6567 | if (len < headlen) |
6568 | return pskb_carve_inside_header(skb, off: len, headlen, gfp_mask: gfp); |
6569 | else |
6570 | return pskb_carve_inside_nonlinear(skb, off: len, pos: headlen, gfp_mask: gfp); |
6571 | } |
6572 | |
6573 | /* Extract to_copy bytes starting at off from skb, and return this in |
6574 | * a new skb |
6575 | */ |
6576 | struct sk_buff *(struct sk_buff *skb, int off, |
6577 | int to_copy, gfp_t gfp) |
6578 | { |
6579 | struct sk_buff *clone = skb_clone(skb, gfp); |
6580 | |
6581 | if (!clone) |
6582 | return NULL; |
6583 | |
6584 | if (pskb_carve(skb: clone, len: off, gfp) < 0 || |
6585 | pskb_trim(skb: clone, len: to_copy)) { |
6586 | kfree_skb(skb: clone); |
6587 | return NULL; |
6588 | } |
6589 | return clone; |
6590 | } |
6591 | EXPORT_SYMBOL(pskb_extract); |
6592 | |
6593 | /** |
6594 | * skb_condense - try to get rid of fragments/frag_list if possible |
6595 | * @skb: buffer |
6596 | * |
6597 | * Can be used to save memory before skb is added to a busy queue. |
6598 | * If packet has bytes in frags and enough tail room in skb->head, |
6599 | * pull all of them, so that we can free the frags right now and adjust |
6600 | * truesize. |
6601 | * Notes: |
6602 | * We do not reallocate skb->head thus can not fail. |
6603 | * Caller must re-evaluate skb->truesize if needed. |
6604 | */ |
6605 | void skb_condense(struct sk_buff *skb) |
6606 | { |
6607 | if (skb->data_len) { |
6608 | if (skb->data_len > skb->end - skb->tail || |
6609 | skb_cloned(skb)) |
6610 | return; |
6611 | |
6612 | /* Nice, we can free page frag(s) right now */ |
6613 | __pskb_pull_tail(skb, skb->data_len); |
6614 | } |
6615 | /* At this point, skb->truesize might be over estimated, |
6616 | * because skb had a fragment, and fragments do not tell |
6617 | * their truesize. |
6618 | * When we pulled its content into skb->head, fragment |
6619 | * was freed, but __pskb_pull_tail() could not possibly |
6620 | * adjust skb->truesize, not knowing the frag truesize. |
6621 | */ |
6622 | skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); |
6623 | } |
6624 | EXPORT_SYMBOL(skb_condense); |
6625 | |
6626 | #ifdef CONFIG_SKB_EXTENSIONS |
6627 | static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) |
6628 | { |
6629 | return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); |
6630 | } |
6631 | |
6632 | /** |
6633 | * __skb_ext_alloc - allocate a new skb extensions storage |
6634 | * |
6635 | * @flags: See kmalloc(). |
6636 | * |
6637 | * Returns the newly allocated pointer. The pointer can later attached to a |
6638 | * skb via __skb_ext_set(). |
6639 | * Note: caller must handle the skb_ext as an opaque data. |
6640 | */ |
6641 | struct skb_ext *__skb_ext_alloc(gfp_t flags) |
6642 | { |
6643 | struct skb_ext *new = kmem_cache_alloc(cachep: skbuff_ext_cache, flags); |
6644 | |
6645 | if (new) { |
6646 | memset(new->offset, 0, sizeof(new->offset)); |
6647 | refcount_set(r: &new->refcnt, n: 1); |
6648 | } |
6649 | |
6650 | return new; |
6651 | } |
6652 | |
6653 | static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, |
6654 | unsigned int old_active) |
6655 | { |
6656 | struct skb_ext *new; |
6657 | |
6658 | if (refcount_read(r: &old->refcnt) == 1) |
6659 | return old; |
6660 | |
6661 | new = kmem_cache_alloc(cachep: skbuff_ext_cache, GFP_ATOMIC); |
6662 | if (!new) |
6663 | return NULL; |
6664 | |
6665 | memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); |
6666 | refcount_set(r: &new->refcnt, n: 1); |
6667 | |
6668 | #ifdef CONFIG_XFRM |
6669 | if (old_active & (1 << SKB_EXT_SEC_PATH)) { |
6670 | struct sec_path *sp = skb_ext_get_ptr(ext: old, id: SKB_EXT_SEC_PATH); |
6671 | unsigned int i; |
6672 | |
6673 | for (i = 0; i < sp->len; i++) |
6674 | xfrm_state_hold(x: sp->xvec[i]); |
6675 | } |
6676 | #endif |
6677 | __skb_ext_put(ext: old); |
6678 | return new; |
6679 | } |
6680 | |
6681 | /** |
6682 | * __skb_ext_set - attach the specified extension storage to this skb |
6683 | * @skb: buffer |
6684 | * @id: extension id |
6685 | * @ext: extension storage previously allocated via __skb_ext_alloc() |
6686 | * |
6687 | * Existing extensions, if any, are cleared. |
6688 | * |
6689 | * Returns the pointer to the extension. |
6690 | */ |
6691 | void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, |
6692 | struct skb_ext *ext) |
6693 | { |
6694 | unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); |
6695 | |
6696 | skb_ext_put(skb); |
6697 | newlen = newoff + skb_ext_type_len[id]; |
6698 | ext->chunks = newlen; |
6699 | ext->offset[id] = newoff; |
6700 | skb->extensions = ext; |
6701 | skb->active_extensions = 1 << id; |
6702 | return skb_ext_get_ptr(ext, id); |
6703 | } |
6704 | |
6705 | /** |
6706 | * skb_ext_add - allocate space for given extension, COW if needed |
6707 | * @skb: buffer |
6708 | * @id: extension to allocate space for |
6709 | * |
6710 | * Allocates enough space for the given extension. |
6711 | * If the extension is already present, a pointer to that extension |
6712 | * is returned. |
6713 | * |
6714 | * If the skb was cloned, COW applies and the returned memory can be |
6715 | * modified without changing the extension space of clones buffers. |
6716 | * |
6717 | * Returns pointer to the extension or NULL on allocation failure. |
6718 | */ |
6719 | void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) |
6720 | { |
6721 | struct skb_ext *new, *old = NULL; |
6722 | unsigned int newlen, newoff; |
6723 | |
6724 | if (skb->active_extensions) { |
6725 | old = skb->extensions; |
6726 | |
6727 | new = skb_ext_maybe_cow(old, old_active: skb->active_extensions); |
6728 | if (!new) |
6729 | return NULL; |
6730 | |
6731 | if (__skb_ext_exist(ext: new, i: id)) |
6732 | goto set_active; |
6733 | |
6734 | newoff = new->chunks; |
6735 | } else { |
6736 | newoff = SKB_EXT_CHUNKSIZEOF(*new); |
6737 | |
6738 | new = __skb_ext_alloc(GFP_ATOMIC); |
6739 | if (!new) |
6740 | return NULL; |
6741 | } |
6742 | |
6743 | newlen = newoff + skb_ext_type_len[id]; |
6744 | new->chunks = newlen; |
6745 | new->offset[id] = newoff; |
6746 | set_active: |
6747 | skb->slow_gro = 1; |
6748 | skb->extensions = new; |
6749 | skb->active_extensions |= 1 << id; |
6750 | return skb_ext_get_ptr(ext: new, id); |
6751 | } |
6752 | EXPORT_SYMBOL(skb_ext_add); |
6753 | |
6754 | #ifdef CONFIG_XFRM |
6755 | static void skb_ext_put_sp(struct sec_path *sp) |
6756 | { |
6757 | unsigned int i; |
6758 | |
6759 | for (i = 0; i < sp->len; i++) |
6760 | xfrm_state_put(x: sp->xvec[i]); |
6761 | } |
6762 | #endif |
6763 | |
6764 | #ifdef CONFIG_MCTP_FLOWS |
6765 | static void skb_ext_put_mctp(struct mctp_flow *flow) |
6766 | { |
6767 | if (flow->key) |
6768 | mctp_key_unref(key: flow->key); |
6769 | } |
6770 | #endif |
6771 | |
6772 | void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) |
6773 | { |
6774 | struct skb_ext *ext = skb->extensions; |
6775 | |
6776 | skb->active_extensions &= ~(1 << id); |
6777 | if (skb->active_extensions == 0) { |
6778 | skb->extensions = NULL; |
6779 | __skb_ext_put(ext); |
6780 | #ifdef CONFIG_XFRM |
6781 | } else if (id == SKB_EXT_SEC_PATH && |
6782 | refcount_read(r: &ext->refcnt) == 1) { |
6783 | struct sec_path *sp = skb_ext_get_ptr(ext, id: SKB_EXT_SEC_PATH); |
6784 | |
6785 | skb_ext_put_sp(sp); |
6786 | sp->len = 0; |
6787 | #endif |
6788 | } |
6789 | } |
6790 | EXPORT_SYMBOL(__skb_ext_del); |
6791 | |
6792 | void __skb_ext_put(struct skb_ext *ext) |
6793 | { |
6794 | /* If this is last clone, nothing can increment |
6795 | * it after check passes. Avoids one atomic op. |
6796 | */ |
6797 | if (refcount_read(r: &ext->refcnt) == 1) |
6798 | goto free_now; |
6799 | |
6800 | if (!refcount_dec_and_test(r: &ext->refcnt)) |
6801 | return; |
6802 | free_now: |
6803 | #ifdef CONFIG_XFRM |
6804 | if (__skb_ext_exist(ext, i: SKB_EXT_SEC_PATH)) |
6805 | skb_ext_put_sp(sp: skb_ext_get_ptr(ext, id: SKB_EXT_SEC_PATH)); |
6806 | #endif |
6807 | #ifdef CONFIG_MCTP_FLOWS |
6808 | if (__skb_ext_exist(ext, i: SKB_EXT_MCTP)) |
6809 | skb_ext_put_mctp(flow: skb_ext_get_ptr(ext, id: SKB_EXT_MCTP)); |
6810 | #endif |
6811 | |
6812 | kmem_cache_free(s: skbuff_ext_cache, objp: ext); |
6813 | } |
6814 | EXPORT_SYMBOL(__skb_ext_put); |
6815 | #endif /* CONFIG_SKB_EXTENSIONS */ |
6816 | |
6817 | /** |
6818 | * skb_attempt_defer_free - queue skb for remote freeing |
6819 | * @skb: buffer |
6820 | * |
6821 | * Put @skb in a per-cpu list, using the cpu which |
6822 | * allocated the skb/pages to reduce false sharing |
6823 | * and memory zone spinlock contention. |
6824 | */ |
6825 | void skb_attempt_defer_free(struct sk_buff *skb) |
6826 | { |
6827 | int cpu = skb->alloc_cpu; |
6828 | struct softnet_data *sd; |
6829 | unsigned int defer_max; |
6830 | bool kick; |
6831 | |
6832 | if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || |
6833 | !cpu_online(cpu) || |
6834 | cpu == raw_smp_processor_id()) { |
6835 | nodefer: __kfree_skb(skb); |
6836 | return; |
6837 | } |
6838 | |
6839 | DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); |
6840 | DEBUG_NET_WARN_ON_ONCE(skb->destructor); |
6841 | |
6842 | sd = &per_cpu(softnet_data, cpu); |
6843 | defer_max = READ_ONCE(sysctl_skb_defer_max); |
6844 | if (READ_ONCE(sd->defer_count) >= defer_max) |
6845 | goto nodefer; |
6846 | |
6847 | spin_lock_bh(lock: &sd->defer_lock); |
6848 | /* Send an IPI every time queue reaches half capacity. */ |
6849 | kick = sd->defer_count == (defer_max >> 1); |
6850 | /* Paired with the READ_ONCE() few lines above */ |
6851 | WRITE_ONCE(sd->defer_count, sd->defer_count + 1); |
6852 | |
6853 | skb->next = sd->defer_list; |
6854 | /* Paired with READ_ONCE() in skb_defer_free_flush() */ |
6855 | WRITE_ONCE(sd->defer_list, skb); |
6856 | spin_unlock_bh(lock: &sd->defer_lock); |
6857 | |
6858 | /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU |
6859 | * if we are unlucky enough (this seems very unlikely). |
6860 | */ |
6861 | if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) |
6862 | smp_call_function_single_async(cpu, csd: &sd->defer_csd); |
6863 | } |
6864 | |
6865 | static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, |
6866 | size_t offset, size_t len) |
6867 | { |
6868 | const char *kaddr; |
6869 | __wsum csum; |
6870 | |
6871 | kaddr = kmap_local_page(page); |
6872 | csum = csum_partial(buff: kaddr + offset, len, sum: 0); |
6873 | kunmap_local(kaddr); |
6874 | skb->csum = csum_block_add(csum: skb->csum, csum2: csum, offset: skb->len); |
6875 | } |
6876 | |
6877 | /** |
6878 | * skb_splice_from_iter - Splice (or copy) pages to skbuff |
6879 | * @skb: The buffer to add pages to |
6880 | * @iter: Iterator representing the pages to be added |
6881 | * @maxsize: Maximum amount of pages to be added |
6882 | * @gfp: Allocation flags |
6883 | * |
6884 | * This is a common helper function for supporting MSG_SPLICE_PAGES. It |
6885 | * extracts pages from an iterator and adds them to the socket buffer if |
6886 | * possible, copying them to fragments if not possible (such as if they're slab |
6887 | * pages). |
6888 | * |
6889 | * Returns the amount of data spliced/copied or -EMSGSIZE if there's |
6890 | * insufficient space in the buffer to transfer anything. |
6891 | */ |
6892 | ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, |
6893 | ssize_t maxsize, gfp_t gfp) |
6894 | { |
6895 | size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); |
6896 | struct page *pages[8], **ppages = pages; |
6897 | ssize_t spliced = 0, ret = 0; |
6898 | unsigned int i; |
6899 | |
6900 | while (iter->count > 0) { |
6901 | ssize_t space, nr, len; |
6902 | size_t off; |
6903 | |
6904 | ret = -EMSGSIZE; |
6905 | space = frag_limit - skb_shinfo(skb)->nr_frags; |
6906 | if (space < 0) |
6907 | break; |
6908 | |
6909 | /* We might be able to coalesce without increasing nr_frags */ |
6910 | nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); |
6911 | |
6912 | len = iov_iter_extract_pages(i: iter, pages: &ppages, maxsize, maxpages: nr, extraction_flags: 0, offset0: &off); |
6913 | if (len <= 0) { |
6914 | ret = len ?: -EIO; |
6915 | break; |
6916 | } |
6917 | |
6918 | i = 0; |
6919 | do { |
6920 | struct page *page = pages[i++]; |
6921 | size_t part = min_t(size_t, PAGE_SIZE - off, len); |
6922 | |
6923 | ret = -EIO; |
6924 | if (WARN_ON_ONCE(!sendpage_ok(page))) |
6925 | goto out; |
6926 | |
6927 | ret = skb_append_pagefrags(skb, page, off, part, |
6928 | frag_limit); |
6929 | if (ret < 0) { |
6930 | iov_iter_revert(i: iter, bytes: len); |
6931 | goto out; |
6932 | } |
6933 | |
6934 | if (skb->ip_summed == CHECKSUM_NONE) |
6935 | skb_splice_csum_page(skb, page, offset: off, len: part); |
6936 | |
6937 | off = 0; |
6938 | spliced += part; |
6939 | maxsize -= part; |
6940 | len -= part; |
6941 | } while (len > 0); |
6942 | |
6943 | if (maxsize <= 0) |
6944 | break; |
6945 | } |
6946 | |
6947 | out: |
6948 | skb_len_add(skb, delta: spliced); |
6949 | return spliced ?: ret; |
6950 | } |
6951 | EXPORT_SYMBOL(skb_splice_from_iter); |
6952 | |
6953 | static __always_inline |
6954 | size_t memcpy_from_iter_csum(void *iter_from, size_t progress, |
6955 | size_t len, void *to, void *priv2) |
6956 | { |
6957 | __wsum *csum = priv2; |
6958 | __wsum next = csum_partial_copy_nocheck(src: iter_from, dst: to + progress, len); |
6959 | |
6960 | *csum = csum_block_add(csum: *csum, csum2: next, offset: progress); |
6961 | return 0; |
6962 | } |
6963 | |
6964 | static __always_inline |
6965 | size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, |
6966 | size_t len, void *to, void *priv2) |
6967 | { |
6968 | __wsum next, *csum = priv2; |
6969 | |
6970 | next = csum_and_copy_from_user(src: iter_from, dst: to + progress, len); |
6971 | *csum = csum_block_add(csum: *csum, csum2: next, offset: progress); |
6972 | return next ? 0 : len; |
6973 | } |
6974 | |
6975 | bool csum_and_copy_from_iter_full(void *addr, size_t bytes, |
6976 | __wsum *csum, struct iov_iter *i) |
6977 | { |
6978 | size_t copied; |
6979 | |
6980 | if (WARN_ON_ONCE(!i->data_source)) |
6981 | return false; |
6982 | copied = iterate_and_advance2(iter: i, len: bytes, priv: addr, priv2: csum, |
6983 | ustep: copy_from_user_iter_csum, |
6984 | step: memcpy_from_iter_csum); |
6985 | if (likely(copied == bytes)) |
6986 | return true; |
6987 | iov_iter_revert(i, bytes: copied); |
6988 | return false; |
6989 | } |
6990 | EXPORT_SYMBOL(csum_and_copy_from_iter_full); |
6991 | |