diff options
Diffstat (limited to 'net/ipv4/inet_fragment.c')
| -rw-r--r-- | net/ipv4/inet_fragment.c | 790 |
1 files changed, 501 insertions, 289 deletions
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 96e95e83cc61..025895eb6ec5 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * inet fragments management * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Authors: Pavel Emelyanov <xemul@openvz.org> * Started as consolidation of ipv4/ip_fragment.c, * ipv6/reassembly. and ipv6 nf conntrack reassembly @@ -20,16 +16,70 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/ipv6.h> + +#include "../core/sock_destructor.h" + +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + }; + struct sk_buff *next_frag; + int frag_run_len; + int ip_defrag_offset; +}; -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) +static void fragcb_clear(struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void fragrun_append_to_last(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + fragcb_clear(skb); + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + fragcb_clear(skb); + + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + q->fragments_tail = skb; + q->last_run_head = skb; +} /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. @@ -52,388 +102,550 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) +int inet_frags_init(struct inet_frags *f) { - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} + f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, + NULL); + if (!f->frags_cachep) + return -ENOMEM; -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); + refcount_set(&f->refcnt, 1); + init_completion(&f->completion); + return 0; } +EXPORT_SYMBOL(inet_frags_init); -static void inet_frag_secret_rebuild(struct inet_frags *f) +void inet_frags_fini(struct inet_frags *f) { - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); + wait_for_completion(&f->completion); - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } +EXPORT_SYMBOL(inet_frags_fini); -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) +/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ +static void inet_frags_free_cb(void *ptr, void *arg) { - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); + struct inet_frag_queue *fq = ptr; + int count; - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; + count = timer_delete_sync(&fq->timer) ? 1 : 0; - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; + spin_lock_bh(&fq->lock); + fq->flags |= INET_FRAG_DROP; + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + count++; + } else if (fq->flags & INET_FRAG_HASH_DEAD) { + count++; } + spin_unlock_bh(&fq->lock); - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); - - return evicted; + inet_frag_putn(fq, count); } -static void inet_frag_worker(struct work_struct *work) +static LLIST_HEAD(fqdir_free_list); + +static void fqdir_free_fn(struct work_struct *work) { - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; + struct llist_node *kill_list; + struct fqdir *fqdir, *tmp; struct inet_frags *f; - f = container_of(work, struct inet_frags, frags_work); + /* Atomically snapshot the list of fqdirs to free */ + kill_list = llist_del_all(&fqdir_free_list); - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); + /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) + * have completed, since they need to dereference fqdir. + * Would it not be nice to have kfree_rcu_barrier() ? :) + */ + rcu_barrier(); - local_bh_disable(); + llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { + f = fqdir->f; + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); - for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; + kfree(fqdir); } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); } -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} +static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); -int inet_frags_init(struct inet_frags *f) +static void fqdir_work_fn(struct work_struct *work) { - int i; + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); - INIT_WORK(&f->frags_work, inet_frag_worker); + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; + if (llist_add(&fqdir->free_list, &fqdir_free_list)) + queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ); +} - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) +{ + struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + int res; - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; - f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, - NULL); - if (!f->frags_cachep) + if (!fqdir) return -ENOMEM; - + fqdir->f = f; + fqdir->net = net; + res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); + if (res < 0) { + kfree(fqdir); + return res; + } + refcount_inc(&f->refcnt); + *fqdirp = fqdir; return 0; } -EXPORT_SYMBOL(inet_frags_init); +EXPORT_SYMBOL(fqdir_init); -void inet_frags_fini(struct inet_frags *f) -{ - cancel_work_sync(&f->frags_work); - kmem_cache_destroy(f->frags_cachep); -} -EXPORT_SYMBOL(inet_frags_fini); +static struct workqueue_struct *inet_frag_wq; -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +static int __init inet_frag_wq_init(void) { - unsigned int seq; - int i; - - nf->low_thresh = 0; - -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); + inet_frag_wq = create_workqueue("inet_frag_wq"); + if (!inet_frag_wq) + panic("Could not create inet frag workq"); + return 0; +} - if (read_seqretry(&f->rnd_seqlock, seq) || - percpu_counter_sum(&nf->mem)) - goto evict_again; +pure_initcall(inet_frag_wq_init); - percpu_counter_destroy(&nf->mem); +void fqdir_exit(struct fqdir *fqdir) +{ + INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); + queue_work(inet_frag_wq, &fqdir->destroy_work); } -EXPORT_SYMBOL(inet_frags_exit_net); +EXPORT_SYMBOL(fqdir_exit); -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) +void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { - struct inet_frag_bucket *hb; - unsigned int seq, hash; - - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + if (timer_delete(&fq->timer)) + (*refs)++; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + if (!(fq->flags & INET_FRAG_COMPLETE)) { + struct fqdir *fqdir = fq->fqdir; + + fq->flags |= INET_FRAG_COMPLETE; + rcu_read_lock(); + /* The RCU read lock provides a memory barrier + * guaranteeing that if fqdir->dead is false then + * the hash table destruction will not start until + * after we unlock. Paired with fqdir_pre_exit(). + */ + if (!READ_ONCE(fqdir->dead)) { + rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, + fqdir->f->rhash_params); + (*refs)++; + } else { + fq->flags |= INET_FRAG_HASH_DEAD; + } + rcu_read_unlock(); } - - return hb; } +EXPORT_SYMBOL(inet_frag_kill); -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static void inet_frag_destroy_rcu(struct rcu_head *head) { - struct inet_frag_bucket *hb; + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->fqdir->f; - hb = get_frag_bucket_locked(fq, f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); } -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +unsigned int inet_frag_rbtree_purge(struct rb_root *root, + enum skb_drop_reason reason) { - if (del_timer(&fq->timer)) - refcount_dec(&fq->refcnt); + struct rb_node *p = rb_first(root); + unsigned int sum = 0; - if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); - refcount_dec(&fq->refcnt); + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb_reason(skb, reason); + skb = next; + } } + return sum; } -EXPORT_SYMBOL(inet_frag_kill); +EXPORT_SYMBOL(inet_frag_rbtree_purge); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) +void inet_frag_destroy(struct inet_frag_queue *q) { - struct sk_buff *fp; - struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + enum skb_drop_reason reason; + struct inet_frags *f; + struct fqdir *fqdir; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); - WARN_ON(del_timer(&q->timer) != 0); + reason = (q->flags & INET_FRAG_DROP) ? + SKB_DROP_REASON_FRAG_REASM_TIMEOUT : + SKB_CONSUMED; + WARN_ON(timer_delete(&q->timer) != 0); /* Release all fragment data. */ - fp = q->fragments; - nf = q->net; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } + fqdir = q->fqdir; + f = fqdir->f; + sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); - sub_frag_mem_limit(nf, sum); + sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, +static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); - return NULL; - } - q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; - q->net = nf; + q->fqdir = fqdir; f->constructor(q, arg); - add_frag_mem_limit(nf, f->qsize); + add_frag_mem_limit(fqdir, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + /* One reference for the timer, one for the hash table. */ + refcount_set(&q->refcnt, 2); return q; } -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, - void *arg) +static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, + void *arg, + struct inet_frag_queue **prev) { + struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; - q = inet_frag_alloc(nf, f, arg); - if (!q) + q = inet_frag_alloc(fqdir, f, arg); + if (!q) { + *prev = ERR_PTR(-ENOMEM); + return NULL; + } + mod_timer(&q->timer, jiffies + fqdir->timeout); + + *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, + &q->node, f->rhash_params); + if (*prev) { + /* We could not insert in the hash table, + * we need to cancel what inet_frag_alloc() + * anticipated. + */ + int refs = 1; + + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q, &refs); + inet_frag_putn(q, refs); + return NULL; + } + return q; +} + +struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) +{ + /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ + long high_thresh = READ_ONCE(fqdir->high_thresh); + struct inet_frag_queue *fq = NULL, *prev; + + if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; - return inet_frag_intern(nf, q, f, arg); + prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); + if (!prev) + fq = inet_frag_create(fqdir, key, &prev); + if (!IS_ERR_OR_NULL(prev)) + fq = prev; + return fq; } +EXPORT_SYMBOL(inet_frag_find); -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; + struct sk_buff *last = q->fragments_tail; + + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * Duplicates, however, should be ignored (i.e. skb dropped, but the + * queue/fragments kept for later reassembly). + */ + if (!last) + fragrun_create(q, skb); /* First fragment. */ + else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) + return IPFRAG_OVERLAP; + if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) + fragrun_append_to_last(q, skb); + else + fragrun_create(q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + struct rb_node **rbn, *parent; + + rbn = &q->rb_fragments.rb_node; + do { + struct sk_buff *curr; + int curr_run_end; + + parent = *rbn; + curr = rb_to_skb(parent); + curr_run_end = FRAG_CB(curr)->ip_defrag_offset + + FRAG_CB(curr)->frag_run_len; + if (end <= FRAG_CB(curr)->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= curr_run_end) + rbn = &parent->rb_right; + else if (offset >= FRAG_CB(curr)->ip_defrag_offset && + end <= curr_run_end) + return IPFRAG_DUP; + else + return IPFRAG_OVERLAP; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + fragcb_clear(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + } - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); + FRAG_CB(skb)->ip_defrag_offset = offset; - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; + return IPFRAG_OK; +} +EXPORT_SYMBOL(inet_frag_queue_insert); - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent) +{ + struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); + void (*destructor)(struct sk_buff *); + unsigned int orig_truesize = 0; + struct sk_buff **nextp = NULL; + struct sock *sk = skb->sk; + int delta; + + if (sk && is_skb_wmem(skb)) { + /* TX: skb->sk might have been passed as argument to + * dst->output and must remain valid until tx completes. + * + * Move sk to reassembled skb and fix up wmem accounting. + */ + orig_truesize = skb->truesize; + destructor = skb->destructor; + } + + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); + if (!fp) { + head = skb; + goto out_restore_sk; + } + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(parent)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &q->rb_fragments); + if (q->fragments_tail == skb) + q->fragments_tail = fp; + + if (orig_truesize) { + /* prevent skb_morph from releasing sk */ + skb->sk = NULL; + skb->destructor = NULL; } - depth++; + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + consume_skb(head); + head = skb; } - spin_unlock(&hb->chain_lock); + WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + delta = -head->truesize; - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + goto out_restore_sk; + + delta += head->truesize; + if (delta) + add_frag_mem_limit(q->fqdir, delta); + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + goto out_restore_sk; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->data_len = head->data_len - plen; + clone->len = clone->data_len; + head->truesize += clone->truesize; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + add_frag_mem_limit(q->fqdir, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; } - return ERR_PTR(-ENOBUFS); +out_restore_sk: + if (orig_truesize) { + int ts_delta = head->truesize - orig_truesize; + + /* if this reassembled skb is fragmented later, + * fraglist skbs will get skb->sk assigned from head->sk, + * and each frag skb will be released via sock_wfree. + * + * Update sk_wmem_alloc. + */ + head->sk = sk; + head->destructor = destructor; + refcount_add(ts_delta, &sk->sk_wmem_alloc); + } + + return nextp; } -EXPORT_SYMBOL(inet_frag_find); +EXPORT_SYMBOL(inet_frag_reasm_prepare); + +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data, bool try_coalesce) +{ + struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; + const unsigned int head_truesize = head->truesize; + struct sk_buff **nextp = reasm_data; + struct rb_node *rbn; + struct sk_buff *fp; + int sum_truesize; + + skb_push(head, head->data - skb_network_header(head)); + + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &q->rb_fragments); + + sum_truesize = head->truesize; + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; + bool stolen; + int delta; + + sum_truesize += fp->truesize; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + + if (try_coalesce && skb_try_coalesce(head, fp, &stolen, + &delta)) { + kfree_skb_partial(fp, stolen); + } else { + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + + *nextp = fp; + nextp = &fp->next; + } + + fp = next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &q->rb_fragments); + rbn = rbnext; + } + } + sub_frag_mem_limit(q->fqdir, sum_truesize); + + *nextp = NULL; + skb_mark_not_on_list(head); + head->prev = NULL; + head->tstamp = q->stamp; + head->tstamp_type = q->tstamp_type; + + if (sk) + refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(inet_frag_reasm_finish); -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) { - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; + struct sk_buff *head, *skb; + + head = skb_rb_first(&q->rb_fragments); + if (!head) + return NULL; + skb = FRAG_CB(head)->next_frag; + if (skb) + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + else + rb_erase(&head->rbnode, &q->rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + + if (head == q->fragments_tail) + q->fragments_tail = NULL; + + sub_frag_mem_limit(q->fqdir, head->truesize); - if (PTR_ERR(q) == -ENOBUFS) - net_dbg_ratelimited("%s%s", prefix, msg); + return head; } -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); +EXPORT_SYMBOL(inet_frag_pull_head); |
