diff options
Diffstat (limited to 'net/ipv4/inet_fragment.c')
| -rw-r--r-- | net/ipv4/inet_fragment.c | 756 |
1 files changed, 522 insertions, 234 deletions
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c5313a9c019b..025895eb6ec5 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * inet fragments management * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Authors: Pavel Emelyanov <xemul@openvz.org> * Started as consolidation of ipv4/ip_fragment.c, * ipv6/reassembly. and ipv6 nf conntrack reassembly @@ -20,10 +16,70 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/ipv6.h> + +#include "../core/sock_destructor.h" + +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + }; + struct sk_buff *next_frag; + int frag_run_len; + int ip_defrag_offset; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void fragcb_clear(struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void fragrun_append_to_last(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + fragcb_clear(skb); + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + fragcb_clear(skb); + + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + q->fragments_tail = skb; + q->last_run_head = skb; +} /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. @@ -46,318 +102,550 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static void inet_frag_secret_rebuild(unsigned long dummy) +int inet_frags_init(struct inet_frags *f) { - struct inet_frags *f = (struct inet_frags *)dummy; - unsigned long now = jiffies; - int i; - - /* Per bucket lock NOT needed here, due to write lock protection */ - write_lock(&f->lock); - - get_random_bytes(&f->rnd, sizeof(u32)); - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; + f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, + NULL); + if (!f->frags_cachep) + return -ENOMEM; + + refcount_set(&f->refcnt, 1); + init_completion(&f->completion); + return 0; +} +EXPORT_SYMBOL(inet_frags_init); - hb = &f->hash[i]; - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = f->hashfn(q); +void inet_frags_fini(struct inet_frags *f) +{ + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); - if (hval != i) { - struct inet_frag_bucket *hb_dest; + wait_for_completion(&f->completion); - hlist_del(&q->list); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; +} +EXPORT_SYMBOL(inet_frags_fini); - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - hlist_add_head(&q->list, &hb_dest->chain); - } - } +/* called from rhashtable_free_and_destroy() at netns_frags dismantle */ +static void inet_frags_free_cb(void *ptr, void *arg) +{ + struct inet_frag_queue *fq = ptr; + int count; + + count = timer_delete_sync(&fq->timer) ? 1 : 0; + + spin_lock_bh(&fq->lock); + fq->flags |= INET_FRAG_DROP; + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + count++; + } else if (fq->flags & INET_FRAG_HASH_DEAD) { + count++; } - write_unlock(&f->lock); + spin_unlock_bh(&fq->lock); - mod_timer(&f->secret_timer, now + f->secret_interval); + inet_frag_putn(fq, count); } -void inet_frags_init(struct inet_frags *f) +static LLIST_HEAD(fqdir_free_list); + +static void fqdir_free_fn(struct work_struct *work) { - int i; + struct llist_node *kill_list; + struct fqdir *fqdir, *tmp; + struct inet_frags *f; - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; + /* Atomically snapshot the list of fqdirs to free */ + kill_list = llist_del_all(&fqdir_free_list); - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - rwlock_init(&f->lock); + /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) + * have completed, since they need to dereference fqdir. + * Would it not be nice to have kfree_rcu_barrier() ? :) + */ + rcu_barrier(); - f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^ - (jiffies ^ (jiffies >> 6))); + llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { + f = fqdir->f; + if (refcount_dec_and_test(&f->refcnt)) + complete(&f->completion); - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, - (unsigned long)f); - f->secret_timer.expires = jiffies + f->secret_interval; - add_timer(&f->secret_timer); + kfree(fqdir); + } } -EXPORT_SYMBOL(inet_frags_init); -void inet_frags_init_net(struct netns_frags *nf) +static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); + +static void fqdir_work_fn(struct work_struct *work) { - nf->nqueues = 0; - init_frag_mem_limit(nf); - INIT_LIST_HEAD(&nf->lru_list); - spin_lock_init(&nf->lru_lock); + struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); + + rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); + + if (llist_add(&fqdir->free_list, &fqdir_free_list)) + queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ); } -EXPORT_SYMBOL(inet_frags_init_net); -void inet_frags_fini(struct inet_frags *f) +int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { - del_timer(&f->secret_timer); + struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + int res; + + if (!fqdir) + return -ENOMEM; + fqdir->f = f; + fqdir->net = net; + res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); + if (res < 0) { + kfree(fqdir); + return res; + } + refcount_inc(&f->refcnt); + *fqdirp = fqdir; + return 0; } -EXPORT_SYMBOL(inet_frags_fini); +EXPORT_SYMBOL(fqdir_init); + +static struct workqueue_struct *inet_frag_wq; -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +static int __init inet_frag_wq_init(void) { - nf->low_thresh = 0; + inet_frag_wq = create_workqueue("inet_frag_wq"); + if (!inet_frag_wq) + panic("Could not create inet frag workq"); + return 0; +} - local_bh_disable(); - inet_frag_evictor(nf, f, true); - local_bh_enable(); +pure_initcall(inet_frag_wq_init); - percpu_counter_destroy(&nf->mem); +void fqdir_exit(struct fqdir *fqdir) +{ + INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); + queue_work(inet_frag_wq, &fqdir->destroy_work); } -EXPORT_SYMBOL(inet_frags_exit_net); +EXPORT_SYMBOL(fqdir_exit); -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { - struct inet_frag_bucket *hb; - unsigned int hash; - - read_lock(&f->lock); - hash = f->hashfn(fq); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_del(&fq->list); - spin_unlock(&hb->chain_lock); - - read_unlock(&f->lock); - inet_frag_lru_del(fq); + if (timer_delete(&fq->timer)) + (*refs)++; + + if (!(fq->flags & INET_FRAG_COMPLETE)) { + struct fqdir *fqdir = fq->fqdir; + + fq->flags |= INET_FRAG_COMPLETE; + rcu_read_lock(); + /* The RCU read lock provides a memory barrier + * guaranteeing that if fqdir->dead is false then + * the hash table destruction will not start until + * after we unlock. Paired with fqdir_pre_exit(). + */ + if (!READ_ONCE(fqdir->dead)) { + rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, + fqdir->f->rhash_params); + (*refs)++; + } else { + fq->flags |= INET_FRAG_HASH_DEAD; + } + rcu_read_unlock(); + } } +EXPORT_SYMBOL(inet_frag_kill); -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +static void inet_frag_destroy_rcu(struct rcu_head *head) { - if (del_timer(&fq->timer)) - atomic_dec(&fq->refcnt); + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->fqdir->f; - if (!(fq->last_in & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); - atomic_dec(&fq->refcnt); - fq->last_in |= INET_FRAG_COMPLETE; - } + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); } -EXPORT_SYMBOL(inet_frag_kill); -static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb) +unsigned int inet_frag_rbtree_purge(struct rb_root *root, + enum skb_drop_reason reason) { - if (f->skb_free) - f->skb_free(skb); - kfree_skb(skb); + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb_reason(skb, reason); + skb = next; + } + } + return sum; } +EXPORT_SYMBOL(inet_frag_rbtree_purge); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, - int *work) +void inet_frag_destroy(struct inet_frag_queue *q) { - struct sk_buff *fp; - struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + enum skb_drop_reason reason; + struct inet_frags *f; + struct fqdir *fqdir; - WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); - WARN_ON(del_timer(&q->timer) != 0); + WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); + reason = (q->flags & INET_FRAG_DROP) ? + SKB_DROP_REASON_FRAG_REASM_TIMEOUT : + SKB_CONSUMED; + WARN_ON(timer_delete(&q->timer) != 0); /* Release all fragment data. */ - fp = q->fragments; - nf = q->net; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - frag_kfree_skb(nf, f, fp); - fp = xp; - } + fqdir = q->fqdir; + f = fqdir->f; + sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); sum = sum_truesize + f->qsize; - if (work) - *work -= sum; - sub_frag_mem_limit(q, sum); - if (f->destructor) - f->destructor(q); - kfree(q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); + sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) +static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, + struct inet_frags *f, + void *arg) { struct inet_frag_queue *q; - int work, evicted = 0; - - if (!force) { - if (frag_mem_limit(nf) <= nf->high_thresh) - return 0; - } - work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0) { - spin_lock(&nf->lru_lock); + q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); + if (!q) + return NULL; - if (list_empty(&nf->lru_list)) { - spin_unlock(&nf->lru_lock); - break; - } + q->fqdir = fqdir; + f->constructor(q, arg); + add_frag_mem_limit(fqdir, f->qsize); - q = list_first_entry(&nf->lru_list, - struct inet_frag_queue, lru_list); - atomic_inc(&q->refcnt); - /* Remove q from list to avoid several CPUs grabbing it */ - list_del_init(&q->lru_list); + timer_setup(&q->timer, f->frag_expire, 0); + spin_lock_init(&q->lock); + /* One reference for the timer, one for the hash table. */ + refcount_set(&q->refcnt, 2); - spin_unlock(&nf->lru_lock); + return q; +} - spin_lock(&q->lock); - if (!(q->last_in & INET_FRAG_COMPLETE)) - inet_frag_kill(q, f); - spin_unlock(&q->lock); +static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, + void *arg, + struct inet_frag_queue **prev) +{ + struct inet_frags *f = fqdir->f; + struct inet_frag_queue *q; - if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, &work); - evicted++; + q = inet_frag_alloc(fqdir, f, arg); + if (!q) { + *prev = ERR_PTR(-ENOMEM); + return NULL; + } + mod_timer(&q->timer, jiffies + fqdir->timeout); + + *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, + &q->node, f->rhash_params); + if (*prev) { + /* We could not insert in the hash table, + * we need to cancel what inet_frag_alloc() + * anticipated. + */ + int refs = 1; + + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q, &refs); + inet_frag_putn(q, refs); + return NULL; } + return q; +} + +struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) +{ + /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ + long high_thresh = READ_ONCE(fqdir->high_thresh); + struct inet_frag_queue *fq = NULL, *prev; + + if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) + return NULL; - return evicted; + prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); + if (!prev) + fq = inet_frag_create(fqdir, key, &prev); + if (!IS_ERR_OR_NULL(prev)) + fq = prev; + return fq; } -EXPORT_SYMBOL(inet_frag_evictor); +EXPORT_SYMBOL(inet_frag_find); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, struct inet_frags *f, - void *arg) +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *qp; - unsigned int hash; - - read_lock(&f->lock); /* Protects against hash rebuild */ - /* - * While we stayed w/o the lock other CPU could update - * the rnd seed, so we need to re-calculate the hash - * chain. Fortunatelly the qp_in can be used to get one. + struct sk_buff *last = q->fragments_tail; + + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * Duplicates, however, should be ignored (i.e. skb dropped, but the + * queue/fragments kept for later reassembly). */ - hash = f->hashfn(qp_in); - hb = &f->hash[hash]; - spin_lock(&hb->chain_lock); - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could be created on other cpu, while we - * released the hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - atomic_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); - qp_in->last_in |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); - return qp; - } + if (!last) + fragrun_create(q, skb); /* First fragment. */ + else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) + return IPFRAG_OVERLAP; + if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) + fragrun_append_to_last(q, skb); + else + fragrun_create(q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + struct rb_node **rbn, *parent; + + rbn = &q->rb_fragments.rb_node; + do { + struct sk_buff *curr; + int curr_run_end; + + parent = *rbn; + curr = rb_to_skb(parent); + curr_run_end = FRAG_CB(curr)->ip_defrag_offset + + FRAG_CB(curr)->frag_run_len; + if (end <= FRAG_CB(curr)->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= curr_run_end) + rbn = &parent->rb_right; + else if (offset >= FRAG_CB(curr)->ip_defrag_offset && + end <= curr_run_end) + return IPFRAG_DUP; + else + return IPFRAG_OVERLAP; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + fragcb_clear(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &q->rb_fragments); } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - atomic_inc(&qp->refcnt); - - atomic_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); - inet_frag_lru_add(nf, qp); - return qp; + + FRAG_CB(skb)->ip_defrag_offset = offset; + + return IPFRAG_OK; } +EXPORT_SYMBOL(inet_frag_queue_insert); -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, - struct inet_frags *f, void *arg) +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent) { - struct inet_frag_queue *q; + struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); + void (*destructor)(struct sk_buff *); + unsigned int orig_truesize = 0; + struct sk_buff **nextp = NULL; + struct sock *sk = skb->sk; + int delta; + + if (sk && is_skb_wmem(skb)) { + /* TX: skb->sk might have been passed as argument to + * dst->output and must remain valid until tx completes. + * + * Move sk to reassembled skb and fix up wmem accounting. + */ + orig_truesize = skb->truesize; + destructor = skb->destructor; + } - q = kzalloc(f->qsize, GFP_ATOMIC); - if (q == NULL) - return NULL; + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); + if (!fp) { + head = skb; + goto out_restore_sk; + } + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(parent)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &q->rb_fragments); + if (q->fragments_tail == skb) + q->fragments_tail = fp; + + if (orig_truesize) { + /* prevent skb_morph from releasing sk */ + skb->sk = NULL; + skb->destructor = NULL; + } + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + consume_skb(head); + head = skb; + } + WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); - q->net = nf; - f->constructor(q, arg); - add_frag_mem_limit(q, f->qsize); + delta = -head->truesize; - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); - spin_lock_init(&q->lock); - atomic_set(&q->refcnt, 1); - INIT_LIST_HEAD(&q->lru_list); + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + goto out_restore_sk; - return q; -} + delta += head->truesize; + if (delta) + add_frag_mem_limit(q->fqdir, delta); -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) -{ - struct inet_frag_queue *q; + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + goto out_restore_sk; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->data_len = head->data_len - plen; + clone->len = clone->data_len; + head->truesize += clone->truesize; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + add_frag_mem_limit(q->fqdir, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; + } - q = inet_frag_alloc(nf, f, arg); - if (q == NULL) - return NULL; +out_restore_sk: + if (orig_truesize) { + int ts_delta = head->truesize - orig_truesize; + + /* if this reassembled skb is fragmented later, + * fraglist skbs will get skb->sk assigned from head->sk, + * and each frag skb will be released via sock_wfree. + * + * Update sk_wmem_alloc. + */ + head->sk = sk; + head->destructor = destructor; + refcount_add(ts_delta, &sk->sk_wmem_alloc); + } - return inet_frag_intern(nf, q, f, arg); + return nextp; } +EXPORT_SYMBOL(inet_frag_reasm_prepare); -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock) +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data, bool try_coalesce) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; + struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; + const unsigned int head_truesize = head->truesize; + struct sk_buff **nextp = reasm_data; + struct rb_node *rbn; + struct sk_buff *fp; + int sum_truesize; + + skb_push(head, head->data - skb_network_header(head)); + + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &q->rb_fragments); + + sum_truesize = head->truesize; + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; + bool stolen; + int delta; + + sum_truesize += fp->truesize; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + + if (try_coalesce && skb_try_coalesce(head, fp, &stolen, + &delta)) { + kfree_skb_partial(fp, stolen); + } else { + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + + *nextp = fp; + nextp = &fp->next; + } - hb = &f->hash[hash]; + fp = next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - atomic_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); - return q; + fp = rb_to_skb(rbn); + rb_erase(rbn, &q->rb_fragments); + rbn = rbnext; } - depth++; } - spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); + sub_frag_mem_limit(q->fqdir, sum_truesize); - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); - else - return ERR_PTR(-ENOBUFS); + *nextp = NULL; + skb_mark_not_on_list(head); + head->prev = NULL; + head->tstamp = q->stamp; + head->tstamp_type = q->tstamp_type; + + if (sk) + refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } -EXPORT_SYMBOL(inet_frag_find); +EXPORT_SYMBOL(inet_frag_reasm_finish); -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) { - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; + struct sk_buff *head, *skb; + + head = skb_rb_first(&q->rb_fragments); + if (!head) + return NULL; + skb = FRAG_CB(head)->next_frag; + if (skb) + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + else + rb_erase(&head->rbnode, &q->rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + + if (head == q->fragments_tail) + q->fragments_tail = NULL; + + sub_frag_mem_limit(q->fqdir, head->truesize); - if (PTR_ERR(q) == -ENOBUFS) - LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); + return head; } -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); +EXPORT_SYMBOL(inet_frag_pull_head); |
