diff options
Diffstat (limited to 'net/ipv4/ip_fragment.c')
| -rw-r--r-- | net/ipv4/ip_fragment.c | 199 |
1 files changed, 103 insertions, 96 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 486ecb0aeb87..f7012479713b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -76,23 +76,27 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, - frags); - struct net *net = container_of(ipv4, struct net, ipv4); - const struct frag_v4_compare_key *key = a; + struct net *net = q->fqdir->net; + struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : - NULL; + if (q->fqdir->max_dist) { + rcu_read_lock(); + p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); + if (p && !refcount_inc_not_zero(&p->refcnt)) + p = NULL; + rcu_read_unlock(); + } + qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) @@ -104,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q) inet_putpeer(qp->peer); } - -/* Destruction primitives. */ - -static void ipq_put(struct ipq *ipq) -{ - inet_frag_put(&ipq->q); -} - -/* Kill ipq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static void ipq_kill(struct ipq *ipq) -{ - inet_frag_kill(&ipq->q); -} - static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || @@ -134,23 +122,30 @@ static bool frag_expire_skip_icmp(u32 user) */ static void ip_expire(struct timer_list *t) { - struct inet_frag_queue *frag = from_timer(frag, t, timer); + enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; - int err; + int refs = 1; qp = container_of(frag, struct ipq, q); - net = container_of(qp->q.net, struct net, ipv4.frags); + net = qp->q.fqdir->net; rcu_read_lock(); + + /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ + if (READ_ONCE(qp->q.fqdir->dead)) + goto out_rcu_unlock; + spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) goto out; - ipq_kill(qp); + qp->q.flags |= INET_FRAG_DROP; + inet_frag_kill(&qp->q, &refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -171,14 +166,15 @@ static void ip_expire(struct timer_list *t) /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (err) + reason = ip_route_input_noref(head, iph->daddr, iph->saddr, + ip4h_dscp(iph), head->dev); + if (reason) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ + reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -191,8 +187,8 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - kfree_skb(head); - ipq_put(qp); + kfree_skb_reason(head, reason); + inet_frag_putn(&qp->q, refs); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -211,7 +207,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->ipv4.frags, &key); + q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; @@ -222,7 +218,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = qp->q.net->max_dist; + unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; @@ -236,12 +232,8 @@ static int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments_tail && (end - start) > max; - if (rc) { - struct net *net; - - net = container_of(qp->q.net, struct net, ipv4.frags); - __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - } + if (rc) + __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } @@ -250,18 +242,18 @@ static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } - sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); - sub_frag_mem_limit(qp->q.net, sum_truesize); + sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, + SKB_DROP_REASON_FRAG_TOO_FAR); + sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; - qp->q.fragments = NULL; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; @@ -272,23 +264,27 @@ static int ip_frag_reinit(struct ipq *qp) } /* Add new segment to existing queue. */ -static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; int err = -ENOENT; + SKB_DR(reason); u8 ecn; - if (qp->q.flags & INET_FRAG_COMPLETE) + /* If reassembly is already done, @skb must be a duplicate frag. */ + if (qp->q.flags & INET_FRAG_COMPLETE) { + SKB_DR_SET(reason, DUP_FRAG); goto err; + } if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); goto err; } @@ -351,9 +347,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; + qp->q.tstamp_type = skb->tstamp_type; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(qp->q.net, skb->truesize); + add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; @@ -371,42 +368,50 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, skb, prev_tail, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; if (err) - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); return err; } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { - kfree_skb(skb); - return -EINVAL; + SKB_DR_SET(reason, DUP_FRAG); + err = -EINVAL; + goto err; } err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: - kfree_skb(skb); + kfree_skb_reason(skb, reason); return err; } +static bool ip_frag_coalesce_ok(const struct ipq *qp) +{ + return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; +} + /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { - struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; u8 ecn; - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { @@ -424,7 +429,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; - inet_frag_reasm_finish(&qp->q, skb, reasm_data); + inet_frag_reasm_finish(&qp->q, skb, reasm_data, + ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); @@ -451,7 +457,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); - qp->q.fragments = NULL; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; @@ -471,26 +476,30 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { - struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; - int vif = l3mdev_master_ifindex_rcu(dev); + struct net_device *dev; struct ipq *qp; + int vif; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); - skb_orphan(skb); /* Lookup (or create) queue header */ + rcu_read_lock(); + dev = skb->dev ? : skb_dst_dev_rcu(skb); + vif = l3mdev_master_ifindex_rcu(dev); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { - int ret; + int ret, refs = 0; spin_lock(&qp->q.lock); - ret = ip_frag_queue(qp, skb); + ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); - ipq_put(qp); + rcu_read_unlock(); + inet_frag_putn(&qp->q, refs); return ret; } + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); @@ -546,36 +555,29 @@ static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", - .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", - .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", - .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", - .data = &init_net.ipv4.frags.max_dist, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, - { } }; /* secret interval has been deprecated */ @@ -588,7 +590,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) @@ -602,15 +603,16 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv4.frags.high_thresh; - table[0].extra1 = &net->ipv4.frags.low_thresh; - table[1].data = &net->ipv4.frags.low_thresh; - table[1].extra2 = &net->ipv4.frags.high_thresh; - table[2].data = &net->ipv4.frags.timeout; - table[3].data = &net->ipv4.frags.max_dist; } - - hdr = register_net_sysctl(net, "net/ipv4", table); + table[0].data = &net->ipv4.fqdir->high_thresh; + table[0].extra1 = &net->ipv4.fqdir->low_thresh; + table[1].data = &net->ipv4.fqdir->low_thresh; + table[1].extra2 = &net->ipv4.fqdir->high_thresh; + table[2].data = &net->ipv4.fqdir->timeout; + table[3].data = &net->ipv4.fqdir->max_dist; + + hdr = register_net_sysctl_sz(net, "net/ipv4", table, + ARRAY_SIZE(ip4_frags_ns_ctl_table)); if (!hdr) goto err_reg; @@ -626,7 +628,7 @@ err_alloc: static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); @@ -656,6 +658,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) { int res; + res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); + if (res < 0) + return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -670,36 +675,38 @@ static int __net_init ipv4_frags_init_net(struct net *net) * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 4 * 1024 * 1024; - net->ipv4.frags.low_thresh = 3 * 1024 * 1024; + net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; + net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ - net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.fqdir->timeout = IP_FRAG_TIME; - net->ipv4.frags.max_dist = 64; - net->ipv4.frags.f = &ip4_frags; + net->ipv4.fqdir->max_dist = 64; - res = inet_frags_init_net(&net->ipv4.frags); - if (res < 0) - return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); return res; } +static void __net_exit ipv4_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv4.fqdir); +} + static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags); + fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { - .init = ipv4_frags_init_net, - .exit = ipv4_frags_exit_net, + .init = ipv4_frags_init_net, + .pre_exit = ipv4_frags_pre_exit_net, + .exit = ipv4_frags_exit_net, }; |
