summaryrefslogtreecommitdiff
path: root/net/ipv4/ip_fragment.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/ip_fragment.c')
-rw-r--r--net/ipv4/ip_fragment.c199
1 files changed, 103 insertions, 96 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 486ecb0aeb87..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -76,23 +76,27 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev);
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
- struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
- frags);
- struct net *net = container_of(ipv4, struct net, ipv4);
-
const struct frag_v4_compare_key *key = a;
+ struct net *net = q->fqdir->net;
+ struct inet_peer *p = NULL;
q->key.v4 = *key;
qp->ecn = 0;
- qp->peer = q->net->max_dist ?
- inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
- NULL;
+ if (q->fqdir->max_dist) {
+ rcu_read_lock();
+ p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
+ if (p && !refcount_inc_not_zero(&p->refcnt))
+ p = NULL;
+ rcu_read_unlock();
+ }
+ qp->peer = p;
}
static void ip4_frag_free(struct inet_frag_queue *q)
@@ -104,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q)
inet_putpeer(qp->peer);
}
-
-/* Destruction primitives. */
-
-static void ipq_put(struct ipq *ipq)
-{
- inet_frag_put(&ipq->q);
-}
-
-/* Kill ipq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static void ipq_kill(struct ipq *ipq)
-{
- inet_frag_kill(&ipq->q);
-}
-
static bool frag_expire_skip_icmp(u32 user)
{
return user == IP_DEFRAG_AF_PACKET ||
@@ -134,23 +122,30 @@ static bool frag_expire_skip_icmp(u32 user)
*/
static void ip_expire(struct timer_list *t)
{
- struct inet_frag_queue *frag = from_timer(frag, t, timer);
+ enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
const struct iphdr *iph;
struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
- int err;
+ int refs = 1;
qp = container_of(frag, struct ipq, q);
- net = container_of(qp->q.net, struct net, ipv4.frags);
+ net = qp->q.fqdir->net;
rcu_read_lock();
+
+ /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(qp->q.fqdir->dead))
+ goto out_rcu_unlock;
+
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
goto out;
- ipq_kill(qp);
+ qp->q.flags |= INET_FRAG_DROP;
+ inet_frag_kill(&qp->q, &refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -171,14 +166,15 @@ static void ip_expire(struct timer_list *t)
/* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
- err = ip_route_input_noref(head, iph->daddr, iph->saddr,
- iph->tos, head->dev);
- if (err)
+ reason = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), head->dev);
+ if (reason)
goto out;
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
+ reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
@@ -191,8 +187,8 @@ out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
- kfree_skb(head);
- ipq_put(qp);
+ kfree_skb_reason(head, reason);
+ inet_frag_putn(&qp->q, refs);
}
/* Find the correct entry in the "incomplete datagrams" queue for
@@ -211,7 +207,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->ipv4.frags, &key);
+ q = inet_frag_find(net->ipv4.fqdir, &key);
if (!q)
return NULL;
@@ -222,7 +218,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = qp->q.net->max_dist;
+ unsigned int max = qp->q.fqdir->max_dist;
unsigned int start, end;
int rc;
@@ -236,12 +232,8 @@ static int ip_frag_too_far(struct ipq *qp)
rc = qp->q.fragments_tail && (end - start) > max;
- if (rc) {
- struct net *net;
-
- net = container_of(qp->q.net, struct net, ipv4.frags);
- __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
- }
+ if (rc)
+ __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
return rc;
}
@@ -250,18 +242,18 @@ static int ip_frag_reinit(struct ipq *qp)
{
unsigned int sum_truesize = 0;
- if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
- sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
- sub_frag_mem_limit(qp->q.net, sum_truesize);
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
+ SKB_DROP_REASON_FRAG_TOO_FAR);
+ sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
- qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
@@ -272,23 +264,27 @@ static int ip_frag_reinit(struct ipq *qp)
}
/* Add new segment to existing queue. */
-static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
int ihl, end, flags, offset;
struct sk_buff *prev_tail;
struct net_device *dev;
unsigned int fragsize;
int err = -ENOENT;
+ SKB_DR(reason);
u8 ecn;
- if (qp->q.flags & INET_FRAG_COMPLETE)
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (qp->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
goto err;
}
@@ -351,9 +347,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->iif = dev->ifindex;
qp->q.stamp = skb->tstamp;
+ qp->q.tstamp_type = skb->tstamp_type;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(qp->q.net, skb->truesize);
+ add_frag_mem_limit(qp->q.fqdir, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
@@ -371,42 +368,50 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, skb, prev_tail, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev, refs);
skb->_skb_refdst = orefdst;
if (err)
- inet_frag_kill(&qp->q);
+ inet_frag_kill(&qp->q, refs);
return err;
}
skb_dst_drop(skb);
+ skb_orphan(skb);
return -EINPROGRESS;
insert_error:
if (err == IPFRAG_DUP) {
- kfree_skb(skb);
- return -EINVAL;
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
}
err = -EINVAL;
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
discard_qp:
- inet_frag_kill(&qp->q);
+ inet_frag_kill(&qp->q, refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return err;
}
+static bool ip_frag_coalesce_ok(const struct ipq *qp)
+{
+ return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
+}
+
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
void *reasm_data;
int len, err;
u8 ecn;
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
@@ -424,7 +429,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
if (len > 65535)
goto out_oversize;
- inet_frag_reasm_finish(&qp->q, skb, reasm_data);
+ inet_frag_reasm_finish(&qp->q, skb, reasm_data,
+ ip_frag_coalesce_ok(qp));
skb->dev = dev;
IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -451,7 +457,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
ip_send_check(iph);
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
- qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
@@ -471,26 +476,30 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
- int vif = l3mdev_master_ifindex_rcu(dev);
+ struct net_device *dev;
struct ipq *qp;
+ int vif;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
- skb_orphan(skb);
/* Lookup (or create) queue header */
+ rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
- int ret;
+ int ret, refs = 0;
spin_lock(&qp->q.lock);
- ret = ip_frag_queue(qp, skb);
+ ret = ip_frag_queue(qp, skb, &refs);
spin_unlock(&qp->q.lock);
- ipq_put(qp);
+ rcu_read_unlock();
+ inet_frag_putn(&qp->q, refs);
return ret;
}
+ rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
@@ -546,36 +555,29 @@ static int dist_min;
static struct ctl_table ip4_frags_ns_ctl_table[] = {
{
.procname = "ipfrag_high_thresh",
- .data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv4.frags.low_thresh
},
{
.procname = "ipfrag_low_thresh",
- .data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv4.frags.high_thresh
},
{
.procname = "ipfrag_time",
- .data = &init_net.ipv4.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ipfrag_max_dist",
- .data = &init_net.ipv4.frags.max_dist,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &dist_min,
},
- { }
};
/* secret interval has been deprecated */
@@ -588,7 +590,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
@@ -602,15 +603,16 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv4.frags.high_thresh;
- table[0].extra1 = &net->ipv4.frags.low_thresh;
- table[1].data = &net->ipv4.frags.low_thresh;
- table[1].extra2 = &net->ipv4.frags.high_thresh;
- table[2].data = &net->ipv4.frags.timeout;
- table[3].data = &net->ipv4.frags.max_dist;
}
-
- hdr = register_net_sysctl(net, "net/ipv4", table);
+ table[0].data = &net->ipv4.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv4.fqdir->low_thresh;
+ table[1].data = &net->ipv4.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv4.fqdir->high_thresh;
+ table[2].data = &net->ipv4.fqdir->timeout;
+ table[3].data = &net->ipv4.fqdir->max_dist;
+
+ hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ ARRAY_SIZE(ip4_frags_ns_ctl_table));
if (!hdr)
goto err_reg;
@@ -626,7 +628,7 @@ err_alloc:
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->ipv4.frags_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.frags_hdr);
@@ -656,6 +658,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
{
int res;
+ res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+ if (res < 0)
+ return res;
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -670,36 +675,38 @@ static int __net_init ipv4_frags_init_net(struct net *net)
* we will prune down to 3MB, making room for approx 8 big 64K
* fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
- net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
+ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+ net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
* by TTL.
*/
- net->ipv4.frags.timeout = IP_FRAG_TIME;
+ net->ipv4.fqdir->timeout = IP_FRAG_TIME;
- net->ipv4.frags.max_dist = 64;
- net->ipv4.frags.f = &ip4_frags;
+ net->ipv4.fqdir->max_dist = 64;
- res = inet_frags_init_net(&net->ipv4.frags);
- if (res < 0)
- return res;
res = ip4_frags_ns_ctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
return res;
}
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv4.fqdir);
+}
+
static void __net_exit ipv4_frags_exit_net(struct net *net)
{
ip4_frags_ns_ctl_unregister(net);
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
}
static struct pernet_operations ip4_frags_ops = {
- .init = ipv4_frags_init_net,
- .exit = ipv4_frags_exit_net,
+ .init = ipv4_frags_init_net,
+ .pre_exit = ipv4_frags_pre_exit_net,
+ .exit = ipv4_frags_exit_net,
};