diff options
Diffstat (limited to 'net/xfrm/xfrm_policy.c')
| -rw-r--r-- | net/xfrm/xfrm_policy.c | 1167 |
1 files changed, 820 insertions, 347 deletions
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 934492bad8e0..62486f866975 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * xfrm_policy.c * @@ -27,13 +28,24 @@ #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> +#include <linux/if_tunnel.h> +#include <linux/icmp.h> #include <net/dst.h> #include <net/flow.h> +#include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/gre.h> +#if IS_ENABLED(CONFIG_IPV6_MIP6) +#include <net/mip6.h> +#endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif +#ifdef CONFIG_XFRM_ESPINTCP +#include <net/espintcp.h> +#endif +#include <net/inet_dscp.h> #include "xfrm_hash.h" @@ -98,7 +110,11 @@ struct xfrm_pol_inexact_node { * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with - * the lowest priority. If two results have same prio, youngest one wins. + * the lowest priority. If two candidates have the same priority, the + * struct xfrm_policy pos member with the lower number is used. + * + * This replicates previous single-list-search algorithm which would + * return first matching policy in the (ordered-by-priority) list. */ struct xfrm_pol_inexact_key { @@ -114,7 +130,7 @@ struct xfrm_pol_inexact_bin { /* list containing '*:*' policies */ struct hlist_head hhead; - seqcount_t count; + seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; @@ -139,6 +155,21 @@ struct xfrm_pol_inexact_candidates { struct hlist_head *res[XFRM_POL_CAND_MAX]; }; +struct xfrm_flow_keys { + struct flow_dissector_key_basic basic; + struct flow_dissector_key_control control; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + } addrs; + struct flow_dissector_key_ip ip; + struct flow_dissector_key_icmp icmp; + struct flow_dissector_key_ports ports; + struct flow_dissector_key_keyid gre; +}; + +static struct flow_dissector xfrm_session_dissector __ro_after_init; + static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -147,7 +178,6 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; -static __read_mostly seqcount_t xfrm_policy_hash_generation; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; @@ -171,8 +201,6 @@ xfrm_policy_inexact_lookup_rcu(struct net *net, static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); -static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, - struct xfrm_policy *policy); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, @@ -242,10 +270,8 @@ static const struct xfrm_if_cb *xfrm_if_get_cb(void) return rcu_dereference(xfrm_if_cb); } -struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family, u32 mark) +struct dst_entry *__xfrm_dst_lookup(int family, + const struct xfrm_dst_lookup_params *params) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -254,7 +280,7 @@ struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); + dst = afinfo->dst_lookup(params); rcu_read_unlock(); @@ -263,11 +289,12 @@ struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, - int tos, int oif, + dscp_t dscp, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { + struct xfrm_dst_lookup_params params; struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; @@ -282,7 +309,29 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); + params.net = net; + params.saddr = saddr; + params.daddr = daddr; + params.dscp = dscp; + params.oif = oif; + params.mark = mark; + params.ipproto = x->id.proto; + if (x->encap) { + switch (x->encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + params.ipproto = IPPROTO_UDP; + params.uli.ports.sport = x->encap->encap_sport; + params.uli.ports.dport = x->encap->encap_dport; + break; + case TCP_ENCAP_ESPINTCP: + params.ipproto = IPPROTO_TCP; + params.uli.ports.sport = x->encap->encap_sport; + params.uli.ports.dport = x->encap->encap_dport; + break; + } + } + + dst = __xfrm_dst_lookup(family, ¶ms); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -304,7 +353,7 @@ static inline unsigned long make_jiffies(long secs) static void xfrm_policy_timer(struct timer_list *t) { - struct xfrm_policy *xp = from_timer(xp, t, timer); + struct xfrm_policy *xp = timer_container_of(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; @@ -327,7 +376,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -345,7 +394,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -385,7 +434,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); - INIT_HLIST_NODE(&policy->bydst_inexact_list); + INIT_HLIST_HEAD(&policy->state_cache_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -413,9 +462,10 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); - if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) + if (timer_delete(&policy->timer) || timer_delete(&policy->polq.hold_timer)) BUG(); + xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); @@ -426,17 +476,31 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + struct net *net = xp_net(policy); + struct xfrm_state *x; + + xfrm_dev_policy_delete(policy); + + write_lock_bh(&policy->lock); policy->walk.dead = 1; + write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); - if (del_timer(&policy->polq.hold_timer)) + if (timer_delete(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); - if (del_timer(&policy->timer)) + if (timer_delete(&policy->timer)) xfrm_pol_put(policy); + /* XXX: Flush state cache */ + spin_lock_bh(&net->xfrm.xfrm_state_lock); + hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) { + hlist_del_init_rcu(&x->state_cache); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + xfrm_pol_put(policy); } @@ -524,7 +588,7 @@ redo: __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); - if (!entry0) { + if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; @@ -575,10 +639,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); - write_seqcount_begin(&xfrm_policy_hash_generation); - - odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, - lockdep_is_held(&net->xfrm.xfrm_policy_lock)); + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); @@ -589,7 +650,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; - write_seqcount_end(&xfrm_policy_hash_generation); + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); @@ -597,7 +658,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } -static void xfrm_byidx_resize(struct net *net, int total) +static void xfrm_byidx_resize(struct net *net) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); @@ -675,23 +736,13 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) - xfrm_byidx_resize(net, total); + xfrm_byidx_resize(net); mutex_unlock(&hash_resize_mutex); } -static void xfrm_hash_reset_inexact_table(struct net *net) -{ - struct xfrm_pol_inexact_bin *b; - - lockdep_assert_held(&net->xfrm.xfrm_policy_lock); - - list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins) - INIT_HLIST_HEAD(&b->hhead); -} - /* Make sure *pol can be inserted into fastbin. - * Useful to check that later insert requests will be sucessful + * Useful to check that later insert requests will be successful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * @@ -722,7 +773,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; - seqcount_init(&bin->count); + seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, @@ -796,15 +847,22 @@ static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { + u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: - if (sizeof(long) == 4 && prefixlen == 0) - return ntohl(a->a4) - ntohl(b->a4); - return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) - - (ntohl(b->a4) & ((~0UL << (32 - prefixlen)))); + if (prefixlen == 0) + return 0; + mask = ~0U << (32 - prefixlen); + ma = ntohl(a->a4) & mask; + mb = ntohl(b->a4) & mask; + if (ma < mb) + delta = -1; + else if (ma > mb) + delta = 1; + break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; @@ -815,10 +873,13 @@ static int xfrm_policy_addr_delta(const xfrm_address_t *a, return delta; } if (pbi) { - u32 mask = ~0u << (32 - pbi); - - delta = (ntohl(a->a6[pdw]) & mask) - - (ntohl(b->a6[pdw]) & mask); + mask = ~0U << (32 - pbi); + ma = ntohl(a->a6[pdw]) & mask; + mb = ntohl(b->a6[pdw]) & mask; + if (ma < mb) + delta = -1; + else if (ma > mb) + delta = 1; } break; default: @@ -833,32 +894,35 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, u16 family) { unsigned int matched_s, matched_d; - struct hlist_node *newpos = NULL; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + struct hlist_node *newpos = NULL; bool matches_s, matches_d; - if (!policy->bydst_reinsert) + if (policy->walk.dead || !policy->bydst_reinsert) continue; WARN_ON_ONCE(policy->family != family); policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { - if (policy->priority >= p->priority) + if (policy->priority > p->priority) + newpos = &p->bydst; + else if (policy->priority == p->priority && + policy->pos > p->pos) newpos = &p->bydst; else break; } - if (newpos) - hlist_add_behind(&policy->bydst, newpos); + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, &n->hhead); + hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least @@ -893,12 +957,13 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net, struct rb_root *new, u16 family) { - struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; + struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); - +restart: + parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; @@ -916,17 +981,19 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net, } else if (delta > 0) { p = &parent->rb_right; } else { + bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; - hlist_for_each_entry(tmp, &node->hhead, bydst) - tmp->bydst_reinsert = true; - hlist_for_each_entry(tmp, &n->hhead, bydst) + hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; + hlist_del_rcu(&tmp->bydst); + } + + node->prefixlen = prefixlen; - INIT_HLIST_HEAD(&node->hhead); xfrm_policy_inexact_list_reinsert(net, node, family); - if (node->prefixlen == n->prefixlen) { + if (same_prefixlen) { kfree_rcu(n, rcu); return; } @@ -934,9 +1001,7 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net, rb_erase(*p, new); kfree_rcu(n, rcu); n = node; - n->prefixlen = prefixlen; - *p = new->rb_node; - parent = NULL; + goto restart; } } @@ -965,12 +1030,11 @@ static void xfrm_policy_inexact_node_merge(struct net *net, family); } - hlist_for_each_entry(tmp, &v->hhead, bydst) - tmp->bydst_reinsert = true; - hlist_for_each_entry(tmp, &n->hhead, bydst) + hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; + hlist_del_rcu(&tmp->bydst); + } - INIT_HLIST_HEAD(&n->hhead); xfrm_policy_inexact_list_reinsert(net, n, family); } @@ -1198,26 +1262,31 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) return ERR_PTR(-EEXIST); } - chain = &net->xfrm.policy_inexact[dir]; - xfrm_policy_insert_inexact_list(chain, policy); - if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } +static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy) +{ + int dir; + + if (policy->walk.dead) + return true; + + dir = xfrm_policy_id2dir(policy->index); + return dir >= XFRM_POLICY_MAX; +} + static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); - unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; - struct hlist_head *odst; struct hlist_node *newpos; - int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; @@ -1235,6 +1304,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. @@ -1243,10 +1313,10 @@ static void xfrm_hash_rebuild(struct work_struct *work) struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; - dir = xfrm_policy_id2dir(policy->index); - if (policy->walk.dead || dir >= XFRM_POLICY_MAX) + if (xfrm_policy_is_dead_or_sk(policy)) continue; + dir = xfrm_policy_id2dir(policy->index); if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; @@ -1277,15 +1347,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) goto out_unlock; } - /* reset the bydst and inexact table in all directions */ - xfrm_hash_reset_inexact_table(net); - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); - hmask = net->xfrm.policy_bydst[dir].hmask; - odst = net->xfrm.policy_bydst[dir].table; - for (i = hmask; i >= 0; i--) - INIT_HLIST_HEAD(odst + i); if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; @@ -1303,16 +1365,16 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (policy->walk.dead) + if (xfrm_policy_is_dead_or_sk(policy)) continue; - dir = xfrm_policy_id2dir(policy->index); - if (dir >= XFRM_POLICY_MAX) { - /* skip socket policies */ - continue; - } + + hlist_del_rcu(&policy->bydst); + newpos = NULL; + dir = xfrm_policy_id2dir(policy->index); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); + if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); @@ -1326,7 +1388,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) else break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); @@ -1334,6 +1396,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) out_unlock: __xfrm_policy_inexact_flush(net); + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); @@ -1349,8 +1412,6 @@ EXPORT_SYMBOL(xfrm_policy_hash_rebuild); * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { - static u32 idx_generator; - for (;;) { struct hlist_head *list; struct xfrm_policy *p; @@ -1358,8 +1419,8 @@ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) int found; if (!index) { - idx = (idx_generator | dir); - idx_generator += 8; + idx = (net->xfrm.idx_generator | dir); + net->xfrm.idx_generator += 8; } else { idx = index; index = 0; @@ -1408,7 +1469,7 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); - if (del_timer(&pq->hold_timer)) + if (timer_delete(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); @@ -1422,19 +1483,10 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, spin_unlock_bh(&pq->hold_queue.lock); } -static bool xfrm_policy_mark_match(struct xfrm_policy *policy, - struct xfrm_policy *pol) +static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, + struct xfrm_policy *pol) { - u32 mark = policy->mark.v & policy->mark.m; - - if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m) - return true; - - if ((mark & pol->mark.m) == pol->mark.v && - policy->priority == pol->priority) - return true; - - return false; + return mark->v == pol->mark.v && mark->m == pol->mark.m; } static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) @@ -1486,42 +1538,6 @@ static const struct rhashtable_params xfrm_pol_inexact_params = { .automatic_shrinking = true, }; -static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, - struct xfrm_policy *policy) -{ - struct xfrm_policy *pol, *delpol = NULL; - struct hlist_node *newpos = NULL; - int i = 0; - - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - if (pol->type == policy->type && - pol->if_id == policy->if_id && - !selector_cmp(&pol->selector, &policy->selector) && - xfrm_policy_mark_match(policy, pol) && - xfrm_sec_ctx_match(pol->security, policy->security) && - !WARN_ON(delpol)) { - delpol = pol; - if (policy->priority > pol->priority) - continue; - } else if (policy->priority >= pol->priority) { - newpos = &pol->bydst_inexact_list; - continue; - } - if (delpol) - break; - } - - if (newpos) - hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); - else - hlist_add_head_rcu(&policy->bydst_inexact_list, chain); - - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - pol->pos = i; - i++; - } -} - static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) @@ -1532,7 +1548,7 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && - xfrm_policy_mark_match(policy, pol) && + xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) @@ -1548,9 +1564,12 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else + /* Packet offload policies enter to the head + * to speed-up lookups. + */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; @@ -1562,6 +1581,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *delpol; struct hlist_head *chain; + /* Sanitize mark before store */ + policy->mark.v &= policy->mark.m; + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) @@ -1604,9 +1626,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) EXPORT_SYMBOL(xfrm_policy_insert); static struct xfrm_policy * -__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id, - u8 type, int dir, - struct xfrm_selector *sel, +__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark, + u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx) { struct xfrm_policy *pol; @@ -1617,7 +1638,7 @@ __xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id, hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && - (mark & pol->mark.m) == pol->mark.v && + xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) return pol; @@ -1626,11 +1647,10 @@ __xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id, return NULL; } -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, - u8 type, int dir, - struct xfrm_selector *sel, - struct xfrm_sec_ctx *ctx, int delete, - int *err) +struct xfrm_policy * +xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, + u8 type, int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_pol_inexact_bin *bin = NULL; struct xfrm_policy *pol, *ret = NULL; @@ -1697,9 +1717,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, - u8 type, int dir, u32 id, int delete, - int *err) +struct xfrm_policy * +xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, + u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; @@ -1714,8 +1734,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && - pol->if_id == if_id && - (mark & pol->mark.m) == pol->mark.v) { + pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( @@ -1759,12 +1778,41 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) } return err; } + +static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, + struct net_device *dev, + bool task_valid) +{ + struct xfrm_policy *pol; + int err = 0; + + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead || + xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || + pol->xdo.dev != dev) + continue; + + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, task_valid); + return err; + } + } + return err; +} #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } + +static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, + struct net_device *dev, + bool task_valid) +{ + return 0; +} #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) @@ -1780,9 +1828,11 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead) + continue; + dir = xfrm_policy_id2dir(pol->index); - if (pol->walk.dead || - dir >= XFRM_POLICY_MAX || + if (dir >= XFRM_POLICY_MAX || pol->type != type) continue; @@ -1804,6 +1854,46 @@ out: } EXPORT_SYMBOL(xfrm_policy_flush); +int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, + bool task_valid) +{ + int dir, err = 0, cnt = 0; + struct xfrm_policy *pol; + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + + err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); + if (err) + goto out; + +again: + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead) + continue; + + dir = xfrm_policy_id2dir(pol->index); + if (dir >= XFRM_POLICY_MAX || + pol->xdo.dev != dev) + continue; + + __xfrm_policy_unlink(pol, dir); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + cnt++; + xfrm_audit_policy_delete(pol, 1, task_valid); + xfrm_policy_kill(pol); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + goto again; + } + if (cnt) + __xfrm_policy_inexact_flush(net); + else + err = -ESRCH; +out: + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return err; +} +EXPORT_SYMBOL(xfrm_dev_policy_flush); + int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) @@ -1879,7 +1969,7 @@ EXPORT_SYMBOL(xfrm_policy_walk_done); */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, - u8 type, u16 family, int dir, u32 if_id) + u8 type, u16 family, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; @@ -1893,14 +1983,13 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, match = xfrm_selector_match(sel, fl, family); if (match) - ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, - dir); + ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, - seqcount_t *count, + seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; @@ -2005,7 +2094,7 @@ static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, - u8 type, u16 family, int dir, u32 if_id) + u8 type, u16 family, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; @@ -2019,7 +2108,7 @@ __xfrm_policy_eval_candidates(struct hlist_head *chain, if (pol->priority > priority) break; - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); @@ -2044,7 +2133,7 @@ static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, - u8 type, u16 family, int dir, u32 if_id) + u8 type, u16 family, u32 if_id) { struct xfrm_policy *tmp; int i; @@ -2052,8 +2141,7 @@ xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, - fl, type, family, dir, - if_id); + fl, type, family, if_id); if (!tmp) continue; @@ -2086,13 +2174,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, rcu_read_lock(); retry: do { - sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); - } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err == -ESRCH) continue; @@ -2105,13 +2193,16 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } + if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) + goto skip_inexact; + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, - family, dir, if_id); + family, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) @@ -2119,7 +2210,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } skip_inexact: - if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) @@ -2166,14 +2257,13 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { - if ((sk->sk_mark & pol->mark.m) != pol->mark.v || + if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, - fl->flowi_secid, - dir); + fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; @@ -2190,10 +2280,52 @@ out: return pol; } +static u32 xfrm_gen_pos_slow(struct net *net) +{ + struct xfrm_policy *policy; + u32 i = 0; + + /* oldest entry is last in list */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + if (!xfrm_policy_is_dead_or_sk(policy)) + policy->pos = ++i; + } + + return i; +} + +static u32 xfrm_gen_pos(struct net *net) +{ + const struct xfrm_policy *policy; + u32 i = 0; + + /* most recently added policy is at the head of the list */ + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + if (xfrm_policy_is_dead_or_sk(policy)) + continue; + + if (policy->pos == UINT_MAX) + return xfrm_gen_pos_slow(net); + + i = policy->pos + 1; + break; + } + + return i; +} + static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_FWD: + case XFRM_POLICY_OUT: + pol->pos = xfrm_gen_pos(net); + break; + } + list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); @@ -2210,7 +2342,6 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); - hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } @@ -2336,15 +2467,15 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) } static int -xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, - xfrm_address_t *remote, unsigned short family, u32 mark) +xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, oif, local, remote, mark); + err = afinfo->get_saddr(saddr, params); rcu_read_unlock(); return err; } @@ -2369,13 +2500,19 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_IPTFS || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, fl->flowi_oif, - &tmp, remote, - tmpl->encap_family, 0); + struct xfrm_dst_lookup_params params; + + memset(¶ms, 0, sizeof(params)); + params.net = net; + params.oif = fl->flowi_oif; + params.daddr = remote; + error = xfrm_get_saddr(tmpl->encap_family, &tmp, + ¶ms); if (error) goto fail; local = &tmp; @@ -2384,6 +2521,12 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); + if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR); + xfrm_state_put(x); + error = -EINVAL; + goto fail; + } if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; @@ -2448,20 +2591,12 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, } -static int xfrm_get_tos(const struct flowi *fl, int family) +static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { - const struct xfrm_policy_afinfo *afinfo; - int tos; - - afinfo = xfrm_policy_get_afinfo(family); - if (!afinfo) - return 0; + if (family == AF_INET) + return fl->u.ip4.flowi4_dscp; - tos = afinfo->get_tos(fl); - - rcu_read_unlock(); - - return tos; + return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) @@ -2485,12 +2620,10 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { - struct dst_entry *dst = &xdst->u.dst; - - memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); + memset_after(xdst, 0, u.dst); } else xdst = ERR_PTR(-ENOBUFS); @@ -2499,21 +2632,13 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) return xdst; } -static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, - int nfheader_len) +static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) { - const struct xfrm_policy_afinfo *afinfo = - xfrm_policy_get_afinfo(dst->ops->family); - int err; - - if (!afinfo) - return -EINVAL; - - err = afinfo->init_path(path, dst, nfheader_len); - - rcu_read_unlock(); - - return err; + if (dst->ops->family == AF_INET6) { + path->path_cookie = rt6_get_cookie(dst_rt6_info(dst)); + path->u.rt6.rt6i_nfheader_len = nfheader_len; + } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, @@ -2545,10 +2670,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, const struct flowi *fl, struct dst_entry *dst) { + const struct xfrm_state_afinfo *afinfo; + const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; - struct xfrm_mode *inner_mode; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; @@ -2556,13 +2682,13 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, int header_len = 0; int nfheader_len = 0; int trailer_len = 0; - int tos; int family = policy->selector.family; xfrm_address_t saddr, daddr; + dscp_t dscp; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); - tos = xfrm_get_tos(fl, family); + dscp = xfrm_get_dscp(fl, family); dst_hold(dst); @@ -2594,17 +2720,24 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } } else - inner_mode = xfrm[i]->inner_mode; + inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { - __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); + __u32 mark = 0; + int oif; + + if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) + mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); + + if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET) + family = xfrm[i]->props.family; - family = xfrm[i]->props.family; - dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family, mark); + oif = fl->flowi_oif ? : fl->flowi_l3mdev; + dst = xfrm_dst_lookup(xfrm[i], dscp, oif, &saddr, + &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; @@ -2615,11 +2748,21 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; - dst1->flags |= DST_HOST; dst1->lastuse = now; dst1->input = dst_discard; - dst1->output = inner_mode->afinfo->output; + + if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { + dst1->output = xfrm[i]->mode_cbs->output; + } else { + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (likely(afinfo)) + dst1->output = afinfo->output; + else + dst1->output = dst_discard_out; + rcu_read_unlock(); + } xdst_prev = xdst; @@ -2675,13 +2818,15 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, *num_xfrms = 0; return 0; } - if (IS_ERR(pols[0])) + if (IS_ERR(pols[0])) { + *num_pols = 0; return PTR_ERR(pols[0]); + } *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY - if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW && + if (pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, @@ -2691,6 +2836,7 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); + *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; @@ -2752,11 +2898,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; - struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); + struct xfrm_policy *pol = timer_container_of(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; + __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); @@ -2766,7 +2913,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) } dst = skb_dst(skb); sk = skb->sk; - xfrm_decode_session(skb, &fl, dst->ops->family); + + /* Fixup the mark to support VTI. */ + skb_mark = skb->mark; + skb->mark = pol->mark.v; + xfrm_decode_session(net, skb, &fl, dst->ops->family); + skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); @@ -2798,7 +2950,12 @@ static void xfrm_policy_queue_process(struct timer_list *t) while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); - xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); + /* Fixup the mark to support VTI. */ + skb_mark = skb->mark; + skb->mark = pol->mark.v; + xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family); + skb->mark = skb_mark; + dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { @@ -2806,11 +2963,11 @@ static void xfrm_policy_queue_process(struct timer_list *t) continue; } - nf_reset(skb); + nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(net, skb->sk, skb); + dst_output(net, skb_to_full_sk(skb), skb); } out: @@ -2850,7 +3007,7 @@ static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *s sched_next = jiffies + pq->timeout; - if (del_timer(&pq->hold_timer)) { + if (timer_delete(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); @@ -2894,7 +3051,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; - dst1->flags |= DST_HOST | DST_XFRM_QUEUE; + dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; @@ -3073,8 +3230,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, xflo.flags = flags; /* To accelerate a bit... */ - if ((dst_orig->flags & DST_NOXFRM) || - !net->xfrm.policy_count[XFRM_POLICY_OUT]) + if (!if_id && ((dst_orig->flags & DST_NOXFRM) || + !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); @@ -3122,7 +3279,7 @@ no_transform: } for (i = 0; i < num_pols; i++) - pols[i]->curlft.use_time = ktime_get_real_seconds(); + WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds()); if (num_xfrms < 0) { /* Prohibit the flow */ @@ -3137,14 +3294,21 @@ no_transform: dst_release(dst); dst = dst_orig; } + ok: xfrm_pols_put(pols, drop_pols); - if (dst && dst->xfrm && - dst->xfrm->props.mode == XFRM_MODE_TUNNEL) + if (dst->xfrm && + (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || + dst->xfrm->props.mode == XFRM_MODE_IPTFS)) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: + if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && + net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { + err = -EPERM; + goto error; + } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; @@ -3184,7 +3348,7 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); - if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) + if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) @@ -3216,7 +3380,7 @@ xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, - unsigned short family) + unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); @@ -3227,19 +3391,20 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && - xfrm_state_addr_cmp(tmpl, x, family)); + xfrm_state_addr_cmp(tmpl, x, family)) && + (if_id == 0 || if_id == x->if_id); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass - * because of optional transport mode, or next index of the mathced secpath + * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, - unsigned short family) + unsigned short family, u32 if_id) { int idx = start; @@ -3249,9 +3414,16 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star } else start = -1; for (; idx < sp->len; idx++) { - if (xfrm_state_ok(tmpl, sp->xvec[idx], family)) + if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { + if (idx < sp->verified_cnt) { + /* Secpath entry previously verified, consider optional and + * continue searching + */ + continue; + } + if (start == -1) start = -2-idx; break; @@ -3260,20 +3432,108 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star return start; } -int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, +static void +decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) +{ + struct flowi4 *fl4 = &fl->u.ip4; + + memset(fl4, 0, sizeof(struct flowi4)); + + if (reverse) { + fl4->saddr = flkeys->addrs.ipv4.dst; + fl4->daddr = flkeys->addrs.ipv4.src; + fl4->fl4_sport = flkeys->ports.dst; + fl4->fl4_dport = flkeys->ports.src; + } else { + fl4->saddr = flkeys->addrs.ipv4.src; + fl4->daddr = flkeys->addrs.ipv4.dst; + fl4->fl4_sport = flkeys->ports.src; + fl4->fl4_dport = flkeys->ports.dst; + } + + switch (flkeys->basic.ip_proto) { + case IPPROTO_GRE: + fl4->fl4_gre_key = flkeys->gre.keyid; + break; + case IPPROTO_ICMP: + fl4->fl4_icmp_type = flkeys->icmp.type; + fl4->fl4_icmp_code = flkeys->icmp.code; + break; + } + + fl4->flowi4_proto = flkeys->basic.ip_proto; + fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void +decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) +{ + struct flowi6 *fl6 = &fl->u.ip6; + + memset(fl6, 0, sizeof(struct flowi6)); + + if (reverse) { + fl6->saddr = flkeys->addrs.ipv6.dst; + fl6->daddr = flkeys->addrs.ipv6.src; + fl6->fl6_sport = flkeys->ports.dst; + fl6->fl6_dport = flkeys->ports.src; + } else { + fl6->saddr = flkeys->addrs.ipv6.src; + fl6->daddr = flkeys->addrs.ipv6.dst; + fl6->fl6_sport = flkeys->ports.src; + fl6->fl6_dport = flkeys->ports.dst; + } + + switch (flkeys->basic.ip_proto) { + case IPPROTO_GRE: + fl6->fl6_gre_key = flkeys->gre.keyid; + break; + case IPPROTO_ICMPV6: + fl6->fl6_icmp_type = flkeys->icmp.type; + fl6->fl6_icmp_code = flkeys->icmp.code; + break; + } + + fl6->flowi6_proto = flkeys->basic.ip_proto; +} +#endif + +int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { - const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - int err; + struct xfrm_flow_keys flkeys; - if (unlikely(afinfo == NULL)) + memset(&flkeys, 0, sizeof(flkeys)); + __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys, + NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP); + + switch (family) { + case AF_INET: + decode_session4(&flkeys, fl, reverse); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + decode_session6(&flkeys, fl, reverse); + break; +#endif + default: return -EAFNOSUPPORT; + } - afinfo->decode_session(skb, fl, reverse); + fl->flowi_mark = skb->mark; + if (reverse) { + fl->flowi_oif = skb->skb_iif; + } else { + int oif = 0; - err = security_xfrm_decode_session(skb, &fl->flowi_secid); - rcu_read_unlock(); - return err; + if (skb_dst(skb) && skb_dst(skb)->dev) + oif = skb_dst(skb)->dev->ifindex; + + fl->flowi_oif = oif; + } + + return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); @@ -3289,6 +3549,130 @@ static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int return 0; } +static bool icmp_err_packet(const struct flowi *fl, unsigned short family) +{ + const struct flowi4 *fl4 = &fl->u.ip4; + + if (family == AF_INET && + fl4->flowi4_proto == IPPROTO_ICMP && + (fl4->fl4_icmp_type == ICMP_DEST_UNREACH || + fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED)) + return true; + +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6) { + const struct flowi6 *fl6 = &fl->u.ip6; + + if (fl6->flowi6_proto == IPPROTO_ICMPV6 && + (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH || + fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG || + fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED)) + return true; + } +#endif + return false; +} + +static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, + const struct flowi *fl, struct flowi *fl1) +{ + bool ret = true; + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + int hl = family == AF_INET ? (sizeof(struct iphdr) + sizeof(struct icmphdr)) : + (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr)); + + if (!newskb) + return true; + + if (!pskb_pull(newskb, hl)) + goto out; + + skb_reset_network_header(newskb); + + if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0) + goto out; + + fl1->flowi_oif = fl->flowi_oif; + fl1->flowi_mark = fl->flowi_mark; + fl1->flowi_dscp = fl->flowi_dscp; + nf_nat_decode_session(newskb, fl1, family); + ret = false; + +out: + consume_skb(newskb); + return ret; +} + +static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family, + const struct xfrm_selector *sel, + const struct flowi *fl) +{ + bool ret = false; + + if (icmp_err_packet(fl, family)) { + struct flowi fl1; + + if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) + return ret; + + ret = xfrm_selector_match(sel, &fl1, family); + } + + return ret; +} + +static inline struct +xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb, + const struct flowi *fl, unsigned short family, + u32 if_id) +{ + struct xfrm_policy *pol = NULL; + + if (icmp_err_packet(fl, family)) { + struct flowi fl1; + struct net *net = dev_net(skb->dev); + + if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) + return pol; + + pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id); + if (IS_ERR(pol)) + pol = NULL; + } + + return pol; +} + +static inline struct +dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl, + unsigned short family, struct dst_entry *dst) +{ + if (icmp_err_packet(fl, family)) { + struct net *net = dev_net(skb->dev); + struct dst_entry *dst2; + struct flowi fl1; + + if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) + return dst; + + dst_hold(dst); + + dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP)); + + if (IS_ERR(dst2)) + return dst; + + if (dst2->xfrm) { + dst_release(dst); + dst = dst2; + } else { + dst_release(dst2); + } + } + + return dst; +} + int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { @@ -3303,23 +3687,25 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; - struct xfrm_if *xi; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { - xi = ifcb->decode_session(skb); - if (xi) - if_id = xi->p.if_id; + struct xfrm_if_decode_session_result r; + + if (ifcb->decode_session(skb, family, &r)) { + if_id = r.if_id; + net = r.net; + } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; - if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { + if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } @@ -3333,9 +3719,17 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; + int ret = 0; + if (!xfrm_selector_match(&x->sel, &fl, family)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); - return 0; + ret = 1; + if (x->props.flags & XFRM_STATE_ICMP && + xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl)) + ret = 0; + if (ret) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); + return 0; + } } } } @@ -3358,8 +3752,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 0; } + if (!pol && dir == XFRM_POLICY_FWD) + pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id); + if (!pol) { - if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { + const bool is_crypto_offload = sp && + (xfrm_input_state(skb)->xso.type == XFRM_DEV_OFFLOAD_CRYPTO); + + if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); + return 0; + } + + if (sp && secpath_has_nontransport(sp, 0, &xerr_idx) && !is_crypto_offload) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; @@ -3367,7 +3772,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3379,9 +3785,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } @@ -3413,8 +3822,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; + if (npols > 1) { - xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net); + xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } @@ -3423,9 +3833,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. + * Upon success, marks secpath entries as having been + * verified to allow them to be skipped in future policy + * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { - k = xfrm_policy_ok(tpp[i], sp, k, family); + k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ @@ -3441,6 +3854,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_pols_put(pols, npols); + sp->verified_cnt = k; + return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); @@ -3460,22 +3875,32 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) struct dst_entry *dst; int res = 1; - if (xfrm_decode_session(skb, &fl, family) < 0) { + if (xfrm_decode_session(net, skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); - if (!skb_dst(skb)) { + dst = skb_dst(skb); + if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } - dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); + + dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } + + if (dst && !dst->xfrm) + dst = xfrm_out_fwd_icmp(skb, &fl, family, dst); + skb_dst_set(skb, dst); return res; } @@ -3506,7 +3931,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ - if (dst->obsolete < 0 && !stale_bundle(dst)) + if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst)) return dst; return NULL; @@ -3520,7 +3945,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { - dst->dev = dev_net(dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } @@ -3532,15 +3957,10 @@ static void xfrm_link_failure(struct sk_buff *skb) /* Impossible. Such dst must be popped before reaches point of failure. */ } -static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst) { - if (dst->obsolete) { - dst_release(dst); - dst = NULL; - } - } - return dst; + if (READ_ONCE(dst->obsolete)) + sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) @@ -3794,10 +4214,7 @@ static int __net_init xfrm_policy_init(struct net *net) int dir, err; if (net_eq(net, &init_net)) { - xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", - sizeof(struct xfrm_dst), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); + xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); @@ -3816,7 +4233,6 @@ static int __net_init xfrm_policy_init(struct net *net) net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; - INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); @@ -3870,8 +4286,6 @@ static void xfrm_policy_fini(struct net *net) for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; - WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); - htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); @@ -3895,7 +4309,11 @@ static int __net_init xfrm_net_init(struct net *net) /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); + seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); + net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT; + net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT; + net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT; rv = xfrm_statistics_init(net); if (rv < 0) @@ -3910,8 +4328,14 @@ static int __net_init xfrm_net_init(struct net *net) if (rv < 0) goto out_sysctl; + rv = xfrm_nat_keepalive_net_init(net); + if (rv < 0) + goto out_nat_keepalive; + return 0; +out_nat_keepalive: + xfrm_sysctl_fini(net); out_sysctl: xfrm_policy_fini(net); out_policy: @@ -3924,6 +4348,7 @@ out_statistics: static void __net_exit xfrm_net_exit(struct net *net) { + xfrm_nat_keepalive_net_fini(net); xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); @@ -3935,15 +4360,57 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { .exit = xfrm_net_exit, }; +static const struct flow_dissector_key xfrm_flow_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct xfrm_flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct xfrm_flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct xfrm_flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct xfrm_flow_keys, gre), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IP, + .offset = offsetof(struct xfrm_flow_keys, ip), + }, + { + .key_id = FLOW_DISSECTOR_KEY_ICMP, + .offset = offsetof(struct xfrm_flow_keys, icmp), + }, +}; + void __init xfrm_init(void) { + skb_flow_dissector_init(&xfrm_session_dissector, + xfrm_flow_dissector_keys, + ARRAY_SIZE(xfrm_flow_dissector_keys)); + register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); - seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); - RCU_INIT_POINTER(xfrm_if_cb, NULL); - synchronize_rcu(); +#ifdef CONFIG_XFRM_ESPINTCP + espintcp_init(); +#endif + + register_xfrm_state_bpf(); + xfrm_nat_keepalive_init(AF_INET); } #ifdef CONFIG_AUDITSYSCALL @@ -4012,61 +4479,50 @@ EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE -static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, - const struct xfrm_selector *sel_tgt) -{ - if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { - if (sel_tgt->family == sel_cmp->family && - xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, - sel_cmp->family) && - xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, - sel_cmp->family) && - sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && - sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { - return true; - } - } else { - if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { - return true; - } - } - return false; -} - static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type, struct net *net) + u8 dir, u8 type, struct net *net, u32 if_id) { - struct xfrm_policy *pol, *ret = NULL; - struct hlist_head *chain; - u32 priority = ~0U; + struct xfrm_policy *pol; + struct flowi fl; - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); - hlist_for_each_entry(pol, chain, bydst) { - if (xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type) { - ret = pol; - priority = ret->priority; - break; - } - } - chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - if ((pol->priority >= priority) && ret) - break; + memset(&fl, 0, sizeof(fl)); - if (xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type) { - ret = pol; + fl.flowi_proto = sel->proto; + + switch (sel->family) { + case AF_INET: + fl.u.ip4.saddr = sel->saddr.a4; + fl.u.ip4.daddr = sel->daddr.a4; + if (sel->proto == IPSEC_ULPROTO_ANY) break; - } + fl.u.flowi4_oif = sel->ifindex; + fl.u.ip4.fl4_sport = sel->sport; + fl.u.ip4.fl4_dport = sel->dport; + break; + case AF_INET6: + fl.u.ip6.saddr = sel->saddr.in6; + fl.u.ip6.daddr = sel->daddr.in6; + if (sel->proto == IPSEC_ULPROTO_ANY) + break; + fl.u.flowi6_oif = sel->ifindex; + fl.u.ip6.fl4_sport = sel->sport; + fl.u.ip6.fl4_dport = sel->dport; + break; + default: + return ERR_PTR(-EAFNOSUPPORT); } - xfrm_pol_hold(ret); + rcu_read_lock(); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id); + if (IS_ERR_OR_NULL(pol)) + goto out_unlock; - return ret; + if (!xfrm_pol_hold_rcu(pol)) + pol = NULL; +out_unlock: + rcu_read_unlock(); + return pol; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) @@ -4078,6 +4534,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, @@ -4100,7 +4557,8 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, - struct xfrm_migrate *m, int num_migrate) + struct xfrm_migrate *m, int num_migrate, + struct netlink_ext_ack *extack) { struct xfrm_migrate *mp; int i, j, n = 0; @@ -4108,6 +4566,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ + NL_SET_ERR_MSG(extack, "Target policy not found"); write_unlock_bh(&pol->lock); return -ENOENT; } @@ -4118,7 +4577,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && - pol->xfrm_vec[i].mode != XFRM_MODE_BEET) + pol->xfrm_vec[i].mode != XFRM_MODE_BEET && + pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, @@ -4139,17 +4599,22 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, return 0; } -static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) +static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate, + struct netlink_ext_ack *extack) { int i, j; - if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) + if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) { + NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; + } for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || - xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) + xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) { + NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null"); return -EINVAL; + } /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { @@ -4160,8 +4625,10 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && - m[i].old_family == m[j].old_family) + m[i].old_family == m[j].old_family) { + NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique"); return -EINVAL; + } } } @@ -4171,7 +4638,8 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, u32 if_id, + struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4181,26 +4649,30 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ - if ((err = xfrm_migrate_check(m, num_migrate)) < 0) + err = xfrm_migrate_check(m, num_migrate, extack); + if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { + NL_SET_ERR_MSG(extack, "Invalid policy direction"); err = -EINVAL; goto out; } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { - err = -ENOENT; + pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); + if (IS_ERR_OR_NULL(pol)) { + NL_SET_ERR_MSG(extack, "Target policy not found"); + err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp, net))) { + if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; - xc = xfrm_state_migrate(x, mp, encap); + xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack); if (xc) { x_new[nx_new] = xc; nx_new++; @@ -4212,7 +4684,8 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 3 - update policy */ - if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0) + err = xfrm_policy_migrate(pol, m, num_migrate, extack); + if (err < 0) goto restore_state; /* Stage 4 - delete old state(s) */ |
