diff options
Diffstat (limited to 'net')
43 files changed, 1405 insertions, 790 deletions
diff --git a/net/atm/clip.c b/net/atm/clip.c index d795b9c5aea4..b9e67e589a7b 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,8 +345,8 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } rt = (struct rtable *) dst; - if (rt->rt_gateway) - daddr = &rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); diff --git a/net/bpf/Makefile b/net/bpf/Makefile index 27b2992a0692..b0ca361742e4 100644 --- a/net/bpf/Makefile +++ b/net/bpf/Makefile @@ -1 +1 @@ -obj-y := test_run.o +obj-$(CONFIG_BPF_SYSCALL) := test_run.o diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fab142b796ef..2221573dacdb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -123,12 +123,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, return data; } +static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + u32 size = kattr->test.ctx_size_in; + void *data; + int err; + + if (!data_in && !data_out) + return NULL; + + data = kzalloc(max_size, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (data_in) { + err = bpf_check_uarg_tail_zero(data_in, max_size, size); + if (err) { + kfree(data); + return ERR_PTR(err); + } + + size = min_t(u32, max_size, size); + if (copy_from_user(data, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + } + return data; +} + +static int bpf_ctx_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + int err = -EFAULT; + u32 copy_size = size; + + if (!data || !data_out) + return 0; + + if (copy_size > kattr->test.ctx_size_out) { + copy_size = kattr->test.ctx_size_out; + err = -ENOSPC; + } + + if (copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + return err; +} + +/** + * range_is_zero - test whether buffer is initialized + * @buf: buffer to check + * @from: check from this position + * @to: check up until (excluding) this position + * + * This function returns true if the there is a non-zero byte + * in the buf in the range [from,to). + */ +static inline bool range_is_zero(void *buf, size_t from, size_t to) +{ + return !memchr_inv((u8 *)buf + from, 0, to - from); +} + +static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return 0; + + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + return -EINVAL; + + /* priority is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + + FIELD_SIZEOF(struct __sk_buff, priority), + offsetof(struct __sk_buff, cb))) + return -EINVAL; + + /* cb is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + + FIELD_SIZEOF(struct __sk_buff, cb), + sizeof(struct __sk_buff))) + return -EINVAL; + + skb->priority = __skb->priority; + memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + + return 0; +} + +static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return; + + __skb->priority = skb->priority; + memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); +} + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; @@ -141,6 +255,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) { + kfree(data); + return PTR_ERR(ctx); + } + switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -158,6 +278,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sk = kzalloc(sizeof(struct sock), GFP_USER); if (!sk) { kfree(data); + kfree(ctx); return -ENOMEM; } sock_net_set(sk, current->nsproxy->net_ns); @@ -166,6 +287,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb = build_skb(data, 0); if (!skb) { kfree(data); + kfree(ctx); kfree(sk); return -ENOMEM; } @@ -180,32 +302,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); + ret = convert___skb_to_skb(skb, ctx); + if (ret) + goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration); - if (ret) { - kfree_skb(skb); - kfree(sk); - return ret; - } + if (ret) + goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { - kfree_skb(skb); - kfree(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } + convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct __sk_buff)); +out: kfree_skb(skb); kfree(sk); + kfree(ctx); return ret; } @@ -220,6 +347,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -263,6 +393,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 711d7156efd8..6c6e01963aac 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -186,15 +186,19 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { + struct Qdisc *sch; + /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); - qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc)); - - if (likely(qlen == 0)) + sch = rcu_dereference_bh(txq->qdisc); + if (likely(qdisc_is_empty(sch))) goto noxoff; + /* can check for explicit qdisc len value only !NOLOCK, + * always set flow off otherwise + */ high = (caifd->netdev->tx_queue_len * q_high) / 100; - if (likely(qlen < high)) + if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } diff --git a/net/core/filter.c b/net/core/filter.c index 8904e3407163..95a27fdf9a40 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2970,11 +2970,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ - BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK)) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; @@ -3009,6 +3012,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; inner_trans = skb->transport_header; } @@ -3017,8 +3022,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return ret; if (encap) { - /* inner mac == inner_net on l3 encap */ - skb->inner_mac_header = inner_net; + skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; skb_set_inner_protocol(skb, skb->protocol); @@ -3032,7 +3036,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; - else + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || @@ -4639,15 +4643,26 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; - if (nhc->nhc_has_gw) - params->ipv4_dst = nhc->nhc_gw.ipv4; params->rt_metric = res.fi->fib_priority; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ - neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (likely(nhc->nhc_gw_family != AF_INET6)) { + if (nhc->nhc_gw_family) + params->ipv4_dst = nhc->nhc_gw.ipv4; + + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + } else { + struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; + + params->family = AF_INET6; + *dst = nhc->nhc_gw.ipv6; + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); + } + if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; @@ -4752,18 +4767,16 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (f6i->fib6_nh.fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; - if (f6i->fib6_nh.fib_nh_has_gw) + if (f6i->fib6_nh.fib_nh_gw_family) *dst = f6i->fib6_nh.fib_nh_gw6; dev = f6i->fib6_nh.fib_nh_dev; params->rt_metric = f6i->fib6_metric; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is - * not needed here. Can not use __ipv6_neigh_lookup_noref here - * because we need to get nd_tbl via the stub + * not needed here. */ - neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, - ndisc_hashfn, dst, dev); + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ac679f74ba47..9bf1b9ad1780 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -291,6 +291,7 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + qstats->qlen = 0; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @@ -306,6 +307,7 @@ void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); } else { + qstats->qlen = q->qlen; qstats->backlog = q->backlog; qstats->drops = q->drops; qstats->requeues = q->requeues; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7e6dcc625701..ebb5b6d21a13 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -839,7 +839,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else if (tb[NETNSA_NSID]) { - peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); if (!peer) peer = ERR_PTR(-ENOENT); nla = tb[NETNSA_NSID]; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 76338c38738a..19aa32fc1802 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -94,8 +94,6 @@ int dns_query(const char *type, const char *name, size_t namelen, desclen += typelen + 1; } - if (!namelen) - namelen = strnlen(name, 256); if (namelen < 3 || namelen > 255) return -EINVAL; desclen += namelen + 1; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 15f779bd26b3..d4b63f94f7be 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -558,7 +558,8 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; - cfg->fc_gw = addr; + cfg->fc_gw4 = addr; + cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) @@ -568,7 +569,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (cmd == SIOCDELRT) return 0; - if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw) + if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) @@ -664,10 +665,55 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DPORT] = { .type = NLA_U16 }, }; +int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct rtvia *via; + int alen; + + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { + NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); + return -EINVAL; + } + + via = nla_data(nla); + alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (alen != sizeof(__be32)) { + NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET; + cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); + break; + case AF_INET6: +#ifdef CONFIG_IPV6 + if (alen != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET6; + cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); +#else + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EINVAL; +#endif + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); + return -EINVAL; + } + + return 0; +} + static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { + bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; @@ -708,12 +754,17 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: - cfg->fc_gw = nla_get_be32(attr); + has_gw = true; + cfg->fc_gw4 = nla_get_be32(attr); + if (cfg->fc_gw4) + cfg->fc_gw_family = AF_INET; break; case RTA_VIA: - NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); - err = -EINVAL; - goto errout; + has_via = true; + err = fib_gw_from_via(cfg, attr, extack); + if (err) + goto errout; + break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; @@ -752,6 +803,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (has_gw && has_via) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + goto errout; + } + return 0; errout: return err; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8e0cb1687a74..779d2be2b135 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -41,6 +41,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/ip6_fib.h> #include <net/netlink.h> #include <net/nexthop.h> #include <net/lwtunnel.h> @@ -276,7 +277,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) for_nexthops(fi) { if (nh->fib_nh_oif != onh->fib_nh_oif || - nh->fib_nh_gw4 != onh->fib_nh_gw4 || + nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || @@ -287,6 +288,15 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; + + if (nh->fib_nh_gw_family == AF_INET && + nh->fib_nh_gw4 != onh->fib_nh_gw4) + return -1; + + if (nh->fib_nh_gw_family == AF_INET6 && + ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) + return -1; + onh++; } endfor_nexthops(fi); return 0; @@ -447,10 +457,18 @@ static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev); + if (likely(nhc->nhc_gw_family == AF_INET)) + n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); + else if (nhc->nhc_gw_family == AF_INET6) + n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, + nhc->nhc_dev); + else + n = NULL; + if (n) { state = n->nud_state; neigh_release(n); @@ -511,10 +529,12 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, goto init_failure; nh->fib_nh_oif = cfg->fc_oif; - if (cfg->fc_gw) { - nh->fib_nh_gw4 = cfg->fc_gw; - nh->fib_nh_has_gw = 1; - } + nh->fib_nh_gw_family = cfg->fc_gw_family; + if (cfg->fc_gw_family == AF_INET) + nh->fib_nh_gw4 = cfg->fc_gw4; + else if (cfg->fc_gw_family == AF_INET6) + nh->fib_nh_gw6 = cfg->fc_gw6; + nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID @@ -586,11 +606,24 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla) - fib_cfg.fc_gw = nla_get_in_addr(nla); + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } + if (nla) { + fib_cfg.fc_gw4 = nla_get_in_addr(nla); + if (fib_cfg.fc_gw4) + fib_cfg.fc_gw_family = AF_INET; + } else if (nlav) { + ret = fib_gw_from_via(&fib_cfg, nlav, extack); + if (ret) + goto errout; + } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) @@ -616,10 +649,16 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, "Nexthop device index does not match RTA_OIF"); goto errout; } - if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) { - NL_SET_ERR_MSG(extack, - "Nexthop gateway does not match RTA_GATEWAY"); - goto errout; + if (cfg->fc_gw_family) { + if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + (cfg->fc_gw_family == AF_INET && + fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); + goto errout; + } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { @@ -719,7 +758,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; - if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_oif || cfg->fc_gw_family) { if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, fi->fib_nh, cfg, extack)) @@ -730,10 +769,20 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, cfg->fc_flow != fi->fib_nh->nh_tclassid) return 1; #endif - if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) && - (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->fib_nh_gw4)) - return 0; - return 1; + if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + (cfg->fc_gw_family && + cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + return 1; + + if (cfg->fc_gw_family == AF_INET && + cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + return 1; + + if (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + return 1; + + return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -754,11 +803,43 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4) - return 1; + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } + + if (nla) { + if (nh->fib_nh_gw_family != AF_INET || + nla_get_in_addr(nla) != nh->fib_nh_gw4) + return 1; + } else if (nlav) { + struct fib_config cfg2; + int err; + + err = fib_gw_from_via(&cfg2, nlav, extack); + if (err) + return err; + + switch (nh->fib_nh_gw_family) { + case AF_INET: + if (cfg2.fc_gw_family != AF_INET || + cfg2.fc_gw4 != nh->fib_nh_gw4) + return 1; + break; + case AF_INET6: + if (cfg2.fc_gw_family != AF_INET6 || + ipv6_addr_cmp(&cfg2.fc_gw6, + &nh->fib_nh_gw6)) + return 1; + break; + } + } + #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) @@ -812,6 +893,30 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) return true; } +static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, + u32 table, struct netlink_ext_ack *extack) +{ + struct fib6_config cfg = { + .fc_table = table, + .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, + .fc_ifindex = nh->fib_nh_oif, + .fc_gateway = nh->fib_nh_gw6, + }; + struct fib6_nh fib6_nh = {}; + int err; + + err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); + if (!err) { + nh->fib_nh_dev = fib6_nh.fib_nh_dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_oif = nh->fib_nh_dev->ifindex; + nh->fib_nh_scope = RT_SCOPE_LINK; + + ipv6_stub->fib6_nh_release(&fib6_nh); + } + + return err; +} /* * Picture @@ -856,134 +961,152 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, + u8 scope, struct netlink_ext_ack *extack) { - int err = 0; - struct net *net; struct net_device *dev; + struct fib_result res; + int err; - net = cfg->fc_nlinfo.nl_net; - if (nh->fib_nh_gw4) { - struct fib_result res; - - if (nh->fib_nh_flags & RTNH_F_ONLINK) { - unsigned int addr_type; + if (nh->fib_nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid scope"); - return -EINVAL; - } - dev = __dev_get_by_index(net, nh->fib_nh_oif); - if (!dev) { - NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); - return -ENODEV; - } - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, - "Nexthop device is not up"); - return -ENETDOWN; - } - addr_type = inet_addr_type_dev_table(net, dev, - nh->fib_nh_gw4); - if (addr_type != RTN_UNICAST) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - return -EINVAL; - } - if (!netif_carrier_ok(dev)) - nh->fib_nh_flags |= RTNH_F_LINKDOWN; - nh->fib_nh_dev = dev; - dev_hold(dev); - nh->fib_nh_scope = RT_SCOPE_LINK; - return 0; + if (scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); + return -EINVAL; } - rcu_read_lock(); - { - struct fib_table *tbl = NULL; - struct flowi4 fl4 = { - .daddr = nh->fib_nh_gw4, - .flowi4_scope = cfg->fc_scope + 1, - .flowi4_oif = nh->fib_nh_oif, - .flowi4_iif = LOOPBACK_IFINDEX, - }; - - /* It is not necessary, but requires a bit of thinking */ - if (fl4.flowi4_scope < RT_SCOPE_LINK) - fl4.flowi4_scope = RT_SCOPE_LINK; - - if (cfg->fc_table) - tbl = fib_get_table(net, cfg->fc_table); - - if (tbl) - err = fib_table_lookup(tbl, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE | - FIB_LOOKUP_NOREF); - - /* on error or if no table given do full lookup. This - * is needed for example when nexthops are in the local - * table rather than the given table - */ - if (!tbl || err) { - err = fib_lookup(net, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); - } - - if (err) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - rcu_read_unlock(); - return err; - } + dev = __dev_get_by_index(net, nh->fib_nh_oif); + if (!dev) { + NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); + return -ENODEV; } - err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { - NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); - goto out; + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + return -ENETDOWN; } - nh->fib_nh_scope = res.scope; - nh->fib_nh_oif = FIB_RES_OIF(res); - nh->fib_nh_dev = dev = FIB_RES_DEV(res); - if (!dev) { - NL_SET_ERR_MSG(extack, - "No egress device for nexthop gateway"); - goto out; + addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + return -EINVAL; } - dev_hold(dev); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; - } else { - struct in_device *in_dev; - - if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { - NL_SET_ERR_MSG(extack, - "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); - return -EINVAL; + nh->fib_nh_dev = dev; + dev_hold(dev); + nh->fib_nh_scope = RT_SCOPE_LINK; + return 0; + } + rcu_read_lock(); + { + struct fib_table *tbl = NULL; + struct flowi4 fl4 = { + .daddr = nh->fib_nh_gw4, + .flowi4_scope = scope + 1, + .flowi4_oif = nh->fib_nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + + if (table) + tbl = fib_get_table(net, table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); } - rcu_read_lock(); - err = -ENODEV; - in_dev = inetdev_by_index(net, nh->fib_nh_oif); - if (!in_dev) - goto out; - err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } - nh->fib_nh_dev = in_dev->dev; - dev_hold(nh->fib_nh_dev); - nh->fib_nh_scope = RT_SCOPE_HOST; - if (!netif_carrier_ok(nh->fib_nh_dev)) - nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = 0; } + + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + goto out; + } + nh->fib_nh_scope = res.scope; + nh->fib_nh_oif = FIB_RES_OIF(res); + nh->fib_nh_dev = dev = FIB_RES_DEV(res); + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); + goto out; + } + dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; +out: + rcu_read_unlock(); + return err; +} + +static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct in_device *in_dev; + int err; + + if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); + return -EINVAL; + } + + rcu_read_lock(); + + err = -ENODEV; + in_dev = inetdev_by_index(net, nh->fib_nh_oif); + if (!in_dev) + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + goto out; + } + + nh->fib_nh_dev = in_dev->dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->fib_nh_dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = 0; out: rcu_read_unlock(); return err; } +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + u32 table = cfg->fc_table; + int err; + + if (nh->fib_nh_gw_family == AF_INET) + err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + else if (nh->fib_nh_gw_family == AF_INET6) + err = fib_check_nh_v6_gw(net, nh, table, extack); + else + err = fib_check_nh_nongw(net, nh, extack); + + return err; +} + static inline unsigned int fib_laddr_hashfn(__be32 val) { unsigned int mask = (fib_info_hash_size - 1); @@ -1204,7 +1327,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; @@ -1238,7 +1361,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, "Route with host scope can not have multiple nexthops"); goto err_inval; } - if (nh->fib_nh_gw4) { + if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; @@ -1269,6 +1392,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); @@ -1341,18 +1466,32 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, rcu_read_unlock(); } - if (nhc->nhc_has_gw) { - switch (nhc->nhc_family) { - case AF_INET: - if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) - goto nla_put_failure; - break; - case AF_INET6: - if (nla_put_in6_addr(skb, RTA_GATEWAY, - &nhc->nhc_gw.ipv6) < 0) + switch (nhc->nhc_gw_family) { + case AF_INET: + if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) + goto nla_put_failure; + break; + case AF_INET6: + /* if gateway family does not match nexthop family + * gateway is encoded as RTA_VIA + */ + if (nhc->nhc_gw_family != nhc->nhc_family) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) goto nla_put_failure; - break; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); + } else if (nla_put_in6_addr(skb, RTA_GATEWAY, + &nhc->nhc_gw.ipv6) < 0) { + goto nla_put_failure; } + break; } *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); @@ -1832,8 +1971,14 @@ static bool fib_good_nh(const struct fib_nh *nh) rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, - (__force u32)nh->fib_nh_gw4); + if (likely(nh->fib_nh_gw_family == AF_INET)) + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + else if (nh->fib_nh_gw_family == AF_INET6) + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, + &nh->fib_nh_gw6); + else + n = NULL; if (n) state = n->nud_state; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 100e63f57ea6..b038f563baa4 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -136,7 +136,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ int prot; @@ -170,9 +170,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) /* guehdr may change after pull */ guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - hdrlen = sizeof(struct guehdr) + optlen; - - if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) + if (validate_gue_flags(guehdr, optlen)) goto drop; hdrlen = sizeof(struct guehdr) + optlen; @@ -1137,7 +1135,7 @@ static int gue_err(struct sk_buff *skb, u32 info) case 0: /* Full GUE header present */ break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); switch (((struct iphdr *)guehdr)->version) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6ea523d71947..a175e3e7ae97 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,7 +564,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 00ec819f949b..06f6f280b9ff 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_uses_gateway) + if (opt->is_strictroute && rt->rt_gw_family) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fd219f7bd3ea..4b0526441476 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -282,9 +281,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - pkt_md = (struct erspan_metadata *)(ershdr + 1); - if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), @@ -292,8 +288,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, goto drop; if (tunnel->collect_md) { + struct erspan_metadata *pkt_md, *md; struct ip_tunnel_info *info; - struct erspan_metadata *md; + unsigned char *gh; __be64 tun_id; __be16 flags; @@ -306,6 +303,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!tun_dst) return PACKET_REJECT; + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); md = ip_tunnel_info_opts(&tun_dst->u.tun_info); md->version = ver; md2 = &md->u.md2; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 10b35328cfbc..4e42c1974ba2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -188,7 +188,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; - u32 nexthop; + bool is_v6gw = false; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); @@ -218,16 +218,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } rcu_read_lock_bh(); - nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); - res = neigh_output(neigh, skb); - + /* if crossing protocols, can not use the cached header */ + res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock_bh(); return res; } @@ -472,7 +469,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f3f2adf630d4..efa6a36cbfff 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -434,37 +434,46 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; - const __be32 *pkey = daddr; - const struct rtable *rt; struct neighbour *n; - rt = (const struct rtable *) dst; - if (rt->rt_gateway) - pkey = (const __be32 *) &rt->rt_gateway; - else if (skb) - pkey = &ip_hdr(skb)->daddr; + rcu_read_lock_bh(); + + if (likely(rt->rt_gw_family == AF_INET)) { + n = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + n = ip_neigh_gw6(dev, &rt->rt_gw6); + } else { + __be32 pkey; + + pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); + n = ip_neigh_gw4(dev, pkey); + } + + if (n && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; - n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); - if (n) - return n; - return neigh_create(&arp_tbl, pkey, dev); + rcu_read_unlock_bh(); + + return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - const struct rtable *rt; - rt = (const struct rtable *)dst; - if (rt->rt_gateway) - pkey = (const __be32 *)&rt->rt_gateway; - else if (!daddr || + if (rt->rt_gw_family == AF_INET) { + pkey = (const __be32 *)&rt->rt_gw4; + } else if (rt->rt_gw_family == AF_INET6) { + return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); + } else if (!daddr || (rt->rt_flags & - (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; - + } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } @@ -629,8 +638,8 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = fnhe->fnhe_gw; - rt->rt_uses_gateway = 1; + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = fnhe->fnhe_gw; } } @@ -747,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow return; } - if (rt->rt_gateway != old_gw) + if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); @@ -1282,7 +1291,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) + if (rt->rt_gw_family && mtu > 576) mtu = 576; } @@ -1410,8 +1419,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, orig = NULL; } fill_route_from_fnhe(rt, fnhe); - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw4 = daddr; + rt->rt_gw_family = AF_INET; + } if (do_cache) { dst_hold(&rt->dst); @@ -1535,14 +1546,20 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); - struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); + struct fib_nh *nh; - if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { - rt->rt_gateway = nh->fib_nh_gw4; - rt->rt_uses_gateway = 1; + if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_gw_family = nhc->nhc_gw_family; + /* only INET and INET6 are supported */ + if (likely(nhc->nhc_gw_family == AF_INET)) + rt->rt_gw4 = nhc->nhc_gw.ipv4; + else + rt->rt_gw6 = nhc->nhc_gw.ipv6; } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + nh = container_of(nhc, struct fib_nh, nh_common); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -1557,8 +1574,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = daddr; + } rt_add_uncached_list(rt); } } else @@ -1591,8 +1610,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; - rt->rt_gateway = 0; - rt->rt_uses_gateway = 0; + rt->rt_gw_family = 0; + rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1734,8 +1753,9 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { - __be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; + __be32 gw; + gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; @@ -2284,7 +2304,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nhc->nhc_has_gw && + !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; @@ -2594,8 +2614,11 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_gateway = ort->rt_gateway; - rt->rt_uses_gateway = ort->rt_uses_gateway; + rt->rt_gw_family = ort->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + rt->rt_gw4 = ort->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + rt->rt_gw6 = ort->rt_gw6; INIT_LIST_HEAD(&rt->rt_uncached); } @@ -2674,9 +2697,22 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_uses_gateway && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } expires = rt->dst.expires; if (expires) { diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d73a6d6652f6..72d19b1838ed 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -97,8 +97,11 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; + xdst->u.rt.rt_gw_family = rt->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + xdst->u.rt.rt_gw4 = rt->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2e8d1d2d8d3d..340a0f06f974 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2421,7 +2421,7 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) continue; - if (no_gw && rt->fib6_nh.fib_nh_has_gw) + if (no_gw && rt->fib6_nh.fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 945b66e3008f..e37e4c5871f7 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -173,6 +173,14 @@ eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, return 0; } +static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -181,6 +189,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_lookup = eafnosupport_fib6_lookup, .fib6_multipath_select = eafnosupport_fib6_multipath_select, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, + .fib6_nh_init = eafnosupport_fib6_nh_init, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1789bf99c419..1dac6ea6666a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -919,6 +919,8 @@ static const struct ipv6_stub ipv6_stub_impl = { .fib6_lookup = fib6_lookup, .fib6_multipath_select = fib6_multipath_select, .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, + .fib6_nh_init = fib6_nh_init, + .fib6_nh_release = fib6_nh_release, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8c00609a1513..46f54a5bb1f0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2304,7 +2304,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_has_gw) { + if (rt->fib6_nh.fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); } else { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b32c95f02128..655e46b227f9 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -525,10 +525,10 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) } static int ip6erspan_rcv(struct sk_buff *skb, - struct tnl_ptk_info *tpi) + struct tnl_ptk_info *tpi, + int gre_hdr_len) { struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; const struct ipv6hdr *ipv6h; struct erspan_md2 *md2; struct ip6_tnl *tunnel; @@ -547,18 +547,16 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; - ershdr = (struct erspan_base_hdr *)skb->data; - pkt_md = (struct erspan_metadata *)(ershdr + 1); - if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), false, false) < 0) return PACKET_REJECT; if (tunnel->parms.collect_md) { + struct erspan_metadata *pkt_md, *md; struct metadata_dst *tun_dst; struct ip_tunnel_info *info; - struct erspan_metadata *md; + unsigned char *gh; __be64 tun_id; __be16 flags; @@ -571,6 +569,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (!tun_dst) return PACKET_REJECT; + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); md->version = ver; @@ -607,7 +613,7 @@ static int gre_rcv(struct sk_buff *skb) if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { - if (ip6erspan_rcv(skb, &tpi) == PACKET_RCVD) + if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; goto out; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e51f3c648b09..adef2236abe2 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -117,7 +117,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); - ret = neigh_output(neigh, skb); + ret = neigh_output(neigh, skb, false); rcu_read_unlock_bh(); return ret; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6e89151693d0..a77c004d67fb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -102,7 +102,8 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict); static size_t rt6_nlmsg_size(struct fib6_info *rt); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, @@ -446,12 +447,13 @@ struct fib6_info *fib6_multipath_select(const struct net *net, list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { + const struct fib6_nh *nh = &sibling->fib6_nh; int nh_upper_bound; - nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound); + nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (fl6->mp_hash > nh_upper_bound) continue; - if (rt6_score_route(sibling, oif, strict) < 0) + if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; @@ -464,12 +466,34 @@ struct fib6_info *fib6_multipath_select(const struct net *net, * Route lookup. rcu_read_lock() should be held. */ +static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, + const struct in6_addr *saddr, int oif, int flags) +{ + const struct net_device *dev; + + if (nh->fib_nh_flags & RTNH_F_DEAD) + return false; + + dev = nh->fib_nh_dev; + if (oif) { + if (dev->ifindex == oif) + return true; + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return true; + } + + return false; +} + static inline struct fib6_info *rt6_device_match(struct net *net, struct fib6_info *rt, const struct in6_addr *saddr, int oif, int flags) { + const struct fib6_nh *nh; struct fib6_info *sprt; if (!oif && ipv6_addr_any(saddr) && @@ -477,19 +501,9 @@ static inline struct fib6_info *rt6_device_match(struct net *net, return rt; for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { - const struct net_device *dev = sprt->fib6_nh.fib_nh_dev; - - if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) - continue; - - if (oif) { - if (dev->ifindex == oif) - return sprt; - } else { - if (ipv6_chk_addr(net, saddr, dev, - flags & RT6_LOOKUP_F_IFACE)) - return sprt; - } + nh = &sprt->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) + return sprt; } if (oif && flags & RT6_LOOKUP_F_IFACE) @@ -517,7 +531,7 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct fib6_info *rt) +static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; @@ -533,11 +547,11 @@ static void rt6_probe(struct fib6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !rt->fib6_nh.fib_nh_has_gw) + if (fib6_nh->fib_nh_gw_family) return; - nh_gw = &rt->fib6_nh.fib_nh_gw6; - dev = rt->fib6_nh.fib_nh_dev; + nh_gw = &fib6_nh->fib_nh_gw6; + dev = fib6_nh->fib_nh_dev; rcu_read_lock_bh(); idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); @@ -554,13 +568,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else if (time_after(jiffies, rt->last_probe + + } else if (time_after(jiffies, fib6_nh->last_probe + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { - rt->last_probe = jiffies; + fib6_nh->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -572,7 +586,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct fib6_info *rt) +static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif @@ -580,27 +594,14 @@ static inline void rt6_probe(struct fib6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct fib6_info *rt, int oif) -{ - const struct net_device *dev = rt->fib6_nh.fib_nh_dev; - - if (!oif || dev->ifindex == oif) - return 2; - return 0; -} - -static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) +static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - if (rt->fib6_flags & RTF_NONEXTHOP || - !rt->fib6_nh.fib_nh_has_gw) - return RT6_NUD_SUCCEED; - rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev, - &rt->fib6_nh.fib_nh_gw6); + neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, + &fib6_nh->fib_nh_gw6); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -621,43 +622,44 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) return ret; } -static int rt6_score_route(struct fib6_info *rt, int oif, int strict) +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict) { - int m; + int m = 0; + + if (!oif || nh->fib_nh_dev->ifindex == oif) + m = 2; - m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif - if (strict & RT6_LOOKUP_F_REACHABLE) { - int n = rt6_check_neigh(rt); + if ((strict & RT6_LOOKUP_F_REACHABLE) && + !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { + int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } -static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, - int *mpri, struct fib6_info *match, - bool *do_rr) +static bool find_match(struct fib6_nh *nh, u32 fib6_flags, + int oif, int strict, int *mpri, bool *do_rr) { - int m; bool match_do_rr = false; + bool rc = false; + int m; - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; - if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) && - rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && + if (ip6_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (fib6_check_expired(rt)) - goto out; - - m = rt6_score_route(rt, oif, strict); + m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ @@ -666,53 +668,64 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, } if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(rt); + rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; - match = rt; + rc = true; } out: - return match; + return rc; } -static struct fib6_info *find_rr_leaf(struct fib6_node *fn, - struct fib6_info *leaf, - struct fib6_info *rr_head, - u32 metric, int oif, int strict, - bool *do_rr) +static void __find_rr_leaf(struct fib6_info *rt_start, + struct fib6_info *nomatch, u32 metric, + struct fib6_info **match, struct fib6_info **cont, + int oif, int strict, bool *do_rr, int *mpri) { - struct fib6_info *rt, *match, *cont; - int mpri = -1; + struct fib6_info *rt; - match = NULL; - cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + for (rt = rt_start; + rt && rt != nomatch; + rt = rcu_dereference(rt->fib6_next)) { + struct fib6_nh *nh; + + if (cont && rt->fib6_metric != metric) { + *cont = rt; + return; } - match = find_match(rt, oif, strict, &mpri, match, do_rr); + if (fib6_check_expired(rt)) + continue; + + nh = &rt->fib6_nh; + if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr)) + *match = rt; } +} - for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; - } +static struct fib6_info *find_rr_leaf(struct fib6_node *fn, + struct fib6_info *leaf, + struct fib6_info *rr_head, + u32 metric, int oif, int strict, + bool *do_rr) +{ + struct fib6_info *match = NULL, *cont = NULL; + int mpri = -1; - match = find_match(rt, oif, strict, &mpri, match, do_rr); - } + __find_rr_leaf(rr_head, NULL, metric, &match, &cont, + oif, strict, do_rr, &mpri); + + __find_rr_leaf(leaf, rr_head, metric, &match, &cont, + oif, strict, do_rr, &mpri); if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) - match = find_match(rt, oif, strict, &mpri, match, do_rr); + __find_rr_leaf(cont, NULL, metric, &match, NULL, + oif, strict, do_rr, &mpri); return match; } @@ -769,7 +782,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) { - return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw; + return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -975,7 +988,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) rt->rt6i_dst = ort->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_flags = ort->fib6_flags; - if (ort->fib6_nh.fib_nh_has_gw) { + if (ort->fib6_nh.fib_nh_gw_family) { rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6; rt->rt6i_flags |= RTF_GATEWAY; } @@ -1061,36 +1074,37 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: f6i = rcu_dereference(fn->leaf); - if (!f6i) { + if (!f6i) f6i = net->ipv6.fib6_null_entry; - } else { + else f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); - if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) - f6i = fib6_multipath_select(net, f6i, fl6, - fl6->flowi6_oif, skb, - flags); - } + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; - } - trace_fib6_table_lookup(net, f6i, table, fl6); + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + goto out; + } + if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) + f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb, + flags); /* Search through exception table */ rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); - } else if (f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); } else { rt = ip6_create_rt_rcu(f6i); } +out: + trace_fib6_table_lookup(net, f6i, table, fl6); + rcu_read_unlock(); return rt; @@ -1841,9 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); f6i = fib6_table_lookup(net, table, oif, fl6, strict); - if (f6i->fib6_nsiblings) - f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); - if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); @@ -1851,6 +1862,9 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, return rt; } + if (f6i->fib6_nsiblings) + f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); + /*Search through exception table */ rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); if (rt) { @@ -1860,7 +1874,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_unlock(); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !f6i->fib6_nh.fib_nh_has_gw)) { + !f6i->fib6_nh.fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -2393,6 +2407,35 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, NULL); } +static bool ip6_redirect_nh_match(struct fib6_info *f6i, + struct fib6_nh *nh, + struct flowi6 *fl6, + const struct in6_addr *gw, + struct rt6_info **ret) +{ + if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || + fl6->flowi6_oif != nh->fib_nh_dev->ifindex) + return false; + + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { + *ret = rt_cache; + return true; + } + return false; + } + return true; +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2406,7 +2449,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *ret = NULL, *rt_cache; + struct rt6_info *ret = NULL; struct fib6_info *rt; struct fib6_node *fn; @@ -2424,34 +2467,13 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) - continue; if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (!rt->fib6_nh.fib_nh_has_gw) - continue; - if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex) - continue; - /* rt_cache's gateway might be different from its 'parent' - * in the case of an ip redirect. - * So we keep searching in the exception table if the gateway - * is different. - */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) { - rt_cache = rt6_find_cached_rt(rt, - &fl6->daddr, - &fl6->saddr); - if (rt_cache && - ipv6_addr_equal(&rdfl->gateway, - &rt_cache->rt6i_gateway)) { - ret = rt_cache; - break; - } - continue; - } - break; + if (ip6_redirect_nh_match(rt, &rt->fib6_nh, fl6, + &rdfl->gateway, &ret)) + goto out; } if (!rt) @@ -2964,7 +2986,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; fib6_nh->fib_nh_gw6 = cfg->fc_gateway; - fib6_nh->fib_nh_has_gw = 1; + fib6_nh->fib_nh_gw_family = AF_INET6; } err = -ENODEV; @@ -3476,7 +3498,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net, if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_has_gw) + !rt->fib6_nh.fib_nh_gw_family) continue; if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) continue; @@ -3807,7 +3829,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) struct in6_addr *gateway = (struct in6_addr *)arg; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_has_gw && + rt->fib6_nh.fib_nh_gw_family && ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { return -1; } diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index f3a8557494d6..2619c2fbea93 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -137,10 +137,14 @@ static int mpls_xmit(struct sk_buff *skb) mpls_stats_inc_outucastpkts(out_dev, skb); - if (rt) - err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, - skb); - else if (rt6) { + if (rt) { + if (rt->rt_gw_family == AF_INET) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, + skb); + else if (rt->rt_gw_family == AF_INET6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, + skb); + } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index ddfc52ac1f9b..c0d323b58e73 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -312,6 +312,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; + if (new_pipe >= NCI_HCI_MAX_PIPES) { + status = NCI_HCI_ANY_E_NOK; + goto exit; + } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id @@ -336,6 +340,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { + status = NCI_HCI_ANY_E_NOK; + goto exit; + } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2763176e369c..4b5585358699 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -336,8 +336,7 @@ static void fl_mask_free_work(struct work_struct *work) fl_mask_free(mask); } -static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, - bool async) +static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask) { if (!refcount_dec_and_test(&mask->refcnt)) return false; @@ -348,10 +347,7 @@ static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, list_del_rcu(&mask->list); spin_unlock(&head->masks_lock); - if (async) - tcf_queue_work(&mask->rwork, fl_mask_free_work); - else - fl_mask_free(mask); + tcf_queue_work(&mask->rwork, fl_mask_free_work); return true; } @@ -538,7 +534,6 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); - bool async = tcf_exts_get_net(&f->exts); *last = false; @@ -555,7 +550,7 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, list_del_rcu(&f->list); spin_unlock(&tp->lock); - *last = fl_mask_put(head, f->mask, async); + *last = fl_mask_put(head, f->mask); if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f, rtnl_held, extack); tcf_unbind_filter(tp, &f->res); @@ -1466,9 +1461,9 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew, struct fl_flow_mask *mask = fnew->mask; int err; - err = rhashtable_insert_fast(&mask->ht, - &fnew->ht_node, - mask->filter_ht_params); + err = rhashtable_lookup_insert_fast(&mask->ht, + &fnew->ht_node, + mask->filter_ht_params); if (err) { *in_ht = false; /* It is okay if filter with same key exists when @@ -1605,11 +1600,10 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, spin_unlock(&tp->lock); - fl_mask_put(head, fold->mask, true); + fl_mask_put(head, fold->mask); if (!tc_skip_hw(fold->flags)) fl_hw_destroy_filter(tp, fold, rtnl_held, NULL); tcf_unbind_filter(tp, &fold->res); - tcf_exts_get_net(&fold->exts); /* Caller holds reference to fold, so refcnt is always > 0 * after this. */ @@ -1657,8 +1651,9 @@ errout_ht: rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, fnew->mask->filter_ht_params); errout_mask: - fl_mask_put(head, fnew->mask, true); + fl_mask_put(head, fnew->mask); errout: + tcf_exts_get_net(&fnew->exts); tcf_queue_work(&fnew->rwork, fl_destroy_filter_work); errout_tb: kfree(tb); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index fb8f138b9776..c126b9f78d6e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -998,6 +998,19 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb, qdisc_put(old); } +static void qdisc_clear_nolock(struct Qdisc *sch) +{ + sch->flags &= ~TCQ_F_NOLOCK; + if (!(sch->flags & TCQ_F_CPUSTATS)) + return; + + free_percpu(sch->cpu_bstats); + free_percpu(sch->cpu_qstats); + sch->cpu_bstats = NULL; + sch->cpu_qstats = NULL; + sch->flags &= ~TCQ_F_CPUSTATS; +} + /* Graft qdisc "new" to class "classid" of qdisc "parent" or * to device "dev". * @@ -1076,7 +1089,7 @@ skip: /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && parent && !(parent->flags & TCQ_F_NOLOCK)) - new->flags &= ~TCQ_F_NOLOCK; + qdisc_clear_nolock(new); if (!cops || !cops->graft) return -EOPNOTSUPP; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index c6a502933fe7..f68fd7a0e038 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -61,16 +61,20 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <net/netevent.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> +static LIST_HEAD(cbs_list); +static DEFINE_SPINLOCK(cbs_list_lock); + #define BYTES_PER_KBIT (1000LL / 8) struct cbs_sched_data { bool offload; int queue; - s64 port_rate; /* in bytes/s */ + atomic64_t port_rate; /* in bytes/s */ s64 last; /* timestamp in ns */ s64 credits; /* in bytes */ s32 locredit; /* in bytes */ @@ -82,6 +86,7 @@ struct cbs_sched_data { struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); struct Qdisc *qdisc; + struct list_head cbs_list; }; static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -181,6 +186,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; + if (atomic64_read(&q->port_rate) == -1) { + WARN_ONCE(1, "cbs: dequeue() called with unknown port rate."); + return NULL; + } + if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -207,7 +217,8 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) /* As sendslope is a negative number, this will decrease the * amount of q->credits. */ - credits = credits_from_len(len, q->sendslope, q->port_rate); + credits = credits_from_len(len, q->sendslope, + atomic64_read(&q->port_rate)); credits += q->credits; q->credits = max_t(s64, credits, q->locredit); @@ -294,6 +305,50 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, return 0; } +static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) +{ + struct ethtool_link_ksettings ecmd; + int port_rate = -1; + + if (!__ethtool_get_link_ksettings(dev, &ecmd) && + ecmd.base.speed != SPEED_UNKNOWN) + port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT; + + atomic64_set(&q->port_rate, port_rate); + netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n", + dev->name, (long long)atomic64_read(&q->port_rate), + ecmd.base.speed); +} + +static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct cbs_sched_data *q; + struct net_device *qdev; + bool found = false; + + ASSERT_RTNL(); + + if (event != NETDEV_UP && event != NETDEV_CHANGE) + return NOTIFY_DONE; + + spin_lock(&cbs_list_lock); + list_for_each_entry(q, &cbs_list, cbs_list) { + qdev = qdisc_dev(q->qdisc); + if (qdev == dev) { + found = true; + break; + } + } + spin_unlock(&cbs_list_lock); + + if (found) + cbs_set_port_rate(dev, q); + + return NOTIFY_DONE; +} + static int cbs_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -315,16 +370,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_CBS_PARMS]); if (!qopt->offload) { - struct ethtool_link_ksettings ecmd; - s64 link_speed; - - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; - - q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; - + cbs_set_port_rate(dev, q); cbs_disable_offload(dev, q); } else { err = cbs_enable_offload(dev, q, qopt, extack); @@ -347,6 +393,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -367,7 +414,17 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - return cbs_change(sch, opt, extack); + err = cbs_change(sch, opt, extack); + if (err) + return err; + + if (!q->offload) { + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + } + + return 0; } static void cbs_destroy(struct Qdisc *sch) @@ -375,8 +432,11 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - qdisc_watchdog_cancel(&q->watchdog); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); if (q->qdisc) @@ -487,14 +547,24 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static struct notifier_block cbs_device_notifier = { + .notifier_call = cbs_dev_notifier, +}; + static int __init cbs_module_init(void) { + int err = register_netdevice_notifier(&cbs_device_notifier); + + if (err) + return err; + return register_qdisc(&cbs_qdisc_ops); } static void __exit cbs_module_exit(void) { unregister_qdisc(&cbs_qdisc_ops); + unregister_netdevice_notifier(&cbs_device_notifier); } module_init(cbs_module_init) module_exit(cbs_module_exit) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 81356ef38d1d..848aab3693bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -68,7 +68,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_atomic_qlen_dec(q); + qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -108,7 +108,7 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_atomic_qlen_inc(q); + qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; @@ -118,52 +118,36 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, spin_unlock(lock); } -static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - while (skb) { - struct sk_buff *next = skb->next; - - __skb_queue_tail(&q->gso_skb, skb); - q->qstats.requeues++; - qdisc_qstats_backlog_inc(q, skb); - q->q.qlen++; /* it's still part of the queue */ + spinlock_t *lock = NULL; - skb = next; + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); } - __netif_schedule(q); - - return 0; -} -static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) -{ - spinlock_t *lock = qdisc_lock(q); - - spin_lock(lock); while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); - qdisc_qstats_cpu_requeues_inc(q); - qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_atomic_qlen_inc(q); + /* it's still part of the queue */ + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + q->qstats.requeues++; + qdisc_qstats_backlog_inc(q, skb); + q->q.qlen++; + } skb = next; } - spin_unlock(lock); - + if (lock) + spin_unlock(lock); __netif_schedule(q); - - return 0; -} - -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) -{ - if (q->flags & TCQ_F_NOLOCK) - return dev_requeue_skb_locked(skb, q); - else - return __dev_requeue_skb(skb, q); } static void try_bulk_dequeue_skb(struct Qdisc *q, @@ -252,7 +236,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_atomic_qlen_dec(q); + qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -645,11 +629,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, if (unlikely(err)) return qdisc_drop_cpu(skb, qdisc, to_free); - qdisc_qstats_atomic_qlen_inc(qdisc); - /* Note: skb can not be used after skb_array_produce(), - * so we better not use qdisc_qstats_cpu_backlog_inc() - */ - this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len); + qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } @@ -668,9 +648,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) skb = __skb_array_consume(q); } if (likely(skb)) { - qdisc_qstats_cpu_backlog_dec(qdisc, skb); - qdisc_bstats_cpu_update(qdisc, skb); - qdisc_qstats_atomic_qlen_dec(qdisc); + qdisc_update_stats_at_dequeue(qdisc, skb); } else { qdisc->empty = true; } @@ -716,6 +694,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; + q->qlen = 0; } } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c7041999eb5d..1b0fb80162e6 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -20,6 +20,9 @@ #include <net/pkt_cls.h> #include <net/sch_generic.h> +static LIST_HEAD(taprio_list); +static DEFINE_SPINLOCK(taprio_list_lock); + #define TAPRIO_ALL_GATES_OPEN -1 struct sched_entry { @@ -42,9 +45,9 @@ struct taprio_sched { struct Qdisc *root; s64 base_time; int clockid; - int picos_per_byte; /* Using picoseconds because for 10Gbps+ - * speeds it's sub-nanoseconds per byte - */ + atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ size_t num_entries; /* Protects the update side of the RCU protected current_entry */ @@ -53,6 +56,7 @@ struct taprio_sched { struct list_head entries; ktime_t (*get_time)(void); struct hrtimer advance_timer; + struct list_head taprio_list; }; static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -117,7 +121,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) static inline int length_to_duration(struct taprio_sched *q, int len) { - return (len * q->picos_per_byte) / 1000; + return (len * atomic64_read(&q->picos_per_byte)) / 1000; } static struct sk_buff *taprio_dequeue(struct Qdisc *sch) @@ -129,6 +133,11 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) u32 gate_mask; int i; + if (atomic64_read(&q->picos_per_byte) == -1) { + WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte."); + return NULL; + } + rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't @@ -233,7 +242,7 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) next->close_time = close_time; atomic_set(&next->budget, - (next->interval * 1000) / q->picos_per_byte); + (next->interval * 1000) / atomic64_read(&q->picos_per_byte)); first_run: rcu_assign_pointer(q->current_entry, next); @@ -567,7 +576,8 @@ static void taprio_start_sched(struct Qdisc *sch, ktime_t start) first->close_time = ktime_add_ns(start, first->interval); atomic_set(&first->budget, - (first->interval * 1000) / q->picos_per_byte); + (first->interval * 1000) / + atomic64_read(&q->picos_per_byte)); rcu_assign_pointer(q->current_entry, NULL); spin_unlock_irqrestore(&q->current_entry_lock, flags); @@ -575,6 +585,52 @@ static void taprio_start_sched(struct Qdisc *sch, ktime_t start) hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } +static void taprio_set_picos_per_byte(struct net_device *dev, + struct taprio_sched *q) +{ + struct ethtool_link_ksettings ecmd; + int picos_per_byte = -1; + + if (!__ethtool_get_link_ksettings(dev, &ecmd) && + ecmd.base.speed != SPEED_UNKNOWN) + picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + ecmd.base.speed * 1000 * 1000); + + atomic64_set(&q->picos_per_byte, picos_per_byte); + netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", + dev->name, (long long)atomic64_read(&q->picos_per_byte), + ecmd.base.speed); +} + +static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net_device *qdev; + struct taprio_sched *q; + bool found = false; + + ASSERT_RTNL(); + + if (event != NETDEV_UP && event != NETDEV_CHANGE) + return NOTIFY_DONE; + + spin_lock(&taprio_list_lock); + list_for_each_entry(q, &taprio_list, taprio_list) { + qdev = qdisc_dev(q->root); + if (qdev == dev) { + found = true; + break; + } + } + spin_unlock(&taprio_list_lock); + + if (found) + taprio_set_picos_per_byte(dev, q); + + return NOTIFY_DONE; +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -582,9 +638,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; - struct ethtool_link_ksettings ecmd; int i, err, size; - s64 link_speed; ktime_t start; err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, @@ -657,14 +711,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, mqprio->prio_tc_map[i]); } - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; - - q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - link_speed * 1000 * 1000); - + taprio_set_picos_per_byte(dev, q); start = taprio_get_start_time(sch); if (!start) return 0; @@ -681,6 +728,10 @@ static void taprio_destroy(struct Qdisc *sch) struct sched_entry *entry, *n; unsigned int i; + spin_lock(&taprio_list_lock); + list_del(&q->taprio_list); + spin_unlock(&taprio_list_lock); + hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { @@ -735,6 +786,10 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; + spin_lock(&taprio_list_lock); + list_add(&q->taprio_list, &taprio_list); + spin_unlock(&taprio_list_lock); + return taprio_change(sch, opt, extack); } @@ -947,14 +1002,24 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static struct notifier_block taprio_device_notifier = { + .notifier_call = taprio_dev_notifier, +}; + static int __init taprio_module_init(void) { + int err = register_netdevice_notifier(&taprio_device_notifier); + + if (err) + return err; + return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); + unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 102c6fefe38c..25e0b7e5189c 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -484,14 +484,15 @@ static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *event) + struct sk_buff_head *skb_list) { - struct sk_buff *skb = sctp_event2skb(event); struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff_head *skb_list; + struct sctp_ulpevent *event; + struct sk_buff *skb; - skb_list = (struct sk_buff_head *)skb->prev; + skb = __skb_peek(skb_list); + event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || @@ -858,19 +859,24 @@ static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); - if (event && event->msg_flags & MSG_EOR) { + if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); - event = sctp_intl_order(ulpq, event); + if (event->msg_flags & MSG_EOR) + event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; - sctp_enqueue_event(ulpq, event); + sctp_enqueue_event(ulpq, &temp); } return event_eor; @@ -944,20 +950,27 @@ out: static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; + struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); - if (event) - sctp_enqueue_event(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_enqueue_event(ulpq, &temp); + } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); - if (event) - sctp_enqueue_event(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_enqueue_event(ulpq, &temp); + } } while (event); } } @@ -1059,7 +1072,7 @@ static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) if (event) { sctp_intl_retrieve_ordered(ulpq, event); - sctp_enqueue_event(ulpq, event); + sctp_enqueue_event(ulpq, &temp); } } @@ -1298,6 +1311,15 @@ static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) ntohl(skip->mid), skip->flags); } +static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +{ + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + return sctp_ulpq_tail_event(ulpq, &temp); +} + static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), @@ -1306,7 +1328,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, - .enqueue_event = sctp_ulpq_tail_event, + .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, @@ -1317,6 +1339,16 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .handle_ftsn = sctp_handle_fwdtsn, }; +static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + return sctp_enqueue_event(ulpq, &temp); +} + static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), @@ -1325,7 +1357,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = { .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, - .enqueue_event = sctp_enqueue_event, + .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 5dde92101743..7cdc3623fa35 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -116,12 +116,13 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, event = sctp_ulpq_reasm(ulpq, event); /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)) { + if (event) { /* Create a temporary list to collect chunks on. */ skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); - event = sctp_ulpq_order(ulpq, event); + if (event->msg_flags & MSG_EOR) + event = sctp_ulpq_order(ulpq, event); } /* Send event to the ULP. 'event' is the sctp_ulpevent for @@ -129,7 +130,7 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, */ if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } return event_eor; @@ -193,18 +194,17 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); } -/* If the SKB of 'event' is on a list, it is the first such member - * of that list. - */ -int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff_head *queue, *skb_list; - struct sk_buff *skb = sctp_event2skb(event); + struct sctp_ulpevent *event; + struct sk_buff_head *queue; + struct sk_buff *skb; int clear_pd = 0; - skb_list = (struct sk_buff_head *) skb->prev; + skb = __skb_peek(skb_list); + event = sctp_skb2event(skb); /* If the socket is just going to throw this away, do not * even try to deliver it. @@ -257,13 +257,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) } } - /* If we are harvesting multiple skbs they will be - * collected on a list. - */ - if (skb_list) - skb_queue_splice_tail_init(skb_list, queue); - else - __skb_queue_tail(queue, skb); + skb_queue_splice_tail_init(skb_list, queue); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive @@ -738,25 +732,25 @@ void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn) static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) { struct sctp_ulpevent *event = NULL; - struct sk_buff_head temp; if (skb_queue_empty(&ulpq->reasm)) return; while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { - /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)) { - skb_queue_head_init(&temp); - __skb_queue_tail(&temp, sctp_event2skb(event)); + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + /* Do ordering if needed. */ + if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); - } /* Send event to the ULP. 'event' is the * sctp_ulpevent for very first SKB on the temp' list. */ if (event) - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } } @@ -956,7 +950,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) if (event) { /* see if we have more ordered that we can deliver */ sctp_ulpq_retrieve_ordered(ulpq, event); - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } } @@ -1082,7 +1076,11 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, event = sctp_ulpq_retrieve_first(ulpq); /* Send event to the ULP. */ if (event) { - sctp_ulpq_tail_event(ulpq, event); + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_ulpq_tail_event(ulpq, &temp); sctp_ulpq_set_pd(ulpq); return; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 77ef53596d18..e066899de72d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -134,11 +134,9 @@ static int smc_release(struct socket *sock) smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ - if (smc->connect_info && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); flush_work(&smc->connect_work); - kfree(smc->connect_info); - smc->connect_info = NULL; if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires @@ -452,6 +450,7 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; @@ -491,46 +490,41 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, mutex_unlock(&smc_client_lgr_pending); smc_conn_free(&smc->conn); + smc->connect_nonblock = 0; return reason_code; } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ -static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) +static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { - int reason_code = 0; - /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, - gid); - if (!(*ibdev)) - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - - return reason_code; + smc_pnet_find_roce_resource(smc->clcsock->sk, ini); + if (!ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ -static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ - smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); - if (!(*ismdev)) - return SMC_CLC_DECL_CNFERR; /* configuration error */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ini); + if (!ini->ism_dev) + return SMC_CLC_DECL_NOSMCDDEV; return 0; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { - if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) - return SMC_CLC_DECL_CNFERR; + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) + return SMC_CLC_DECL_ISMVLANERR; return 0; } @@ -538,12 +532,11 @@ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { if (!is_smcd) return 0; - if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } @@ -551,13 +544,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport, - u8 gid[], struct smcd_dev *ismdev) + struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); + rc = smc_clc_send_proposal(smc, smc_type, ini); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -568,23 +560,19 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; struct smc_link *link; int reason_code = 0; + ini->is_smcd = false; + ini->ib_lcl = &aclc->lcl; + ini->ib_clcqpn = ntoh24(aclc->qpn); + ini->srv_first_contact = aclc->hdr.flag; + mutex_lock(&smc_client_lgr_pending); - local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, ntoh24(aclc->qpn), &aclc->lcl, - NULL, 0); - if (local_contact < 0) { - if (local_contact == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (local_contact == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - else - reason_code = SMC_CLC_DECL_INTERR; /* other error */ + reason_code = smc_conn_create(smc, ini); + if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -594,45 +582,48 @@ static int smc_connect_rdma(struct smc_sock *smc, /* create send buffer and rmb */ if (smc_buf_create(smc, false)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); - if (local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - local_contact); + ini->cln_first_contact); smc_close_init(smc); smc_rx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - local_contact); + ini->cln_first_contact); } else { if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - local_contact); + ini->cln_first_contact); } smc_rmb_sync_sg_for_device(&smc->conn); reason_code = smc_clc_send_confirm(smc); if (reason_code) - return smc_connect_abort(smc, reason_code, local_contact); + return smc_connect_abort(smc, reason_code, + ini->cln_first_contact); smc_tx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, - local_contact); + ini->cln_first_contact); } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -642,23 +633,26 @@ static int smc_connect_rdma(struct smc_sock *smc, /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smcd_dev *ismdev) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; int rc = 0; + ini->is_smcd = true; + ini->ism_gid = aclc->gid; + ini->srv_first_contact = aclc->hdr.flag; + /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, - NULL, ismdev, aclc->gid); - if (local_contact < 0) { + rc = smc_conn_create(smc, ini); + if (rc) { mutex_unlock(&smc_server_lgr_pending); - return SMC_CLC_DECL_MEM; + return rc; } /* Create send and receive buffers */ if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); @@ -667,10 +661,11 @@ static int smc_connect_ism(struct smc_sock *smc, rc = smc_clc_send_confirm(smc); if (rc) - return smc_connect_abort(smc, rc, local_contact); + return smc_connect_abort(smc, rc, ini->cln_first_contact); mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -682,13 +677,9 @@ static int __smc_connect(struct smc_sock *smc) { bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; - struct smc_ib_device *ibdev; - struct smcd_dev *ismdev; - u8 gid[SMC_GID_SIZE]; - unsigned short vlan; + struct smc_init_info ini = {0}; int smc_type; int rc = 0; - u8 ibport; sock_hold(&smc->sk); /* sock put in passive closing */ @@ -703,20 +694,21 @@ static int __smc_connect(struct smc_sock *smc) if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check for VLAN ID */ - if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, &ini)) + return smc_connect_decline_fallback(smc, + SMC_CLC_DECL_GETVLANERR); /* check if there is an ism device available */ - if (!smc_check_ism(smc, &ismdev) && - !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + if (!smc_find_ism_device(smc, &ini) && + !smc_connect_ism_vlan_setup(smc, &ini)) { /* ISM is supported for this connection */ ism_supported = true; smc_type = SMC_TYPE_D; } /* check if there is a rdma device available */ - if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { + if (!smc_find_rdma_device(smc, &ini)) { /* RDMA is supported for this connection */ rdma_supported = true; if (ism_supported) @@ -730,25 +722,25 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); + rc = smc_connect_clc(smc, smc_type, &aclc, &ini); if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } /* depending on previous steps, connect using rdma or ism */ if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + rc = smc_connect_rdma(smc, &aclc, &ini); else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) - rc = smc_connect_ism(smc, &aclc, ismdev); + rc = smc_connect_ism(smc, &aclc, &ini); else rc = SMC_CLC_DECL_MODEUNSUPP; if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return 0; } @@ -756,17 +748,30 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - int rc; + long timeo = smc->sk.sk_sndtimeo; + int rc = 0; - lock_sock(&smc->sk); - rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, - smc->connect_info->alen, smc->connect_info->flags); + if (!timeo) + timeo = MAX_SCHEDULE_TIMEOUT; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; - goto out; - } - if (rc < 0) { - smc->sk.sk_err = -rc; + } else if ((1 << smc->clcsock->sk->sk_state) & + (TCPF_SYN_SENT | TCP_SYN_RECV)) { + rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); + if ((rc == -EPIPE) && + ((1 << smc->clcsock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) + rc = 0; + } + release_sock(smc->clcsock->sk); + lock_sock(&smc->sk); + if (rc != 0 || smc->sk.sk_err) { + smc->sk.sk_state = SMC_CLOSED; + if (rc == -EPIPE || rc == -EAGAIN) + smc->sk.sk_err = EPIPE; + else if (signal_pending(current)) + smc->sk.sk_err = -sock_intr_errno(timeo); goto out; } @@ -779,8 +784,6 @@ out: smc->sk.sk_state_change(&smc->sk); else smc->sk.sk_write_space(&smc->sk); - kfree(smc->connect_info); - smc->connect_info = NULL; release_sock(&smc->sk); } @@ -813,26 +816,18 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (smc->connect_nonblock) { + rc = -EALREADY; + goto out; + } + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc && rc != -EINPROGRESS) + goto out; if (flags & O_NONBLOCK) { - if (smc->connect_info) { - rc = -EALREADY; - goto out; - } - smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); - if (!smc->connect_info) { - rc = -ENOMEM; - goto out; - } - smc->connect_info->alen = alen; - smc->connect_info->flags = flags ^ O_NONBLOCK; - memcpy(&smc->connect_info->addr, addr, alen); - schedule_work(&smc->connect_work); + if (schedule_work(&smc->connect_work)) + smc->connect_nonblock = 1; rc = -EINPROGRESS; } else { - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; - rc = __smc_connect(smc); if (rc < 0) goto out; @@ -1099,7 +1094,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, } /* listen worker: check prefixes */ -static int smc_listen_rdma_check(struct smc_sock *new_smc, +static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; @@ -1107,25 +1102,21 @@ static int smc_listen_rdma_check(struct smc_sock *new_smc, pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_ib_device *ibdev, u8 ibport, - int *local_contact) + struct smc_init_info *ini) { + int rc; + /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, - &pclc->lcl, NULL, 0); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) @@ -1137,33 +1128,30 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, - struct smcd_dev *ismdev, - int *local_contact) + struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd; + int rc; pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, - ismdev, pclc_smcd->gid); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + ini->ism_gid = pclc_smcd->gid; + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* Check if peer can be reached via ISM device */ if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, new_smc->conn.lgr->vlan_id, new_smc->conn.lgr->smcd)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_SMCDNOTALK; } /* Create send and receive buffers */ if (smc_buf_create(new_smc, true)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_MEM; @@ -1227,15 +1215,10 @@ static void smc_listen_work(struct work_struct *work) struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *ibdev; + struct smc_init_info ini = {0}; bool ism_supported = false; - struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; - int local_contact = 0; - unsigned short vlan; - int reason_code = 0; int rc = 0; - u8 ibport; if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); @@ -1254,17 +1237,26 @@ static void smc_listen_work(struct work_struct *work) * wait for and receive SMC Proposal CLC message */ pclc = (struct smc_clc_msg_proposal *)&buf; - reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL, CLC_WAIT_TIME); - if (reason_code) { - smc_listen_decline(new_smc, reason_code, 0); - return; - } + rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); + if (rc) + goto out_decl; /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(new_smc)) { - smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); - return; + rc = SMC_CLC_DECL_IPSEC; + goto out_decl; + } + + /* check for matching IP prefix and subnet length */ + rc = smc_listen_prfx_check(new_smc, pclc); + if (rc) + goto out_decl; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) { + rc = SMC_CLC_DECL_GETVLANERR; + goto out_decl; } mutex_lock(&smc_server_lgr_pending); @@ -1273,59 +1265,73 @@ static void smc_listen_work(struct work_struct *work) smc_tx_init(new_smc); /* check if ISM is available */ - if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && - !smc_check_ism(new_smc, &ismdev) && - !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { - ism_supported = true; + if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) { + ini.is_smcd = true; /* prepare ISM check */ + rc = smc_find_ism_device(new_smc, &ini); + if (!rc) + rc = smc_listen_ism_init(new_smc, pclc, &ini); + if (!rc) + ism_supported = true; + else if (pclc->hdr.path == SMC_TYPE_D) + goto out_unlock; /* skip RDMA and decline */ } /* check if RDMA is available */ - if (!ism_supported && - ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || - smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact))) { - /* SMC not supported, decline */ - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, - local_contact); - return; + if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ + /* prepare RDMA check */ + memset(&ini, 0, sizeof(ini)); + ini.is_smcd = false; + ini.ib_lcl = &pclc->lcl; + rc = smc_find_rdma_device(new_smc, &ini); + if (rc) { + /* no RDMA device found */ + if (pclc->hdr.path == SMC_TYPE_B) + /* neither ISM nor RDMA device found */ + rc = SMC_CLC_DECL_NOSMCDEV; + goto out_unlock; + } + rc = smc_listen_rdma_init(new_smc, &ini); + if (rc) + goto out_unlock; + rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; } /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) { - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, rc, local_contact); - return; - } + rc = smc_clc_send_accept(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; /* SMC-D does not need this lock any more */ if (ism_supported) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM, CLC_WAIT_TIME); - if (reason_code) { + rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM, CLC_WAIT_TIME); + if (rc) { if (!ism_supported) - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, reason_code, local_contact); - return; + goto out_unlock; + goto out_decl; } /* finish worker */ if (!ism_supported) { - rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact); + rc = smc_listen_rdma_finish(new_smc, &cclc, + ini.cln_first_contact); mutex_unlock(&smc_server_lgr_pending); if (rc) return; } smc_conn_save_peer_info(new_smc, &cclc); smc_listen_out_connected(new_smc); + return; + +out_unlock: + mutex_unlock(&smc_server_lgr_pending); +out_decl: + smc_listen_decline(new_smc, rc, ini.cln_first_contact); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1571,8 +1577,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; struct smc_sock *smc; + __poll_t mask = 0; if (!sk) return EPOLLNVAL; @@ -1582,8 +1588,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) - mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); @@ -1594,9 +1598,14 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ - mask = smc_accept_poll(sk); + mask |= smc_accept_poll(sk); + } else if (smc->use_fallback) { /* as result of connect_work()*/ + mask |= smc->clcsock->ops->poll(file, smc->clcsock, + wait); + sk->sk_err = smc->clcsock->sk->sk_err; } else { - if (atomic_read(&smc->conn.sndbuf_space) || + if ((sk->sk_state != SMC_INIT && + atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { diff --git a/net/smc/smc.h b/net/smc/smc.h index adbdf195eb08..878313f8d6c1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -190,18 +190,11 @@ struct smc_connection { u64 peer_token; /* SMC-D token of peer */ }; -struct smc_connect_info { - int flags; - int alen; - struct sockaddr addr; -}; - struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - struct smc_connect_info *connect_info; /* connect address & flags */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ @@ -219,6 +212,10 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 connect_nonblock : 1; + /* non-blocking connect in + * flight + */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index d53fd588d1f5..745afd82f281 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -385,8 +385,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *ibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev) + struct smc_init_info *ini) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; @@ -416,8 +415,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, /* add SMC-R specifics */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + ETH_ALEN); pclc.iparea_offset = htons(0); } if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { @@ -425,7 +425,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, memset(&pclc_smcd, 0, sizeof(pclc_smcd)); plen += sizeof(pclc_smcd); pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); - pclc_smcd.gid = ismdev->local_gid; + pclc_smcd.gid = ini->ism_dev->local_gid; } pclc.hdr.length = htons(plen); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 24658e8c0de4..ca209272e5fa 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -34,16 +34,22 @@ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ #define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ #define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ -#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ +#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ +#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ +#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ +#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ +#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ -#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ -#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */ -#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */ +#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ +#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -179,6 +185,7 @@ smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) } struct smcd_dev; +struct smc_init_info; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); @@ -186,8 +193,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev); + struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 53a17cfa61af..2d2850adc2a3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -195,10 +195,7 @@ static void smc_lgr_free_work(struct work_struct *work) } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, - struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id, - struct smcd_dev *smcismdev, u64 peer_gid) +static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -206,20 +203,21 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, int rc = 0; int i; - if (is_smcd && vlan_id) { - rc = smc_ism_get_vlan(smcismdev, vlan_id); - if (rc) + if (ini->is_smcd && ini->vlan_id) { + if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) { + rc = SMC_CLC_DECL_ISMVLANERR; goto out; + } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { - rc = -ENOMEM; + rc = SMC_CLC_DECL_MEM; goto out; } - lgr->is_smcd = is_smcd; + lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; - lgr->vlan_id = vlan_id; + lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -231,29 +229,32 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - if (is_smcd) { + if (ini->is_smcd) { /* SMC-D specific settings */ - lgr->peer_gid = peer_gid; - lgr->smcd = smcismdev; + lgr->peer_gid = ini->ism_gid; + lgr->smcd = ini->ism_dev; } else { /* SMC-R specific settings */ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = + ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) + smc_ib_setup_per_ibdev(ini->ib_dev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - vlan_id, lnk->gid, &lnk->sgid_index); + ini->vlan_id, lnk->gid, + &lnk->sgid_index); if (rc) goto free_lgr; rc = smc_llc_link_init(lnk); @@ -289,6 +290,12 @@ clear_llc_lnk: free_lgr: kfree(lgr); out: + if (rc < 0) { + if (rc == -ENOMEM) + rc = SMC_CLC_DECL_MEM; + else + rc = SMC_CLC_DECL_INTERR; + } return rc; } @@ -528,13 +535,13 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; int i, nest_lvl, rc = 0; - *vlan_id = 0; + ini->vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; @@ -546,7 +553,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) ndev = dst->dev; if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } @@ -560,7 +567,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) lower = lower->next; ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); break; } } @@ -594,24 +601,16 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid) +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; - int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; - unsigned short vlan_id; enum smc_lgr_role role; int rc = 0; + ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); - if (rc) - return rc; - - if ((role == SMC_CLNT) && srv_first_contact) + if (role == SMC_CLNT && ini->srv_first_contact) /* create new link group as well */ goto create; @@ -619,14 +618,15 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : - smcr_lgr_match(lgr, lcl, role, clcqpn)) && + if ((ini->is_smcd ? + smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : + smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && - lgr->vlan_id == vlan_id && + lgr->vlan_id == ini->vlan_id && (role == SMC_CLNT || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ - local_contact = SMC_REUSE_CONTACT; + ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ if (delayed_work_pending(&lgr->free_work)) @@ -638,19 +638,18 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, } spin_unlock_bh(&smc_lgr_list.lock); - if (role == SMC_CLNT && !srv_first_contact && - (local_contact == SMC_FIRST_CONTACT)) { + if (role == SMC_CLNT && !ini->srv_first_contact && + ini->cln_first_contact == SMC_FIRST_CONTACT) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ - return -ENOLINK; + return SMC_CLC_DECL_SYNCERR; } create: - if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, - lcl->id_for_peer, vlan_id, smcd, peer_gid); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + rc = smc_lgr_create(smc, ini); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ @@ -658,7 +657,7 @@ create: conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; - if (is_smcd) { + if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } @@ -667,7 +666,7 @@ create: #endif out: - return rc ? rc : local_contact; + return rc; } /* convert the RMB size into the compressed notation - minimum 16K. diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8806d2afa6ed..c00ac61dc129 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -229,6 +229,24 @@ struct smc_link_group { }; }; +struct smc_clc_msg_local; + +struct smc_init_info { + u8 is_smcd; + unsigned short vlan_id; + int srv_first_contact; + int cln_first_contact; + /* SMC-R */ + struct smc_clc_msg_local *ib_lcl; + struct smc_ib_device *ib_dev; + u8 ib_gid[SMC_GID_SIZE]; + u8 ib_port; + u32 ib_clcqpn; + /* SMC-D */ + u64 ism_gid; + struct smcd_dev *ism_dev; +}; + /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock @@ -281,13 +299,10 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid); +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); void smc_core_exit(void); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 3cdf81cf97a3..2b246b94a3af 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -26,6 +26,7 @@ #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" +#include "smc_core.h" #define SMC_ASCII_BLANK 32 @@ -755,8 +756,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) + struct smc_init_info *ini) { struct smc_ib_device *ibdev; @@ -776,10 +776,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; break; } } @@ -794,9 +794,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, - u8 gid[]) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smc_ib_device *ibdev; @@ -806,7 +804,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } @@ -817,10 +815,10 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; goto out; } } @@ -830,7 +828,7 @@ out: } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, - struct smcd_dev **smcismdev) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; @@ -844,7 +842,7 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { - *smcismdev = ismdev; + ini->ism_dev = ismdev; break; } } @@ -855,21 +853,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]) +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcibdev = NULL; - *ibport = 0; - + ini->ib_dev = NULL; + ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); @@ -877,17 +872,17 @@ out: return; } -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcismdev = NULL; + ini->ism_dev = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5eac42fb45d0..4564e4d69c2e 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -18,6 +18,7 @@ struct smc_ib_device; struct smcd_dev; +struct smc_init_info; /** * struct smc_pnettable - SMC PNET table anchor @@ -43,9 +44,7 @@ int smc_pnet_init(void) __init; int smc_pnet_net_init(struct net *net); void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]); -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); #endif diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 860dcfb95ee4..68a0885b9319 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -299,7 +299,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, break; } - /* Positive extra indicates ore bytes than needed for the + /* Positive extra indicates more bytes than needed for the * message */ diff --git a/net/tipc/node.c b/net/tipc/node.c index 3469b5d4ed32..7478e2d4ec02 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -375,14 +375,15 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ - write_lock_bh(&n->lock); + tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } - write_unlock_bh(&n->lock); + tipc_node_write_unlock_fast(n); + /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { |