diff options
Diffstat (limited to 'net')
72 files changed, 1132 insertions, 637 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 4674235b0d9b..b433aff5ff13 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -82,7 +82,7 @@ int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->msize != 8192) seq_printf(m, ",msize=%u", clnt->msize); - seq_printf(m, "trans=%s", clnt->trans_mod->name); + seq_printf(m, ",trans=%s", clnt->trans_mod->name); switch (clnt->proto_version) { case p9_proto_legacy: @@ -773,8 +773,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) } again: /* Wait for the response */ - err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); /* * Make sure our req is coherent with regard to updates in other diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 903a190319b9..985046ae4231 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -724,12 +724,12 @@ static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->trans_mod == &p9_tcp_trans) { if (clnt->trans_opts.tcp.port != P9_PORT) - seq_printf(m, "port=%u", clnt->trans_opts.tcp.port); + seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port); } else if (clnt->trans_mod == &p9_fd_trans) { if (clnt->trans_opts.fd.rfd != ~0) - seq_printf(m, "rfd=%u", clnt->trans_opts.fd.rfd); + seq_printf(m, ",rfd=%u", clnt->trans_opts.fd.rfd); if (clnt->trans_opts.fd.wfd != ~0) - seq_printf(m, "wfd=%u", clnt->trans_opts.fd.wfd); + seq_printf(m, ",wfd=%u", clnt->trans_opts.fd.wfd); } return 0; } diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index f24b25c25106..f3a4efcf1456 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -286,8 +286,8 @@ req_retry: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; @@ -327,7 +327,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, * Other zc request to finish here */ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { - err = wait_event_interruptible(vp_wq, + err = wait_event_killable(vp_wq, (atomic_read(&vp_pinned) < chan->p9_max_pages)); if (err == -ERESTARTSYS) return err; @@ -471,8 +471,8 @@ req_retry_pinned: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) goto err_out; @@ -489,8 +489,7 @@ req_retry_pinned: virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); - err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); /* * Non kernel buffers are pinned, unpin them */ diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 6ad3e043c617..325c56043007 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -156,8 +156,8 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) ring = &priv->rings[num]; again: - while (wait_event_interruptible(ring->wq, - p9_xen_write_todo(ring, size)) != 0) + while (wait_event_killable(ring->wq, + p9_xen_write_todo(ring, size)) != 0) ; spin_lock_irqsave(&ring->lock, flags); diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 67bb1f11e613..9a5850f264ed 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -47,28 +47,38 @@ unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) /* handle the last 11 bytes */ c = c + length; - switch (len) { /* all the case statements fall through */ + switch (len) { case 11: c = c + ((__u32)k[10] << 24); + /* fall through */ case 10: c = c + ((__u32)k[9] << 16); + /* fall through */ case 9: c = c + ((__u32)k[8] << 8); /* the first byte of c is reserved for the length */ + /* fall through */ case 8: b = b + ((__u32)k[7] << 24); + /* fall through */ case 7: b = b + ((__u32)k[6] << 16); + /* fall through */ case 6: b = b + ((__u32)k[5] << 8); + /* fall through */ case 5: b = b + k[4]; + /* fall through */ case 4: a = a + ((__u32)k[3] << 24); + /* fall through */ case 3: a = a + ((__u32)k[2] << 16); + /* fall through */ case 2: a = a + ((__u32)k[1] << 8); + /* fall through */ case 1: a = a + k[0]; /* case 0: nothing left to add */ diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 489610ac1cdd..bf9d079cbafd 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -37,7 +37,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) return -ENOTSUPP; } - WARN_ON(!key->len); + if (!key->len) + return -EINVAL; + key->key = kmemdup(buf, key->len, GFP_NOIO); if (!key->key) { ret = -ENOMEM; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ad93342c90d7..8a4d3758030b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -430,6 +430,7 @@ static void ceph_sock_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); + /* fall through */ case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 9ae1bab8c05d..1547107f4854 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1279,9 +1279,10 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, /* * Older OSDs don't set reply tid even if the orignal - * request had a non-zero tid. Workaround this weirdness - * by falling through to the allocate case. + * request had a non-zero tid. Work around this weirdness + * by allocating a new message. */ + /* fall through */ case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index ee43bc13221c..a3d0adc828e6 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -25,9 +25,9 @@ struct page **ceph_get_direct_page_vector(const void __user *data, return ERR_PTR(-ENOMEM); while (got < num_pages) { - rc = get_user_pages_unlocked( + rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, pages + got, write_page ? FOLL_WRITE : 0); + num_pages - got, write_page, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/core/dev.c b/net/core/dev.c index 8ee29f4f5fa9..07ed21d64f92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2746,7 +2746,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } @@ -7139,13 +7140,17 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; - if (bpf_op == ops->ndo_bpf) - prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, - dev); - else - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); + + if (!(flags & XDP_FLAGS_HW_MODE) && + bpf_prog_is_dev_bound(prog->aux)) { + NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); + bpf_prog_put(prog); + return -EINVAL; + } } err = dev_xdp_install(dev, bpf_op, extack, flags, prog); diff --git a/net/core/filter.c b/net/core/filter.c index 1afa17935954..6a85e67fafce 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1646,9 +1646,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM_OR_NULL, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ce4aa827be05..f00499a46927 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool fixedid = false, gso_partial, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; @@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ @@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) { + if (udpfrag) { + iph->frag_off = htons(offset >> 3); + if (skb->next) + iph->frag_off |= htons(IP_MF); + offset += skb->len - nhoff - ihl; + tot_len = skb->len - nhoff; + } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3b427757b1f8..43b69af242e1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -651,9 +651,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; + u32 genid, hval; unsigned int i; int depth; - u32 hval = fnhe_hashfun(daddr); + + genid = fnhe_genid(dev_net(nh->nh_dev)); + hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); @@ -676,12 +679,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, } if (fnhe) { + if (fnhe->fnhe_genid != genid) + fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; - if (pmtu) { + if (pmtu) fnhe->fnhe_pmtu = pmtu; - fnhe->fnhe_expires = max(1UL, expires); - } + fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) @@ -700,7 +704,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_next = hash->chain; rcu_assign_pointer(hash->chain, fnhe); } - fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f844c06c0676..734cfc8ff76e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2964,7 +2964,7 @@ void tcp_rearm_rto(struct sock *sk) /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ static void tcp_set_xmit_timer(struct sock *sk) { - if (!tcp_schedule_loss_probe(sk)) + if (!tcp_schedule_loss_probe(sk, true)) tcp_rearm_rto(sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 540b7d92cc70..a4d214c7b506 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2391,7 +2391,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) - tcp_schedule_loss_probe(sk); + tcp_schedule_loss_probe(sk, false); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; @@ -2399,7 +2399,7 @@ repair: return !tp->packets_out && !tcp_write_queue_empty(sk); } -bool tcp_schedule_loss_probe(struct sock *sk) +bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2440,7 +2440,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) } /* If the RTO formula yields an earlier time, then use that time. */ - rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + rto_delta_us = advancing_rto ? + jiffies_to_usecs(inet_csk(sk)->icsk_rto) : + tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e360d55be555..01801b77bd0d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -187,16 +187,57 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); -static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + __wsum csum; + struct udphdr *uh; + struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features, false); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + uh = udp_hdr(skb); + iph = ip_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: return segs; } @@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_segment = udp4_tunnel_segment, + .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b90bad7a4e56..4cfd8e0696fe 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -460,7 +460,7 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, false); + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); return PACKET_RCVD; } diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 4a7e5ffa5108..4fe7c90962dd 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -31,6 +31,37 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, return id; } +/* This function exists only for tap drivers that must support broken + * clients requesting UFO without specifying an IPv6 fragment ID. + * + * This is similar to ipv6_select_ident() but we use an independent hash + * seed to limit information leakage. + * + * The network header must be set before calling this. + */ +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) +{ + static u32 ip6_proxy_idents_hashrnd __read_mostly; + struct in6_addr buf[2]; + struct in6_addr *addrs; + u32 id; + + addrs = skb_header_pointer(skb, + skb_network_offset(skb) + + offsetof(struct ipv6hdr, saddr), + sizeof(buf), buf); + if (!addrs) + return 0; + + net_get_random_once(&ip6_proxy_idents_hashrnd, + sizeof(ip6_proxy_idents_hashrnd)); + + id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, + &addrs[1], &addrs[0]); + return htonl(id); +} +EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); + __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 05eb7bc36156..7a8d1500d374 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -472,6 +472,11 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, &match->rt6i_siblings, rt6i_siblings) { route_choosen--; if (route_choosen == 0) { + struct inet6_dev *idev = sibling->rt6i_idev; + + if (!netif_carrier_ok(sibling->dst.dev) && + idev->cnf.ignore_routes_with_linkdown) + break; if (rt6_score_route(sibling, oif, strict) < 0) break; match = sibling; @@ -1019,7 +1024,7 @@ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) { struct net_device *dev = rt->dst.dev; - if (rt->rt6i_flags & RTF_LOCAL) { + if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 455fd4e39333..a0f89ad76f9d 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,15 +17,94 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *packet_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + __wsum csum; + int tnl_hlen; + int err; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); + else { + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + + uh = udp_hdr(skb); + ipv6h = ipv6_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v6_check(skb->len, &ipv6h->saddr, + &ipv6h->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Check if there is enough headroom to insert fragment header. */ + tnl_hlen = skb_tnl_header_len(skb); + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb); + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + } +out: return segs; } @@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_segment = udp6_tunnel_segment, + .gso_segment = udp6_ufo_fragment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, }, diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 88cc1ae935ea..d444752dbf40 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -151,21 +151,17 @@ EXPORT_SYMBOL(ieee80211_stop_rx_ba_session); * After accepting the AddBA Request we activated a timer, * resetting it after each frame that arrives from the originator. */ -static void sta_rx_agg_session_timer_expired(unsigned long data) +static void sta_rx_agg_session_timer_expired(struct timer_list *t) { - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and various sta_info are needed here, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); + struct tid_ampdu_rx *tid_rx_timer = + from_timer(tid_rx_timer, t, session_timer); + struct sta_info *sta = tid_rx_timer->sta; + u8 tid = tid_rx_timer->tid; struct tid_ampdu_rx *tid_rx; unsigned long timeout; rcu_read_lock(); - tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[*ptid]); + tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_rx) { rcu_read_unlock(); return; @@ -180,21 +176,18 @@ static void sta_rx_agg_session_timer_expired(unsigned long data) rcu_read_unlock(); ht_dbg(sta->sdata, "RX session timer expired on %pM tid %d\n", - sta->sta.addr, (u16)*ptid); + sta->sta.addr, tid); - set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired); + set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired); ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); } -static void sta_rx_agg_reorder_timer_expired(unsigned long data) +static void sta_rx_agg_reorder_timer_expired(struct timer_list *t) { - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); + struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, reorder_timer); rcu_read_lock(); - ieee80211_release_reorder_timeout(sta, *ptid); + ieee80211_release_reorder_timeout(tid_rx->sta, tid_rx->tid); rcu_read_unlock(); } @@ -356,14 +349,12 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, spin_lock_init(&tid_agg_rx->reorder_lock); /* rx timer */ - setup_deferrable_timer(&tid_agg_rx->session_timer, - sta_rx_agg_session_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_agg_rx->session_timer, + sta_rx_agg_session_timer_expired, TIMER_DEFERRABLE); /* rx reorder timer */ - setup_timer(&tid_agg_rx->reorder_timer, - sta_rx_agg_reorder_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_agg_rx->reorder_timer, + sta_rx_agg_reorder_timer_expired, 0); /* prepare reordering buffer */ tid_agg_rx->reorder_buf = @@ -399,6 +390,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, tid_agg_rx->auto_seq = auto_seq; tid_agg_rx->started = false; tid_agg_rx->reorder_buf_filtered = 0; + tid_agg_rx->tid = tid; + tid_agg_rx->sta = sta; status = WLAN_STATUS_SUCCESS; /* activate it for RX */ diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index bef516ec47f9..5f8ab5be369f 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -330,6 +330,11 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, spin_lock_bh(&sta->lock); + /* free struct pending for start, if present */ + tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; + kfree(tid_tx); + sta->ampdu_mlme.tid_start_tx[tid] = NULL; + tid_tx = rcu_dereference_protected_tid_tx(sta, tid); if (!tid_tx) { spin_unlock_bh(&sta->lock); @@ -422,15 +427,12 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, * add Block Ack response will arrive from the recipient. * If this timer expires sta_addba_resp_timer_expired will be executed. */ -static void sta_addba_resp_timer_expired(unsigned long data) +static void sta_addba_resp_timer_expired(struct timer_list *t) { - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and both sta_info and TID are needed, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u16 tid = *(u8 *)data; - struct sta_info *sta = container_of((void *)data, - struct sta_info, timer_to_tid[tid]); + struct tid_ampdu_tx *tid_tx_timer = + from_timer(tid_tx_timer, t, addba_resp_timer); + struct sta_info *sta = tid_tx_timer->sta; + u8 tid = tid_tx_timer->tid; struct tid_ampdu_tx *tid_tx; /* check if the TID waits for addBA response */ @@ -525,21 +527,17 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) * After accepting the AddBA Response we activated a timer, * resetting it after each frame that we send. */ -static void sta_tx_agg_session_timer_expired(unsigned long data) +static void sta_tx_agg_session_timer_expired(struct timer_list *t) { - /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and various sta_info are needed here, so init - * flow in sta_info_create gives the TID as data, while the timer_to_id - * array gives the sta through container_of */ - u8 *ptid = (u8 *)data; - u8 *timer_to_id = ptid - *ptid; - struct sta_info *sta = container_of(timer_to_id, struct sta_info, - timer_to_tid[0]); + struct tid_ampdu_tx *tid_tx_timer = + from_timer(tid_tx_timer, t, session_timer); + struct sta_info *sta = tid_tx_timer->sta; + u8 tid = tid_tx_timer->tid; struct tid_ampdu_tx *tid_tx; unsigned long timeout; rcu_read_lock(); - tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[*ptid]); + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) { rcu_read_unlock(); return; @@ -555,9 +553,9 @@ static void sta_tx_agg_session_timer_expired(unsigned long data) rcu_read_unlock(); ht_dbg(sta->sdata, "tx session timer expired on %pM tid %d\n", - sta->sta.addr, (u16)*ptid); + sta->sta.addr, tid); - ieee80211_stop_tx_ba_session(&sta->sta, *ptid); + ieee80211_stop_tx_ba_session(&sta->sta, tid); } int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, @@ -670,16 +668,15 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, __set_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); tid_tx->timeout = timeout; + tid_tx->sta = sta; + tid_tx->tid = tid; /* response timer */ - setup_timer(&tid_tx->addba_resp_timer, - sta_addba_resp_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_tx->addba_resp_timer, sta_addba_resp_timer_expired, 0); /* tx timer */ - setup_deferrable_timer(&tid_tx->session_timer, - sta_tx_agg_session_timer_expired, - (unsigned long)&sta->timer_to_tid[tid]); + timer_setup(&tid_tx->session_timer, + sta_tx_agg_session_timer_expired, TIMER_DEFERRABLE); /* assign a dialog token */ sta->ampdu_mlme.dialog_token_allocator++; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index e9c6aa3ed05b..db07e0de9a03 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1711,10 +1711,10 @@ void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_ibss_timer(unsigned long data) +static void ieee80211_ibss_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.ibss.timer); ieee80211_queue_work(&sdata->local->hw, &sdata->work); } @@ -1723,8 +1723,7 @@ void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - setup_timer(&ifibss->timer, ieee80211_ibss_timer, - (unsigned long) sdata); + timer_setup(&ifibss->timer, ieee80211_ibss_timer, 0); INIT_LIST_HEAD(&ifibss->incomplete_stations); spin_lock_init(&ifibss->incomplete_lock); INIT_WORK(&ifibss->csa_connection_drop_work, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 68f874e73561..885d00b41911 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1057,6 +1057,7 @@ struct tpt_led_trigger { const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; struct timer_list timer; + struct ieee80211_local *local; unsigned long prev_traffic; unsigned long tx_bytes, rx_bytes; unsigned int active, want; @@ -1932,7 +1933,7 @@ static inline int ieee80211_ac_from_tid(int tid) void ieee80211_dynamic_ps_enable_work(struct work_struct *work); void ieee80211_dynamic_ps_disable_work(struct work_struct *work); -void ieee80211_dynamic_ps_timer(unsigned long data); +void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 0505845b7ab8..ba0b507ea691 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -248,10 +248,10 @@ static unsigned long tpt_trig_traffic(struct ieee80211_local *local, return DIV_ROUND_UP(delta, 1024 / 8); } -static void tpt_trig_timer(unsigned long data) +static void tpt_trig_timer(struct timer_list *t) { - struct ieee80211_local *local = (void *)data; - struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; + struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); + struct ieee80211_local *local = tpt_trig->local; struct led_classdev *led_cdev; unsigned long on, off, tpt; int i; @@ -306,8 +306,9 @@ __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; + tpt_trig->local = local; - setup_timer(&tpt_trig->timer, tpt_trig_timer, (unsigned long)local); + timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; @@ -326,7 +327,7 @@ static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; - tpt_trig_timer((unsigned long)local); + tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 8aa1f5b6a051..e054a2fd8d38 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -633,8 +633,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, ieee80211_dynamic_ps_enable_work); INIT_WORK(&local->dynamic_ps_disable_work, ieee80211_dynamic_ps_disable_work); - setup_timer(&local->dynamic_ps_timer, - ieee80211_dynamic_ps_timer, (unsigned long) local); + timer_setup(&local->dynamic_ps_timer, ieee80211_dynamic_ps_timer, 0); INIT_WORK(&local->sched_scan_stopped_work, ieee80211_sched_scan_stopped_work); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 7a76c4a6df30..5e27364e10ac 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -37,9 +37,10 @@ void ieee80211s_stop(void) kmem_cache_destroy(rm_cache); } -static void ieee80211_mesh_housekeeping_timer(unsigned long data) +static void ieee80211_mesh_housekeeping_timer(struct timer_list *t) { - struct ieee80211_sub_if_data *sdata = (void *) data; + struct ieee80211_sub_if_data *sdata = + from_timer(sdata, t, u.mesh.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -528,18 +529,18 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; } -static void ieee80211_mesh_path_timer(unsigned long data) +static void ieee80211_mesh_path_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mesh.mesh_path_timer); ieee80211_queue_work(&sdata->local->hw, &sdata->work); } -static void ieee80211_mesh_path_root_timer(unsigned long data) +static void ieee80211_mesh_path_root_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mesh.mesh_path_root_timer); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); @@ -1442,9 +1443,8 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; static u8 zero_addr[ETH_ALEN] = {}; - setup_timer(&ifmsh->housekeeping_timer, - ieee80211_mesh_housekeeping_timer, - (unsigned long) sdata); + timer_setup(&ifmsh->housekeeping_timer, + ieee80211_mesh_housekeeping_timer, 0); ifmsh->accepting_plinks = true; atomic_set(&ifmsh->mpaths, 0); @@ -1458,12 +1458,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) mesh_pathtbl_init(sdata); - setup_timer(&ifmsh->mesh_path_timer, - ieee80211_mesh_path_timer, - (unsigned long) sdata); - setup_timer(&ifmsh->mesh_path_root_timer, - ieee80211_mesh_path_root_timer, - (unsigned long) sdata); + timer_setup(&ifmsh->mesh_path_timer, ieee80211_mesh_path_timer, 0); + timer_setup(&ifmsh->mesh_path_root_timer, + ieee80211_mesh_path_root_timer, 0); INIT_LIST_HEAD(&ifmsh->preq_queue.list); skb_queue_head_init(&ifmsh->ps.bc_buf); spin_lock_init(&ifmsh->mesh_preq_queue_lock); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 465b7853edc0..ee56f18cad3f 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -296,7 +296,7 @@ void mesh_path_tx_pending(struct mesh_path *mpath); int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata); void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata); int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr); -void mesh_path_timer(unsigned long data); +void mesh_path_timer(struct timer_list *t); void mesh_path_flush_by_nexthop(struct sta_info *sta); void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 146ec6c0f12f..4f7826d7b47c 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1194,9 +1194,9 @@ endlookup: return err; } -void mesh_path_timer(unsigned long data) +void mesh_path_timer(struct timer_list *t) { - struct mesh_path *mpath = (void *) data; + struct mesh_path *mpath = from_timer(mpath, t, timer); struct ieee80211_sub_if_data *sdata = mpath->sdata; int ret; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 97269caafecd..86c8dfef56a4 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -399,8 +399,7 @@ struct mesh_path *mesh_path_new(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&new_mpath->frame_queue); new_mpath->exp_time = jiffies; spin_lock_init(&new_mpath->state_lock); - setup_timer(&new_mpath->timer, mesh_path_timer, - (unsigned long) new_mpath); + timer_setup(&new_mpath->timer, mesh_path_timer, 0); return new_mpath; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e4ededa1909d..04460440d731 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1066,10 +1066,10 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success) } EXPORT_SYMBOL(ieee80211_chswitch_done); -static void ieee80211_chswitch_timer(unsigned long data) +static void ieee80211_chswitch_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.chswitch_timer); ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work); } @@ -1577,9 +1577,9 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) } } -void ieee80211_dynamic_ps_timer(unsigned long data) +void ieee80211_dynamic_ps_timer(struct timer_list *t) { - struct ieee80211_local *local = (void *) data; + struct ieee80211_local *local = from_timer(local, t, dynamic_ps_timer); ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work); } @@ -3711,10 +3711,10 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, sdata_unlock(sdata); } -static void ieee80211_sta_timer(unsigned long data) +static void ieee80211_sta_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.timer); ieee80211_queue_work(&sdata->local->hw, &sdata->work); } @@ -3991,10 +3991,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_sta_bcn_mon_timer(unsigned long data) +static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.bcn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) @@ -4005,10 +4005,10 @@ static void ieee80211_sta_bcn_mon_timer(unsigned long data) &sdata->u.mgd.beacon_connection_loss_work); } -static void ieee80211_sta_conn_mon_timer(unsigned long data) +static void ieee80211_sta_conn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - (struct ieee80211_sub_if_data *) data; + from_timer(sdata, t, u.mgd.conn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; @@ -4139,14 +4139,10 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_mgd_work); INIT_DELAYED_WORK(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); - setup_timer(&ifmgd->timer, ieee80211_sta_timer, - (unsigned long) sdata); - setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, - (unsigned long) sdata); - setup_timer(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, - (unsigned long) sdata); - setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, - (unsigned long) sdata); + timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0); + timer_setup(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 0); + timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); + timer_setup(&ifmgd->chswitch_timer, ieee80211_chswitch_timer, 0); INIT_DELAYED_WORK(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 88e6ebbbe24f..d351dc1162be 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -150,9 +150,10 @@ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata) sdata_unlock(sdata); } -static void ieee80211_ocb_housekeeping_timer(unsigned long data) +static void ieee80211_ocb_housekeeping_timer(struct timer_list *t) { - struct ieee80211_sub_if_data *sdata = (void *)data; + struct ieee80211_sub_if_data *sdata = + from_timer(sdata, t, u.ocb.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; @@ -165,9 +166,8 @@ void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; - setup_timer(&ifocb->housekeeping_timer, - ieee80211_ocb_housekeeping_timer, - (unsigned long)sdata); + timer_setup(&ifocb->housekeeping_timer, + ieee80211_ocb_housekeeping_timer, 0); INIT_LIST_HEAD(&ifocb->incomplete_stations); spin_lock_init(&ifocb->incomplete_lock); } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index a3060e55122c..0c5627f8a104 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -379,14 +379,6 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - /* - * timer_to_tid must be initialized with identity mapping - * to enable session_timer's data differentiation. See - * sta_rx_agg_session_timer_expired for usage. - */ - sta->timer_to_tid[i] = i; - } for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); @@ -1064,9 +1056,9 @@ int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, return ret; } -static void sta_info_cleanup(unsigned long data) +static void sta_info_cleanup(struct timer_list *t) { - struct ieee80211_local *local = (struct ieee80211_local *) data; + struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; @@ -1098,8 +1090,7 @@ int sta_info_init(struct ieee80211_local *local) mutex_init(&local->sta_mtx); INIT_LIST_HEAD(&local->sta_list); - setup_timer(&local->sta_cleanup, sta_info_cleanup, - (unsigned long)local); + timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 5c54acd10562..cd53619435b6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -126,6 +126,8 @@ enum ieee80211_agg_stop_reason { AGG_STOP_DESTROY_STA, }; +struct sta_info; + /** * struct tid_ampdu_tx - TID aggregation information (Tx). * @@ -133,8 +135,10 @@ enum ieee80211_agg_stop_reason { * @session_timer: check if we keep Tx-ing on the TID (by timeout value) * @addba_resp_timer: timer for peer's response to addba request * @pending: pending frames queue -- use sta's spinlock to protect + * @sta: station we are attached to * @dialog_token: dialog token for aggregation session * @timeout: session timeout value to be filled in ADDBA requests + * @tid: TID number * @state: session state (see above) * @last_tx: jiffies of last tx activity * @stop_initiator: initiator of a session stop @@ -158,6 +162,7 @@ struct tid_ampdu_tx { struct timer_list session_timer; struct timer_list addba_resp_timer; struct sk_buff_head pending; + struct sta_info *sta; unsigned long state; unsigned long last_tx; u16 timeout; @@ -169,6 +174,7 @@ struct tid_ampdu_tx { u16 failed_bar_ssn; bool bar_pending; bool amsdu; + u8 tid; }; /** @@ -181,12 +187,14 @@ struct tid_ampdu_tx { * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. + * @sta: station we are attached to * @last_rx: jiffies of last rx activity * @head_seq_num: head sequence number in reordering buffer. * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. * @buf_size: buffer size for incoming A-MPDUs * @timeout: reset timer value (in TUs). + * @tid: TID number * @rcu_head: RCU head used for freeing this struct * @reorder_lock: serializes access to reorder buffer, see below. * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and @@ -208,6 +216,7 @@ struct tid_ampdu_rx { u64 reorder_buf_filtered; struct sk_buff_head *reorder_buf; unsigned long *reorder_time; + struct sta_info *sta; struct timer_list session_timer; struct timer_list reorder_timer; unsigned long last_rx; @@ -216,6 +225,7 @@ struct tid_ampdu_rx { u16 ssn; u16 buf_size; u16 timeout; + u8 tid; u8 auto_seq:1, removed:1, started:1; @@ -447,7 +457,6 @@ struct ieee80211_sta_rx_stats { * plus one for non-QoS frames) * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state - * @timer_to_tid: identity mapping to ID timers * @mesh: mesh STA information * @debugfs_dir: debug filesystem directory dentry * @dead: set to true when sta is unlinked @@ -554,7 +563,6 @@ struct sta_info { * Aggregation information, locked with lock. */ struct sta_ampdu_mlme ampdu_mlme; - u8 timer_to_tid[IEEE80211_NUM_TIDS]; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6e0adfefb9ed..59c08997bfdf 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -533,6 +533,7 @@ nla_put_failure: return -1; } +#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l3proto *l3proto; @@ -552,6 +553,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) return len + len4; } +#endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h index d0f38bc9af6d..ac709f0f197b 100644 --- a/net/netlabel/netlabel_addrlist.h +++ b/net/netlabel/netlabel_addrlist.h @@ -87,7 +87,7 @@ static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { - i = rcu_dereference(i->next); + i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; @@ -154,7 +154,7 @@ static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { - i = rcu_dereference(i->next); + i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f6359c277212..c0b83dc9d993 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -75,7 +75,7 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, if (!hdr) return -EMSGSIZE; - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || @@ -603,7 +603,7 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, return -EMSGSIZE; if (cb) - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; @@ -1356,7 +1356,7 @@ static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, goto nla_put_failure; if (cb) - genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0dab33fb9844..99cfafc2a139 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -308,6 +308,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { + unsigned short gso_type = skb_shinfo(skb)->gso_type; + struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; @@ -318,9 +320,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (segs == NULL) return -EINVAL; + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } + /* Queue all of the segments. */ skb = segs; do { + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 864ddb1e3642..dbe2379329c5 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -631,7 +631,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_LATER; return 0; } - if (nh->frag_off & htons(IP_MF)) + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; else key->ip.frag = OVS_FRAG_TYPE_NONE; @@ -747,6 +748,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (key->ip.frag == OVS_FRAG_TYPE_LATER) return 0; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { if (tcphdr_ok(skb)) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 1c40caadcff9..d836f998117b 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, const struct iphdr *iph; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, @@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, const struct ipv6hdr *ip6h; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ab255b421781..7d97f612c9b9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -205,13 +205,14 @@ static void tcf_chain_head_change(struct tcf_chain *chain, static void tcf_chain_flush(struct tcf_chain *chain) { - struct tcf_proto *tp; + struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); tcf_chain_head_change(chain, NULL); - while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { + while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_chain_put(chain); tcf_proto_destroy(tp); + tp = rtnl_dereference(chain->filter_chain); + tcf_chain_put(chain); } } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index fb680dafac5a..a9f3e317055c 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -382,15 +382,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, { struct bpf_prog *fp; char *name = NULL; + bool skip_sw; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW; - if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) - fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, - qdisc_dev(tp->q)); - else - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw); if (IS_ERR(fp)) return PTR_ERR(fp); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index a6dfa86c0201..3b18085e3b10 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -807,9 +807,10 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; - if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); - } + else + addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 514465b03829..9bf575f2e8ed 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3594,8 +3594,8 @@ struct sctp_chunk *sctp_make_strreset_req( __u16 stream_num, __be16 *stream_list, bool out, bool in) { + __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; - __u16 stream_len = stream_num * 2; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b029757bea03..3204a9b29407 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -84,8 +84,8 @@ /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); -static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p, - size_t msg_len); +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, + size_t msg_len, struct sock **orig_sk); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1970,9 +1970,16 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); - if (err) + /* sk can be changed by peel off when waiting for buf. */ + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + if (err) { + if (err == -ESRCH) { + /* asoc is already dead. */ + new_asoc = NULL; + err = -EPIPE; + } goto out_free; + } } /* If an address is passed with the sendto/sendmsg call, it is used @@ -3133,9 +3140,9 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign */ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_sock *sp = sctp_sk(sk); struct sctp_assoc_value params; struct sctp_association *asoc; - struct sctp_sock *sp = sctp_sk(sk); int val; if (optlen == sizeof(int)) { @@ -3151,26 +3158,35 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; val = params.assoc_value; - } else + } else { return -EINVAL; + } - if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN))) - return -EINVAL; + if (val) { + int min_len, max_len; - asoc = sctp_id2assoc(sk, params.assoc_id); - if (!asoc && params.assoc_id && sctp_style(sk, UDP)) - return -EINVAL; + min_len = SCTP_DEFAULT_MINSEGMENT - sp->pf->af->net_header_len; + min_len -= sizeof(struct sctphdr) + + sizeof(struct sctp_data_chunk); + + max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk); + if (val < min_len || val > max_len) + return -EINVAL; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); if (asoc) { if (val == 0) { - val = asoc->pathmtu; - val -= sp->pf->af->net_header_len; + val = asoc->pathmtu - sp->pf->af->net_header_len; val -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); + sizeof(struct sctp_data_chunk); } asoc->user_frag = val; asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); } else { + if (params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; sp->user_frag = val; } @@ -5015,12 +5031,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) if (!asoc) return -EINVAL; - /* If there is a thread waiting on more sndbuf space for - * sending on this asoc, it cannot be peeled. - */ - if (waitqueue_active(&asoc->wait)) - return -EBUSY; - /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ @@ -7989,7 +7999,7 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len) + size_t msg_len, struct sock **orig_sk) { struct sock *sk = asoc->base.sk; int err = 0; @@ -8006,10 +8016,11 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); + if (asoc->base.dead) + goto do_dead; if (!*timeo_p) goto do_nonblock; - if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || - asoc->base.dead) + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; if (signal_pending(current)) goto do_interrupted; @@ -8022,11 +8033,17 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); + if (sk != asoc->base.sk) { + release_sock(sk); + sk = asoc->base.sk; + lock_sock(sk); + } *timeo_p = current_timeo; } out: + *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ @@ -8034,6 +8051,10 @@ out: return err; +do_dead: + err = -ESRCH; + goto out; + do_error: err = -EPIPE; goto out; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index b8c8cabb1a58..a11db21dc8a0 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -282,15 +282,31 @@ int sctp_send_reset_streams(struct sctp_association *asoc, str_nums = params->srs_number_streams; str_list = params->srs_stream_list; - if (out && str_nums) - for (i = 0; i < str_nums; i++) - if (str_list[i] >= stream->outcnt) - goto out; + if (str_nums) { + int param_len = 0; - if (in && str_nums) - for (i = 0; i < str_nums; i++) - if (str_list[i] >= stream->incnt) - goto out; + if (out) { + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->outcnt) + goto out; + + param_len = str_nums * sizeof(__u16) + + sizeof(struct sctp_strreset_outreq); + } + + if (in) { + for (i = 0; i < str_nums; i++) + if (str_list[i] >= stream->incnt) + goto out; + + param_len += str_nums * sizeof(__u16) + + sizeof(struct sctp_strreset_inreq); + } + + if (param_len > SCTP_MAX_CHUNK_LEN - + sizeof(struct sctp_reconf_chunk)) + goto out; + } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2578fbd95664..94f21116dac5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -562,7 +562,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - struct smc_buf_desc *buf_desc = NULL; + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; @@ -575,7 +575,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); + for (bufsize_short = smc_compress_bufsize(sk_buf_size); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 7b1ee5a0b03c..73165e9ca5bf 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -855,11 +855,13 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g return stat; if (integ_len > buf->len) return stat; - if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) - BUG(); + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) { + WARN_ON_ONCE(1); + return stat; + } /* copy out mic... */ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) - BUG(); + return stat; if (mic.len > RPC_MAX_AUTH_SIZE) return stat; mic.data = kmalloc(mic.len, GFP_KERNEL); @@ -1611,8 +1613,10 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) BUG_ON(integ_len % 4); *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); - if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) - BUG(); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) { + WARN_ON_ONCE(1); + goto out_err; + } if (resbuf->tail[0].iov_base == NULL) { if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto out_err; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2ad827db2704..a801da812f86 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1491,7 +1491,6 @@ rpc_restart_call(struct rpc_task *task) } EXPORT_SYMBOL_GPL(rpc_restart_call); -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) const char *rpc_proc_name(const struct rpc_task *task) { @@ -1505,7 +1504,6 @@ const char } else return "no proc"; } -#endif /* * 0. Initial state @@ -1519,6 +1517,7 @@ call_start(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int idx = task->tk_msg.rpc_proc->p_statidx; + trace_rpc_request(task); dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), @@ -1586,6 +1585,7 @@ call_reserveresult(struct rpc_task *task) switch (status) { case -ENOMEM: rpc_delay(task, HZ >> 2); + /* fall through */ case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; @@ -1647,10 +1647,13 @@ call_refreshresult(struct rpc_task *task) /* Use rate-limiting and a max number of retries if refresh * had status 0 but failed to update the cred. */ + /* fall through */ case -ETIMEDOUT: rpc_delay(task, 3*HZ); + /* fall through */ case -EAGAIN: status = -EACCES; + /* fall through */ case -EKEYEXPIRED: if (!task->tk_cred_retry) break; @@ -1911,6 +1914,7 @@ call_connect_status(struct rpc_task *task) task->tk_action = call_bind; return; } + /* fall through */ case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: @@ -1924,6 +1928,7 @@ call_connect_status(struct rpc_task *task) break; /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); + /* fall through */ case -EAGAIN: /* Check for timeouts before looping back to call_bind */ case -ETIMEDOUT: @@ -2025,6 +2030,7 @@ call_transmit_status(struct rpc_task *task) rpc_exit(task, task->tk_status); break; } + /* fall through */ case -ECONNRESET: case -ECONNABORTED: case -EADDRINUSE: @@ -2145,6 +2151,7 @@ call_status(struct rpc_task *task) * were a timeout. */ rpc_delay(task, 3*HZ); + /* fall through */ case -ETIMEDOUT: task->tk_action = call_timeout; break; @@ -2152,14 +2159,17 @@ call_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: rpc_force_rebind(clnt); + /* fall through */ case -EADDRINUSE: rpc_delay(task, 3*HZ); + /* fall through */ case -EPIPE: case -ENOTCONN: task->tk_action = call_bind; break; case -ENOBUFS: rpc_delay(task, HZ>>2); + /* fall through */ case -EAGAIN: task->tk_action = call_transmit; break; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 61a504fb1ae2..7803f3b6aa53 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1410,8 +1410,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return PTR_ERR(gssd_dentry); } - dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", - net, NET_NAME(net)); + dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", + net->ns.inum, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, @@ -1462,8 +1462,8 @@ static void rpc_kill_sb(struct super_block *sb) goto out; } sn->pipefs_sb = NULL; - dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", - net, NET_NAME(net)); + dprintk("RPC: sending pipefs UMOUNT notification for net %x%s\n", + net->ns.inum, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index ea0676f199c8..c526f8fb37c9 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -216,9 +216,9 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt, smp_wmb(); sn->rpcb_users = 1; dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " - "%p, rpcb_local_clnt4: %p) for net %p%s\n", - sn->rpcb_local_clnt, sn->rpcb_local_clnt4, - net, (net == &init_net) ? " (init_net)" : ""); + "%p, rpcb_local_clnt4: %p) for net %x%s\n", + sn->rpcb_local_clnt, sn->rpcb_local_clnt4, + net->ns.inum, (net == &init_net) ? " (init_net)" : ""); } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5dea47eb31bb..b1b49edd7c4d 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -274,10 +274,9 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task) static void rpc_set_active(struct rpc_task *task) { - trace_rpc_task_begin(task->tk_client, task, NULL); - rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); + trace_rpc_task_begin(task->tk_client, task, NULL); } /* diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index c73de181467a..56f9eff74150 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -65,10 +65,13 @@ err_proc: static __net_exit void sunrpc_exit_net(struct net *net) { + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); + WARN_ON_ONCE(!list_empty(&sn->all_clients)); } static struct pernet_operations sunrpc_net_ops = { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 71de77bd4423..e8e0831229cf 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -250,9 +250,9 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) svc_xprt_received(new); } -int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, - struct net *net, const int family, - const unsigned short port, int flags) +static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + struct net *net, const int family, + const unsigned short port, int flags) { struct svc_xprt_class *xcl; @@ -380,7 +380,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) struct svc_pool *pool; struct svc_rqst *rqstp = NULL; int cpu; - bool queued = false; if (!svc_xprt_has_something_to_do(xprt)) goto out; @@ -401,58 +400,25 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) atomic_long_inc(&pool->sp_stats.packets); -redo_search: + dprintk("svc: transport %p put into queue\n", xprt); + spin_lock_bh(&pool->sp_lock); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + pool->sp_stats.sockets_queued++; + spin_unlock_bh(&pool->sp_lock); + /* find a thread for this xprt */ rcu_read_lock(); list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { - /* Do a lockless check first */ - if (test_bit(RQ_BUSY, &rqstp->rq_flags)) + if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) continue; - - /* - * Once the xprt has been queued, it can only be dequeued by - * the task that intends to service it. All we can do at that - * point is to try to wake this thread back up so that it can - * do so. - */ - if (!queued) { - spin_lock_bh(&rqstp->rq_lock); - if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) { - /* already busy, move on... */ - spin_unlock_bh(&rqstp->rq_lock); - continue; - } - - /* this one will do */ - rqstp->rq_xprt = xprt; - svc_xprt_get(xprt); - spin_unlock_bh(&rqstp->rq_lock); - } - rcu_read_unlock(); - atomic_long_inc(&pool->sp_stats.threads_woken); wake_up_process(rqstp->rq_task); - put_cpu(); - goto out; - } - rcu_read_unlock(); - - /* - * We didn't find an idle thread to use, so we need to queue the xprt. - * Do so and then search again. If we find one, we can't hook this one - * up to it directly but we can wake the thread up in the hopes that it - * will pick it up once it searches for a xprt to service. - */ - if (!queued) { - queued = true; - dprintk("svc: transport %p put into queue\n", xprt); - spin_lock_bh(&pool->sp_lock); - list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); - pool->sp_stats.sockets_queued++; - spin_unlock_bh(&pool->sp_lock); - goto redo_search; + goto out_unlock; } + set_bit(SP_CONGESTED, &pool->sp_flags); rqstp = NULL; +out_unlock: + rcu_read_unlock(); put_cpu(); out: trace_svc_xprt_do_enqueue(xprt, rqstp); @@ -721,38 +687,25 @@ rqst_should_sleep(struct svc_rqst *rqstp) static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) { - struct svc_xprt *xprt; struct svc_pool *pool = rqstp->rq_pool; long time_left = 0; /* rq_xprt should be clear on entry */ WARN_ON_ONCE(rqstp->rq_xprt); - /* Normally we will wait up to 5 seconds for any required - * cache information to be provided. - */ - rqstp->rq_chandle.thread_wait = 5*HZ; - - xprt = svc_xprt_dequeue(pool); - if (xprt) { - rqstp->rq_xprt = xprt; - - /* As there is a shortage of threads and this request - * had to be queued, don't allow the thread to wait so - * long for cache updates. - */ - rqstp->rq_chandle.thread_wait = 1*HZ; - clear_bit(SP_TASK_PENDING, &pool->sp_flags); - return xprt; - } + rqstp->rq_xprt = svc_xprt_dequeue(pool); + if (rqstp->rq_xprt) + goto out_found; /* * We have to be able to interrupt this wait * to bring down the daemons ... */ set_current_state(TASK_INTERRUPTIBLE); + smp_mb__before_atomic(); + clear_bit(SP_CONGESTED, &pool->sp_flags); clear_bit(RQ_BUSY, &rqstp->rq_flags); - smp_mb(); + smp_mb__after_atomic(); if (likely(rqst_should_sleep(rqstp))) time_left = schedule_timeout(timeout); @@ -761,13 +714,11 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) try_to_freeze(); - spin_lock_bh(&rqstp->rq_lock); set_bit(RQ_BUSY, &rqstp->rq_flags); - spin_unlock_bh(&rqstp->rq_lock); - - xprt = rqstp->rq_xprt; - if (xprt != NULL) - return xprt; + smp_mb__after_atomic(); + rqstp->rq_xprt = svc_xprt_dequeue(pool); + if (rqstp->rq_xprt) + goto out_found; if (!time_left) atomic_long_inc(&pool->sp_stats.threads_timedout); @@ -775,6 +726,15 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (signalled() || kthread_should_stop()) return ERR_PTR(-EINTR); return ERR_PTR(-EAGAIN); +out_found: + /* Normally we will wait up to 5 seconds for any required + * cache information to be provided. + */ + if (!test_bit(SP_CONGESTED, &pool->sp_flags)) + rqstp->rq_chandle.thread_wait = 5*HZ; + else + rqstp->rq_chandle.thread_wait = 1*HZ; + return rqstp->rq_xprt; } static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 6160d17a31c4..333b9d697ae5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1139,6 +1139,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) case -EAGAIN: xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); + /* fall through */ default: task->tk_status = -EAGAIN; } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 823a781ec89c..8b818bb3518a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -43,7 +43,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) return PTR_ERR(req); - req->rl_backchannel = true; + __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags); rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, GFP_KERNEL); @@ -223,8 +223,8 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) *p++ = xdr_zero; *p = xdr_zero; - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, - &rqst->rq_snd_buf, rpcrdma_noch)) + if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, + &rqst->rq_snd_buf, rpcrdma_noch)) return -EIO; return 0; } diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index fa759dd2b0f3..29fc84c7ff98 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -306,28 +306,9 @@ out_reset: } } -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - */ -static void -fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - struct rpcrdma_mw *mw; - - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); - if (sync) - fmr_op_recover_mr(mw); - else - rpcrdma_defer_mr_recovery(mw); - } -} - const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_unmap_safe = fmr_op_unmap_safe, .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 35d7517ef0e6..773e66e10a15 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -420,7 +420,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - rpcrdma_set_signaled(&r_xprt->rx_ep, ®_wr->wr); rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -508,12 +507,6 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) f->fr_cqe.done = frwr_wc_localinv_wake; reinit_completion(&f->fr_linv_done); - /* Initialize CQ count, since there is always a signaled - * WR being posted here. The new cqcount depends on how - * many SQEs are about to be consumed. - */ - rpcrdma_init_cqcount(&r_xprt->rx_ep, count); - /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us * unless ri_id->qp is a valid pointer. @@ -546,7 +539,6 @@ reset_mrs: /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. */ - rpcrdma_init_cqcount(&r_xprt->rx_ep, -count); while (bad_wr) { f = container_of(bad_wr, struct rpcrdma_frmr, fr_invwr); @@ -559,28 +551,9 @@ reset_mrs: goto unmap; } -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - */ -static void -frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - struct rpcrdma_mw *mw; - - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); - if (sync) - frwr_op_recover_mr(mw); - else - rpcrdma_defer_mr_recovery(mw); - } -} - const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_unmap_safe = frwr_op_unmap_safe, .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f1889f4d4803..ed34dc0f144c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -75,11 +76,11 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) /* Maximum Read list size */ maxsegs += 2; /* segment for head and tail buffers */ - size = maxsegs * sizeof(struct rpcrdma_read_chunk); + size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ size += sizeof(__be32); /* segment count */ - size += sizeof(struct rpcrdma_segment); + size += rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ dprintk("RPC: %s: max call header size = %u\n", @@ -102,7 +103,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /* Maximum Write list size */ maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ - size += maxsegs * sizeof(struct rpcrdma_segment); + size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ dprintk("RPC: %s: max reply header size = %u\n", @@ -511,27 +512,60 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return 0; } -/* Prepare the RPC-over-RDMA header SGE. +/** + * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * @sc: sendctx containing SGEs to unmap + * + */ +void +rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; + struct ib_sge *sge; + unsigned int count; + + dprintk("RPC: %s: unmapping %u sges for sc=%p\n", + __func__, sc->sc_unmap_count, sc); + + /* The first two SGEs contain the transport header and + * the inline buffer. These are always left mapped so + * they can be cheaply re-used. + */ + sge = &sc->sc_sges[2]; + for (count = sc->sc_unmap_count; count; ++sge, --count) + ib_dma_unmap_page(ia->ri_device, + sge->addr, sge->length, DMA_TO_DEVICE); + + if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { + smp_mb__after_atomic(); + wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); + } +} + +/* Prepare an SGE for the RPC-over-RDMA transport header. */ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, u32 len) { + struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; - struct ib_sge *sge = &req->rl_send_sge[0]; + struct ib_sge *sge = sc->sc_sges; - if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) { - if (!__rpcrdma_dma_map_regbuf(ia, rb)) - return false; - sge->addr = rdmab_addr(rb); - sge->lkey = rdmab_lkey(rb); - } + if (!rpcrdma_dma_map_regbuf(ia, rb)) + goto out_regbuf; + sge->addr = rdmab_addr(rb); sge->length = len; + sge->lkey = rdmab_lkey(rb); ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); - req->rl_send_wr.num_sge++; + sc->sc_wr.num_sge++; return true; + +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; } /* Prepare the Send SGEs. The head and tail iovec, and each entry @@ -541,10 +575,11 @@ static bool rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { + struct rpcrdma_sendctx *sc = req->rl_sendctx; unsigned int sge_no, page_base, len, remaining; struct rpcrdma_regbuf *rb = req->rl_sendbuf; struct ib_device *device = ia->ri_device; - struct ib_sge *sge = req->rl_send_sge; + struct ib_sge *sge = sc->sc_sges; u32 lkey = ia->ri_pd->local_dma_lkey; struct page *page, **ppages; @@ -552,7 +587,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, * DMA-mapped. Sync the content that has changed. */ if (!rpcrdma_dma_map_regbuf(ia, rb)) - return false; + goto out_regbuf; sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; @@ -607,7 +642,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge[sge_no].length = len; sge[sge_no].lkey = lkey; - req->rl_mapped_sges++; + sc->sc_unmap_count++; ppages++; remaining -= len; page_base = 0; @@ -633,56 +668,61 @@ map_tail: goto out_mapping_err; sge[sge_no].length = len; sge[sge_no].lkey = lkey; - req->rl_mapped_sges++; + sc->sc_unmap_count++; } out: - req->rl_send_wr.num_sge = sge_no + 1; + sc->sc_wr.num_sge += sge_no; + if (sc->sc_unmap_count) + __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); return true; +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; + out_mapping_overflow: + rpcrdma_unmap_sendctx(sc); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); return false; out_mapping_err: + rpcrdma_unmap_sendctx(sc); pr_err("rpcrdma: Send mapping error\n"); return false; } -bool -rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - u32 hdrlen, struct xdr_buf *xdr, - enum rpcrdma_chunktype rtype) +/** + * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR + * @r_xprt: controlling transport + * @req: context of RPC Call being marshalled + * @hdrlen: size of transport header, in bytes + * @xdr: xdr_buf containing RPC Call + * @rtype: chunk type being encoded + * + * Returns 0 on success; otherwise a negative errno is returned. + */ +int +rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { - req->rl_send_wr.num_sge = 0; - req->rl_mapped_sges = 0; - - if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen)) - goto out_map; + req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + if (!req->rl_sendctx) + return -ENOBUFS; + req->rl_sendctx->sc_wr.num_sge = 0; + req->rl_sendctx->sc_unmap_count = 0; + req->rl_sendctx->sc_req = req; + __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); + + if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) + return -EIO; if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype)) - goto out_map; - - return true; - -out_map: - pr_err("rpcrdma: failed to DMA map a Send buffer\n"); - return false; -} - -void -rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) -{ - struct ib_device *device = ia->ri_device; - struct ib_sge *sge; - int count; + if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) + return -EIO; - sge = &req->rl_send_sge[2]; - for (count = req->rl_mapped_sges; count--; sge++) - ib_dma_unmap_page(device, sge->addr, sge->length, - DMA_TO_DEVICE); - req->rl_mapped_sges = 0; + return 0; } /** @@ -833,12 +873,10 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) transfertypes[rtype], transfertypes[wtype], xdr_stream_pos(xdr)); - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, - xdr_stream_pos(xdr), - &rqst->rq_snd_buf, rtype)) { - ret = -EIO; + ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), + &rqst->rq_snd_buf, rtype); + if (ret) goto out_err; - } return 0; out_err: @@ -970,14 +1008,13 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws, * straightforward to check the RPC header's direction field. */ static bool -rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, - __be32 xid, __be32 proc) +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; - if (proc != rdma_msg) + if (rep->rr_proc != rdma_msg) return false; /* Peek at stream contents without advancing. */ @@ -992,7 +1029,7 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return false; /* RPC header */ - if (*p++ != xid) + if (*p++ != rep->rr_xid) return false; if (*p != cpu_to_be32(RPC_CALL)) return false; @@ -1212,105 +1249,170 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return -EREMOTEIO; } +/* Perform XID lookup, reconstruction of the RPC reply, and + * RPC completion while holding the transport lock to ensure + * the rep, rqst, and rq_task pointers remain stable. + */ +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) +{ + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + unsigned long cwnd; + int status; + + xprt->reestablish_timeout = 0; + + switch (rep->rr_proc) { + case rdma_msg: + status = rpcrdma_decode_msg(r_xprt, rep, rqst); + break; + case rdma_nomsg: + status = rpcrdma_decode_nomsg(r_xprt, rep); + break; + case rdma_error: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; + default: + status = -EIO; + } + if (status < 0) + goto out_badheader; + +out: + spin_lock(&xprt->recv_lock); + cwnd = xprt->cwnd; + xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); + + xprt_complete_rqst(rqst->rq_task, status); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + return; + +/* If the incoming reply terminated a pending RPC, the next + * RPC call will post a replacement receive buffer as it is + * being marshaled. + */ +out_badheader: + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); + r_xprt->rx_stats.bad_reply_count++; + status = -EIO; + goto out; +} + +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + /* Invalidate and unmap the data payloads before waking + * the waiting application. This guarantees the memory + * regions are properly fenced from the server before the + * application accesses the data. It also ensures proper + * send flow control: waking the next RPC waits until this + * RPC has relinquished all its Send Queue entries. + */ + if (!list_empty(&req->rl_registered)) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); + + /* Ensure that any DMA mapped pages associated with + * the Send of the RPC Call have been unmapped before + * allowing the RPC to complete. This protects argument + * memory not controlled by the RPC client from being + * re-used before we're done with it. + */ + if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + r_xprt->rx_stats.reply_waits_for_send++; + out_of_line_wait_on_bit(&req->rl_flags, + RPCRDMA_REQ_F_TX_RESOURCES, + bit_wait, + TASK_UNINTERRUPTIBLE); + } +} + +/* Reply handling runs in the poll worker thread. Anything that + * might wait is deferred to a separate workqueue. + */ +void rpcrdma_deferred_completion(struct work_struct *work) +{ + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); + struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); + rpcrdma_release_rqst(rep->rr_rxprt, req); + rpcrdma_complete_rqst(rep); +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ -void -rpcrdma_reply_handler(struct work_struct *work) +void rpcrdma_reply_handler(struct rpcrdma_rep *rep) { - struct rpcrdma_rep *rep = - container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct xdr_stream *xdr = &rep->rr_stream; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; struct rpc_rqst *rqst; - __be32 *p, xid, vers, proc; - unsigned long cwnd; - int status; + u32 credits; + __be32 *p; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; - xdr_init_decode(xdr, &rep->rr_hdrbuf, + xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base); /* Fixed transport header fields */ - p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); if (unlikely(!p)) goto out_shortreply; - xid = *p++; - vers = *p++; - p++; /* credits */ - proc = *p++; + rep->rr_xid = *p++; + rep->rr_vers = *p++; + credits = be32_to_cpu(*p++); + rep->rr_proc = *p++; + + if (rep->rr_vers != rpcrdma_version) + goto out_badversion; - if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) + if (rpcrdma_is_bcall(r_xprt, rep)) return; /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ spin_lock(&xprt->recv_lock); - rqst = xprt_lookup_rqst(xprt, xid); + rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > buf->rb_max_requests) + credits = buf->rb_max_requests; + buf->rb_credits = credits; + spin_unlock(&xprt->recv_lock); + req = rpcr_to_rdmar(rqst); req->rl_reply = rep; + rep->rr_rqst = rqst; + clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(xid)); - - /* Invalidate and unmap the data payloads before waking the - * waiting application. This guarantees the memory regions - * are properly fenced from the server before the application - * accesses the data. It also ensures proper send flow control: - * waking the next RPC waits until this RPC has relinquished - * all its Send Queue entries. - */ - if (!list_empty(&req->rl_registered)) { - rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); - } - - xprt->reestablish_timeout = 0; - if (vers != rpcrdma_version) - goto out_badversion; + __func__, rep, req, be32_to_cpu(rep->rr_xid)); - switch (proc) { - case rdma_msg: - status = rpcrdma_decode_msg(r_xprt, rep, rqst); - break; - case rdma_nomsg: - status = rpcrdma_decode_nomsg(r_xprt, rep); - break; - case rdma_error: - status = rpcrdma_decode_error(r_xprt, rep, rqst); - break; - default: - status = -EIO; - } - if (status < 0) - goto out_badheader; - -out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - - xprt_complete_rqst(rqst->rq_task, status); - xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); - dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", - __func__, xprt, rqst, status); + if (list_empty(&req->rl_registered) && + !test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) + rpcrdma_complete_rqst(rep); + else + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: @@ -1321,37 +1423,22 @@ out_badstatus: } return; -/* If the incoming reply terminated a pending RPC, the next - * RPC call will post a replacement receive buffer as it is - * being marshaled. - */ out_badversion: dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(vers)); - status = -EIO; - r_xprt->rx_stats.bad_reply_count++; - goto out; - -out_badheader: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); - r_xprt->rx_stats.bad_reply_count++; - status = -EIO; - goto out; + __func__, be32_to_cpu(rep->rr_vers)); + goto repost; -/* The req was still available, but by the time the recv_lock - * was acquired, the rqst and task had been released. Thus the RPC - * has already been terminated. +/* The RPC transaction has already been terminated, or the header + * is corrupt. */ out_norqst: spin_unlock(&xprt->recv_lock); dprintk("RPC: %s: no match for incoming xid 0x%08x\n", - __func__, be32_to_cpu(xid)); + __func__, be32_to_cpu(rep->rr_xid)); goto repost; out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); - goto repost; /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 992594b7cc6b..af7893501e40 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -133,6 +133,10 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, if (ret) goto out_err; + /* Bump page refcnt so Send completion doesn't release + * the rq_buffer before all retransmits are complete. + */ + get_page(virt_to_page(rqst->rq_buffer)); ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0); if (ret) goto out_unmap; @@ -165,7 +169,6 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } - /* svc_rdma_sendto releases this page */ page = alloc_page(RPCRDMA_DEF_GFP); if (!page) return -ENOMEM; @@ -184,6 +187,7 @@ xprt_rdma_bc_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; + put_page(virt_to_page(rqst->rq_buffer)); kfree(rqst->rq_rbuffer); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 5caf8e722a11..46ec069150d5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -290,6 +290,7 @@ static void qp_event_handler(struct ib_event *event, void *context) ib_event_msg(event->event), event->event, event->element.qp); set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); break; } } @@ -322,8 +323,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) goto out; - svc_xprt_enqueue(&xprt->sc_xprt); - goto out; + goto out_enqueue; flushed: if (wc->status != IB_WC_WR_FLUSH_ERR) @@ -333,6 +333,8 @@ flushed: set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 1); +out_enqueue: + svc_xprt_enqueue(&xprt->sc_xprt); out: svc_xprt_put(&xprt->sc_xprt); } @@ -358,6 +360,7 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) if (unlikely(wc->status != IB_WC_SUCCESS)) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&xprt->sc_xprt); if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("svcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), @@ -569,8 +572,10 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, case RDMA_CM_EVENT_DEVICE_REMOVAL: dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", xprt, cma_id); - if (xprt) + if (xprt) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&xprt->sc_xprt); + } break; default: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c84e2b644e13..646c24494ea7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -678,16 +679,14 @@ xprt_rdma_free(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - if (req->rl_backchannel) + if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) return; dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (!list_empty(&req->rl_registered)) - ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); - rpcrdma_unmap_sges(ia, req); + if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) + rpcrdma_release_rqst(r_xprt, req); rpcrdma_buffer_put(req); } @@ -728,7 +727,8 @@ xprt_rdma_send_request(struct rpc_task *task) /* On retransmit, remove any previously registered chunks */ if (unlikely(!list_empty(&req->rl_registered))) - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) @@ -742,6 +742,7 @@ xprt_rdma_send_request(struct rpc_task *task) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; + set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; @@ -789,11 +790,13 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); - seq_printf(seq, "%lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", r_xprt->rx_stats.mrs_recovered, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, - r_xprt->rx_stats.local_inv_needed); + r_xprt->rx_stats.local_inv_needed, + r_xprt->rx_stats.empty_sendctx_q, + r_xprt->rx_stats.reply_waits_for_send); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 11a1fbf7e59e..710b3f77db82 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -49,9 +50,10 @@ #include <linux/interrupt.h> #include <linux/slab.h> -#include <linux/prefetch.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc_rdma.h> + +#include <asm-generic/barrier.h> #include <asm/bitops.h> #include <rdma/ib_cm.h> @@ -73,7 +75,7 @@ static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); -static struct workqueue_struct *rpcrdma_receive_wq __read_mostly; +struct workqueue_struct *rpcrdma_receive_wq __read_mostly; int rpcrdma_alloc_wq(void) @@ -126,30 +128,17 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_sendctx *sc = + container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); -} - -/* Perform basic sanity checking to avoid using garbage - * to update the credit grant value. - */ -static void -rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) -{ - struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; - __be32 *p = rep->rr_rdmabuf->rg_base; - u32 credits; - credits = be32_to_cpup(p + 2); - if (credits == 0) - credits = 1; /* don't deadlock */ - else if (credits > buffer->rb_max_requests) - credits = buffer->rb_max_requests; - - atomic_set(&buffer->rb_credits, credits); + rpcrdma_sendctx_put_locked(sc); } /** @@ -181,11 +170,8 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); - if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) - rpcrdma_update_granted_credits(rep); - out_schedule: - queue_work(rpcrdma_receive_wq, &rep->rr_work); + rpcrdma_reply_handler(rep); return; out_fail: @@ -295,7 +281,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; connected: - atomic_set(&xprt->rx_buf.rb_credits, 1); + xprt->rx_buf.rb_credits = 1; ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); @@ -564,16 +550,15 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ - ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; /* always signal? */ - rpcrdma_init_cqcount(ep, 0); + ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, + cdata->max_requests >> 2); + ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, - 0, IB_POLL_SOFTIRQ); + 1, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", @@ -583,7 +568,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, recvcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_recv_wr + 1, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -846,6 +831,168 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_drain_qp(ia->ri_id->qp); } +/* Fixed-size circular FIFO queue. This implementation is wait-free and + * lock-free. + * + * Consumer is the code path that posts Sends. This path dequeues a + * sendctx for use by a Send operation. Multiple consumer threads + * are serialized by the RPC transport lock, which allows only one + * ->send_request call at a time. + * + * Producer is the code path that handles Send completions. This path + * enqueues a sendctx that has been completed. Multiple producer + * threads are serialized by the ib_poll_cq() function. + */ + +/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced + * queue activity, and ib_drain_qp has flushed all remaining Send + * requests. + */ +static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) +{ + unsigned long i; + + for (i = 0; i <= buf->rb_sc_last; i++) + kfree(buf->rb_sc_ctxs[i]); + kfree(buf->rb_sc_ctxs); +} + +static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia) +{ + struct rpcrdma_sendctx *sc; + + sc = kzalloc(sizeof(*sc) + + ia->ri_max_send_sges * sizeof(struct ib_sge), + GFP_KERNEL); + if (!sc) + return NULL; + + sc->sc_wr.wr_cqe = &sc->sc_cqe; + sc->sc_wr.sg_list = sc->sc_sges; + sc->sc_wr.opcode = IB_WR_SEND; + sc->sc_cqe.done = rpcrdma_wc_send; + return sc; +} + +static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_sendctx *sc; + unsigned long i; + + /* Maximum number of concurrent outstanding Send WRs. Capping + * the circular queue size stops Send Queue overflow by causing + * the ->send_request call to fail temporarily before too many + * Sends are posted. + */ + i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; + dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i); + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + if (!buf->rb_sc_ctxs) + return -ENOMEM; + + buf->rb_sc_last = i - 1; + for (i = 0; i <= buf->rb_sc_last; i++) { + sc = rpcrdma_sendctx_create(&r_xprt->rx_ia); + if (!sc) + goto out_destroy; + + sc->sc_xprt = r_xprt; + buf->rb_sc_ctxs[i] = sc; + } + + return 0; + +out_destroy: + rpcrdma_sendctxs_destroy(buf); + return -ENOMEM; +} + +/* The sendctx queue is not guaranteed to have a size that is a + * power of two, thus the helpers in circ_buf.h cannot be used. + * The other option is to use modulus (%), which can be expensive. + */ +static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, + unsigned long item) +{ + return likely(item < buf->rb_sc_last) ? item + 1 : 0; +} + +/** + * rpcrdma_sendctx_get_locked - Acquire a send context + * @buf: transport buffers from which to acquire an unused context + * + * Returns pointer to a free send completion context; or NULL if + * the queue is empty. + * + * Usage: Called to acquire an SGE array before preparing a Send WR. + * + * The caller serializes calls to this function (per rpcrdma_buffer), + * and provides an effective memory barrier that flushes the new value + * of rb_sc_head. + */ +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_sendctx *sc; + unsigned long next_head; + + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); + + if (next_head == READ_ONCE(buf->rb_sc_tail)) + goto out_emptyq; + + /* ORDER: item must be accessed _before_ head is updated */ + sc = buf->rb_sc_ctxs[next_head]; + + /* Releasing the lock in the caller acts as a memory + * barrier that flushes rb_sc_head. + */ + buf->rb_sc_head = next_head; + + return sc; + +out_emptyq: + /* The queue is "empty" if there have not been enough Send + * completions recently. This is a sign the Send Queue is + * backing up. Cause the caller to pause and try again. + */ + dprintk("RPC: %s: empty sendctx queue\n", __func__); + r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); + r_xprt->rx_stats.empty_sendctx_q++; + return NULL; +} + +/** + * rpcrdma_sendctx_put_locked - Release a send context + * @sc: send context to release + * + * Usage: Called from Send completion to return a sendctxt + * to the queue. + * + * The caller serializes calls to this function (per rpcrdma_buffer). + */ +void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; + unsigned long next_tail; + + /* Unmap SGEs of previously completed by unsignaled + * Sends by walking up the queue until @sc is found. + */ + next_tail = buf->rb_sc_tail; + do { + next_tail = rpcrdma_sendctx_next(buf, next_tail); + + /* ORDER: item must be accessed _before_ tail is updated */ + rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]); + + } while (buf->rb_sc_ctxs[next_tail] != sc); + + /* Paired with READ_ONCE */ + smp_store_release(&buf->rb_sc_tail, next_tail); +} + static void rpcrdma_mr_recovery_worker(struct work_struct *work) { @@ -941,13 +1088,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); - req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; INIT_LIST_HEAD(&req->rl_registered); - req->rl_send_wr.next = NULL; - req->rl_send_wr.wr_cqe = &req->rl_cqe; - req->rl_send_wr.sg_list = req->rl_send_sge; - req->rl_send_wr.opcode = IB_WR_SEND; return req; } @@ -974,7 +1116,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; - INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); + INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion); rep->rr_recv_wr.next = NULL; rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; @@ -995,7 +1137,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - atomic_set(&buf->rb_credits, 1); spin_lock_init(&buf->rb_mwlock); spin_lock_init(&buf->rb_lock); spin_lock_init(&buf->rb_recovery_lock); @@ -1022,7 +1163,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(req); goto out; } - req->rl_backchannel = false; list_add(&req->rl_list, &buf->rb_send_bufs); } @@ -1040,6 +1180,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) list_add(&rep->rr_list, &buf->rb_recv_bufs); } + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) + goto out; + return 0; out: rpcrdma_buffer_destroy(buf); @@ -1116,6 +1260,8 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); + rpcrdma_sendctxs_destroy(buf); + while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -1231,7 +1377,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_rep *rep = req->rl_reply; - req->rl_send_wr.num_sge = 0; req->rl_reply = NULL; spin_lock(&buffers->rb_lock); @@ -1363,7 +1508,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_send_wr; + struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; struct ib_send_wr *send_wr_fail; int rc; @@ -1377,7 +1522,14 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, dprintk("RPC: %s: posting %d s/g entries\n", __func__, send_wr->num_sge); - rpcrdma_set_signaled(ep, send_wr); + if (!ep->rep_send_count || + test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->rep_send_count = ep->rep_send_batch; + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + --ep->rep_send_count; + } rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); if (rc) goto out_postsend_err; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e26a97d2f922..51686d9eac5f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -93,8 +94,8 @@ enum { */ struct rpcrdma_ep { - atomic_t rep_cqcount; - int rep_cqinit; + unsigned int rep_send_count; + unsigned int rep_send_batch; int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; @@ -104,25 +105,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -static inline void -rpcrdma_init_cqcount(struct rpcrdma_ep *ep, int count) -{ - atomic_set(&ep->rep_cqcount, ep->rep_cqinit - count); -} - -/* To update send queue accounting, provider must take a - * send completion every now and then. - */ -static inline void -rpcrdma_set_signaled(struct rpcrdma_ep *ep, struct ib_send_wr *send_wr) -{ - send_wr->send_flags = 0; - if (unlikely(atomic_sub_return(1, &ep->rep_cqcount) <= 0)) { - rpcrdma_init_cqcount(ep, 0); - send_wr->send_flags = IB_SEND_SIGNALED; - } -} - /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are * allocated when the forward channel is set up. @@ -164,12 +146,6 @@ rdmab_lkey(struct rpcrdma_regbuf *rb) return rb->rg_iov.lkey; } -static inline struct rpcrdma_msg * -rdmab_to_msg(struct rpcrdma_regbuf *rb) -{ - return (struct rpcrdma_msg *)rb->rg_base; -} - static inline struct ib_device * rdmab_device(struct rpcrdma_regbuf *rb) { @@ -202,22 +178,24 @@ enum { }; /* - * struct rpcrdma_rep -- this structure encapsulates state required to recv - * and complete a reply, asychronously. It needs several pieces of - * state: - * o recv buffer (posted to provider) - * o ib_sge (also donated to provider) - * o status of reply (length, success or not) - * o bookkeeping state to get run by reply handler (list, etc) + * struct rpcrdma_rep -- this structure encapsulates state required + * to receive and complete an RPC Reply, asychronously. It needs + * several pieces of state: * - * These are allocated during initialization, per-transport instance. + * o receive buffer and ib_sge (donated to provider) + * o status of receive (success or not, length, inv rkey) + * o bookkeeping state to get run by reply handler (XDR stream) * - * N of these are associated with a transport instance, and stored in - * struct rpcrdma_buffer. N is the max number of outstanding requests. + * These structures are allocated during transport initialization. + * N of these are associated with a transport instance, managed by + * struct rpcrdma_buffer. N is the max number of outstanding RPCs. */ struct rpcrdma_rep { struct ib_cqe rr_cqe; + __be32 rr_xid; + __be32 rr_vers; + __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; struct rpcrdma_regbuf *rr_rdmabuf; @@ -225,10 +203,34 @@ struct rpcrdma_rep { struct work_struct rr_work; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; + struct rpc_rqst *rr_rqst; struct list_head rr_list; struct ib_recv_wr rr_recv_wr; }; +/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes + */ +struct rpcrdma_req; +struct rpcrdma_xprt; +struct rpcrdma_sendctx { + struct ib_send_wr sc_wr; + struct ib_cqe sc_cqe; + struct rpcrdma_xprt *sc_xprt; + struct rpcrdma_req *sc_req; + unsigned int sc_unmap_count; + struct ib_sge sc_sges[]; +}; + +/* Limit the number of SGEs that can be unmapped during one + * Send completion. This caps the amount of work a single + * completion can do before returning to the provider. + * + * Setting this to zero disables Send completion batching. + */ +enum { + RPCRDMA_MAX_SEND_BATCH = 7, +}; + /* * struct rpcrdma_mw - external memory region metadata * @@ -340,26 +342,30 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; struct xdr_buf rl_hdrbuf; - struct ib_send_wr rl_send_wr; - struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; + struct rpcrdma_sendctx *rl_sendctx; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ - struct ib_cqe rl_cqe; struct list_head rl_all; - bool rl_backchannel; + unsigned long rl_flags; struct list_head rl_registered; /* registered segments */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +/* rl_flags */ +enum { + RPCRDMA_REQ_F_BACKCHANNEL = 0, + RPCRDMA_REQ_F_PENDING, + RPCRDMA_REQ_F_TX_RESOURCES, +}; + static inline void rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) { @@ -399,12 +405,17 @@ struct rpcrdma_buffer { struct list_head rb_mws; struct list_head rb_all; + unsigned long rb_sc_head; + unsigned long rb_sc_tail; + unsigned long rb_sc_last; + struct rpcrdma_sendctx **rb_sc_ctxs; + spinlock_t rb_lock; /* protect buf lists */ int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; u32 rb_max_requests; - atomic_t rb_credits; /* most recent credit grant */ + u32 rb_credits; /* most recent credit grant */ u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ @@ -453,10 +464,12 @@ struct rpcrdma_stats { unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + unsigned long empty_sendctx_q; /* accessed when receiving a reply */ unsigned long long total_rdma_reply; unsigned long long fixup_copy_count; + unsigned long reply_waits_for_send; unsigned long local_inv_needed; unsigned long nomsg_call_count; unsigned long bcall_count; @@ -473,8 +486,6 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_unmap_safe)(struct rpcrdma_xprt *, - struct rpcrdma_req *, bool); void (*ro_recover_mr)(struct rpcrdma_mw *); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -532,6 +543,8 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); bool fmr_is_supported(struct rpcrdma_ia *); +extern struct workqueue_struct *rpcrdma_receive_wq; + /* * Endpoint calls - xprtrdma/verbs.c */ @@ -554,6 +567,8 @@ struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); +void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); @@ -610,12 +625,18 @@ enum rpcrdma_chunktype { rpcrdma_replych }; -bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, - u32, struct xdr_buf *, enum rpcrdma_chunktype); -void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); +int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype); +void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); -void rpcrdma_reply_handler(struct work_struct *work); +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_reply_handler(struct rpcrdma_rep *rep); +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req); +void rpcrdma_deferred_completion(struct work_struct *work); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4dad5da388d6..9cc850c2719e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -552,6 +552,7 @@ static int xs_local_send_request(struct rpc_task *task) default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + /* fall through */ case -EPIPE: xs_close(xprt); status = -ENOTCONN; @@ -1611,6 +1612,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); xs_tcp_force_close(xprt); + /* fall through */ case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -2368,6 +2370,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) switch (ret) { case 0: xs_set_srcport(transport, sock); + /* fall through */ case -EINPROGRESS: /* SYN_SENT! */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) @@ -2419,6 +2422,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) default: printk("%s: connect returned unhandled error %d\n", __func__, status); + /* fall through */ case -EADDRNOTAVAIL: /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry diff --git a/net/tipc/group.c b/net/tipc/group.c index 7821085a7dd8..12777cac638a 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -539,8 +539,8 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { - tipc_group_delete_member(grp, m); __skb_queue_purge(defq); + tipc_group_delete_member(grp, m); break; } if (!update) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 1649d456e22d..b0d07b35909d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -174,7 +174,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == LAST_FRAGMENT) { TIPC_SKB_CB(head)->validated = false; - if (unlikely(!tipc_msg_validate(head))) + if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; @@ -201,11 +201,21 @@ err: * TIPC will ignore the excess, under the assumption that it is optional info * introduced by a later release of the protocol. */ -bool tipc_msg_validate(struct sk_buff *skb) +bool tipc_msg_validate(struct sk_buff **_skb) { - struct tipc_msg *msg; + struct sk_buff *skb = *_skb; + struct tipc_msg *hdr; int msz, hsz; + /* Ensure that flow control ratio condition is satisfied */ + if (unlikely(skb->truesize / buf_roundup_len(skb) > 4)) { + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + return false; + kfree_skb(*_skb); + *_skb = skb; + } + if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) @@ -217,11 +227,11 @@ bool tipc_msg_validate(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, hsz))) return false; - msg = buf_msg(skb); - if (unlikely(msg_version(msg) != TIPC_VERSION)) + hdr = buf_msg(skb); + if (unlikely(msg_version(hdr) != TIPC_VERSION)) return false; - msz = msg_size(msg); + msz = msg_size(hdr); if (unlikely(msz < hsz)) return false; if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) @@ -411,7 +421,7 @@ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) skb_pull(*iskb, offset); imsz = msg_size(buf_msg(*iskb)); skb_trim(*iskb, imsz); - if (unlikely(!tipc_msg_validate(*iskb))) + if (unlikely(!tipc_msg_validate(iskb))) goto none; *pos += align(imsz); return true; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index bf8f57ccc70c..3e4384c222f7 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -926,7 +926,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); -bool tipc_msg_validate(struct sk_buff *skb); +bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); @@ -954,6 +954,11 @@ static inline u16 buf_seqno(struct sk_buff *skb) return msg_seqno(buf_msg(skb)); } +static inline int buf_roundup_len(struct sk_buff *skb) +{ + return (skb->len / 1024 + 1) * 1024; +} + /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any diff --git a/net/tipc/node.c b/net/tipc/node.c index 009a81631280..507017fe0f1b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1539,7 +1539,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) __skb_queue_head_init(&xmitq); /* Ensure message is well-formed before touching the header */ - if (unlikely(!tipc_msg_validate(skb))) + if (unlikely(!tipc_msg_validate(&skb))) goto discard; hdr = buf_msg(skb); usr = msg_user(hdr); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bb16f1ec766e..b1ac23ca20c8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,10 +2605,32 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag goto nla_put_failure; } - if (wdev->ssid_len) { - if (nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + if (wdev->ssid_len && + nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid)) goto nla_put_failure; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_ADHOC: { + const u8 *ssid_ie; + if (!wdev->current_bss) + break; + ssid_ie = ieee80211_bss_get_ie(&wdev->current_bss->pub, + WLAN_EID_SSID); + if (!ssid_ie) + break; + if (nla_put(msg, NL80211_ATTR_SSID, ssid_ie[1], ssid_ie + 2)) + goto nla_put_failure; + break; + } + default: + /* nothing */ + break; } + wdev_unlock(wdev); genlmsg_end(msg, hdr); return 0; @@ -6291,7 +6313,7 @@ static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb, if (!hdr) return -1; - genl_dump_check_consistent(cb, hdr, &nl80211_fam); + genl_dump_check_consistent(cb, hdr); if (nl80211_put_regdom(regdom, msg)) goto nla_put_failure; @@ -7722,7 +7744,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, if (!hdr) return -1; - genl_dump_check_consistent(cb, hdr, &nl80211_fam); + genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation)) goto nla_put_failure; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3871998059de..78e71b0390be 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3644,27 +3644,14 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, } } -int __init regulatory_init(void) +static int __init regulatory_init_db(void) { - int err = 0; + int err; err = load_builtin_regdb_keys(); if (err) return err; - reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); - if (IS_ERR(reg_pdev)) - return PTR_ERR(reg_pdev); - - spin_lock_init(®_requests_lock); - spin_lock_init(®_pending_beacons_lock); - spin_lock_init(®_indoor_lock); - - rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); - - user_alpha2[0] = '9'; - user_alpha2[1] = '7'; - /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { @@ -3692,6 +3679,31 @@ int __init regulatory_init(void) return 0; } +#ifndef MODULE +late_initcall(regulatory_init_db); +#endif + +int __init regulatory_init(void) +{ + reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); + if (IS_ERR(reg_pdev)) + return PTR_ERR(reg_pdev); + + spin_lock_init(®_requests_lock); + spin_lock_init(®_pending_beacons_lock); + spin_lock_init(®_indoor_lock); + + rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); + + user_alpha2[0] = '9'; + user_alpha2[1] = '7'; + +#ifdef MODULE + return regulatory_init_db(); +#else + return 0; +#endif +} void regulatory_exit(void) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2f57722f5d03..9542975eb2f9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1305,6 +1305,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; + newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); @@ -1360,29 +1361,36 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct net *net = xp_net(policy); int nx; int i, error; + xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); + xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; - xfrm_address_t *local; - xfrm_address_t *remote; + xfrm_address_t *remote = daddr; + xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; - remote = &tmpl->id.daddr; - local = &tmpl->saddr; - if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, fl->flowi_oif, - &tmp, remote, - tmpl->encap_family, 0); - if (error) - goto fail; - local = &tmp; + if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_BEET) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; + if (xfrm_addr_any(local, tmpl->encap_family)) { + error = xfrm_get_saddr(net, fl->flowi_oif, + &tmp, remote, + tmpl->encap_family, 0); + if (error) + goto fail; + local = &tmp; + } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; + daddr = remote; + saddr = local; continue; } if (x) { |