diff options
Diffstat (limited to 'net/ipv4/ip_output.c')
-rw-r--r-- | net/ipv4/ip_output.c | 107 |
1 files changed, 62 insertions, 45 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b06f678b03a1..a2705d454fd6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -75,8 +75,8 @@ #include <net/checksum.h> #include <net/gso.h> #include <net/inetpeer.h> -#include <net/inet_ecn.h> #include <net/lwtunnel.h> +#include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> @@ -198,7 +198,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; @@ -475,26 +475,18 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, goto packet_routed; /* Make sure we can route this packet. */ - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { - __be32 daddr; + inet_sk_init_flowi4(inet, fl4); - /* Use correct destination address if we have options. */ - daddr = inet->inet_daddr; - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; + /* sctp_v4_xmit() uses its own DSCP value */ + fl4->flowi4_tos = tos & INET_DSCP_MASK; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(net, fl4, sk, - daddr, inet->inet_saddr, - inet->inet_dport, - inet->inet_sport, - sk->sk_protocol, - RT_CONN_FLAGS_TOS(sk, tos), - sk->sk_bound_dev_if); + rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &rt->dst); @@ -764,7 +756,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; struct sk_buff *skb2; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; @@ -856,7 +848,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } } - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) @@ -912,7 +904,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, skb2); if (err) goto fail; @@ -971,9 +963,9 @@ static int __ip_append_data(struct sock *sk, bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); + bool paged, hold_tskey = false, extra_uref = false; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref = false; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -982,10 +974,6 @@ static int __ip_append_data(struct sock *sk, mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; - if (cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; - hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); @@ -1026,7 +1014,8 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1052,6 +1041,16 @@ static int __ip_append_data(struct sock *sk, cork->length += length; + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } + } + /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, @@ -1162,7 +1161,10 @@ alloc_new_skb: /* [!] NOTE: copy will be negative if pagedlen>0 * because then the equation reduces to -fraggap. */ - if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1206,8 +1208,9 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1245,7 +1248,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1274,6 +1278,8 @@ error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + if (hold_tskey) + atomic_dec(&sk->sk_tskey); return err; } @@ -1287,6 +1293,12 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, if (unlikely(!rt)) return -EFAULT; + cork->fragsize = ip_sk_use_pmtu(sk) ? + dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + + if (!inetdev_valid_mtu(cork->fragsize)) + return -ENETUNREACH; + /* * setup for corking. */ @@ -1303,12 +1315,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->addr = ipc->addr; } - cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); - - if (!inetdev_valid_mtu(cork->fragsize)) - return -ENETUNREACH; - cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; @@ -1319,10 +1325,14 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->mark = ipc->sockc.mark; - cork->priority = ipc->priority; + cork->priority = ipc->sockc.priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; - sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); + sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); + if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->flags |= IPCORK_TS_OPT_ID; + cork->ts_opt_id = ipc->sockc.ts_opt_id; + } return 0; } @@ -1387,7 +1397,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; @@ -1452,9 +1462,12 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_options_build(skb, opt, cork->addr, rt); } - skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); + skb->priority = cork->priority; skb->mark = cork->mark; - skb->tstamp = cork->transmit_time; + if (sk_is_tcp(sk)) + skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); + else + skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1470,7 +1483,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * by icmp_hdr(skb)->type. */ if (sk->sk_type == SOCK_RAW && - !inet_test_bit(HDRINCL, sk)) + !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; @@ -1580,7 +1593,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. */ -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -1615,7 +1629,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, - RT_TOS(arg->tos), + arg->tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, @@ -1626,7 +1640,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; + inet_sk(sk)->tos = arg->tos; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; @@ -1646,7 +1660,10 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - nskb->mono_delivery_time = !!transmit_time; + if (orig_sk) + skb_set_owner_edemux(nskb, (struct sock *)orig_sk); + if (transmit_time) + nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); |