summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-06-13 16:27:13 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-13 16:27:13 -0700
commit96144c58abe7ff767e754b5b80995f7b8846d49b (patch)
tree7fcc47090ced9be71fa35cbf5e00d0160b04a2d1 /net/ipv4
parentf82e7b57b5fc48199e2f26ffafe2f96f7338ad3d (diff)
parentbc139119a1708ae3db1ebb379630f286e28d06e8 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from David Miller: 1) Fix cfg80211 deadlock, from Johannes Berg. 2) RXRPC fails to send norigications, from David Howells. 3) MPTCP RM_ADDR parsing has an off by one pointer error, fix from Geliang Tang. 4) Fix crash when using MSG_PEEK with sockmap, from Anny Hu. 5) The ucc_geth driver needs __netdev_watchdog_up exported, from Valentin Longchamp. 6) Fix hashtable memory leak in dccp, from Wang Hai. 7) Fix how nexthops are marked as FDB nexthops, from David Ahern. 8) Fix mptcp races between shutdown and recvmsg, from Paolo Abeni. 9) Fix crashes in tipc_disc_rcv(), from Tuong Lien. 10) Fix link speed reporting in iavf driver, from Brett Creeley. 11) When a channel is used for XSK and then reused again later for XSK, we forget to clear out the relevant data structures in mlx5 which causes all kinds of problems. Fix from Maxim Mikityanskiy. 12) Fix memory leak in genetlink, from Cong Wang. 13) Disallow sockmap attachments to UDP sockets, it simply won't work. From Lorenz Bauer. * git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (83 commits) net: ethernet: ti: ale: fix allmulti for nu type ale net: ethernet: ti: am65-cpsw-nuss: fix ale parameters init net: atm: Remove the error message according to the atomic context bpf: Undo internal BPF_PROBE_MEM in BPF insns dump libbpf: Support pre-initializing .bss global variables tools/bpftool: Fix skeleton codegen bpf: Fix memlock accounting for sock_hash bpf: sockmap: Don't attach programs to UDP sockets bpf: tcp: Recv() should return 0 when the peer socket is closed ibmvnic: Flush existing work items before device removal genetlink: clean up family attributes allocations net: ipa: header pad field only valid for AP->modem endpoint net: ipa: program upper nibbles of sequencer type net: ipa: fix modem LAN RX endpoint id net: ipa: program metadata mask differently ionic: add pcie_print_link_status rxrpc: Fix race between incoming ACK parser and retransmitter net/mlx5: E-Switch, Fix some error pointer dereferences net/mlx5: Don't fail driver on failure to create debugfs net/mlx5e: CT: Fix ipv6 nat header rewrite actions ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/nexthop.c82
-rw-r--r--net/ipv4/tcp.c70
-rw-r--r--net/ipv4/tcp_bpf.c6
3 files changed, 118 insertions, 40 deletions
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 400a9f89ebdb..cc8049b100b2 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -247,12 +247,11 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_u32(skb, NHA_ID, nh->id))
goto nla_put_failure;
- if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
- goto nla_put_failure;
-
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
if (nla_put_nh_group(skb, nhg))
goto nla_put_failure;
goto out;
@@ -264,7 +263,10 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_flag(skb, NHA_BLACKHOLE))
goto nla_put_failure;
goto out;
- } else if (!nh->is_fdb_nh) {
+ } else if (nhi->fdb_nh) {
+ if (nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+ } else {
const struct net_device *dev;
dev = nhi->fib_nhc.nhc_dev;
@@ -385,7 +387,7 @@ errout:
}
static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
- struct netlink_ext_ack *extack)
+ bool *is_fdb, struct netlink_ext_ack *extack)
{
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
@@ -398,6 +400,7 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
"Multipath group can not be a nexthop within a group");
return false;
}
+ *is_fdb = nhg->fdb_nh;
} else {
struct nh_info *nhi = rtnl_dereference(nh->nh_info);
@@ -406,6 +409,7 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
"Blackhole nexthop can not be used in a group with more than 1 path");
return false;
}
+ *is_fdb = nhi->fdb_nh;
}
return true;
@@ -416,12 +420,13 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
{
struct nh_info *nhi;
- if (!nh->is_fdb_nh) {
+ nhi = rtnl_dereference(nh->nh_info);
+
+ if (!nhi->fdb_nh) {
NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
return -EINVAL;
}
- nhi = rtnl_dereference(nh->nh_info);
if (*nh_family == AF_UNSPEC) {
*nh_family = nhi->family;
} else if (*nh_family != nhi->family) {
@@ -473,19 +478,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
struct nexthop *nh;
+ bool is_fdb_nh;
nh = nexthop_find_by_id(net, nhg[i].id);
if (!nh) {
NL_SET_ERR_MSG(extack, "Invalid nexthop id");
return -EINVAL;
}
- if (!valid_group_nh(nh, len, extack))
+ if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
return -EINVAL;
if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
return -EINVAL;
- if (!nhg_fdb && nh->is_fdb_nh) {
+ if (!nhg_fdb && is_fdb_nh) {
NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
return -EINVAL;
}
@@ -553,13 +559,13 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
if (hash > atomic_read(&nhge->upper_bound))
continue;
- if (nhge->nh->is_fdb_nh)
+ nhi = rcu_dereference(nhge->nh->nh_info);
+ if (nhi->fdb_nh)
return nhge->nh;
/* nexthops always check if it is good and does
* not rely on a sysctl for this behavior
*/
- nhi = rcu_dereference(nhge->nh->nh_info);
switch (nhi->family) {
case AF_INET:
if (ipv4_good_nh(&nhi->fib_nh))
@@ -624,11 +630,7 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
struct nh_info *nhi;
-
- if (nh->is_fdb_nh) {
- NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
- return -EINVAL;
- }
+ bool is_fdb_nh;
/* fib6_src is unique to a fib6_info and limits the ability to cache
* routes in fib6_nh within a nexthop that is potentially shared
@@ -645,10 +647,17 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
nhg = rtnl_dereference(nh->nh_grp);
if (nhg->has_v4)
goto no_v4_nh;
+ is_fdb_nh = nhg->fdb_nh;
} else {
nhi = rtnl_dereference(nh->nh_info);
if (nhi->family == AF_INET)
goto no_v4_nh;
+ is_fdb_nh = nhi->fdb_nh;
+ }
+
+ if (is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ return -EINVAL;
}
return 0;
@@ -677,12 +686,9 @@ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
return fib6_check_nexthop(new, NULL, extack);
}
-static int nexthop_check_scope(struct nexthop *nh, u8 scope,
+static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
struct netlink_ext_ack *extack)
{
- struct nh_info *nhi;
-
- nhi = rtnl_dereference(nh->nh_info);
if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
NL_SET_ERR_MSG(extack,
"Route with host scope can not have a gateway");
@@ -704,29 +710,38 @@ static int nexthop_check_scope(struct nexthop *nh, u8 scope,
int fib_check_nexthop(struct nexthop *nh, u8 scope,
struct netlink_ext_ack *extack)
{
+ struct nh_info *nhi;
int err = 0;
- if (nh->is_fdb_nh) {
- NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
- err = -EINVAL;
- goto out;
- }
-
if (nh->is_group) {
struct nh_group *nhg;
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (nhg->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+
if (scope == RT_SCOPE_HOST) {
NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
err = -EINVAL;
goto out;
}
- nhg = rtnl_dereference(nh->nh_grp);
/* all nexthops in a group have the same scope */
- err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack);
+ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
+ err = nexthop_check_scope(nhi, scope, extack);
} else {
- err = nexthop_check_scope(nh, scope, extack);
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+ err = nexthop_check_scope(nhi, scope, extack);
}
+
out:
return err;
}
@@ -787,6 +802,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
newg->has_v4 = nhg->has_v4;
newg->mpath = nhg->mpath;
+ newg->fdb_nh = nhg->fdb_nh;
newg->num_nh = nhg->num_nh;
/* copy old entries to new except the one getting removed */
@@ -1216,7 +1232,7 @@ static struct nexthop *nexthop_create_group(struct net *net,
}
if (cfg->nh_fdb)
- nh->is_fdb_nh = 1;
+ nhg->fdb_nh = 1;
rcu_assign_pointer(nh->nh_grp, nhg);
@@ -1255,7 +1271,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
goto out;
}
- if (nh->is_fdb_nh)
+ if (nhi->fdb_nh)
goto out;
/* sets nh_dev if successful */
@@ -1326,7 +1342,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
if (cfg->nh_fdb)
- nh->is_fdb_nh = 1;
+ nhi->fdb_nh = 1;
if (cfg->nh_blackhole) {
nhi->reject_nh = 1;
@@ -1349,7 +1365,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
}
/* add the entry to the device based hash */
- if (!nh->is_fdb_nh)
+ if (!nhi->fdb_nh)
nexthop_devhash_add(net, nhi);
rcu_assign_pointer(nh->nh_info, nhi);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 27716e4932bc..810cc164f795 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock,
}
EXPORT_SYMBOL(tcp_mmap);
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+ struct page **pages,
+ unsigned long pages_to_map,
+ unsigned long *insert_addr,
+ u32 *length_with_pending,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc)
+{
+ unsigned long pages_remaining = pages_to_map;
+ int bytes_mapped;
+ int ret;
+
+ ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
+ bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
+ * mapping (some but not all of the pages).
+ */
+ *seq += bytes_mapped;
+ *insert_addr += bytes_mapped;
+ if (ret) {
+ /* But if vm_insert_pages did fail, we have to unroll some state
+ * we speculatively touched before.
+ */
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+ *length_with_pending -= bytes_not_mapped;
+ zc->recv_skip_hint += bytes_not_mapped;
+ }
+ return ret;
+}
+
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
unsigned long address = (unsigned long)zc->address;
u32 length = 0, seq, offset, zap_len;
+ #define PAGE_BATCH_SIZE 8
+ struct page *pages[PAGE_BATCH_SIZE];
const skb_frag_t *frags = NULL;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
+ unsigned long pg_idx = 0;
+ unsigned long curr_addr;
struct tcp_sock *tp;
int inq;
int ret;
@@ -1762,6 +1796,8 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk);
+ tp = tcp_sk(sk);
+
mmap_read_lock(current->mm);
vma = find_vma(current->mm, address);
@@ -1771,7 +1807,6 @@ static int tcp_zerocopy_receive(struct sock *sk,
}
zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
- tp = tcp_sk(sk);
seq = tp->copied_seq;
inq = tcp_inq(sk);
zc->length = min_t(u32, zc->length, inq);
@@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc->recv_skip_hint = zc->length;
}
ret = 0;
+ curr_addr = address;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
+ /* If we're here, finish the current batch. */
+ if (pg_idx) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+ pg_idx,
+ &curr_addr,
+ &length,
+ &seq, zc);
+ if (ret)
+ goto out;
+ pg_idx = 0;
+ }
if (skb) {
if (zc->recv_skip_hint > 0)
break;
@@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk,
} else {
skb = tcp_recv_skb(sk, seq, &offset);
}
-
zc->recv_skip_hint = skb->len - offset;
offset -= skb_headlen(skb);
if ((int)offset < 0 || skb_has_frag_list(skb))
@@ -1817,14 +1863,24 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc->recv_skip_hint -= remaining;
break;
}
- ret = vm_insert_page(vma, address + length,
- skb_frag_page(frags));
- if (ret)
- break;
+ pages[pg_idx] = skb_frag_page(frags);
+ pg_idx++;
length += PAGE_SIZE;
- seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
+ if (pg_idx == PAGE_BATCH_SIZE) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+ &curr_addr, &length,
+ &seq, zc);
+ if (ret)
+ goto out;
+ pg_idx = 0;
+ }
+ }
+ if (pg_idx) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+ &curr_addr, &length, &seq,
+ zc);
}
out:
mmap_read_unlock(current->mm);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 629aaa9a1eb9..7aa68f4aae6c 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -64,6 +64,9 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
} while (i != msg_rx->sg.end);
if (unlikely(peek)) {
+ if (msg_rx == list_last_entry(&psock->ingress_msg,
+ struct sk_msg, list))
+ break;
msg_rx = list_next_entry(msg_rx, list);
continue;
}
@@ -242,6 +245,9 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
DEFINE_WAIT_FUNC(wait, woken_wake_function);
int ret = 0;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
if (!timeo)
return ret;