From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup: fix handling of multi-destination migration from subtree_control enabling Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_null+0x1a/0x20 [] pids_cancel.constprop.6+0x31/0x40 [] pids_can_attach+0x6d/0xf0 [] cgroup_taskset_migrate+0x6c/0x330 [] cgroup_migrate+0xf5/0x190 [] cgroup_attach_task+0x176/0x200 [] __cgroup_procs_write+0x2ad/0x460 [] cgroup_procs_write+0x14/0x20 [] cgroup_file_write+0x35/0x1c0 [] kernfs_fop_write+0x141/0x190 [] __vfs_write+0x28/0xe0 [] vfs_write+0xac/0x1a0 [] SyS_write+0x49/0xb0 [] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo Reported-and-tested-by: Daniel Wagner Cc: Aleksa Sarai --- net/core/netclassid_cgroup.c | 11 ++++++----- net/core/netprio_cgroup.c | 9 +++++---- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47b1a8f..81cb3c72efe8 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -67,14 +67,15 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_taskset *tset) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a199bf52..40fd09fe06ae 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); -- cgit From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 4 Dec 2015 15:14:04 -0200 Subject: sctp: update the netstamp_needed counter when copying sockets Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy related to disabling sock timestamp. When SCTP accepts an association or peel one off, it copies sock flags but forgot to call net_enable_timestamp() if a packet timestamping flag was copied, leading to extra calls to net_disable_timestamp() whenever such clones were closed. The fix is to call net_enable_timestamp() whenever we copy a sock with that flag on, like tcp does. Reported-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/core/sock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index e31dfcee1729..d01c8f42dbb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -433,8 +433,6 @@ static bool sock_needs_netstamp(const struct sock *sk) } } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) - static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { -- cgit From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2015 07:22:02 -0800 Subject: xfrm: add rcu protection to sk->sk_policy[] XFRM can deal with SYNACK messages, sent while listener socket is not locked. We add proper rcu protection to __xfrm_sk_clone_policy() and xfrm_sk_policy_lookup() This might serve as the first step to remove xfrm.xfrm_policy_lock use in fast path. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index d01c8f42dbb2..765be835b06c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1550,7 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ is_charged = sk_filter_charge(newsk, filter); - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; -- cgit From f654861569872d10dcb79d9d7ca219b316f94ff0 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 14 Dec 2015 17:44:10 -0500 Subject: skbuff: Fix offset error in skb_reorder_vlan_header skb_reorder_vlan_header is called after the vlan header has been pulled. As a result the offset of the begining of the mac header has been incrased by 4 bytes (VLAN_HLEN). When moving the mac addresses, include this incrase in the offset calcualation so that the mac addresses are copied correctly. Fixes: a6e18ff1117 (vlan: Fix untag operations of stacked vlans with REORDER_HEADER off) CC: Nicolas Dichtel CC: Patrick McHardy Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 152b9c70e252..5cc43d37a4a2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4268,7 +4268,7 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len, + memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN, 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; -- cgit From ac5cc977991d2dce85fc734a6c71ddb33f6fe3c1 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 16 Dec 2015 23:39:04 -0800 Subject: net: check both type and procotol for tcp sockets Dmitry reported the following out-of-bound access: Call Trace: [] __asan_report_load4_noabort+0x3e/0x40 mm/kasan/report.c:294 [] sock_setsockopt+0x1284/0x13d0 net/core/sock.c:880 [< inline >] SYSC_setsockopt net/socket.c:1746 [] SyS_setsockopt+0x1fe/0x240 net/socket.c:1729 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 This is because we mistake a raw socket as a tcp socket. We should check both sk->sk_type and sk->sk_protocol to ensure it is a tcp socket. Willem points out __skb_complete_tx_timestamp() needs to fix as well. Reported-by: Dmitry Vyukov Cc: Willem de Bruijn Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- net/core/sock.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5cc43d37a4a2..b2df375ec9c2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3643,7 +3643,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; - if (sk->sk_protocol == IPPROTO_TCP) + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) serr->ee.ee_data -= sk->sk_tskey; } diff --git a/net/core/sock.c b/net/core/sock.c index 765be835b06c..0d91f7dca751 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -872,7 +872,8 @@ set_rcvbuf: if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { if (sk->sk_state != TCP_ESTABLISHED) { ret = -EINVAL; break; -- cgit