From ed2b82c03dc187018307c7c6bf9299705f3db383 Mon Sep 17 00:00:00 2001
From: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Date: Fri, 29 Jun 2018 14:48:20 +0200
Subject: bpf: hash map: decrement counter on error

Decrement the number of elements in the map in case the allocation
of a new node fails.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/hashtab.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ca2198a6d22..513d9dfcf4ee 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				 * old element will be freed immediately.
 				 * Otherwise return an error
 				 */
-				atomic_dec(&htab->count);
-				return ERR_PTR(-E2BIG);
+				l_new = ERR_PTR(-E2BIG);
+				goto dec_count;
 			}
 		l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
 				     htab->map.numa_node);
-		if (!l_new)
-			return ERR_PTR(-ENOMEM);
+		if (!l_new) {
+			l_new = ERR_PTR(-ENOMEM);
+			goto dec_count;
+		}
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 						  GFP_ATOMIC | __GFP_NOWARN);
 			if (!pptr) {
 				kfree(l_new);
-				return ERR_PTR(-ENOMEM);
+				l_new = ERR_PTR(-ENOMEM);
+				goto dec_count;
 			}
 		}
 
@@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 
 	l_new->hash = hash;
 	return l_new;
+dec_count:
+	atomic_dec(&htab->count);
+	return l_new;
 }
 
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
-- 
cgit 


From 547b3aa451ae2739585547db9fbdee11a43ff999 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:05:56 -0700
Subject: bpf: sockmap, error path can not release psock in multi-map case

The current code, in the error path of sock_hash_ctx_update_elem,
checks if the sock has a psock in the user data and if so decrements
the reference count of the psock. However, if the error happens early
in the error path we may have never incremented the psock reference
count and if the psock exists because the sock is in another map then
we may inadvertently decrement the reference count.

Fix this by making the error path only call smap_release_sock if the
error happens after the increment.

Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index cf7b6a6dbd1f..3847a7ce7dae 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1896,7 +1896,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
 		if (!e) {
 			err = -ENOMEM;
-			goto out_progs;
+			goto out_free;
 		}
 	}
 
@@ -2342,7 +2342,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	if (err)
 		goto err;
 
-	/* bpf_map_update_elem() can be called in_irq() */
+	/* psock is valid here because otherwise above *ctx_update_elem would
+	 * have thrown an error. It is safe to skip error check.
+	 */
+	psock = smap_psock_sk(sock);
 	raw_spin_lock_bh(&b->lock);
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 	if (l_old && map_flags == BPF_NOEXIST) {
@@ -2360,12 +2363,6 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		goto bucket_err;
 	}
 
-	psock = smap_psock_sk(sock);
-	if (unlikely(!psock)) {
-		err = -EINVAL;
-		goto bucket_err;
-	}
-
 	rcu_assign_pointer(e->hash_link, l_new);
 	rcu_assign_pointer(e->htab,
 			   container_of(map, struct bpf_htab, map));
@@ -2388,12 +2385,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	raw_spin_unlock_bh(&b->lock);
 	return 0;
 bucket_err:
+	smap_release_sock(psock, sock);
 	raw_spin_unlock_bh(&b->lock);
 err:
 	kfree(e);
-	psock = smap_psock_sk(sock);
-	if (psock)
-		smap_release_sock(psock, sock);
 	return err;
 }
 
-- 
cgit 


From 1d1ef005dbc6de673c62cbd2562290ada3090463 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:06:01 -0700
Subject: bpf: sockmap, hash table is RCU so readers do not need locks

This removes locking from readers of RCU hash table. Its not
necessary.

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 3847a7ce7dae..00fb2e328d1b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2467,10 +2467,8 @@ struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	raw_spin_lock_bh(&b->lock);
 	l = lookup_elem_raw(head, hash, key, key_size);
 	sk = l ? l->sk : NULL;
-	raw_spin_unlock_bh(&b->lock);
 	return sk;
 }
 
-- 
cgit 


From 99ba2b5aba24e022683a7db63204f9e306fe7ab9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:50:04 -0700
Subject: bpf: sockhash, disallow bpf_tcp_close and update in parallel

After latest lock updates there is no longer anything preventing a
close and recvmsg call running in parallel. Additionally, we can
race update with close if we close a socket and simultaneously update
if via the BPF userspace API (note the cgroup ops are already run
with sock_lock held).

To resolve this take sock_lock in close and update paths.

Reported-by: syzbot+b680e42077a0d7c9a0c4@syzkaller.appspotmail.com
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 15 +++++++++++++++
 kernel/bpf/syscall.c |  4 +++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 00fb2e328d1b..9c67e96fe336 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -312,10 +312,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	struct smap_psock *psock;
 	struct sock *osk;
 
+	lock_sock(sk);
 	rcu_read_lock();
 	psock = smap_psock_sk(sk);
 	if (unlikely(!psock)) {
 		rcu_read_unlock();
+		release_sock(sk);
 		return sk->sk_prot->close(sk, timeout);
 	}
 
@@ -371,6 +373,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 		e = psock_map_pop(sk, psock);
 	}
 	rcu_read_unlock();
+	release_sock(sk);
 	close_fun(sk, timeout);
 }
 
@@ -2069,7 +2072,13 @@ static int sock_map_update_elem(struct bpf_map *map,
 		return -EOPNOTSUPP;
 	}
 
+	lock_sock(skops.sk);
+	preempt_disable();
+	rcu_read_lock();
 	err = sock_map_ctx_update_elem(&skops, map, key, flags);
+	rcu_read_unlock();
+	preempt_enable();
+	release_sock(skops.sk);
 	fput(socket->file);
 	return err;
 }
@@ -2410,7 +2419,13 @@ static int sock_hash_update_elem(struct bpf_map *map,
 		return -EINVAL;
 	}
 
+	lock_sock(skops.sk);
+	preempt_disable();
+	rcu_read_lock();
 	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+	rcu_read_unlock();
+	preempt_enable();
+	release_sock(skops.sk);
 	fput(socket->file);
 	return err;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d10ecd78105f..a31a1ba0f8ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr)
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
 		goto out;
-	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		goto out;
 	}
-- 
cgit 


From 7ebc14d507b4b55105da8d1a1eda323381529cc7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:50:10 -0700
Subject: bpf: sockmap, consume_skb in close path

Currently, when a sock is closed and the bpf_tcp_close() callback is
used we remove memory but do not free the skb. Call consume_skb() if
the skb is attached to the buffer.

Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com
Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 9c67e96fe336..dfc8a8a07c1f 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -571,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
 	while (sg[i].length) {
 		free += sg[i].length;
 		sk_mem_uncharge(sk, sg[i].length);
-		put_page(sg_page(&sg[i]));
+		if (!md->skb)
+			put_page(sg_page(&sg[i]));
 		sg[i].length = 0;
 		sg[i].page_link = 0;
 		sg[i].offset = 0;
@@ -580,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
 	}
+	if (md->skb)
+		consume_skb(md->skb);
 
 	return free;
 }
-- 
cgit 


From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:50:15 -0700
Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb

In commit

  'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512)

we added the routine bpf_compute_data_end_sk_skb() to compute the
correct data_end values, but this has since been lost. In kernel
v4.14 this was correct and the above patch was applied in it
entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree
we lost the piece that renamed bpf_compute_data_pointers to the
new function bpf_compute_data_end_sk_skb. This was done here,

e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")

When it conflicted with the following rename patch,

6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers")

Finally, after a refactor I thought even the function
bpf_compute_data_end_sk_skb() was no longer needed and it was
erroneously removed.

However, we never reverted the sk_skb_convert_ctx_access() usage of
tcp_skb_cb which had been committed and survived the merge conflict.
Here we fix this by adding back the helper and *_data_end_sk_skb()
usage. Using the bpf_skc_data_end mapping is not correct because it
expects a qdisc_skb_cb object but at the sock layer this is not the
case. Even though it happens to work here because we don't overwrite
any data in-use at the socket layer and the cb structure is cleared
later this has potential to create some subtle issues. But, even
more concretely the filter.c access check uses tcp_skb_cb.

And by some act of chance though,

struct bpf_skb_data_end {
        struct qdisc_skb_cb        qdisc_cb;             /*     0    28 */

        /* XXX 4 bytes hole, try to pack */

        void *                     data_meta;            /*    32     8 */
        void *                     data_end;             /*    40     8 */

        /* size: 48, cachelines: 1, members: 3 */
        /* sum members: 44, holes: 1, sum holes: 4 */
        /* last cacheline: 48 bytes */
};

and then tcp_skb_cb,

struct tcp_skb_cb {
	[...]
                struct {
                        __u32      flags;                /*    24     4 */
                        struct sock * sk_redir;          /*    32     8 */
                        void *     data_end;             /*    40     8 */
                } bpf;                                   /*          24 */
        };

So when we use offset_of() to track down the byte offset we get 40 in
either case and everything continues to work. Fix this mess and use
correct structures its unclear how long this might actually work for
until someone moves the structs around.

Reported-by: Martin KaFai Lau <kafai@fb.com>
Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")
Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dfc8a8a07c1f..98fb7938beea 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 */
 	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
 	skb->sk = psock->sock;
-	bpf_compute_data_pointers(skb);
+	bpf_compute_data_end_sk_skb(skb);
 	preempt_disable();
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	preempt_enable();
@@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_pointers(skb);
+	bpf_compute_data_end_sk_skb(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
-- 
cgit 


From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Fri, 6 Jul 2018 11:49:00 +0900
Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU

Otherwise we end up with attempting to send packets from down devices
or to send oversized packets, which may cause unexpected driver/device
behaviour. Generic XDP has already done this check, so reuse the logic
in native XDP.

Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function")
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642c97f6d1b8..d361fc1e3bf3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
+	int err;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit)
 		return -EOPNOTSUPP;
 
+	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+	if (unlikely(err))
+		return err;
+
 	xdpf = convert_to_xdp_frame(xdp);
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
@@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 {
 	int err;
 
-	err = __xdp_generic_ok_fwd_dev(skb, dst->dev);
+	err = xdp_ok_fwd_dev(dst->dev, skb->len);
 	if (unlikely(err))
 		return err;
 	skb->dev = dst->dev;
-- 
cgit 


From b65f370d0671c4980ffe866c41e327b88893245c Mon Sep 17 00:00:00 2001
From: Okash Khawaja <osk@fb.com>
Date: Tue, 10 Jul 2018 14:33:07 -0700
Subject: bpf: btf: Fix bitfield extraction for big endian

When extracting bitfield from a number, btf_int_bits_seq_show() builds
a mask and accesses least significant byte of the number in a way
specific to little-endian. This patch fixes that by checking endianness
of the machine and then shifting left and right the unneeded bits.

Thanks to Martin Lau for the help in navigating potential pitfalls when
dealing with endianess and for the final solution.

Fixes: b00b8daec828 ("bpf: btf: Add pretty print capability for data with BTF type info")
Signed-off-by: Okash Khawaja <osk@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2d49d18b793a..e016ac3afa24 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -991,16 +991,13 @@ static void btf_int_bits_seq_show(const struct btf *btf,
 				  void *data, u8 bits_offset,
 				  struct seq_file *m)
 {
+	u16 left_shift_bits, right_shift_bits;
 	u32 int_data = btf_type_int(t);
 	u16 nr_bits = BTF_INT_BITS(int_data);
 	u16 total_bits_offset;
 	u16 nr_copy_bytes;
 	u16 nr_copy_bits;
-	u8 nr_upper_bits;
-	union {
-		u64 u64_num;
-		u8  u8_nums[8];
-	} print_num;
+	u64 print_num;
 
 	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
 	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
@@ -1008,21 +1005,20 @@ static void btf_int_bits_seq_show(const struct btf *btf,
 	nr_copy_bits = nr_bits + bits_offset;
 	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
 
-	print_num.u64_num = 0;
-	memcpy(&print_num.u64_num, data, nr_copy_bytes);
+	print_num = 0;
+	memcpy(&print_num, data, nr_copy_bytes);
 
-	/* Ditch the higher order bits */
-	nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
-	if (nr_upper_bits) {
-		/* We need to mask out some bits of the upper byte. */
-		u8 mask = (1 << nr_upper_bits) - 1;
+#ifdef __BIG_ENDIAN_BITFIELD
+	left_shift_bits = bits_offset;
+#else
+	left_shift_bits = BITS_PER_U64 - nr_copy_bits;
+#endif
+	right_shift_bits = BITS_PER_U64 - nr_bits;
 
-		print_num.u8_nums[nr_copy_bytes - 1] &= mask;
-	}
-
-	print_num.u64_num >>= bits_offset;
+	print_num <<= left_shift_bits;
+	print_num >>= right_shift_bits;
 
-	seq_printf(m, "0x%llx", print_num.u64_num);
+	seq_printf(m, "0x%llx", print_num);
 }
 
 static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
-- 
cgit 


From c7a897843224a92209f306c984975b704969b89d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 12 Jul 2018 21:44:28 +0200
Subject: bpf: don't leave partial mangled prog in jit_subprogs error path

syzkaller managed to trigger the following bug through fault injection:

  [...]
  [  141.043668] verifier bug. No program starts at insn 3
  [  141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613
                 get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline]
  [  141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613
                 fixup_call_args kernel/bpf/verifier.c:5587 [inline]
  [  141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613
                 bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952
  [  141.047355] CPU: 3 PID: 4072 Comm: a.out Not tainted 4.18.0-rc4+ #51
  [  141.048446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS 1.10.2-1 04/01/2014
  [  141.049877] Call Trace:
  [  141.050324]  __dump_stack lib/dump_stack.c:77 [inline]
  [  141.050324]  dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
  [  141.050950]  ? dump_stack_print_info.cold.2+0x52/0x52 lib/dump_stack.c:60
  [  141.051837]  panic+0x238/0x4e7 kernel/panic.c:184
  [  141.052386]  ? add_taint.cold.5+0x16/0x16 kernel/panic.c:385
  [  141.053101]  ? __warn.cold.8+0x148/0x1ba kernel/panic.c:537
  [  141.053814]  ? __warn.cold.8+0x117/0x1ba kernel/panic.c:530
  [  141.054506]  ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline]
  [  141.054506]  ? fixup_call_args kernel/bpf/verifier.c:5587 [inline]
  [  141.054506]  ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952
  [  141.055163]  __warn.cold.8+0x163/0x1ba kernel/panic.c:538
  [  141.055820]  ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline]
  [  141.055820]  ? fixup_call_args kernel/bpf/verifier.c:5587 [inline]
  [  141.055820]  ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952
  [...]

What happens in jit_subprogs() is that kcalloc() for the subprog func
buffer is failing with NULL where we then bail out. Latter is a plain
return -ENOMEM, and this is definitely not okay since earlier in the
loop we are walking all subprogs and temporarily rewrite insn->off to
remember the subprog id as well as insn->imm to temporarily point the
call to __bpf_call_base + 1 for the initial JIT pass. Thus, bailing
out in such state and handing this over to the interpreter is troublesome
since later/subsequent e.g. find_subprog() lookups are based on wrong
insn->imm.

Therefore, once we hit this point, we need to jump to out_free path
where we undo all changes from earlier loop, so that interpreter can
work on unmodified insn->{off,imm}.

Another point is that should find_subprog() fail in jit_subprogs() due
to a verifier bug, then we also should not simply defer the program to
the interpreter since also here we did partial modifications. Instead
we should just bail out entirely and return an error to the user who is
trying to load the program.

Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs")
Reported-by: syzbot+7d427828b2ea6e592804@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e2bf834f13a..63aaac52a265 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
+		/* Upon error here we cannot fall back to interpreter but
+		 * need a hard reject of the program. Thus -EFAULT is
+		 * propagated in any case.
+		 */
 		subprog = find_subprog(env, i + insn->imm + 1);
 		if (subprog < 0) {
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
@@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 
 	func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
 	if (!func)
-		return -ENOMEM;
+		goto out_undo_insn;
 
 	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
@@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		tmp = bpf_int_jit_compile(func[i]);
 		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
 			verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
-			err = -EFAULT;
+			err = -ENOTSUPP;
 			goto out_free;
 		}
 		cond_resched();
@@ -5552,6 +5556,7 @@ out_free:
 		if (func[i])
 			bpf_jit_free(func[i]);
 	kfree(func);
+out_undo_insn:
 	/* cleanup main prog to be interpreted */
 	prog->jit_requested = 0;
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 		err = jit_subprogs(env);
 		if (err == 0)
 			return 0;
+		if (err == -EFAULT)
+			return err;
 	}
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	for (i = 0; i < prog->len; i++, insn++) {
-- 
cgit 


From 9fb8d5dc4b649dd190e1af4ead670753e71bf907 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Tue, 3 Jul 2018 15:02:14 -0700
Subject: stop_machine: Disable preemption when waking two stopper threads

When cpu_stop_queue_two_works() begins to wake the stopper threads, it does
so without preemption disabled, which leads to the following race
condition:

The source CPU calls cpu_stop_queue_two_works(), with cpu1 as the source
CPU, and cpu2 as the destination CPU. When adding the stopper threads to
the wake queue used in this function, the source CPU stopper thread is
added first, and the destination CPU stopper thread is added last.

When wake_up_q() is invoked to wake the stopper threads, the threads are
woken up in the order that they are queued in, so the source CPU's stopper
thread is woken up first, and it preempts the thread running on the source
CPU.

The stopper thread will then execute on the source CPU, disable preemption,
and begin executing multi_cpu_stop(), and wait for an ack from the
destination CPU's stopper thread, with preemption still disabled. Since the
worker thread that woke up the stopper thread on the source CPU is affine
to the source CPU, and preemption is disabled on the source CPU, that
thread will never run to dequeue the destination CPU's stopper thread from
the wake queue, and thus, the destination CPU's stopper thread will never
run, causing the source CPU's stopper thread to wait forever, and stall.

Disable preemption when waking the stopper threads in
cpu_stop_queue_two_works().

Fixes: 0b26351b910f ("stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock")
Co-Developed-by: Prasad Sodagudi <psodagud@codeaurora.org>
Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
Co-Developed-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: matt@codeblueprint.co.uk
Cc: bigeasy@linutronix.de
Cc: gregkh@linuxfoundation.org
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1530655334-4601-1-git-send-email-isaacm@codeaurora.org
---
 kernel/stop_machine.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index f89014a2c238..1ff523dae6e2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -270,7 +270,11 @@ unlock:
 		goto retry;
 	}
 
-	wake_up_q(&wakeq);
+	if (!err) {
+		preempt_disable();
+		wake_up_q(&wakeq);
+		preempt_enable();
+	}
 
 	return err;
 }
-- 
cgit 


From e117cb52bdb4d376b711bee34af6434c9e314b3b Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Wed, 11 Jul 2018 09:29:48 +0200
Subject: sched/deadline: Fix switched_from_dl() warning

Mark noticed that syzkaller is able to reliably trigger the following warning:

  dl_rq->running_bw > dl_rq->this_bw
  WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124 switched_from_dl+0x454/0x608
  Kernel panic - not syncing: panic_on_warn set ...

  CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29
  Hardware name: linux,dummy-virt (DT)
  Call trace:
   dump_backtrace+0x0/0x458
   show_stack+0x20/0x30
   dump_stack+0x180/0x250
   panic+0x2dc/0x4ec
   __warn_printk+0x0/0x150
   report_bug+0x228/0x2d8
   bug_handler+0xa0/0x1a0
   brk_handler+0x2f0/0x568
   do_debug_exception+0x1bc/0x5d0
   el1_dbg+0x18/0x78
   switched_from_dl+0x454/0x608
   __sched_setscheduler+0x8cc/0x2018
   sys_sched_setattr+0x340/0x758
   el0_svc_naked+0x30/0x34

syzkaller reproducer runs a bunch of threads that constantly switch
between DEADLINE and NORMAL classes while interacting through futexes.

The splat above is caused by the fact that if a DEADLINE task is setattr
back to NORMAL while in non_contending state (blocked on a futex -
inactive timer armed), its contribution to running_bw is not removed
before sub_rq_bw() gets called (!task_on_rq_queued() branch) and the
latter sees running_bw > this_bw.

Fix it by removing a task contribution from running_bw if the task is
not queued and in non_contending state while switched to a different
class.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reviewed-by: Luca Abeni <luca.abeni@santannapisa.it>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: claudio@evidence.eu.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20180711072948.27061-1-juri.lelli@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fbfc3f1d368a..10c7b51c0d1f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (task_on_rq_queued(p) && p->dl.dl_runtime)
 		task_non_contending(p);
 
-	if (!task_on_rq_queued(p))
+	if (!task_on_rq_queued(p)) {
+		/*
+		 * Inactive timer is armed. However, p is leaving DEADLINE and
+		 * might migrate away from this rq while continuing to run on
+		 * some other class. We need to remove its contribution from
+		 * this rq running_bw now, or sub_rq_bw (below) will complain.
+		 */
+		if (p->dl.dl_non_contending)
+			sub_running_bw(&p->dl, &rq->dl);
 		sub_rq_bw(&p->dl, &rq->dl);
+	}
 
 	/*
 	 * We cannot use inactive_task_timer() to invoke sub_running_bw()
-- 
cgit 


From 3c53776e29f81719efcf8f7a6e30cdf753bee94d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Jan 2018 11:51:04 -0800
Subject: Mark HI and TASKLET softirq synchronous

Way back in 4.9, we committed 4cd13c21b207 ("softirq: Let ksoftirqd do
its job"), and ever since we've had small nagging issues with it.  For
example, we've had:

  1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred")
  8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run")
  217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()")

all of which worked around some of the effects of that commit.

The DVB people have also complained that the commit causes excessive USB
URB latencies, which seems to be due to the USB code using tasklets to
schedule USB traffic.  This seems to be an issue mainly when already
living on the edge, but waiting for ksoftirqd to handle it really does
seem to cause excessive latencies.

Now Hanna Hawa reports that this issue isn't just limited to USB URB and
DVB, but also causes timeout problems for the Marvell SoC team:

 "I'm facing kernel panic issue while running raid 5 on sata disks
  connected to Macchiatobin (Marvell community board with Armada-8040
  SoC with 4 ARMv8 cores of CA72) Raid 5 built with Marvell DMA engine
  and async_tx mechanism (ASYNC_TX_DMA [=y]); the DMA driver (mv_xor_v2)
  uses a tasklet to clean the done descriptors from the queue"

The latency problem causes a panic:

  mv_xor_v2 f0400000.xor: dma_sync_wait: timeout!
  Kernel panic - not syncing: async_tx_quiesce: DMA error waiting for transaction

We've discussed simply just reverting the original commit entirely, and
also much more involved solutions (with per-softirq threads etc).  This
patch is intentionally stupid and fairly limited, because the issue
still remains, and the other solutions either got sidetracked or had
other issues.

We should probably also consider the timer softirqs to be synchronous
and not be delayed to ksoftirqd (since they were the issue with the
earlier watchdog problems), but that should be done as a separate patch.
This does only the tasklet cases.

Reported-and-tested-by: Hanna Hawa <hannah@marvell.com>
Reported-and-tested-by: Josef Griebichler <griebichler.josef@gmx.at>
Reported-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softirq.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 900dcfee542c..75ffc1d1a2e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -79,12 +79,16 @@ static void wakeup_softirqd(void)
 
 /*
  * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness,
+ * unless we're doing some of the synchronous softirqs.
  */
-static bool ksoftirqd_running(void)
+#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
+static bool ksoftirqd_running(unsigned long pending)
 {
 	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
 
+	if (pending & SOFTIRQ_NOW_MASK)
+		return false;
 	return tsk && (tsk->state == TASK_RUNNING);
 }
 
@@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	if (pending && !ksoftirqd_running())
+	if (pending && !ksoftirqd_running(pending))
 		do_softirq_own_stack();
 
 	local_irq_restore(flags);
@@ -355,7 +359,7 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
-	if (ksoftirqd_running())
+	if (ksoftirqd_running(local_softirq_pending()))
 		return;
 
 	if (!force_irqthreads) {
-- 
cgit 


From 36fc3c8c282c01ad1570bd864de52f128d731b75 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 19 Jul 2018 22:14:31 -0700
Subject: bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h

This patch shrinks the BTF_INT_BITS() mask.  The current
btf_int_check_meta() ensures the nr_bits of an integer
cannot exceed 64.  Hence, it is mostly an uapi cleanup.

The actual btf usage (i.e. seq_show()) is also modified
to use u8 instead of u16.  The verification (e.g. btf_int_check_meta())
path stays as is to deal with invalid BTF situation.

Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e016ac3afa24..9704934252b3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
  */
 static bool btf_type_int_is_regular(const struct btf_type *t)
 {
-	u16 nr_bits, nr_bytes;
+	u8 nr_bits, nr_bytes;
 	u32 int_data;
 
 	int_data = btf_type_int(t);
@@ -993,12 +993,16 @@ static void btf_int_bits_seq_show(const struct btf *btf,
 {
 	u16 left_shift_bits, right_shift_bits;
 	u32 int_data = btf_type_int(t);
-	u16 nr_bits = BTF_INT_BITS(int_data);
-	u16 total_bits_offset;
-	u16 nr_copy_bytes;
-	u16 nr_copy_bits;
+	u8 nr_bits = BTF_INT_BITS(int_data);
+	u8 total_bits_offset;
+	u8 nr_copy_bytes;
+	u8 nr_copy_bits;
 	u64 print_num;
 
+	/*
+	 * bits_offset is at most 7.
+	 * BTF_INT_OFFSET() cannot exceed 64 bits.
+	 */
 	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
 	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
 	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
@@ -1028,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
 	u32 int_data = btf_type_int(t);
 	u8 encoding = BTF_INT_ENCODING(int_data);
 	bool sign = encoding & BTF_INT_SIGNED;
-	u32 nr_bits = BTF_INT_BITS(int_data);
+	u8 nr_bits = BTF_INT_BITS(int_data);
 
 	if (bits_offset || BTF_INT_OFFSET(int_data) ||
 	    BITS_PER_BYTE_MASKED(nr_bits)) {
-- 
cgit 


From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 13:48:51 -0700
Subject: mm: use helper functions for allocating and freeing vm_area structs

The vm_area_struct is one of the most fundamental memory management
objects, but the management of it is entirely open-coded evertwhere,
ranging from allocation and freeing (using kmem_cache_[z]alloc and
kmem_cache_free) to initializing all the fields.

We want to unify this in order to end up having some unified
initialization of the vmas, and the first step to this is to at least
have basic allocation functions.

Right now those functions are literally just wrappers around the
kmem_cache_*() calls.  This is a purely mechanical conversion:

    # new vma:
    kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc()

    # copy old vma
    kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old)

    # free vma
    kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma)

to the point where the old vma passed in to the vm_area_dup() function
isn't even used yet (because I've left all the old manual initialization
alone).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..0e23deb5acfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -303,11 +303,26 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+static struct kmem_cache *vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+struct vm_area_struct *vm_area_alloc(void)
+{
+	return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+	return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	void *stack = task_stack_page(tsk);
@@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = vm_area_dup(mpnt);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -539,7 +554,7 @@ fail_uprobe_end:
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
-	kmem_cache_free(vm_area_cachep, tmp);
+	vm_area_free(tmp);
 fail_nomem:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
-- 
cgit 


From 95faf6992df468f617edb788da8c21c6eed0dfa7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 14:48:45 -0700
Subject: mm: make vm_area_dup() actually copy the old vma data

.. and re-initialize th eanon_vma_chain head.

This removes some boiler-plate from the users, and also makes it clear
why it didn't need use the 'zalloc()' version.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e23deb5acfc..67253e41bfb0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,7 +315,13 @@ struct vm_area_struct *vm_area_alloc(void)
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
-	return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+	if (new) {
+		*new = *orig;
+		INIT_LIST_HEAD(&new->anon_vma_chain);
+	}
+	return new;
 }
 
 void vm_area_free(struct vm_area_struct *vma)
@@ -473,8 +479,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		tmp = vm_area_dup(mpnt);
 		if (!tmp)
 			goto fail_nomem;
-		*tmp = *mpnt;
-		INIT_LIST_HEAD(&tmp->anon_vma_chain);
 		retval = vma_dup_policy(mpnt, tmp);
 		if (retval)
 			goto fail_nomem_policy;
-- 
cgit 


From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 15:24:03 -0700
Subject: mm: make vm_area_alloc() initialize core fields

Like vm_area_dup(), it initializes the anon_vma_chain head, and the
basic mm pointer.

The rest of the fields end up being different for different users,
although the plan is to also initialize the 'vm_ops' field to a dummy
entry.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 67253e41bfb0..a191c05e757d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-struct vm_area_struct *vm_area_alloc(void)
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+
+	if (vma) {
+		vma->vm_mm = mm;
+		INIT_LIST_HEAD(&vma->anon_vma_chain);
+	}
+	return vma;
 }
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-- 
cgit 


From 1863c387259b629e4ebfb255495f67cd06aa229b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 24 Jul 2018 19:13:31 -0400
Subject: tracing: Fix double free of event_trigger_data

Running the following:

 # cd /sys/kernel/debug/tracing
 # echo 500000 > buffer_size_kb
[ Or some other number that takes up most of memory ]
 # echo snapshot > events/sched/sched_switch/trigger

Triggers the following bug:

 ------------[ cut here ]------------
 kernel BUG at mm/slub.c:296!
 invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI
 CPU: 6 PID: 6878 Comm: bash Not tainted 4.18.0-rc6-test+ #1066
 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
 RIP: 0010:kfree+0x16c/0x180
 Code: 05 41 0f b6 72 51 5b 5d 41 5c 4c 89 d7 e9 ac b3 f8 ff 48 89 d9 48 89 da 41 b8 01 00 00 00 5b 5d 41 5c 4c 89 d6 e9 f4 f3 ff ff <0f> 0b 0f 0b 48 8b 3d d9 d8 f9 00 e9 c1 fe ff ff 0f 1f 40 00 0f 1f
 RSP: 0018:ffffb654436d3d88 EFLAGS: 00010246
 RAX: ffff91a9d50f3d80 RBX: ffff91a9d50f3d80 RCX: ffff91a9d50f3d80
 RDX: 00000000000006a4 RSI: ffff91a9de5a60e0 RDI: ffff91a9d9803500
 RBP: ffffffff8d267c80 R08: 00000000000260e0 R09: ffffffff8c1a56be
 R10: fffff0d404543cc0 R11: 0000000000000389 R12: ffffffff8c1a56be
 R13: ffff91a9d9930e18 R14: ffff91a98c0c2890 R15: ffffffff8d267d00
 FS:  00007f363ea64700(0000) GS:ffff91a9de580000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 000055c1cacc8e10 CR3: 00000000d9b46003 CR4: 00000000001606e0
 Call Trace:
  event_trigger_callback+0xee/0x1d0
  event_trigger_write+0xfc/0x1a0
  __vfs_write+0x33/0x190
  ? handle_mm_fault+0x115/0x230
  ? _cond_resched+0x16/0x40
  vfs_write+0xb0/0x190
  ksys_write+0x52/0xc0
  do_syscall_64+0x5a/0x160
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 RIP: 0033:0x7f363e16ab50
 Code: 73 01 c3 48 8b 0d 38 83 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 79 db 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 1e e3 01 00 48 89 04 24
 RSP: 002b:00007fff9a4c6378 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f363e16ab50
 RDX: 0000000000000009 RSI: 000055c1cacc8e10 RDI: 0000000000000001
 RBP: 000055c1cacc8e10 R08: 00007f363e435740 R09: 00007f363ea64700
 R10: 0000000000000073 R11: 0000000000000246 R12: 0000000000000009
 R13: 0000000000000001 R14: 00007f363e4345e0 R15: 00007f363e4303c0
 Modules linked in: ip6table_filter ip6_tables snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_seq snd_seq_device i915 snd_pcm snd_timer i2c_i801 snd soundcore i2c_algo_bit drm_kms_helper
86_pkg_temp_thermal video kvm_intel kvm irqbypass wmi e1000e
 ---[ end trace d301afa879ddfa25 ]---

The cause is because the register_snapshot_trigger() call failed to
allocate the snapshot buffer, and then called unregister_trigger()
which freed the data that was passed to it. Then on return to the
function that called register_snapshot_trigger(), as it sees it
failed to register, it frees the trigger_data again and causes
a double free.

By calling event_trigger_init() on the trigger_data (which only ups
the reference counter for it), and then event_trigger_free() afterward,
the trigger_data would not get freed by the registering trigger function
as it would only up and lower the ref count for it. If the register
trigger function fails, then the event_trigger_free() called after it
will free the trigger data normally.

Link: http://lkml.kernel.org/r/20180724191331.738eb819@gandalf.local.home

Cc: stable@vger.kerne.org
Fixes: 93e31ffbf417 ("tracing: Add 'snapshot' event trigger command")
Reported-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_trigger.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d18249683682..d18ec0e58be2 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -679,6 +679,8 @@ event_trigger_callback(struct event_command *cmd_ops,
 		goto out_free;
 
  out_reg:
+	/* Up the trigger_data count to make sure reg doesn't free it on failure */
+	event_trigger_init(trigger_ops, trigger_data);
 	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
 	/*
 	 * The above returns on success the # of functions enabled,
@@ -686,11 +688,13 @@ event_trigger_callback(struct event_command *cmd_ops,
 	 * Consider no functions a failure too.
 	 */
 	if (!ret) {
+		cmd_ops->unreg(glob, trigger_ops, trigger_data, file);
 		ret = -ENOENT;
-		goto out_free;
-	} else if (ret < 0)
-		goto out_free;
-	ret = 0;
+	} else if (ret > 0)
+		ret = 0;
+
+	/* Down the counter of trigger_data or free it if not used anymore */
+	event_trigger_free(trigger_ops, trigger_data);
  out:
 	return ret;
 
-- 
cgit 


From 73c8d8945505acdcbae137c2e00a1232e0be709f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sat, 14 Jul 2018 01:28:15 +0900
Subject: ring_buffer: tracing: Inherit the tracing setting to next ring buffer

Maintain the tracing on/off setting of the ring_buffer when switching
to the trace buffer snapshot.

Taking a snapshot is done by swapping the backup ring buffer
(max_tr_buffer). But since the tracing on/off setting is defined
by the ring buffer, when swapping it, the tracing on/off setting
can also be changed. This causes a strange result like below:

  /sys/kernel/debug/tracing # cat tracing_on
  1
  /sys/kernel/debug/tracing # echo 0 > tracing_on
  /sys/kernel/debug/tracing # cat tracing_on
  0
  /sys/kernel/debug/tracing # echo 1 > snapshot
  /sys/kernel/debug/tracing # cat tracing_on
  1
  /sys/kernel/debug/tracing # echo 1 > snapshot
  /sys/kernel/debug/tracing # cat tracing_on
  0

We don't touch tracing_on, but snapshot changes tracing_on
setting each time. This is an anomaly, because user doesn't know
that each "ring_buffer" stores its own tracing-enable state and
the snapshot is done by swapping ring buffers.

Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Hiraku Toyooka <hiraku.toyooka@cybertrust.co.jp>
Cc: stable@vger.kernel.org
Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
[ Updated commit log and comment in the code ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 16 ++++++++++++++++
 kernel/trace/trace.c       |  6 ++++++
 2 files changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6a46af21765c..0b0b688ea166 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3226,6 +3226,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer)
 	return !atomic_read(&buffer->record_disabled);
 }
 
+/**
+ * ring_buffer_record_is_set_on - return true if the ring buffer is set writable
+ * @buffer: The ring buffer to see if write is set enabled
+ *
+ * Returns true if the ring buffer is set writable by ring_buffer_record_on().
+ * Note that this does NOT mean it is in a writable state.
+ *
+ * It may return true when the ring buffer has been disabled by
+ * ring_buffer_record_disable(), as that is a temporary disabling of
+ * the ring buffer.
+ */
+int ring_buffer_record_is_set_on(struct ring_buffer *buffer)
+{
+	return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
+}
+
 /**
  * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
  * @buffer: The ring buffer to stop writes to.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 87cf25171fb8..823687997b01 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	arch_spin_lock(&tr->max_lock);
 
+	/* Inherit the recordable setting from trace_buffer */
+	if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer))
+		ring_buffer_record_on(tr->max_buffer.buffer);
+	else
+		ring_buffer_record_off(tr->max_buffer.buffer);
+
 	swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
 
 	__update_max_tr(tr, tsk, cpu);
-- 
cgit 


From 57ea2a34adf40f3a6e88409aafcf803b8945619a Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Wed, 25 Jul 2018 16:20:38 +0200
Subject: tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe()
 failure

If enable_trace_kprobe fails to enable the probe in enable_k(ret)probe
it returns an error, but does not unset the tp flags it set previously.
This results in a probe being considered enabled and failures like being
unable to remove the probe through kprobe_events file since probes_open()
expects every probe to be disabled.

Link: http://lkml.kernel.org/r/20180725102826.8300-1-asavkov@redhat.com
Link: http://lkml.kernel.org/r/20180725142038.4765-1-asavkov@redhat.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 41a7dd420c57 ("tracing/kprobes: Support ftrace_event_file base multibuffer")
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 21f718472942..27ace4513c43 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -400,11 +400,10 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
 static int
 enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 {
+	struct event_file_link *link;
 	int ret = 0;
 
 	if (file) {
-		struct event_file_link *link;
-
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
 			ret = -ENOMEM;
@@ -424,6 +423,16 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 		else
 			ret = enable_kprobe(&tk->rp.kp);
 	}
+
+	if (ret) {
+		if (file) {
+			list_del_rcu(&link->list);
+			kfree(link);
+			tk->tp.flags &= ~TP_FLAG_TRACE;
+		} else {
+			tk->tp.flags &= ~TP_FLAG_PROFILE;
+		}
+	}
  out:
 	return ret;
 }
-- 
cgit 


From 15cc78644d0075e76d59476a4467e7143860f660 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 25 Jul 2018 16:02:06 -0400
Subject: tracing: Fix possible double free in event_enable_trigger_func()

There was a case that triggered a double free in event_trigger_callback()
due to the called reg() function freeing the trigger_data and then it
getting freed again by the error return by the caller. The solution there
was to up the trigger_data ref count.

Code inspection found that event_enable_trigger_func() has the same issue,
but is not as easy to trigger (requires harder to trigger failures). It
needs to be solved slightly different as it needs more to clean up when the
reg() function fails.

Link: http://lkml.kernel.org/r/20180725124008.7008e586@gandalf.local.home

Cc: stable@vger.kernel.org
Fixes: 7862ad1846e99 ("tracing: Add 'enable_event' and 'disable_event' event trigger commands")
Reivewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_trigger.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d18ec0e58be2..5dea177cef53 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1420,6 +1420,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
 		goto out;
 	}
 
+	/* Up the trigger_data count to make sure nothing frees it on failure */
+	event_trigger_init(trigger_ops, trigger_data);
+
 	if (trigger) {
 		number = strsep(&trigger, ":");
 
@@ -1470,6 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
 		goto out_disable;
 	/* Just return zero, not the number of enabled functions */
 	ret = 0;
+	event_trigger_free(trigger_ops, trigger_data);
  out:
 	return ret;
 
@@ -1480,7 +1484,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
  out_free:
 	if (cmd_ops->set_filter)
 		cmd_ops->set_filter(NULL, trigger_data, NULL);
-	kfree(trigger_data);
+	event_trigger_free(trigger_ops, trigger_data);
 	kfree(enable_data);
 	goto out;
 }
-- 
cgit 


From 2519c1bbe38d7acacc9aacba303ca6f97482ed53 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 25 Jul 2018 22:28:56 -0400
Subject: tracing: Quiet gcc warning about maybe unused link variable

Commit 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on
enable_trace_kprobe() failure") added an if statement that depends on another
if statement that gcc doesn't see will initialize the "link" variable and
gives the warning:

 "warning: 'link' may be used uninitialized in this function"

It is really a false positive, but to quiet the warning, and also to make
sure that it never actually is used uninitialized, initialize the "link"
variable to NULL and add an if (!WARN_ON_ONCE(!link)) where the compiler
thinks it could be used uninitialized.

Cc: stable@vger.kernel.org
Fixes: 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27ace4513c43..6b71860f3998 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -400,7 +400,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
 static int
 enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 {
-	struct event_file_link *link;
+	struct event_file_link *link = NULL;
 	int ret = 0;
 
 	if (file) {
@@ -426,7 +426,9 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 
 	if (ret) {
 		if (file) {
-			list_del_rcu(&link->list);
+			/* Notice the if is true on not WARN() */
+			if (!WARN_ON_ONCE(!link))
+				list_del_rcu(&link->list);
 			kfree(link);
 			tk->tp.flags &= ~TP_FLAG_TRACE;
 		} else {
-- 
cgit 


From 3e536e222f2930534c252c1cc7ae799c725c5ff9 Mon Sep 17 00:00:00 2001
From: Snild Dolkow <snild@sony.com>
Date: Thu, 26 Jul 2018 09:15:39 +0200
Subject: kthread, tracing: Don't expose half-written comm when creating
 kthreads

There is a window for racing when printing directly to task->comm,
allowing other threads to see a non-terminated string. The vsnprintf
function fills the buffer, counts the truncated chars, then finally
writes the \0 at the end.

	creator                     other
	vsnprintf:
	  fill (not terminated)
	  count the rest            trace_sched_waking(p):
	  ...                         memcpy(comm, p->comm, TASK_COMM_LEN)
	  write \0

The consequences depend on how 'other' uses the string. In our case,
it was copied into the tracing system's saved cmdlines, a buffer of
adjacent TASK_COMM_LEN-byte buffers (note the 'n' where 0 should be):

	crash-arm64> x/1024s savedcmd->saved_cmdlines | grep 'evenk'
	0xffffffd5b3818640:     "irq/497-pwr_evenkworker/u16:12"

...and a strcpy out of there would cause stack corruption:

	[224761.522292] Kernel panic - not syncing: stack-protector:
	    Kernel stack is corrupted in: ffffff9bf9783c78

	crash-arm64> kbt | grep 'comm\|trace_print_context'
	#6  0xffffff9bf9783c78 in trace_print_context+0x18c(+396)
	      comm (char [16]) =  "irq/497-pwr_even"

	crash-arm64> rd 0xffffffd4d0e17d14 8
	ffffffd4d0e17d14:  2f71726900000000 5f7277702d373934   ....irq/497-pwr_
	ffffffd4d0e17d24:  726f776b6e657665 3a3631752f72656b   evenkworker/u16:
	ffffffd4d0e17d34:  f9780248ff003231 cede60e0ffffff9b   12..H.x......`..
	ffffffd4d0e17d44:  cede60c8ffffffd4 00000fffffffffd4   .....`..........

The workaround in e09e28671 (use strlcpy in __trace_find_cmdline) was
likely needed because of this same bug.

Solved by vsnprintf:ing to a local buffer, then using set_task_comm().
This way, there won't be a window where comm is not terminated.

Link: http://lkml.kernel.org/r/20180726071539.188015-1-snild@sony.com

Cc: stable@vger.kernel.org
Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure")
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Snild Dolkow <snild@sony.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/kthread.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 750cb8082694..486dedbd9af5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	task = create->result;
 	if (!IS_ERR(task)) {
 		static const struct sched_param param = { .sched_priority = 0 };
+		char name[TASK_COMM_LEN];
 
-		vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+		/*
+		 * task is already visible to other tasks, so updating
+		 * COMM must be protected.
+		 */
+		vsnprintf(name, sizeof(name), namefmt, args);
+		set_task_comm(task, name);
 		/*
 		 * root may have changed our (kthreadd's) priority or CPU mask.
 		 * The kernel thread should not inherit these properties.
-- 
cgit 


From 15d36fecd0bdc7510b70a0e5ec6671140b3fce0c Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 26 Jul 2018 16:37:15 -0700
Subject: mm: disallow mappings that conflict for devm_memremap_pages()

When pmem namespaces created are smaller than section size, this can
cause an issue during removal and gpf was observed:

  general protection fault: 0000 1 SMP PTI
  CPU: 36 PID: 3941 Comm: ndctl Tainted: G W 4.14.28-1.el7uek.x86_64 #2
  task: ffff88acda150000 task.stack: ffffc900233a4000
  RIP: 0010:__put_page+0x56/0x79
  Call Trace:
    devm_memremap_pages_release+0x155/0x23a
    release_nodes+0x21e/0x260
    devres_release_all+0x3c/0x48
    device_release_driver_internal+0x15c/0x207
    device_release_driver+0x12/0x14
    unbind_store+0xba/0xd8
    drv_attr_store+0x27/0x31
    sysfs_kf_write+0x3f/0x46
    kernfs_fop_write+0x10f/0x18b
    __vfs_write+0x3a/0x16d
    vfs_write+0xb2/0x1a1
    SyS_write+0x55/0xb9
    do_syscall_64+0x79/0x1ae
    entry_SYSCALL_64_after_hwframe+0x3d/0x0

Add code to check whether we have a mapping already in the same section
and prevent additional mappings from being created if that is the case.

Link: http://lkml.kernel.org/r/152909478401.50143.312364396244072931.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Robert Elliott <elliott@hpe.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 5857267a4af5..a734b1747466 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -176,10 +176,27 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	unsigned long pfn, pgoff, order;
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
+	struct dev_pagemap *conflict_pgmap;
 
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 		- align_start;
+	align_end = align_start + align_size - 1;
+
+	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
+	if (conflict_pgmap) {
+		dev_WARN(dev, "Conflicting mapping in same section\n");
+		put_dev_pagemap(conflict_pgmap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
+	if (conflict_pgmap) {
+		dev_WARN(dev, "Conflicting mapping in same section\n");
+		put_dev_pagemap(conflict_pgmap);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	is_ram = region_intersects(align_start, align_size,
 		IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
 
@@ -199,7 +216,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 
 	mutex_lock(&pgmap_lock);
 	error = 0;
-	align_end = align_start + align_size - 1;
 
 	foreach_order_pgoff(res, order, pgoff) {
 		error = __radix_tree_insert(&pgmap_radix,
-- 
cgit 


From 31c5bda3a656089f01963d290a40ccda181f816e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 26 Jul 2018 16:37:22 -0700
Subject: mm: fix exports that inadvertently make put_page() EXPORT_SYMBOL_GPL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit e76384884344 ("mm: introduce MEMORY_DEVICE_FS_DAX and
CONFIG_DEV_PAGEMAP_OPS") added two EXPORT_SYMBOL_GPL() symbols, but
these symbols are required by the inlined put_page(), thus accidentally
making put_page() a GPL export only.  This breaks OpenAFS (at least).

Mark them EXPORT_SYMBOL() instead.

Link: http://lkml.kernel.org/r/153128611970.2928.11310692420711601254.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: e76384884344 ("mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Joe Gorse <jhgorse@gmail.com>
Reported-by: John Hubbard <jhubbard@nvidia.com>
Tested-by: Joe Gorse <jhgorse@gmail.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Mark Vitale <mvitale@sinenomine.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a734b1747466..38283363da06 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -321,7 +321,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL_GPL(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
 static atomic_t devmap_enable;
 
 /*
@@ -362,5 +362,5 @@ void __put_devmap_managed_page(struct page *page)
 	} else if (!count)
 		__put_page(page);
 }
-EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
+EXPORT_SYMBOL(__put_devmap_managed_page);
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
-- 
cgit 


From 027232da7c7c1c7f04383f93bd798e475dde5285 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 26 Jul 2018 16:37:25 -0700
Subject: mm: introduce vma_init()

Not all VMAs allocated with vm_area_alloc().  Some of them allocated on
stack or in data segment.

The new helper can be use to initialize VMA properly regardless where it
was allocated.

Link: http://lkml.kernel.org/r/20180724121139.62570-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a191c05e757d..1b27babc4c78 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -312,10 +312,8 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 
-	if (vma) {
-		vma->vm_mm = mm;
-		INIT_LIST_HEAD(&vma->anon_vma_chain);
-	}
+	if (vma)
+		vma_init(vma, mm);
 	return vma;
 }
 
-- 
cgit