Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says: ==================== pull-request: bpf-next 2019-04-22 The following pull-request contains BPF updates for your *net-next* tree. The main changes are: 1) allow stack/queue helpers from more bpf program types, from Alban. 2) allow parallel verification of root bpf programs, from Alexei. 3) introduce bpf sysctl hook for trusted root cases, from Andrey. 4) recognize var/datasec in btf deduplication, from Andrii. 5) cpumap performance optimizations, from Jesper. 6) verifier prep for alu32 optimization, from Jiong. 7) libbpf xsk cleanup, from Magnus. 8) other various fixes and cleanups. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2019-04-22 21:35:55 -0700
committer: David S. Miller <davem@davemloft.net> 2019-04-22 21:35:55 -0700
commit: 2843ba2ec75948e274d2c4f0a9390980e68a6461 (patch)
tree: b2fde70246956609ac43ad62026a4fdb48e33fbc /net
parent: be659b8d3c79afc54e087ebf8d849685d7b0d395 (diff)
parent: f79b464fd6b56b6256de43bc4c7d5968c0e52968 (diff)
3 files changed, 108 insertions, 25 deletions
diff --git a/net/core/filter.c b/net/core/filter.c
index 1644a16afcec..fa8fb0548217 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3069,6 +3069,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
 {
 	int ret;
 
+	if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO)
+		return -EINVAL;
+
 	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
 		/* udp gso_size delineates datagrams, only allow if fixed */
 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
@@ -4434,8 +4437,7 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
 	if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
 		return -EINVAL;
 
-	if (val)
-		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+	tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
 
 	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a083e188374f..e89be6282693 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -258,6 +258,33 @@ nodata:
 }
 EXPORT_SYMBOL(__alloc_skb);
 
+/* Caller must provide SKB that is memset cleared */
+static struct sk_buff *__build_skb_around(struct sk_buff *skb,
+					  void *data, unsigned int frag_size)
+{
+	struct skb_shared_info *shinfo;
+	unsigned int size = frag_size ? : ksize(data);
+
+	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	/* Assumes caller memset cleared SKB */
+	skb->truesize = SKB_TRUESIZE(size);
+	refcount_set(&skb->users, 1);
+	skb->head = data;
+	skb->data = data;
+	skb_reset_tail_pointer(skb);
+	skb->end = skb->tail + size;
+	skb->mac_header = (typeof(skb->mac_header))~0U;
+	skb->transport_header = (typeof(skb->transport_header))~0U;
+
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+
+	return skb;
+}
+
 /**
  * __build_skb - build a network buffer
  * @data: data buffer provided by caller
@@ -279,32 +306,15 @@ EXPORT_SYMBOL(__alloc_skb);
  */
 struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 {
-	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
-	unsigned int size = frag_size ? : ksize(data);
 
 	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
-	if (!skb)
+	if (unlikely(!skb))
 		return NULL;
 
-	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-
 	memset(skb, 0, offsetof(struct sk_buff, tail));
-	skb->truesize = SKB_TRUESIZE(size);
-	refcount_set(&skb->users, 1);
-	skb->head = data;
-	skb->data = data;
-	skb_reset_tail_pointer(skb);
-	skb->end = skb->tail + size;
-	skb->mac_header = (typeof(skb->mac_header))~0U;
-	skb->transport_header = (typeof(skb->transport_header))~0U;
 
-	/* make sure we initialize shinfo sequentially */
-	shinfo = skb_shinfo(skb);
-	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
-	atomic_set(&shinfo->dataref, 1);
-
-	return skb;
+	return __build_skb_around(skb, data, frag_size);
 }
 
 /* build_skb() is wrapper over __build_skb(), that specifically
@@ -325,6 +335,29 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
+/**
+ * build_skb_around - build a network buffer around provided skb
+ * @skb: sk_buff provide by caller, must be memset cleared
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ */
+struct sk_buff *build_skb_around(struct sk_buff *skb,
+				 void *data, unsigned int frag_size)
+{
+	if (unlikely(!skb))
+		return NULL;
+
+	skb = __build_skb_around(skb, data, frag_size);
+
+	if (skb && frag_size) {
+		skb->head_frag = 1;
+		if (page_is_pfmemalloc(virt_to_head_page(data)))
+			skb->pfmemalloc = 1;
+	}
+	return skb;
+}
+EXPORT_SYMBOL(build_skb_around);
+
 #define NAPI_SKB_CACHE_SIZE	64
 
 struct napi_alloc_cache {
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 610c0bdc0c2b..88b9ae24658d 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -43,6 +43,48 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
+/* The structure of the shared state of the rings are the same as the
+ * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion
+ * ring, the kernel is the producer and user space is the consumer. For
+ * the Tx and fill rings, the kernel is the consumer and user space is
+ * the producer.
+ *
+ * producer                         consumer
+ *
+ * if (LOAD ->consumer) {           LOAD ->producer
+ *                    (A)           smp_rmb()       (C)
+ *    STORE $data                   LOAD $data
+ *    smp_wmb()       (B)           smp_mb()        (D)
+ *    STORE ->producer              STORE ->consumer
+ * }
+ *
+ * (A) pairs with (D), and (B) pairs with (C).
+ *
+ * Starting with (B), it protects the data from being written after
+ * the producer pointer. If this barrier was missing, the consumer
+ * could observe the producer pointer being set and thus load the data
+ * before the producer has written the new data. The consumer would in
+ * this case load the old data.
+ *
+ * (C) protects the consumer from speculatively loading the data before
+ * the producer pointer actually has been read. If we do not have this
+ * barrier, some architectures could load old data as speculative loads
+ * are not discarded as the CPU does not know there is a dependency
+ * between ->producer and data.
+ *
+ * (A) is a control dependency that separates the load of ->consumer
+ * from the stores of $data. In case ->consumer indicates there is no
+ * room in the buffer to store $data we do not. So no barrier is needed.
+ *
+ * (D) protects the load of the data to be observed to happen after the
+ * store of the consumer pointer. If we did not have this memory
+ * barrier, the producer could observe the consumer pointer being set
+ * and overwrite the data with a new value before the consumer got the
+ * chance to read the old value. The consumer would thus miss reading
+ * the old entry and very likely read the new entry twice, once right
+ * now and again after circling through the ring.
+ */
+
 /* Common functions operating for both RXTX and umem queues */
 
 static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
@@ -106,6 +148,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
 static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 {
 	if (q->cons_tail == q->cons_head) {
+		smp_mb(); /* D, matches A */
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
@@ -128,10 +171,11 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 	if (xskq_nb_free(q, q->prod_tail, 1) == 0)
 		return -ENOSPC;
 
+	/* A, matches D */
 	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
-	smp_wmb();
+	smp_wmb(); /* B, matches C */
 
 	WRITE_ONCE(q->ring->producer, q->prod_tail);
 	return 0;
@@ -144,6 +188,7 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
 	if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
 		return -ENOSPC;
 
+	/* A, matches D */
 	ring->desc[q->prod_head++ & q->ring_mask] = addr;
 	return 0;
 }
@@ -152,7 +197,7 @@ static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
 					     u32 nb_entries)
 {
 	/* Order producer and data */
-	smp_wmb();
+	smp_wmb(); /* B, matches C */
 
 	q->prod_tail += nb_entries;
 	WRITE_ONCE(q->ring->producer, q->prod_tail);
@@ -163,6 +208,7 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
 		return -ENOSPC;
 
+	/* A, matches D */
 	q->prod_head++;
 	return 0;
 }
@@ -204,11 +250,12 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
 					      struct xdp_desc *desc)
 {
 	if (q->cons_tail == q->cons_head) {
+		smp_mb(); /* D, matches A */
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
-		smp_rmb();
+		smp_rmb(); /* C, matches B */
 	}
 
 	return xskq_validate_desc(q, desc);
@@ -228,6 +275,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
 		return -ENOSPC;
 
+	/* A, matches D */
 	idx = (q->prod_head++) & q->ring_mask;
 	ring->desc[idx].addr = addr;
 	ring->desc[idx].len = len;
@@ -238,7 +286,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 static inline void xskq_produce_flush_desc(struct xsk_queue *q)
 {
 	/* Order producer and data */
-	smp_wmb();
+	smp_wmb(); /* B, matches C */
 
 	q->prod_tail = q->prod_head,
 	WRITE_ONCE(q->ring->producer, q->prod_tail);
author	David S. Miller <davem@davemloft.net>	2019-04-22 21:35:55 -0700
committer	David S. Miller <davem@davemloft.net>	2019-04-22 21:35:55 -0700
commit	2843ba2ec75948e274d2c4f0a9390980e68a6461 (patch)
tree	b2fde70246956609ac43ad62026a4fdb48e33fbc /net
parent	be659b8d3c79afc54e087ebf8d849685d7b0d395 (diff)
parent	f79b464fd6b56b6256de43bc4c7d5968c0e52968 (diff)