From 7c15d9bb8231f998ae7dc0b72415f5215459f7fb Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 19 Jul 2016 15:00:04 -0700 Subject: mm: Add is_migrate_cma_page Code such as hardened user copy[1] needs a way to tell if a page is CMA or not. Add is_migrate_cma_page in a similar way to is_migrate_isolate_page. [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 Signed-off-by: Laura Abbott Signed-off-by: Kees Cook --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 02069c23486d..c8478b29f070 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -68,8 +68,10 @@ extern char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false +# define is_migrate_cma_page(_page) false #endif #define for_each_migratetype_order(order, type) \ -- cgit From 0f60a8efe4005ab5e65ce000724b04d4ca04a199 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jul 2016 16:19:48 -0700 Subject: mm: Implement stack frame object validation This creates per-architecture function arch_within_stack_frames() that should validate if a given object is contained by a kernel stack frame. Initial implementation is on x86. This is based on code from PaX. Signed-off-by: Kees Cook --- include/linux/thread_info.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index b4c2a485b28a..3d5c80b4391d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -146,6 +146,15 @@ static inline bool test_and_clear_restore_sigmask(void) #error "no set_restore_sigmask() provided and default one won't work" #endif +#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ -- cgit From f5509cc18daa7f82bcc553be70df2117c8eedc16 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 7 Jun 2016 11:05:33 -0700 Subject: mm: Hardened usercopy This is the start of porting PAX_USERCOPY into the mainline kernel. This is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The work is based on code by PaX Team and Brad Spengler, and an earlier port from Casey Schaufler. Additional non-slab page tests are from Rik van Riel. This patch contains the logic for validating several conditions when performing copy_to_user() and copy_from_user() on the kernel object being copied to/from: - address range doesn't wrap around - address range isn't NULL or zero-allocated (with a non-zero copy size) - if on the slab allocator: - object size must be less than or equal to copy size (when check is implemented in the allocator, which appear in subsequent patches) - otherwise, object must not span page allocations (excepting Reserved and CMA ranges) - if on the stack - object must not extend before/after the current process stack - object must be contained by a valid stack frame (when there is arch/build support for identifying stack frames) - object must not overlap with kernel text Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Tested-by: Michael Ellerman --- include/linux/slab.h | 12 ++++++++++++ include/linux/thread_info.h | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index aeb3e6d00a66..96a16a3fb7cb 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -155,6 +155,18 @@ void kfree(const void *); void kzfree(const void *); size_t ksize(const void *); +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page); +#else +static inline const char *__check_heap_object(const void *ptr, + unsigned long n, + struct page *page) +{ + return NULL; +} +#endif + /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 3d5c80b4391d..f24b99eac969 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif +#ifdef CONFIG_HARDENED_USERCOPY +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + __check_object_size(ptr, n, to_user); +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ -- cgit From 2439ca0402091badb24415e1b073ba12b34ba423 Mon Sep 17 00:00:00 2001 From: Maxim Altshul Date: Thu, 4 Aug 2016 15:43:04 +0300 Subject: mac80211: Add ieee80211_hw pointer to get_expected_throughput The variable is added to allow the driver an easy access to it's own hw->priv when the op is invoked. This fixes a crash in wlcore because it was relying on a station pointer that wasn't initialized yet. It's the wrong way to fix the crash, but it solves the problem for now and it does make sense to have the hw pointer here. Signed-off-by: Maxim Altshul [rewrite commit message, fix indentation] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b4faadbb4e01..cca510a585c3 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3620,7 +3620,8 @@ struct ieee80211_ops { int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - u32 (*get_expected_throughput)(struct ieee80211_sta *sta); + u32 (*get_expected_throughput)(struct ieee80211_hw *hw, + struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int *dbm); -- cgit From 34b58355ad1d9987267f071265a7de6c8e00662a Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Fri, 5 Aug 2016 18:36:10 +0900 Subject: drm/ttm: Wait for a BO to become idle before unbinding it from GTT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes hangs under memory pressure, e.g. running the piglit test tex3d-maxsize concurrently with other tests. Fixes: 17d33bc9d6ef ("drm/ttm: drop waiting for idle in ttm_bo_evict.") Reviewed-by: Christian König Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- include/drm/ttm/ttm_bo_driver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 4348d6d5877a..99c6d01d24f2 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -962,6 +962,7 @@ void ttm_mem_io_free(struct ttm_bo_device *bdev, * * @bo: A pointer to a struct ttm_buffer_object. * @evict: 1: This is an eviction. Don't try to pipeline. + * @interruptible: Sleep interruptible if waiting. * @no_wait_gpu: Return immediately if the GPU is busy. * @new_mem: struct ttm_mem_reg indicating where to move. * @@ -976,7 +977,7 @@ void ttm_mem_io_free(struct ttm_bo_device *bdev, */ extern int ttm_bo_move_ttm(struct ttm_buffer_object *bo, - bool evict, bool no_wait_gpu, + bool evict, bool interruptible, bool no_wait_gpu, struct ttm_mem_reg *new_mem); /** -- cgit From 3851f1cdb2b8d507b10395fc110d4c37d6121285 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Aug 2016 00:08:45 -0400 Subject: SUNRPC: Limit the reconnect backoff timer to the max RPC message timeout ...and ensure that we propagate it to new transports on the same client. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 5e3e1b63dbb3..a16070dd03ee 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -218,7 +218,8 @@ struct rpc_xprt { struct work_struct task_cleanup; struct timer_list timer; unsigned long last_used, - idle_timeout; + idle_timeout, + max_reconnect_timeout; /* * Send stuff -- cgit From 8d480326c3d6921ff5f1cc988c993bd572248deb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Aug 2016 19:03:31 -0400 Subject: NFSv4: Cap the transport reconnection timer at 1/2 lease period We don't want to miss a lease period renewal due to the TCP connection failing to reconnect in a timely fashion. To ensure this doesn't happen, cap the reconnection timer so that we retry the connection attempt at least every 1/2 lease period. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b6810c92b8bb..5c02b0691587 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -195,6 +195,8 @@ int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, struct rpc_xprt *, void *), void *data); +void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, + unsigned long timeo); const char *rpc_proc_name(const struct rpc_task *task); #endif /* __KERNEL__ */ -- cgit From 372ee16386bbf6dc5eeb0387e1ede963debba82a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Aug 2016 14:11:40 +0100 Subject: rxrpc: Fix races between skb free, ACK generation and replying Inside the kafs filesystem it is possible to occasionally have a call processed and terminated before we've had a chance to check whether we need to clean up the rx queue for that call because afs_send_simple_reply() ends the call when it is done, but this is done in a workqueue item that might happen to run to completion before afs_deliver_to_call() completes. Further, it is possible for rxrpc_kernel_send_data() to be called to send a reply before the last request-phase data skb is released. The rxrpc skb destructor is where the ACK processing is done and the call state is advanced upon release of the last skb. ACK generation is also deferred to a work item because it's possible that the skb destructor is not called in a context where kernel_sendmsg() can be invoked. To this end, the following changes are made: (1) kernel_rxrpc_data_consumed() is added. This should be called whenever an skb is emptied so as to crank the ACK and call states. This does not release the skb, however. kernel_rxrpc_free_skb() must now be called to achieve that. These together replace rxrpc_kernel_data_delivered(). (2) kernel_rxrpc_data_consumed() is wrapped by afs_data_consumed(). This makes afs_deliver_to_call() easier to work as the skb can simply be discarded unconditionally here without trying to work out what the return value of the ->deliver() function means. The ->deliver() functions can, via afs_data_complete(), afs_transfer_reply() and afs_extract_data() mark that an skb has been consumed (thereby cranking the state) without the need to conditionally free the skb to make sure the state is correct on an incoming call for when the call processor tries to send the reply. (3) rxrpc_recvmsg() now has to call kernel_rxrpc_data_consumed() when it has finished with a packet and MSG_PEEK isn't set. (4) rxrpc_packet_destructor() no longer calls rxrpc_hard_ACK_data(). Because of this, we no longer need to clear the destructor and put the call before we free the skb in cases where we don't want the ACK/call state to be cranked. (5) The ->deliver() call-type callbacks are made to return -EAGAIN rather than 0 if they expect more data (afs_extract_data() returns -EAGAIN to the delivery function already), and the caller is now responsible for producing an abort if that was the last packet. (6) There are many bits of unmarshalling code where: ret = afs_extract_data(call, skb, last, ...); switch (ret) { case 0: break; case -EAGAIN: return 0; default: return ret; } is to be found. As -EAGAIN can now be passed back to the caller, we now just return if ret < 0: ret = afs_extract_data(call, skb, last, ...); if (ret < 0) return ret; (7) Checks for trailing data and empty final data packets has been consolidated as afs_data_complete(). So: if (skb->len > 0) return -EBADMSG; if (!last) return 0; becomes: ret = afs_data_complete(call, skb, last); if (ret < 0) return ret; (8) afs_transfer_reply() now checks the amount of data it has against the amount of data desired and the amount of data in the skb and returns an error to induce an abort if we don't get exactly what we want. Without these changes, the following oops can occasionally be observed, particularly if some printks are inserted into the delivery path: general protection fault: 0000 [#1] SMP Modules linked in: kafs(E) af_rxrpc(E) [last unloaded: af_rxrpc] CPU: 0 PID: 1305 Comm: kworker/u8:3 Tainted: G E 4.7.0-fsdevel+ #1303 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Workqueue: kafsd afs_async_workfn [kafs] task: ffff88040be041c0 ti: ffff88040c070000 task.ti: ffff88040c070000 RIP: 0010:[] [] __lock_acquire+0xcf/0x15a1 RSP: 0018:ffff88040c073bc0 EFLAGS: 00010002 RAX: 6b6b6b6b6b6b6b6b RBX: 0000000000000000 RCX: ffff88040d29a710 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88040d29a710 RBP: ffff88040c073c70 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88040be041c0 R15: ffffffff814c928f FS: 0000000000000000(0000) GS:ffff88041fa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa4595f4750 CR3: 0000000001c14000 CR4: 00000000001406f0 Stack: 0000000000000006 000000000be04930 0000000000000000 ffff880400000000 ffff880400000000 ffffffff8108f847 ffff88040be041c0 ffffffff81050446 ffff8803fc08a920 ffff8803fc08a958 ffff88040be041c0 ffff88040c073c38 Call Trace: [] ? mark_held_locks+0x5e/0x74 [] ? __local_bh_enable_ip+0x9b/0xa1 [] ? trace_hardirqs_on_caller+0x16d/0x189 [] lock_acquire+0x122/0x1b6 [] ? lock_acquire+0x122/0x1b6 [] ? skb_dequeue+0x18/0x61 [] _raw_spin_lock_irqsave+0x35/0x49 [] ? skb_dequeue+0x18/0x61 [] skb_dequeue+0x18/0x61 [] afs_deliver_to_call+0x344/0x39d [kafs] [] afs_process_async_call+0x4c/0xd5 [kafs] [] afs_async_workfn+0xe/0x10 [kafs] [] process_one_work+0x29d/0x57c [] worker_thread+0x24a/0x385 [] ? rescuer_thread+0x2d0/0x2d0 [] kthread+0xf3/0xfb [] ret_from_fork+0x1f/0x40 [] ? kthread_create_on_node+0x1cf/0x1cf Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/net/af_rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ac1bc3c49fbd..7b0f88699b25 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -40,12 +40,12 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, unsigned long, gfp_t); int rxrpc_kernel_send_data(struct rxrpc_call *, struct msghdr *, size_t); +void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_kernel_abort_call(struct rxrpc_call *, u32); void rxrpc_kernel_end_call(struct rxrpc_call *); bool rxrpc_kernel_is_data_last(struct sk_buff *); u32 rxrpc_kernel_get_abort_code(struct sk_buff *); int rxrpc_kernel_get_error_number(struct sk_buff *); -void rxrpc_kernel_data_delivered(struct sk_buff *); void rxrpc_kernel_free_skb(struct sk_buff *); struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long); int rxrpc_kernel_reject_call(struct socket *); -- cgit From 2c86943c20e375b0fe562af0626f2e5461d8d203 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 24 Jul 2016 19:53:19 +0200 Subject: netfilter: nf_tables: s/MFT_REG32_01/NFT_REG32_01 MFT_REG32_01 is a typo, rename this to NFT_REG32_01. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 01751faccaf8..c674ba2563b7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -24,7 +24,7 @@ enum nft_registers { __NFT_REG_MAX, NFT_REG32_00 = 8, - MFT_REG32_01, + NFT_REG32_01, NFT_REG32_02, NFT_REG32_03, NFT_REG32_04, -- cgit From dca3f53c02e325bb19dfc05b1571b9a706226fea Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 4 Aug 2016 12:11:55 +0200 Subject: sctp: Export struct sctp_info to userspace This is required to correctly interpret INET_DIAG_INFO messages exported by sctp_diag module. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/linux/sctp.h | 64 ----------------------------------------------- include/uapi/linux/sctp.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index de1f64318fc4..fcb4c3646173 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -705,70 +705,6 @@ typedef struct sctp_auth_chunk { sctp_authhdr_t auth_hdr; } __packed sctp_auth_chunk_t; -struct sctp_info { - __u32 sctpi_tag; - __u32 sctpi_state; - __u32 sctpi_rwnd; - __u16 sctpi_unackdata; - __u16 sctpi_penddata; - __u16 sctpi_instrms; - __u16 sctpi_outstrms; - __u32 sctpi_fragmentation_point; - __u32 sctpi_inqueue; - __u32 sctpi_outqueue; - __u32 sctpi_overall_error; - __u32 sctpi_max_burst; - __u32 sctpi_maxseg; - __u32 sctpi_peer_rwnd; - __u32 sctpi_peer_tag; - __u8 sctpi_peer_capable; - __u8 sctpi_peer_sack; - __u16 __reserved1; - - /* assoc status info */ - __u64 sctpi_isacks; - __u64 sctpi_osacks; - __u64 sctpi_opackets; - __u64 sctpi_ipackets; - __u64 sctpi_rtxchunks; - __u64 sctpi_outofseqtsns; - __u64 sctpi_idupchunks; - __u64 sctpi_gapcnt; - __u64 sctpi_ouodchunks; - __u64 sctpi_iuodchunks; - __u64 sctpi_oodchunks; - __u64 sctpi_iodchunks; - __u64 sctpi_octrlchunks; - __u64 sctpi_ictrlchunks; - - /* primary transport info */ - struct sockaddr_storage sctpi_p_address; - __s32 sctpi_p_state; - __u32 sctpi_p_cwnd; - __u32 sctpi_p_srtt; - __u32 sctpi_p_rto; - __u32 sctpi_p_hbinterval; - __u32 sctpi_p_pathmaxrxt; - __u32 sctpi_p_sackdelay; - __u32 sctpi_p_sackfreq; - __u32 sctpi_p_ssthresh; - __u32 sctpi_p_partial_bytes_acked; - __u32 sctpi_p_flight_size; - __u16 sctpi_p_error; - __u16 __reserved2; - - /* sctp sock info */ - __u32 sctpi_s_autoclose; - __u32 sctpi_s_adaptation_ind; - __u32 sctpi_s_pd_point; - __u8 sctpi_s_nodelay; - __u8 sctpi_s_disable_fragments; - __u8 sctpi_s_v4mapped; - __u8 sctpi_s_frag_interleave; - __u32 sctpi_s_type; - __u32 __reserved3; -}; - struct sctp_infox { struct sctp_info *sctpinfo; struct sctp_association *asoc; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d304f4c9792c..a406adcc0793 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -944,4 +944,68 @@ struct sctp_default_prinfo { __u16 pr_policy; }; +struct sctp_info { + __u32 sctpi_tag; + __u32 sctpi_state; + __u32 sctpi_rwnd; + __u16 sctpi_unackdata; + __u16 sctpi_penddata; + __u16 sctpi_instrms; + __u16 sctpi_outstrms; + __u32 sctpi_fragmentation_point; + __u32 sctpi_inqueue; + __u32 sctpi_outqueue; + __u32 sctpi_overall_error; + __u32 sctpi_max_burst; + __u32 sctpi_maxseg; + __u32 sctpi_peer_rwnd; + __u32 sctpi_peer_tag; + __u8 sctpi_peer_capable; + __u8 sctpi_peer_sack; + __u16 __reserved1; + + /* assoc status info */ + __u64 sctpi_isacks; + __u64 sctpi_osacks; + __u64 sctpi_opackets; + __u64 sctpi_ipackets; + __u64 sctpi_rtxchunks; + __u64 sctpi_outofseqtsns; + __u64 sctpi_idupchunks; + __u64 sctpi_gapcnt; + __u64 sctpi_ouodchunks; + __u64 sctpi_iuodchunks; + __u64 sctpi_oodchunks; + __u64 sctpi_iodchunks; + __u64 sctpi_octrlchunks; + __u64 sctpi_ictrlchunks; + + /* primary transport info */ + struct sockaddr_storage sctpi_p_address; + __s32 sctpi_p_state; + __u32 sctpi_p_cwnd; + __u32 sctpi_p_srtt; + __u32 sctpi_p_rto; + __u32 sctpi_p_hbinterval; + __u32 sctpi_p_pathmaxrxt; + __u32 sctpi_p_sackdelay; + __u32 sctpi_p_sackfreq; + __u32 sctpi_p_ssthresh; + __u32 sctpi_p_partial_bytes_acked; + __u32 sctpi_p_flight_size; + __u16 sctpi_p_error; + __u16 __reserved2; + + /* sctp sock info */ + __u32 sctpi_s_autoclose; + __u32 sctpi_s_adaptation_ind; + __u32 sctpi_s_pd_point; + __u8 sctpi_s_nodelay; + __u8 sctpi_s_disable_fragments; + __u8 sctpi_s_v4mapped; + __u8 sctpi_s_frag_interleave; + __u32 sctpi_s_type; + __u32 __reserved3; +}; + #endif /* _UAPI_SCTP_H */ -- cgit From 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 349557825428..f30c187ed785 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ -- cgit From 479ffcccefd7b04442b0e949f8779b0612e8c75f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Aug 2016 00:11:12 +0200 Subject: bpf: fix checksum fixups on bpf_skb_store_bytes bpf_skb_store_bytes() invocations above L2 header need BPF_F_RECOMPUTE_CSUM flag for updates, so that CHECKSUM_COMPLETE will be fixed up along the way. Where we ran into an issue with bpf_skb_store_bytes() is when we did a single-byte update on the IPv6 hoplimit despite using BPF_F_RECOMPUTE_CSUM flag; simple ping via ICMPv6 triggered a hw csum failure as a result. The underlying issue has been tracked down to a buffer alignment issue. Meaning, that csum_partial() computations via skb_postpull_rcsum() and skb_postpush_rcsum() pair invoked had a wrong result since they operated on an odd address for the hoplimit, while other computations were done on an even address. This mix doesn't work as-is with skb_postpull_rcsum(), skb_postpush_rcsum() pair as it always expects at least half-word alignment of input buffers, which is normally the case. Thus, instead of these helpers using csum_sub() and (implicitly) csum_add(), we need to use csum_block_sub(), csum_block_add(), respectively. For unaligned offsets, they rotate the sum to align it to a half-word boundary again, otherwise they work the same as csum_sub() and csum_add(). Adding __skb_postpull_rcsum(), __skb_postpush_rcsum() variants that take the offset as an input and adapting bpf_skb_store_bytes() to them fixes the hw csum failures again. The skb_postpull_rcsum(), skb_postpush_rcsum() helpers use a 0 constant for offset so that the compiler optimizes the offset & 1 test away and generates the same code as with csum_sub()/_add(). Fixes: 608cd71a9c7c ("tc: bpf: generalize pedit action") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 52 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f0b3e0adc73..0f665cb26b50 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2847,6 +2847,18 @@ static inline int skb_linearize_cow(struct sk_buff *skb) __skb_linearize(skb) : 0; } +static __always_inline void +__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, + unsigned int off) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_block_sub(skb->csum, + csum_partial(start, len, 0), off); + else if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start_offset(skb) < 0) + skb->ip_summed = CHECKSUM_NONE; +} + /** * skb_postpull_rcsum - update checksum for received skb after pull * @skb: buffer to update @@ -2857,36 +2869,38 @@ static inline int skb_linearize_cow(struct sk_buff *skb) * update the CHECKSUM_COMPLETE checksum, or set ip_summed to * CHECKSUM_NONE so that it can be recomputed from scratch. */ - static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); - else if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_start_offset(skb) < 0) - skb->ip_summed = CHECKSUM_NONE; + __skb_postpull_rcsum(skb, start, len, 0); } -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); +static __always_inline void +__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len, + unsigned int off) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_block_add(skb->csum, + csum_partial(start, len, 0), off); +} +/** + * skb_postpush_rcsum - update checksum for received skb after push + * @skb: buffer to update + * @start: start of data after push + * @len: length of data pushed + * + * After doing a push on a received packet, you need to call this to + * update the CHECKSUM_COMPLETE checksum. + */ static inline void skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { - /* For performing the reverse operation to skb_postpull_rcsum(), - * we can instead of ... - * - * skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); - * - * ... just use this equivalent version here to save a few - * instructions. Feeding csum of 0 in csum_partial() and later - * on adding skb->csum is equivalent to feed skb->csum in the - * first place. - */ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_partial(start, len, skb->csum); + __skb_postpush_rcsum(skb, start, len, 0); } +unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); + /** * skb_push_rcsum - push skb and update receive checksum * @skb: buffer to update -- cgit From 59bcb7972fc5d53a621ee6b2c3cf1654cebb3dc5 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Mon, 8 Aug 2016 21:57:42 -0400 Subject: qed: Add dcbx app support for IEEE Selection Field. MFW now supports the Selection field for IEEE mode. Add driver changes to use the newer MFW masks to read/write the port-id value. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b1e3c57c7117..d6c4177df7cb 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -70,8 +70,16 @@ struct qed_dbcx_pfc_params { u8 max_tc; }; +enum qed_dcbx_sf_ieee_type { + QED_DCBX_SF_IEEE_ETHTYPE, + QED_DCBX_SF_IEEE_TCP_PORT, + QED_DCBX_SF_IEEE_UDP_PORT, + QED_DCBX_SF_IEEE_TCP_UDP_PORT +}; + struct qed_app_entry { bool ethtype; + enum qed_dcbx_sf_ieee_type sf_ieee; bool enabled; u8 prio; u16 proto_id; -- cgit From cbd74e1bc8129efb9908f130a8a6e60fd95d2106 Mon Sep 17 00:00:00 2001 From: Philippe Bergheaud Date: Fri, 5 Aug 2016 14:02:00 +0200 Subject: cxl: Use fixed width predefined types in data structure. This patch fixes a regression introduced by commit b810253bd934 ("cxl: Add mechanism for delivering AFU driver specific events"). It changes the type u8 to __u8 in the uapi header cxl.h, because the former is a kernel internal type, and may not be defined in userland build environments, in particular when cross-compiling libcxl on x86_64 linux machines (RHEL6.7 and Ubuntu 16.04). This patch also changes the size of the field data_size, and makes it constant, to support 32-bit userland applications running on big-endian ppc64 kernels transparently. mpe: This is an ABI change, however the ABI was only added during the 4.8 merge window so has never been part of a released kernel - therefore we give ourselves permission to change it. Fixes: b810253bd934 ("cxl: Add mechanism for delivering AFU driver specific events") Signed-off-by: Philippe Bergheaud Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman --- include/uapi/misc/cxl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h index cbae529b7ce0..180d526a55c3 100644 --- a/include/uapi/misc/cxl.h +++ b/include/uapi/misc/cxl.h @@ -136,8 +136,8 @@ struct cxl_event_afu_driver_reserved { * * Of course the contents will be ABI, but that's up the AFU driver. */ - size_t data_size; - u8 data[]; + __u32 data_size; + __u8 data[]; }; struct cxl_event { -- cgit From f3b0946d629c8bfbd3e5f038e30cb9c711a35f10 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 Jul 2016 17:18:33 +0100 Subject: genirq/msi: Make sure PCI MSIs are activated early Bharat Kumar Gogada reported issues with the generic MSI code, where the end-point ended up with garbage in its MSI configuration (both for the vector and the message). It turns out that the two MSI paths in the kernel are doing slightly different things: generic MSI: disable MSI -> allocate MSI -> enable MSI -> setup EP PCI MSI: disable MSI -> allocate MSI -> setup EP -> enable MSI And it turns out that end-points are allowed to latch the content of the MSI configuration registers as soon as MSIs are enabled. In Bharat's case, the end-point ends up using whatever was there already, which is not what you want. In order to make things converge, we introduce a new MSI domain flag (MSI_FLAG_ACTIVATE_EARLY) that is unconditionally set for PCI/MSI. When set, this flag forces the programming of the end-point as soon as the MSIs are allocated. A consequence of this is that we have an extra activate in irq_startup, but that should be without much consequence. tglx: - Several people reported a VMWare regression with PCI/MSI-X passthrough. It turns out that the patch also cures that issue. - We need to have a look at the MSI disable interrupt path, where we write the msg to all zeros without disabling MSI in the PCI device. Is that correct? Fixes: 52f518a3a7c2 "x86/MSI: Use hierarchical irqdomains to manage MSI interrupts" Reported-and-tested-by: Bharat Kumar Gogada Reported-and-tested-by: Foster Snowhill Reported-by: Matthias Prager Reported-by: Jason Taylor Signed-off-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1468426713-31431-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4f0bfe5912b2..e8c81fbd5f9c 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -270,6 +270,8 @@ enum { MSI_FLAG_MULTI_PCI_MSI = (1 << 2), /* Support PCI MSIX interrupts */ MSI_FLAG_PCI_MSIX = (1 << 3), + /* Needs early activate, required for PCI */ + MSI_FLAG_ACTIVATE_EARLY = (1 << 4), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, -- cgit From 28ad55578b8a76390d966b09da8c7fa3644f5140 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 5 Aug 2016 13:52:09 +0100 Subject: virtio-vsock: fix include guard typo Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_vsock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h index 6b011c19b50f..1d57ed3d84d2 100644 --- a/include/uapi/linux/virtio_vsock.h +++ b/include/uapi/linux/virtio_vsock.h @@ -32,7 +32,7 @@ */ #ifndef _UAPI_LINUX_VIRTIO_VSOCK_H -#define _UAPI_LINUX_VIRTIO_VOSCK_H +#define _UAPI_LINUX_VIRTIO_VSOCK_H #include #include -- cgit From c87edb36118664f1fa275107c1138b6f47793240 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 5 Aug 2016 12:41:52 -0400 Subject: tracing: Fix tick_stop tracepoint symbols for user export The symbols used in the tick_stop tracepoint were not being converted properly into integers in the trace_stop format file. Instead we had this: print fmt: "success=%d dependency=%s", REC->success, __print_symbolic(REC->dependency, { 0, "NONE" }, { (1 << TICK_DEP_BIT_POSIX_TIMER), "POSIX_TIMER" }, { (1 << TICK_DEP_BIT_PERF_EVENTS), "PERF_EVENTS" }, { (1 << TICK_DEP_BIT_SCHED), "SCHED" }, { (1 << TICK_DEP_BIT_CLOCK_UNSTABLE), "CLOCK_UNSTABLE" }) User space tools have no idea how to parse "TICK_DEP_BIT_SCHED" or the other symbols used to do the bit shifting. The reason is that the conversion was done with using the TICK_DEP_MASK_* symbols which are just macros that convert to the BIT shift itself (with the exception of NONE, which was converted properly, because it doesn't use bits, and is defined as zero). The TICK_DEP_BIT_* needs to be denoted by TRACE_DEFINE_ENUM() in order to have this properly converted for user space tools to parse this event. Cc: stable@vger.kernel.org Cc: Frederic Weisbecker Fixes: e6e6cc22e067 ("nohz: Use enum code for tick stop failure tracing message") Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Signed-off-by: Steven Rostedt --- include/trace/events/timer.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 51440131d337..28c5da6fdfac 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -330,24 +330,32 @@ TRACE_EVENT(itimer_expire, #ifdef CONFIG_NO_HZ_COMMON #define TICK_DEP_NAMES \ - tick_dep_name(NONE) \ + tick_dep_mask_name(NONE) \ tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name_end(CLOCK_UNSTABLE) #undef tick_dep_name +#undef tick_dep_mask_name #undef tick_dep_name_end -#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); -#define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); +/* The MASK will convert to their bits and they need to be processed too */ +#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ + TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); +#define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ + TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); +/* NONE only has a mask defined for it */ +#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); TICK_DEP_NAMES #undef tick_dep_name +#undef tick_dep_mask_name #undef tick_dep_name_end #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, +#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } #define show_tick_dep_name(val) \ -- cgit From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Aug 2016 10:48:18 -0700 Subject: Revert "printk: create pr_ functions" This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142. Geert Uytterhoeven reports: "This change seems to have an (unintendent?) side-effect. Before, pr_*() calls without a trailing newline characters would be printed with a newline character appended, both on the console and in the output of the dmesg command. After this commit, no new line character is appended, and the output of the next pr_*() call of the same type may be appended, like in: - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000 - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM) + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)" Joe Perches says: "No, that is not intentional. The newline handling code inside vprintk_emit is a bit involved and for now I suggest a revert until this has all the same behavior as earlier" Reported-by: Geert Uytterhoeven Requested-by: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8dc155dab3ed..696a56be7d3e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -266,39 +266,21 @@ extern asmlinkage void dump_stack(void) __cold; * and other debug macros are compiled out unless either DEBUG is defined * or CONFIG_DYNAMIC_DEBUG is set. */ - -#ifdef CONFIG_PRINTK - -asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...); -asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...); - -#define pr_emerg(fmt, ...) __pr_emerg(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) __pr_alert(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) __pr_crit(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) __pr_err(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn(fmt, ...) __pr_warn(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_notice(fmt, ...) __pr_notice(pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) __pr_info(pr_fmt(fmt), ##__VA_ARGS__) - -#else - -#define pr_emerg(fmt, ...) printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) -#define pr_alert(fmt, ...) printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_crit(fmt, ...) printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) -#define pr_err(fmt, ...) printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) -#define pr_warn(fmt, ...) printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) -#define pr_notice(fmt, ...) printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) -#define pr_info(fmt, ...) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) - -#endif - -#define pr_warning pr_warn - +#define pr_emerg(fmt, ...) \ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +#define pr_alert(fmt, ...) \ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_crit(fmt, ...) \ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +#define pr_err(fmt, ...) \ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warning(fmt, ...) \ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#define pr_warn pr_warning +#define pr_notice(fmt, ...) \ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* * Like KERN_CONT, pr_cont() should only be used when continuing * a line with no newline ('\n') enclosed. Otherwise it defaults -- cgit From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 2 Aug 2016 00:48:12 -0700 Subject: perf/core: Set cgroup in CPU contexts for new cgroup events There's a perf stat bug easy to observer on a machine with only one cgroup: $ perf stat -e cycles -I 1000 -C 0 -G / # time counts unit events 1.000161699 cycles / 2.000355591 cycles / 3.000565154 cycles / 4.000951350 cycles / We'd expect some output there. The underlying problem is that there is an optimization in perf_cgroup_sched_{in,out}() that skips the switch of cgroup events if the old and new cgroups in a task switch are the same. This optimization interacts with the current code in two ways that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a cgroup event matches the current task. These are: 1. On creation of the first cgroup event in a CPU: In current code, cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the aforesaid optimization, perf_cgroup_sched_in will run until the next cgroup switches in that CPU. This may happen late or never happen, depending on system's number of cgroups, CPU load, etc. 2. On deletion of the last cgroup event in a cpuctx: In list_del_event, cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in because cpuctx->cgrp == NULL until a cgroup switch occurs and perf_cgroup_sched_in is executed (updating cpuctx->cgrp). This patch fixes both problems by setting cpuctx->cgrp in list_add_event, mirroring what list_del_event does when removing a cgroup event from CPU context, as introduced in: commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()") With this patch, cpuctx->cgrp is always set/clear when installing/removing the first/last cgroup event in/from the CPU context. With cpuctx->cgrp correctly set, event_filter_match works as intended when events are sched in/out. After the fix, the output is as expected: $ perf stat -e cycles -I 1000 -a -G / # time counts unit events 1.004699159 627342882 cycles / 2.007397156 615272690 cycles / 3.010019057 616726074 cycles / Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8ed4326164cc..2b6b43cc0dd5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,7 +743,9 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; +#ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ +#endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; }; @@ -769,7 +771,9 @@ struct perf_cpu_context { unsigned int hrtimer_active; struct pmu *unique_pmu; +#ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { -- cgit From 2db34e8bf9a22f4e38b29deccee57457bc0e7d74 Mon Sep 17 00:00:00 2001 From: pan xinhui Date: Mon, 18 Jul 2016 17:47:39 +0800 Subject: locking/qrwlock: Fix write unlock bug on big endian systems This patch aims to get rid of endianness in queued_write_unlock(). We want to set __qrwlock->wmode to NULL, however the address is not &lock->cnts in big endian machine. That causes queued_write_unlock() write NULL to the wrong field of __qrwlock. So implement __qrwlock_write_byte() which returns the correct __qrwlock->wmode address. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: arnd@arndb.de Cc: boqun.feng@gmail.com Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1468835259-4486-1-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/asm-generic/qrwlock.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h index 54a8e65e18b6..7d026bf27713 100644 --- a/include/asm-generic/qrwlock.h +++ b/include/asm-generic/qrwlock.h @@ -25,7 +25,20 @@ #include /* - * Writer states & reader shift and bias + * Writer states & reader shift and bias. + * + * | +0 | +1 | +2 | +3 | + * ----+----+----+----+----+ + * LE | 78 | 56 | 34 | 12 | 0x12345678 + * ----+----+----+----+----+ + * | wr | rd | + * +----+----+----+----+ + * + * ----+----+----+----+----+ + * BE | 12 | 34 | 56 | 78 | 0x12345678 + * ----+----+----+----+----+ + * | rd | wr | + * +----+----+----+----+ */ #define _QW_WAITING 1 /* A writer is waiting */ #define _QW_LOCKED 0xff /* A writer holds the lock */ @@ -133,13 +146,23 @@ static inline void queued_read_unlock(struct qrwlock *lock) (void)atomic_sub_return_release(_QR_BIAS, &lock->cnts); } +/** + * __qrwlock_write_byte - retrieve the write byte address of a queue rwlock + * @lock : Pointer to queue rwlock structure + * Return: the write byte address of a queue rwlock + */ +static inline u8 *__qrwlock_write_byte(struct qrwlock *lock) +{ + return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN); +} + /** * queued_write_unlock - release write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ static inline void queued_write_unlock(struct qrwlock *lock) { - smp_store_release((u8 *)&lock->cnts, 0); + smp_store_release(__qrwlock_write_byte(lock), 0); } /* -- cgit From 1ea049b2de5d803374fdbf43add23c8d1c518e7b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 11 Aug 2016 10:15:56 +0200 Subject: bvec: avoid variable shadowing warning Due to the (indirect) nesting of min(..., min(...)), sparse will show a variable shadowing warning whenever bvec.h is included. Avoid that by assigning the inner min() to a temporary variable first. Signed-off-by: Johannes Berg Signed-off-by: Jens Axboe --- include/linux/bvec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 701b64a3b7c5..89b65b82d98f 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -74,7 +74,8 @@ static inline void bvec_iter_advance(const struct bio_vec *bv, "Attempted to advance past end of bvec iter\n"); while (bytes) { - unsigned len = min(bytes, bvec_iter_len(bv, *iter)); + unsigned iter_len = bvec_iter_len(bv, *iter); + unsigned len = min(bytes, iter_len); bytes -= len; iter->bi_size -= len; -- cgit From 023e9fddc3616b005c3753fc1bb6526388cd7a30 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Aug 2016 19:13:00 +0200 Subject: KVM: PPC: Move xics_debugfs_init out of create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we are about to hold the kvm->lock during the create operation on KVM devices, we should move the call to xics_debugfs_init into its own function, since holding a mutex over extended amounts of time might not be a good idea. Introduce an init operation on the kvm_device_ops struct which cannot fail and call this, if configured, after the device has been created. Signed-off-by: Christoffer Dall Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- include/linux/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01e908ac4a39..d3c9b82812c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1115,6 +1115,12 @@ struct kvm_device_ops { const char *name; int (*create)(struct kvm_device *dev, u32 type); + /* + * init is called after create if create is successful and is called + * outside of holding kvm->lock. + */ + void (*init)(struct kvm_device *dev); + /* * Destroy is responsible for freeing dev. * -- cgit From a28ebea2adc4a2bef5989a5a181ec238f59fbcad Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 9 Aug 2016 19:13:01 +0200 Subject: KVM: Protect device ops->create and list_add with kvm->lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM devices were manipulating list data structures without any form of synchronization, and some implementations of the create operations also suffered from a lack of synchronization. Now when we've split the xics create operation into create and init, we can hold the kvm->lock mutex while calling the create operation and when manipulating the devices list. The error path in the generic code gets slightly ugly because we have to take the mutex again and delete the device from the list, but holding the mutex during anon_inode_getfd or releasing/locking the mutex in the common non-error path seemed wrong. Signed-off-by: Christoffer Dall Reviewed-by: Paolo Bonzini Acked-by: Christian Borntraeger Signed-off-by: Radim Krčmář --- include/linux/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d3c9b82812c3..9c28b4d4c90b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1113,6 +1113,12 @@ struct kvm_device { /* create, destroy, and name are mandatory */ struct kvm_device_ops { const char *name; + + /* + * create is called holding kvm->lock and any operations not suitable + * to do while holding the lock should be deferred to init (see + * below). + */ int (*create)(struct kvm_device *dev, u32 type); /* -- cgit From 747ea55e4f78fd980350c39570a986b8c1c3e4aa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 12 Aug 2016 22:17:17 +0200 Subject: bpf: fix bpf_skb_in_cgroup helper naming While hashing out BPF's current_task_under_cgroup helper bits, it came to discussion that the skb_in_cgroup helper name was suboptimally chosen. Tejun says: So, I think in_cgroup should mean that the object is in that particular cgroup while under_cgroup in the subhierarchy of that cgroup. Let's rename the other subhierarchy test to under too. I think that'd be a lot less confusing going forward. [...] It's more intuitive and gives us the room to implement the real "in" test if ever necessary in the future. Since this touches uapi bits, we need to change this as long as v4.8 is not yet officially released. Thus, change the helper enum and rename related bits. Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto") Reference: http://patchwork.ozlabs.org/patch/658500/ Suggested-by: Sargun Dhillon Suggested-by: Tejun Heo Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da218fec6056..9e5fc168c8a3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -339,7 +339,7 @@ enum bpf_func_id { BPF_FUNC_skb_change_type, /** - * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb + * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb * @skb: pointer to skb * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type * @index: index of the cgroup in the bpf_map @@ -348,7 +348,7 @@ enum bpf_func_id { * == 1 skb succeeded the cgroup2 descendant test * < 0 error */ - BPF_FUNC_skb_in_cgroup, + BPF_FUNC_skb_under_cgroup, /** * bpf_get_hash_recalc(skb) -- cgit From c15c0ab12fd62f2b19181d05c62d24bc9fa55a42 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Aug 2016 07:48:21 +0200 Subject: ipv6: suppress sparse warnings in IP6_ECN_set_ce() Pass the correct type __wsum to csum_sub() and csum_add(). This doesn't really change anything since __wsum really *is* __be32, but removes the address space warnings from sparse. Cc: Eric Dumazet Fixes: 34ae6a1aa054 ("ipv6: update skb->csum when CE mark is propagated") Signed-off-by: Johannes Berg Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 0dc0a51da38f..dce2d586d9ce 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -128,7 +128,8 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(csum_sub(skb->csum, from), to); + skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), + (__force __wsum)to); return 1; } -- cgit From 952fcfd08c8109951622579d0ae7b9cd6cafd688 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 12 Aug 2016 16:10:33 +0200 Subject: net: remove type_check from dev_get_nest_level() The idea for type_check in dev_get_nest_level() was to count the number of nested devices of the same type (currently, only macvlan or vlan devices). This prevented the false positive lockdep warning on configurations such as: eth0 <--- macvlan0 <--- vlan0 <--- macvlan1 However, this doesn't prevent a warning on a configuration such as: eth0 <--- macvlan0 <--- vlan0 eth1 <--- vlan1 <--- macvlan1 In this case, all the locks end up with a nesting subclass of 1, so lockdep thinks that there is still a deadlock: - in the first case we have (macvlan_netdev_addr_lock_key, 1) and then take (vlan_netdev_xmit_lock_key, 1) - in the second case, we have (vlan_netdev_xmit_lock_key, 1) and then take (macvlan_netdev_addr_lock_key, 1) By removing the linktype check in dev_get_nest_level() and always incrementing the nesting depth, lockdep considers this configuration valid. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 076df5360ba5..3a788bf0affd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3891,8 +3891,7 @@ void netdev_default_l2upper_neigh_destroy(struct net_device *dev, extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len); -int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(const struct net_device *dev)); +int dev_get_nest_level(struct net_device *dev); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); -- cgit From 3d7b33209201cbfa090d614db993571ca3c6b090 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 15 Aug 2016 13:06:24 +0200 Subject: gre: set inner_protocol on xmit Ensure that the inner_protocol is set on transmit so that GSO segmentation, which relies on that field, works correctly. This is achieved by setting the inner_protocol in gre_build_header rather than each caller of that function. It ensures that the inner_protocol is set when gre_fb_xmit() is used to transmit GRE which was not previously the case. I have observed this is not the case when OvS transmits GRE using lwtunnel metadata (which it always does). Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol") Cc: Pravin Shelar Acked-by: Alexander Duyck Signed-off-by: Simon Horman Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/gre.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/gre.h b/include/net/gre.h index 7a54a31d1d4c..73ea256eb7d7 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -104,6 +104,7 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len, skb_push(skb, hdr_len); + skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); -- cgit From 0c23c3e705691cfb99c94f2760df2b456fe45194 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 13 Aug 2016 22:34:58 -0700 Subject: net_sched: fix a typo in tc_for_each_action() It is harmless because all users pass 'a' to this macro. Fixes: 00175aec941e ("net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef") Cc: Amir Vadai Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 41e6a24a44b9..f53ee9d315c1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -193,7 +193,7 @@ int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); (list_empty(&(_exts)->actions)) #define tc_for_each_action(_a, _exts) \ - list_for_each_entry(a, &(_exts)->actions, list) + list_for_each_entry(_a, &(_exts)->actions, list) #define tc_single_action(_exts) \ (list_is_singular(&(_exts)->actions)) -- cgit From 2734437ef3c2943090d0914bf91caa6b30451615 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 13 Aug 2016 22:34:59 -0700 Subject: net_sched: move tc offload macros to pkt_cls.h struct tcf_exts belongs to filters, should not be visible to plain tc actions. Cc: Ido Schimmel Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 19 +++---------------- include/net/pkt_cls.h | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index f53ee9d315c1..870332ff61eb 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -189,30 +189,17 @@ int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); -#define tc_no_actions(_exts) \ - (list_empty(&(_exts)->actions)) - -#define tc_for_each_action(_a, _exts) \ - list_for_each_entry(_a, &(_exts)->actions, list) - -#define tc_single_action(_exts) \ - (list_is_singular(&(_exts)->actions)) +#endif /* CONFIG_NET_CLS_ACT */ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 lastuse) { +#ifdef CONFIG_NET_CLS_ACT if (!a->ops->stats_update) return; a->ops->stats_update(a, bytes, packets, lastuse); +#endif } -#else /* CONFIG_NET_CLS_ACT */ - -#define tc_no_actions(_exts) true -#define tc_for_each_action(_a, _exts) while ((void)(_a), 0) -#define tc_single_action(_exts) false -#define tcf_action_stats_update(a, bytes, packets, lastuse) - -#endif /* CONFIG_NET_CLS_ACT */ #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6f8d65342d3a..00dd5c4c1d0a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -130,6 +130,25 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, return 0; } +#ifdef CONFIG_NET_CLS_ACT + +#define tc_no_actions(_exts) \ + (list_empty(&(_exts)->actions)) + +#define tc_for_each_action(_a, _exts) \ + list_for_each_entry(_a, &(_exts)->actions, list) + +#define tc_single_action(_exts) \ + (list_is_singular(&(_exts)->actions)) + +#else /* CONFIG_NET_CLS_ACT */ + +#define tc_no_actions(_exts) true +#define tc_for_each_action(_a, _exts) while ((void)(_a), 0) +#define tc_single_action(_exts) false + +#endif /* CONFIG_NET_CLS_ACT */ + int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr); -- cgit From 22dc13c837c33207548c8ee5116b64e2930a6e23 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 13 Aug 2016 22:35:00 -0700 Subject: net_sched: convert tcf_exts from list to pointer array As pointed out by Jamal, an action could be shared by multiple filters, so we can't use list to chain them any more after we get rid of the original tc_action. Instead, we could just save pointers to these actions in tcf_exts, since they are refcount'ed, so convert the list to an array of pointers. The "ugly" part is the action API still accepts list as a parameter, I just introduce a helper function to convert the array of pointers to a list, instead of relying on the C99 feature to iterate the array. Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common") Reported-by: Jamal Hadi Salim Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 4 ++-- include/net/pkt_cls.h | 40 ++++++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 870332ff61eb..82f3c912a5b1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -176,8 +176,8 @@ int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_action_destroy(struct list_head *actions, int bind); -int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, - struct tcf_result *res); +int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, + int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, char *n, int ovr, int bind, struct list_head *); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 00dd5c4c1d0a..c99508d426cc 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -59,7 +59,8 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ - struct list_head actions; + int nr_actions; + struct tc_action **actions; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. @@ -72,7 +73,10 @@ static inline void tcf_exts_init(struct tcf_exts *exts, int action, int police) { #ifdef CONFIG_NET_CLS_ACT exts->type = 0; - INIT_LIST_HEAD(&exts->actions); + exts->nr_actions = 0; + exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), + GFP_KERNEL); + WARN_ON(!exts->actions); /* TODO: propagate the error to callers */ #endif exts->action = action; exts->police = police; @@ -89,7 +93,7 @@ static inline int tcf_exts_is_predicative(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - return !list_empty(&exts->actions); + return exts->nr_actions; #else return 0; #endif @@ -108,6 +112,20 @@ tcf_exts_is_available(struct tcf_exts *exts) return tcf_exts_is_predicative(exts); } +static inline void tcf_exts_to_list(const struct tcf_exts *exts, + struct list_head *actions) +{ +#ifdef CONFIG_NET_CLS_ACT + int i; + + for (i = 0; i < exts->nr_actions; i++) { + struct tc_action *a = exts->actions[i]; + + list_add(&a->list, actions); + } +#endif +} + /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer @@ -124,27 +142,21 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT - if (!list_empty(&exts->actions)) - return tcf_action_exec(skb, &exts->actions, res); + if (exts->nr_actions) + return tcf_action_exec(skb, exts->actions, exts->nr_actions, + res); #endif return 0; } #ifdef CONFIG_NET_CLS_ACT -#define tc_no_actions(_exts) \ - (list_empty(&(_exts)->actions)) - -#define tc_for_each_action(_a, _exts) \ - list_for_each_entry(_a, &(_exts)->actions, list) - -#define tc_single_action(_exts) \ - (list_is_singular(&(_exts)->actions)) +#define tc_no_actions(_exts) ((_exts)->nr_actions == 0) +#define tc_single_action(_exts) ((_exts)->nr_actions == 1) #else /* CONFIG_NET_CLS_ACT */ #define tc_no_actions(_exts) true -#define tc_for_each_action(_a, _exts) while ((void)(_a), 0) #define tc_single_action(_exts) false #endif /* CONFIG_NET_CLS_ACT */ -- cgit From 112dc0c8069e5554e0ad29c58228f1e6ca49e13d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 11 Aug 2016 11:50:22 +0200 Subject: locking/barriers: Suppress sparse warnings in lockless_dereference() After Peter's commit: 331b6d8c7afc ("locking/barriers: Validate lockless_dereference() is used on a pointer type") ... we get a lot of sparse warnings (one for every rcu_dereference, and more) since the expression here is assigning to the wrong address space. Instead of validating that 'p' is a pointer this way, instead make it fail compilation when it's not by using sizeof(*(p)). This will not cause any sparse warnings (tested, likely since the address space is irrelevant for sizeof), and will fail compilation when 'p' isn't a pointer type. Tested-by: Paul E. McKenney Signed-off-by: Johannes Berg Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Chris Wilson Cc: Daniel Vetter Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 331b6d8c7afc ("locking/barriers: Validate lockless_dereference() is used on a pointer type") Link: http://lkml.kernel.org/r/1470909022-687-2-git-send-email-johannes@sipsolutions.net Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 1bb954842725..436aa4e42221 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -527,13 +527,13 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s * object's lifetime is managed by something other than RCU. That * "something other" might be reference counting or simple immortality. * - * The seemingly unused void * variable is to validate @p is indeed a pointer - * type. All pointer types silently cast to void *. + * The seemingly unused size_t variable is to validate @p is indeed a pointer + * type by making sure it can be dereferenced. */ #define lockless_dereference(p) \ ({ \ typeof(p) _________p1 = READ_ONCE(p); \ - __maybe_unused const void * const _________p2 = _________p1; \ + size_t __maybe_unused __size_of_ptr = sizeof(*(p)); \ smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ (_________p1); \ }) -- cgit